我正在使用 solr
以及 lily
用于索引和生成基于关键字的搜索结果,如“hadoop”、“bigdata”、“computer science”等twitter数据,存储在 hbase
.
hbase中的一行如下所示:
838720557562609665:1488801538:180782707: column=json:tweetJSON, timestamp=1488801607097, value={"created_at":"Mon Mar 06 11:58:58 +0000 2017","id":838720557562609665,"i
d_str":"838720557562609665","text":"RT @eraser: #Blockchain Technology Breakdown [img] by @FollowMyVote #fintech #BigData #IoT
#insurtech #cryptocurrency\x5Cu2026 ","source":"\x5Cu003ca href=\x5C"https:\x5C/\x5C/about.twitter.com\x5C/products\x5C/tweetd
eck\x5C" rel=\x5C"nofollow\x5C"\x5Cu003eTweetDeck\x5Cu003c\x5C/a\x5Cu003e","truncated":false,"in_reply_to_status_id":null,"in_r
eply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"i
d":180782707,"id_str":"180782707","name":"bitiji","screen_name":"bitiji","location":"Zevilla ciberespaci\x5Cu00e1","url":"http:
\x5C/\x5C/bitiji.com","description":"Nac\x5Cu00ed, crec\x5Cu00ed, me viraliz\x5Cu00e9 y mor\x5Cu00ed... y vuerta a empez\x5Cu0
0e1. hacia el infinito y + all\x5Cu00e1 .\x5Cr\x5CnMatria del eco(NO)sistema @bitiji","protected":false,"verified":false,"foll
owers_count":964,"friends_count":700,"listed_count":157,"favourites_count":124,"statuses_count":17870,"created_at":"Fri Aug 20
13:31:49 +0000 2010","utc_offset":3600,"time_zone":"Madrid","geo_enabled":true,"lang":"es","contributors_enabled":false,"is_tra
nslator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile_bac
kground_images\x5C/162532234\x5C/bitiji_avatartwitter.png","profile_background_image_url_https":"https:\x5C/\x5C/pbs.twimg.com\
x5C/profile_background_images\x5C/162532234\x5C/bitiji_avatartwitter.png","profile_background_tile":true,"profile_link_color":"
0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_us
e_background_image":true,"profile_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C/2859966744\x5C/2f056cd86881e4
91f42c4bd942f5c5be_normal.png","profile_image_url_":"\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C/2859966744\x5C/2
f056cd86881e491f42c4bd942f5c5be_normal.png","profile_banner_url":":\x5C/\x5C/pbs.twimg.com\x5C/profile_banners\x5C/1807827
07\x5C/1398242563","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notificat
ions":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Mar 01 13:35:0
5 +0000 2017","id":836932805879820288,"id_str":"836932805879820288","text":"#Blockchain Technology Breakdown [img] by @FollowMy
Vote #fintech #BigData #IoT #insurtech #cryptocurrency\x5Cu2026 :\x5C/\x5C/t.co\x5C/KuYmu4lh8A","display_text_range":[0,1
40],"source":"\x5Cu003ca href=\x5C"http:\x5C/\x5C/www.hootsuite.com\x5C" rel=\x5C"nofollow\x5C"\x5Cu003eHootsuite\x5Cu003c\x5C/
a\x5Cu003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply
_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":3122211,"id_str":"3122211","name":"eraser ju\x5Cu24b6njo * \x
5Cu2718 \x5Cu2605","screen_name":"eraser","location":"Sevilla","url":"http:\x5C/\x5C/e-learning-teleformacion.blogspot.com","de
scription":"PhD student @fceyeUS @unisevilla elige la clave dela vida abcchdefghij... \x5Cu2718\x5Cu24d4-\x5Cu24dd\x5Cu24d0\x5C
u24e4\x5Cu24e3\x5Cu24d0, \x5Cu24d4-\x5Cu24dc\x5Cu24d4\x5Cu24dd\x5Cu24e3\x5Cu24d4 Sevilla \x5Cu2605 elearning \x5Cu2605\x5Cu24b6
r\x5Cu24e3\x5Cu2605 education \x5Cu2605 P2P \x5Cu2605 blockchain \x5Cu2605 economy","protected":false,"verified":false,"followe
rs_count":21208,"friends_count":11566,"listed_count":2074,"favourites_count":4946,"statuses_count":474839,"created_at":"Sun Apr
01 12:12:45 +0000 2007","utc_offset":3600,"time_zone":"Madrid","geo_enabled":true,"lang":"en","contributors_enabled":false,"is
_translator":false,"profile_background_color":"9AE4E8","profile_background_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile
_background_images\x5C/880489560\x5C/e145d4701fc8ad1b84d114cc2fd7c996.jpeg","profile_background_image_url_https":"https:\x5C/\x
5C/pbs.twimg.com\x5C/profile_background_images\x5C/880489560\x5C/e145d4701fc8ad1b84d114cc2fd7c996.jpeg","profile_background_til
e":true,"profile_link_color":"0000FF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E0FF92","profile_te
xt_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C
/599157674337509376\x5C/0ZRJcLhV_normal.jpg","profile_image_url_https":"https:\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C/59
9157674337509376\x5C/0ZRJcLhV_normal.jpg","profile_banner_url":"https:\x5C/\x5C/pbs.twimg.com\x5C/profile_banners\x5C/3122211\x
5C/1438841267","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications
":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"#
Blockchain Technology Breakdown [img] by @FollowMyVote #fintech #BigData #IoT #insurtech #cryptocurrency #smartcities #DeepLea
rning https:\x5C/\x5C/t.co\x5C/PpbSfk3Dta","display_text_range":[0,133],"entities":{"hashtags":[{"text":"Blockchain","indices":
[0,11]},{"text":"fintech","indices":[57,65]},{"text":"BigData","indices":[66,74]},{"text":"IoT","indices":[75,79]},{"text":"ins
urtech","indices":[80,90]},{"text":"cryptocurrency","indices":[91,106]},{"text":"smartcities","indices":[107,119]},{"text":"Dee
pLearning","indices":[120,133]}],"urls":[],"user_mentions":[{"screen_name":"FollowMyVote","name":"FollowMyVote","id":392924202,
"id_str":"392924202","indices":[42,55]}],"symbols":[],"media":[{"id":836932802947973120,"id_str":"836932802947973120","indices"
:[134,157],"media_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","media_url_https":"https:\x5C/\x5C/pbs.
twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","url":"https:\x5C/\x5C/t.co\x5C/PpbSfk3Dta","display_url":"pic.twitter.com\x5C/Ppb
Sfk3Dta","expanded_url":"https:\x5C/\x5C/twitter.com\x5C/eraser\x5C/status\x5C/836932805879820288\x5C/photo\x5C/1","type":"phot
o","sizes":{"large":{"w":800,"h":2000,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":480,"h":1200,"res
ize":"fit"},"small":{"w":272,"h":680,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":836932802947973120,"id_str":"83693
2802947973120","indices":[134,157],"media_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","media_url_http
s":"https:\x5C/\x5C/pbs.twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","url":"https:\x5C/\x5C/t.co\x5C/PpbSfk3Dta","display_url":
"pic.twitter.com\x5C/PpbSfk3Dta","expanded_url":"https:\x5C/\x5C/twitter.com\x5C/eraser\x5C/status\x5C/836932805879820288\x5C/p
hoto\x5C/1","type":"photo","sizes":{"large":{"w":800,"h":2000,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium
":{"w":480,"h":1200,"resize":"fit"},"small":{"w":272,"h":680,"resize":"fit"}}}]}},"retweet_count":15,"favorite_count":5,"entiti
es":{"hashtags":[{"text":"Blockchain","indices":[0,11]},{"text":"fintech","indices":[57,65]},{"text":"BigData","indices":[66,74
]},{"text":"IoT","indices":[75,79]},{"text":"insurtech","indices":[80,90]},{"text":"cryptocurrency","indices":[91,106]}],"urls"
:[{"url":"https:\x5C/\x5C/t.co\x5C/KuYmu4lh8A","expanded_url":"https:\x5C/\x5C/twitter.com\x5C/i\x5C/web\x5C/status\x5C/8369328
05879820288","display_url":"twitter.com\x5C/i\x5C/web\x5C/status\x5C/8\x5Cu2026","indices":[108,131]}],"user_mentions":[{"scree
n_name":"FollowMyVote","name":"FollowMyVote","id":392924202,"id_str":"392924202","indices":[42,55]}],"symbols":[]},"favorited":
false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,
"favorite_count":0,"entities":{"hashtags":[{"text":"Blockchain","indices":[12,23]},{"text":"fintech","indices":[69,77]},{"text"
:"BigData","indices":[78,86]},{"text":"IoT","indices":[87,91]},{"text":"insurtech","indices":[92,102]},{"text":"cryptocurrency"
,"indices":[103,118]}],"urls":[{"url":"","expanded_url":null,"indices":[120,120]}],"user_mentions":[{"screen_name":"eraser","na
me":"eraser ju\x5Cu24b6njo * \x5Cu2718 \x5Cu2605","id":3122211,"id_str":"3122211","indices":[3,10]},{"screen_name":"FollowMyVot
e","name":"FollowMyVote","id":392924202,"id_str":"392924202","indices":[54,67]}],"symbols":[]},"favorited":false,"retweeted":fa
lse,"filter_level":"low","lang":"en","timestamp_ms":"1488801538254"}\x0D\x0A
838720557562609665:1488801538:180782707: column=tweetdata:coordinates, timestamp=1488801607097, value=NA
838720557562609665:1488801538:180782707: column=tweetdata:created_at, timestamp=1488801607097, value=1488801538
838720557562609665:1488801538:180782707: column=tweetdata:created_time_lucene, timestamp=1488801607097, value=2017-03-06T11:58:58Z
838720557562609665:1488801538:180782707: column=tweetdata:hashtags, timestamp=1488801607097, value=Blockchain, fintech, BigData, IoT, insurtech, cryptocurrency
838720557562609665:1488801538:180782707: column=tweetdata:id, timestamp=1488801607097, value=838720557562609665
838720557562609665:1488801538:180782707: column=tweetdata:in_reply_to_screen_name, timestamp=1488801607097, value=NA
838720557562609665:1488801538:180782707: column=tweetdata:in_reply_to_status_id, timestamp=1488801607097, value=NA
838720557562609665:1488801538:180782707: column=tweetdata:in_reply_to_user_id, timestamp=1488801607097, value=NA
838720557562609665:1488801538:180782707: column=tweetdata:place, timestamp=1488801607097, value=NA
838720557562609665:1488801538:180782707: column=tweetdata:retweeted_status_id, timestamp=1488801607097, value=836932805879820288
838720557562609665:1488801538:180782707: column=tweetdata:retweeted_status_text, timestamp=1488801607097, value=#Blockchain Technology Breakdown [img] by @FollowMyVote
#fintech #BigData #IoT #insurtech #cryptocurrency\xE2\x80\xA6
838720557562609665:1488801538:180782707: column=tweetdata:retweeted_status_user_id, timestamp=1488801607097, value=3122211
838720557562609665:1488801538:180782707: column=tweetdata:retweeted_status_user_name, timestamp=1488801607097, value=eraser ju\xE2\x92\xB6njo * \xE2\x9C\x98 \xE2\x98\x8
5
838720557562609665:1488801538:180782707: column=tweetdata:source, timestamp=1488801607097, value=<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">T
weetDeck</a>
838720557562609665:1488801538:180782707: column=tweetdata:text, timestamp=1488801607097, value=RT @eraser: #Blockchain Technology Breakdown [img] by @FollowMyVote #fin
tech #BigData #IoT #insurtech #cryptocurrency\xE2\x80\xA6
838720557562609665:1488801538:180782707: column=tweetdata:urls, timestamp=1488801607097, value=
838720557562609665:1488801538:180782707: column=tweetdata:usermentions, timestamp=1488801607097, value=eraser, FollowMyVote
838720557562609665:1488801538:180782707: column=user:followers_count, timestamp=1488801607097, value=964
838720557562609665:1488801538:180782707: column=user:following_count, timestamp=1488801607097, value=NA
838720557562609665:1488801538:180782707: column=user:friends_count, timestamp=1488801607097, value=700
838720557562609665:1488801538:180782707: column=user:id, timestamp=1488801607097, value=180782707
838720557562609665:1488801538:180782707: column=user:profile_image_url, timestamp=1488801607097, value=http://pbs.twimg.com/profile_images/2859966744/2f056cd86881e491f4
2c4bd942f5c5be_normal.png
838720557562609665:1488801538:180782707: column=user:screen_name, timestamp=1488801607097, value=bitiji
838720557562609665:1488801538:180782707: column=user:timezone, timestamp=1488801607097, value=Madrid
我已经能够设置solr和lily,但只剩下最后一步,即添加索引器:
./bin/hbase-indexer add-indexer -n myindexer -c indexdemo-indexer.xml \
-cp solr.zk=localhost:2181/solr -cp solr.collection=collection1
对于以上我需要使 indexdemo-indexer.xml
文件。样品:
<?xml version="1.0"?>
<indexer table="indexdemo-user">
<field name="firstname_s" value="info:firstname"/>
<field name="lastname_s" value="info:lastname"/>
<field name="age_i" value="info:age" type="int"/>
</indexer>
如何为我的数据创建上述文件(上面提到的hbase示例行)(注意-我可以使用:列=tweetdata:hashtags or 列=tweetdata:text for 我猜是这样的,但怎么可能?)
1条答案
按热度按时间x6yk4ghg1#
这里有一种方法: