本文整理了Java中org.apache.lucene.analysis.Analyzer.tokenStream()
方法的一些代码示例,展示了Analyzer.tokenStream()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Analyzer.tokenStream()
方法的具体详情如下:
包路径:org.apache.lucene.analysis.Analyzer
类名称:Analyzer
方法名:tokenStream
[英]Creates a TokenStream which tokenizes all the text in the provided Reader. Provided for backward compatibility only.
[中]创建标记流,标记所提供读取器中的所有文本。仅提供向后兼容性。
代码示例来源:origin: stackoverflow.com
public final class LuceneUtil {
private LuceneUtil() {}
public static List<String> tokenizeString(Analyzer analyzer, String string) {
List<String> result = new ArrayList<String>();
try {
TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
stream.reset();
while (stream.incrementToken()) {
result.add(stream.getAttribute(CharTermAttribute.class).toString());
}
} catch (IOException e) {
// not thrown b/c we're using a string reader...
throw new RuntimeException(e);
}
return result;
}
}
代码示例来源:origin: stackoverflow.com
public final class LuceneUtils {
public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) {
List<String> result = new ArrayList<String>();
TokenStream stream = analyzer.tokenStream(field, new StringReader(keywords));
try {
while(stream.incrementToken()) {
result.add(stream.getAttribute(TermAttribute.class).term());
}
}
catch(IOException e) {
// not thrown b/c we're using a string reader...
}
return result;
}
}
代码示例来源:origin: looly/hutool
@Override
public Result parse(CharSequence text) {
TokenStream stream;
try {
stream = analyzer.tokenStream("text", StrUtil.str(text));
stream.reset();
} catch (IOException e) {
throw new TokenizerException(e);
}
return new AnalysisResult(stream);
}
代码示例来源:origin: apache/usergrid
public static List<String> keywords( String source ) {
List<String> keywords = new ArrayList<String>();
TokenStream ts = null;
try {
ts = analyzer.tokenStream( "keywords", new StringReader( source ) );
ts.reset();
while ( ts.incrementToken() ) {
keywords.add( ts.getAttribute( CharTermAttribute.class ).toString() );
}
ts.end();
}
catch ( IOException e ) {
logger.error( "Error getting keywords ", e );
}
finally {
try {
ts.close();
} catch (IOException ignored) {}
}
return keywords;
}
}
代码示例来源:origin: pranab/chombo
/**
* @param text
* @param analyzer
* @return
* @throws IOException
*/
public static List<String> tokenize(String text, Analyzer analyzer) throws IOException {
TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
List<String> tokens = new ArrayList<String>();
CharTermAttribute termAttribute = (CharTermAttribute)stream.getAttribute(CharTermAttribute.class);
while (stream.incrementToken()) {
String token = termAttribute.toString();
tokens.add(token);
}
return tokens;
}
代码示例来源:origin: looly/hutool
@Override
public Result parse(CharSequence text) {
TokenStream stream;
try {
stream = analyzer.tokenStream("text", StrUtil.str(text));
stream.reset();
} catch (IOException e) {
throw new TokenizerException(e);
}
return new AnalysisResult(stream);
}
代码示例来源:origin: oracle/opengrok
private SToken[] getTokens(String text) throws IOException {
//FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
//also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
ArrayList<SToken> result = new ArrayList<>();
try (TokenStream ts = analyzer.tokenStream("full", text)) {
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
ts.reset();
while (ts.incrementToken()) {
SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
result.add(t);
}
ts.end();
}
return result.toArray(new SToken[result.size()]);
}
代码示例来源:origin: tdunning/MiA
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {
TokenStream ts = analyzer.tokenStream("text", in);
ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken()) {
String s = ts.getAttribute(CharTermAttribute.class).toString();
words.add(s);
}
/*overallCounts.addAll(words);*/
}
}
代码示例来源:origin: sirensolutions/siren
private CachingTokenFilter getBuffer(Analyzer analyzer, FieldQueryNode fieldNode) {
final TokenStream source;
final String text = fieldNode.getTextAsString();
final String field = fieldNode.getFieldAsString();
try {
source = analyzer.tokenStream(field, new StringReader(text));
source.reset();
}
catch (final IOException e1) {
throw new RuntimeException(e1);
}
return new CachingTokenFilter(source);
}
代码示例来源:origin: sanluan/PublicCMS
/**
* @param text
* @return
*/
public Set<String> getToken(String text) {
Set<String> list = new LinkedHashSet<>();
if (CommonUtils.notEmpty(text)) {
try (StringReader stringReader = new StringReader(text);
TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) {
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
list.add(charTermAttribute.toString());
}
tokenStream.end();
return list;
} catch (IOException e) {
return list;
}
}
return list;
}
代码示例来源:origin: tamingtext/book
public String[] tokenizeField(String fieldName, SolrInputField field) throws IOException {
if (field == null) return new String[0];
if (!(field.getValue() instanceof String)) return new String[0];
//<start id="mahout.bayes.tokenize"/>
String input = (String) field.getValue();
ArrayList<String> tokenList = new ArrayList<String>();
TokenStream ts = analyzer.tokenStream(inputField,
new StringReader(input));
while (ts.incrementToken()) {
tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
}
String[] tokens = tokenList.toArray(new String[tokenList.size()]);
//<end id="mahout.bayes.tokenize"/>
return tokens;
}
}
代码示例来源:origin: cn.hutool/hutool-all
@Override
public Result parse(CharSequence text) {
TokenStream stream;
try {
stream = analyzer.tokenStream("text", StrUtil.str(text));
stream.reset();
} catch (IOException e) {
throw new TokenizerException(e);
}
return new AnalysisResult(stream);
}
代码示例来源:origin: sanluan/PublicCMS
/**
* @param text
* @return
*/
public Set<String> getToken(String text) {
Set<String> list = new LinkedHashSet<>();
if (CommonUtils.notEmpty(text)) {
try (StringReader stringReader = new StringReader(text);
TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) {
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
list.add(charTermAttribute.toString());
}
tokenStream.end();
return list;
} catch (IOException e) {
return list;
}
}
return list;
}
代码示例来源:origin: stackoverflow.com
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
TokenStream tokenStream = analyzer.tokenStream("CONTENT", new StringReader("c/d e/f n/a"));
CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
int pos = 0;
while (tokenStream.incrementToken()) {
String termStr = term.toString();
int incr = position.getPositionIncrement();
if (incr == 0 ) {
System.out.print(" [" + termStr + "]");
} else {
pos += incr;
System.out.println(" " + pos + ": [" + termStr +"]");
}
}
代码示例来源:origin: larsga/Duke
/**
* Parses the query. Using this instead of a QueryParser in order
* to avoid thread-safety issues with Lucene's query parser.
*
* @param fieldName the name of the field
* @param value the value of the field
* @return the parsed query
*/
private Query parseTokens(String fieldName, String value) {
BooleanQuery searchQuery = new BooleanQuery();
if (value != null) {
Analyzer analyzer = new KeywordAnalyzer();
try {
TokenStream tokenStream =
analyzer.tokenStream(fieldName, new StringReader(value));
tokenStream.reset();
CharTermAttribute attr =
tokenStream.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
String term = attr.toString();
Query termQuery = new TermQuery(new Term(fieldName, term));
searchQuery.add(termQuery, Occur.SHOULD);
}
} catch (IOException e) {
throw new DukeException("Error parsing input string '" + value + "' " +
"in field " + fieldName);
}
}
return searchQuery;
}
代码示例来源:origin: INL/BlackLab
public static void main(String[] args) throws IOException {
String TEST_STR = "Hé jij И! раскази и повѣсти. Ст]' Дѣдо Нисторъ. Ива";
try (Analyzer a = new BLStandardAnalyzer()) {
TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken()) {
System.out.println(new String(ta.buffer(), 0, ta.length()));
}
TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
new StringReader(TEST_STR));
ta = ts2.addAttribute(CharTermAttribute.class);
while (ts2.incrementToken()) {
System.out.println(new String(ta.buffer(), 0, ta.length()));
}
}
}
代码示例来源:origin: org.elasticsearch/elasticsearch
try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
skipTerms.add(new Term(fieldName, termAtt.toString()));
代码示例来源:origin: tdunning/MiA
public static void main(String[] args) throws IOException {
FeatureVectorEncoder encoder = new StaticWordValueEncoder("text");
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
StringReader in = new StringReader("text to magically vectorize");
TokenStream ts = analyzer.tokenStream("body", in);
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
Vector v1 = new RandomAccessSparseVector(100);
while (ts.incrementToken()) {
char[] termBuffer = termAtt.termBuffer();
int termLen = termAtt.termLength();
String w = new String(termBuffer, 0, termLen);
encoder.addToVector(w, 1, v1);
}
System.out.printf("%s\n", new SequentialAccessSparseVector(v1));
}
代码示例来源:origin: org.elasticsearch/elasticsearch
private static Query parseQueryString(ExtendedCommonTermsQuery query, Object queryString, String field, Analyzer analyzer,
String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException {
// Logic similar to QueryParser#getFieldQuery
try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) {
source.reset();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
BytesRefBuilder builder = new BytesRefBuilder();
while (source.incrementToken()) {
// UTF-8
builder.copyChars(termAtt);
query.add(new Term(field, builder.toBytesRef()));
}
}
query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
return query;
}
代码示例来源:origin: org.elasticsearch/elasticsearch
"term vectors, you must provide an Analyzer");
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
int tokenCount = 0;
ts.reset();
while (ts.incrementToken()) {
String word = termAtt.toString();
tokenCount++;
内容来源于网络,如有侵权,请联系作者删除!