org.apache.lucene.analysis.Analyzer.tokenStream()方法的使用及代码示例

x33g5p2x  于2022-01-15 转载在 其他  
字(10.5k)|赞(0)|评价(0)|浏览(98)

本文整理了Java中org.apache.lucene.analysis.Analyzer.tokenStream()方法的一些代码示例,展示了Analyzer.tokenStream()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Analyzer.tokenStream()方法的具体详情如下:
包路径:org.apache.lucene.analysis.Analyzer
类名称:Analyzer
方法名:tokenStream

Analyzer.tokenStream介绍

[英]Creates a TokenStream which tokenizes all the text in the provided Reader. Provided for backward compatibility only.
[中]创建标记流,标记所提供读取器中的所有文本。仅提供向后兼容性。

代码示例

代码示例来源:origin: stackoverflow.com

public final class LuceneUtil {

 private LuceneUtil() {}

 public static List<String> tokenizeString(Analyzer analyzer, String string) {
  List<String> result = new ArrayList<String>();
  try {
   TokenStream stream  = analyzer.tokenStream(null, new StringReader(string));
   stream.reset();
   while (stream.incrementToken()) {
    result.add(stream.getAttribute(CharTermAttribute.class).toString());
   }
  } catch (IOException e) {
   // not thrown b/c we're using a string reader...
   throw new RuntimeException(e);
  }
  return result;
 }

}

代码示例来源:origin: stackoverflow.com

public final class LuceneUtils {

  public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) {

    List<String> result = new ArrayList<String>();
    TokenStream stream  = analyzer.tokenStream(field, new StringReader(keywords));

    try {
      while(stream.incrementToken()) {
        result.add(stream.getAttribute(TermAttribute.class).term());
      }
    }
    catch(IOException e) {
      // not thrown b/c we're using a string reader...
    }

    return result;
  }  
}

代码示例来源:origin: looly/hutool

@Override
public Result parse(CharSequence text) {
  TokenStream stream;
  try {
    stream = analyzer.tokenStream("text", StrUtil.str(text));
    stream.reset();
  } catch (IOException e) {
    throw new TokenizerException(e);
  }
  return new AnalysisResult(stream);
}

代码示例来源:origin: apache/usergrid

public static List<String> keywords( String source ) {
    List<String> keywords = new ArrayList<String>();
    TokenStream ts = null;
    try {
      ts = analyzer.tokenStream( "keywords", new StringReader( source ) );
      ts.reset();
      while ( ts.incrementToken() ) {
        keywords.add( ts.getAttribute( CharTermAttribute.class ).toString() );
      }
      ts.end();
    }
    catch ( IOException e ) {
      logger.error( "Error getting keywords ", e );
    }
    finally {
      try {
         ts.close();
      } catch (IOException ignored) {}
    }
    return keywords;
  }
}

代码示例来源:origin: pranab/chombo

/**
 * @param text
 * @param analyzer
 * @return
 * @throws IOException
 */
public static List<String> tokenize(String text, Analyzer analyzer) throws IOException {
  TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
  List<String> tokens = new ArrayList<String>();
  CharTermAttribute termAttribute = (CharTermAttribute)stream.getAttribute(CharTermAttribute.class);
  while (stream.incrementToken()) {
    String token = termAttribute.toString();
    tokens.add(token);
  } 
  
  return tokens;
}

代码示例来源:origin: looly/hutool

@Override
public Result parse(CharSequence text) {
  TokenStream stream;
  try {
    stream = analyzer.tokenStream("text", StrUtil.str(text));
    stream.reset();
  } catch (IOException e) {
    throw new TokenizerException(e);
  }
  return new AnalysisResult(stream);
}

代码示例来源:origin: oracle/opengrok

private SToken[] getTokens(String text) throws IOException {
  //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
  //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter        
  ArrayList<SToken> result = new ArrayList<>();
  try (TokenStream ts = analyzer.tokenStream("full", text)) {
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
      result.add(t);
    }
    ts.end();
  }
  return result.toArray(new SToken[result.size()]);
}

代码示例来源:origin: tdunning/MiA

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
     String s = ts.getAttribute(CharTermAttribute.class).toString();
     words.add(s);
    }
    /*overallCounts.addAll(words);*/
   }
}

代码示例来源:origin: sirensolutions/siren

private CachingTokenFilter getBuffer(Analyzer analyzer, FieldQueryNode fieldNode) {
 final TokenStream source;
 final String text = fieldNode.getTextAsString();
 final String field = fieldNode.getFieldAsString();
 try {
  source = analyzer.tokenStream(field, new StringReader(text));
  source.reset();
 }
 catch (final IOException e1) {
  throw new RuntimeException(e1);
 }
 return new CachingTokenFilter(source);
}

代码示例来源:origin: sanluan/PublicCMS

/**
 * @param text
 * @return
 */
public Set<String> getToken(String text) {
  Set<String> list = new LinkedHashSet<>();
  if (CommonUtils.notEmpty(text)) {
    try (StringReader stringReader = new StringReader(text);
        TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) {
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        list.add(charTermAttribute.toString());
      }
      tokenStream.end();
      return list;
    } catch (IOException e) {
      return list;
    }
  }
  return list;
}

代码示例来源:origin: tamingtext/book

public String[] tokenizeField(String fieldName, SolrInputField field) throws IOException {
  if (field == null) return new String[0];
  if (!(field.getValue() instanceof String)) return new String[0];
  //<start id="mahout.bayes.tokenize"/>
  String input = (String) field.getValue();
  
  ArrayList<String> tokenList = new ArrayList<String>();
  TokenStream ts = analyzer.tokenStream(inputField,
      new StringReader(input));
  while (ts.incrementToken()) {
   tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
  }
  String[] tokens = tokenList.toArray(new String[tokenList.size()]);
  //<end id="mahout.bayes.tokenize"/>
  return tokens;
 }
}

代码示例来源:origin: cn.hutool/hutool-all

@Override
public Result parse(CharSequence text) {
  TokenStream stream;
  try {
    stream = analyzer.tokenStream("text", StrUtil.str(text));
    stream.reset();
  } catch (IOException e) {
    throw new TokenizerException(e);
  }
  return new AnalysisResult(stream);
}

代码示例来源:origin: sanluan/PublicCMS

/**
 * @param text
 * @return
 */
public Set<String> getToken(String text) {
  Set<String> list = new LinkedHashSet<>();
  if (CommonUtils.notEmpty(text)) {
    try (StringReader stringReader = new StringReader(text);
        TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) {
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        list.add(charTermAttribute.toString());
      }
      tokenStream.end();
      return list;
    } catch (IOException e) {
      return list;
    }
  }
  return list;
}

代码示例来源:origin: stackoverflow.com

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
TokenStream tokenStream = analyzer.tokenStream("CONTENT", new StringReader("c/d e/f n/a"));
CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
int pos = 0;
while (tokenStream.incrementToken()) {
  String termStr = term.toString();
  int incr = position.getPositionIncrement();
  if (incr == 0 ) {
    System.out.print(" [" + termStr + "]");
  } else {
    pos += incr;
    System.out.println(" " + pos + ": [" + termStr +"]");
  }
}

代码示例来源:origin: larsga/Duke

/**
 * Parses the query. Using this instead of a QueryParser in order
 * to avoid thread-safety issues with Lucene's query parser.
 *
 * @param fieldName the name of the field
 * @param value the value of the field
 * @return the parsed query
 */
private Query parseTokens(String fieldName, String value) {
 BooleanQuery searchQuery = new BooleanQuery();
 if (value != null) {
  Analyzer analyzer = new KeywordAnalyzer();
  try {
   TokenStream tokenStream =
    analyzer.tokenStream(fieldName, new StringReader(value));
   tokenStream.reset();
   CharTermAttribute attr =
    tokenStream.getAttribute(CharTermAttribute.class);
   while (tokenStream.incrementToken()) {
    String term = attr.toString();
    Query termQuery = new TermQuery(new Term(fieldName, term));
    searchQuery.add(termQuery, Occur.SHOULD);
   }
  } catch (IOException e) {
   throw new DukeException("Error parsing input string '" + value + "' " +
               "in field " + fieldName);
  }
 }
 return searchQuery;
}

代码示例来源:origin: INL/BlackLab

public static void main(String[] args) throws IOException {
  String TEST_STR = "Hé jij И!  раскази и повѣсти. Ст]' Дѣдо  	Нисторъ. Ива";
  try (Analyzer a = new BLStandardAnalyzer()) {
    TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
    CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
      System.out.println(new String(ta.buffer(), 0, ta.length()));
    }
    TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
        new StringReader(TEST_STR));
    ta = ts2.addAttribute(CharTermAttribute.class);
    while (ts2.incrementToken()) {
      System.out.println(new String(ta.buffer(), 0, ta.length()));
    }
  }
}

代码示例来源:origin: org.elasticsearch/elasticsearch

try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
  CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  ts.reset();
  while (ts.incrementToken()) {
    skipTerms.add(new Term(fieldName, termAtt.toString()));

代码示例来源:origin: tdunning/MiA

public static void main(String[] args) throws IOException {
  FeatureVectorEncoder encoder = new StaticWordValueEncoder("text");
  Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);     
  StringReader in = new StringReader("text to magically vectorize");
  TokenStream ts = analyzer.tokenStream("body", in);
  TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
  Vector v1 = new RandomAccessSparseVector(100);                   
  while (ts.incrementToken()) {
   char[] termBuffer = termAtt.termBuffer();
   int termLen = termAtt.termLength();
   String w = new String(termBuffer, 0, termLen);                 
   encoder.addToVector(w, 1, v1);                                 
  }
  System.out.printf("%s\n", new SequentialAccessSparseVector(v1));
}

代码示例来源:origin: org.elasticsearch/elasticsearch

private static Query parseQueryString(ExtendedCommonTermsQuery query, Object queryString, String field, Analyzer analyzer,
                   String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException {
  // Logic similar to QueryParser#getFieldQuery
  try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) {
    source.reset();
    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
    BytesRefBuilder builder = new BytesRefBuilder();
    while (source.incrementToken()) {
      // UTF-8
      builder.copyChars(termAtt);
      query.add(new Term(field, builder.toBytesRef()));
    }
  }
  query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
  query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
  return query;
}

代码示例来源:origin: org.elasticsearch/elasticsearch

"term vectors, you must provide an Analyzer");
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
  int tokenCount = 0;
  ts.reset();
  while (ts.incrementToken()) {
    String word = termAtt.toString();
    tokenCount++;

相关文章