org.apache.lucene.util.fst.Util类的使用及代码示例

x33g5p2x  于2022-02-01 转载在 其他  
字(14.6k)|赞(0)|评价(0)|浏览(202)

本文整理了Java中org.apache.lucene.util.fst.Util类的一些代码示例,展示了Util类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Util类的具体详情如下:
包路径:org.apache.lucene.util.fst.Util
类名称:Util

Util介绍

[英]Static helper methods.
[中]静态助手方法。

代码示例

代码示例来源:origin: org.apache.lucene/lucene-core

final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
                           0, 0, true, false, Integer.MAX_VALUE,
                           outputs, true, 15);
assert bytes.length > 0;
scratchBytes.writeTo(bytes, 0);
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
scratchBytes.reset();
index = indexBuilder.finish();

代码示例来源:origin: org.apache.lucene/lucene-core

SegmentTermsEnumFrame f = getFrame(ord);
assert f != null;
final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix);
if (f.nextEnt == -1) {
 out.println("    frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<< BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
 if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix-1)&0xFF)) {
  out.println("      broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF));
  throw new RuntimeException("seek state is broken");
 BytesRef output = Util.get(fr.index, prefix);
 if (output == null) {
  out.println("      broken seek state: prefix is not final in index");

代码示例来源:origin: org.apache.lucene/lucene-core

/** Reverse lookup (lookup by output instead of by input),
 *  in the special case when your FSTs outputs are
 *  strictly ascending.  This locates the input/output
 *  pair where the output is equal to the target, and will
 *  return null if that output does not exist.
 *
 *  <p>NOTE: this only works with {@code FST<Long>}, only
 *  works when the outputs are ascending in order with
 *  the inputs.
 *  For example, simple ordinals (0, 1,
 *  2, ...), or file offets (when appending to a file)
 *  fit this. */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
 final BytesReader in = fst.getBytesReader();
 // TODO: would be nice not to alloc this on every lookup
 FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
 
 FST.Arc<Long> scratchArc = new FST.Arc<>();
 final IntsRefBuilder result = new IntsRefBuilder();
 return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}

代码示例来源:origin: org.apache.lucene/lucene-core

emitDotState(out, "initial", "point", "white", "");
 emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput));
     emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput);
    out.write("  " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");

代码示例来源:origin: org.elasticsearch/elasticsearch

BytesRefBuilder scratch = new BytesRefBuilder();
      new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
  for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
   Util.toBytesRef(string, scratch);
   if (scratch.length() > Short.MAX_VALUE-2) {
    throw new IllegalArgumentException(
      "cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length() + ")");
   short analyzedLength = (short) scratch.length();
 Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
  analyzed.append((byte) dedup);
  Util.toIntsRef(analyzed.get(), scratchInts);
   builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
  } else {
   int payloadOffset = input.getPosition() + surface.length;
   System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
   br.length = br.bytes.length;
   builder.add(scratchInts.get(), outputs.newPair(cost, br));

代码示例来源:origin: org.apache.lucene/lucene-analyzers-common

BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
 BytesRef scratch1 = new BytesRef();
 BytesRef scratch2 = new BytesRef();
 IntsRefBuilder currentOrds = new IntsRefBuilder();
  } else {
   encodeFlags(flagsScratch, wordForm);
   int ord = flagLookup.add(flagsScratch.get());
   if (ord < 0) {
    Util.toUTF32(currentEntry, scratchInts);
    words.add(scratchInts.get(), currentOrds.get());
 Util.toUTF32(currentEntry, scratchInts);
 words.add(scratchInts.get(), currentOrds.get());
 success2 = true;
} finally {

代码示例来源:origin: harbby/presto-connectors

BytesRefBuilder b = new BytesRefBuilder();
b.append(tokenBytes);
lastTokens[gramCount-1] = b;
for(int i=token.length()-1;i>=0;i--) {
 if (token.byteAt(i) == separator) {
  BytesRef context = new BytesRef(token.bytes(), 0, i);
  Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
  assert output != null;
  contextCount = decodeWeight(output);
  lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
  break;
 searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
  token.setLength(prefixLength);
  Util.toBytesRef(completion.input, suffix);
  token.append(suffix);

代码示例来源:origin: harbby/presto-connectors

BytesRef scratch = new BytesRef();
InputIterator iter = new WFSTInputIterator(iterator);
IntsRefBuilder scratchInts = new IntsRefBuilder();
BytesRefBuilder previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
while ((scratch = iter.next()) != null) {
 long cost = iter.weight();
  previous = new BytesRefBuilder();
 } else if (scratch.equals(previous.get())) {
  continue; // for duplicate suggestions, the best weight is actually
 Util.toIntsRef(scratch, scratchInts);
 builder.add(scratchInts.get(), cost);
 previous.copyBytes(scratch);
 count++;
fst = builder.finish();

代码示例来源:origin: org.elasticsearch/elasticsearch

public void finishTerm(long defaultWeight) throws IOException {
  ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
  int deduplicator = 0;
  analyzed.append((byte) 0);
  analyzed.setLength(analyzed.length() + 1);
  analyzed.grow(analyzed.length());
  for (int i = 0; i < count; i++) {
    analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
    Util.toIntsRef(analyzed.get(), scratchInts);
    SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
    long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
    builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
  }
  seenSurfaceForms.clear();
  count = 0;
}

代码示例来源:origin: lintool/warcbase

public String getUrl(int id) {
 BytesRef scratchBytes = new BytesRef();
 IntsRef key = null;
 try {
  key = Util.getByOutput(fst, id);
 } catch (IOException e) {
  LOG.error("Error id " + id);
  e.printStackTrace();
  return null;
 }
 if (key == null) {
  return null;
 }
 return Util.toBytesRef(key, scratchBytes).utf8ToString();
}

代码示例来源:origin: org.apache.lucene/lucene-analyzers-common

new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
 scratch.grow(estimatedSize);
 scratchOutput.reset(scratch.bytes());
 scratch.setLength(scratchOutput.getPosition());
 builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
FST<BytesRef> fst = builder.finish();
return new SynonymMap(fst, words, maxHorizontalContext);

代码示例来源:origin: org.apache.lucene/lucene-spellchecker

/**
  * Builds the final automaton from a list of entries.
  */
 private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
  // Build the automaton.
  final Outputs<Object> outputs = NoOutputs.getSingleton();
  final Object empty = outputs.getNoOutput();
  final Builder<Object> builder = new Builder<Object>(
    FST.INPUT_TYPE.BYTE1, 0, 0, true, true, 
    shareMaxTailLength, outputs, null, false);
  
  BytesRef scratch = new BytesRef();
  BytesRef entry;
  final IntsRef scratchIntsRef = new IntsRef();
  int count = 0;
  BytesRefIterator iter = sorter.iterator();
  while((entry = iter.next()) != null) {
   count++;
   if (scratch.compareTo(entry) != 0) {
    builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
    scratch.copyBytes(entry);
   }
  }
  
  return count == 0 ? null : builder.finish();
 }
}

代码示例来源:origin: org.apache.lucene/lucene-codecs

OrdsSegmentTermsEnumFrame f = getFrame(ord);
assert f != null;
final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix);
if (f.nextEnt == -1) {
 out.println("    frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd() + " termOrd=" + f.termOrd);
 if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix-1)&0xFF)) {
  out.println("      broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF));
  throw new RuntimeException("seek state is broken");
 Output output = Util.get(fr.index, prefix);
 if (output == null) {
  out.println("      broken seek state: prefix is not final in index");

代码示例来源:origin: org.apache.lucene/lucene-spellchecker

@Override
public void build(TermFreqIterator iterator) throws IOException {
 BytesRef scratch = new BytesRef();
 TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator,
   BytesRef.getUTF8SortedAsUnicodeComparator());
 IntsRef scratchInts = new IntsRef();
 BytesRef previous = null;
 PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
 Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
 while ((scratch = iter.next()) != null) {
  long cost = iter.weight();
  
  if (previous == null) {
   previous = new BytesRef();
  } else if (scratch.equals(previous)) {
   continue; // for duplicate suggestions, the best weight is actually
        // added
  }
  Util.toIntsRef(scratch, scratchInts);
  builder.add(scratchInts, cost);
  previous.copyBytes(scratch);
 }
 fst = builder.finish();
}

代码示例来源:origin: org.apache.lucene/lucene-analyzers

new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRef scratch = new BytesRef(64);
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
 scratch.grow(estimatedSize);
 scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
 assert scratch.offset == 0;
 builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
FST<BytesRef> fst = builder.finish();
return new SynonymMap(fst, words, maxHorizontalContext);

代码示例来源:origin: org.apache.lucene/lucene-codecs

private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
 meta.writeVInt(field.number);
 meta.writeByte(FST);
 meta.writeLong(data.getFilePointer());
 PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
 Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
 IntsRefBuilder scratch = new IntsRefBuilder();
 long ord = 0;
 for (BytesRef v : values) {
  builder.add(Util.toIntsRef(v, scratch), ord);
  ord++;
 }
 FST<Long> fst = builder.finish();
 if (fst != null) {
  fst.save(data);
 }
 meta.writeVLong(ord);
}

代码示例来源:origin: org.apache.lucene/lucene-codecs

final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
  outputsInner);
b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
IndexInput in = SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRefBuilder lastTerm = new BytesRefBuilder();
long lastDocsStart = -1;
int docFreq = 0;
long totalTermFreq = 0;
FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
while(true) {
 SimpleTextUtil.readLine(in, scratch);
 if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
  if (lastDocsStart != -1) {
   b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
     outputs.newPair(lastDocsStart,
       outputsInner.newPair((long) docFreq, totalTermFreq)));
 } else if (StringHelper.startsWith(scratch.get(), TERM)) {
  if (lastDocsStart != -1) {
   b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
     outputsInner.newPair((long) docFreq, totalTermFreq)));

代码示例来源:origin: lintool/warcbase

public int getID(String url) {
 Long id = null;
 try {
  id = Util.get(fst, new BytesRef(url));
 } catch (IOException e) {
  // Log error, but assume that URL doesn't exist.
  LOG.error("Error fetching " + url);
  e.printStackTrace();
  return -1;
 }
 return id == null ? -1 : id.intValue();
}

代码示例来源:origin: org.apache.lucene/lucene-analyzers-common

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
 Map<String,String> mappings = new TreeMap<>();
 
 for (int i = 0; i < num; i++) {
  String line = reader.readLine();
  String parts[] = line.split("\\s+");
  if (parts.length != 3) {
   throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
  }
  if (mappings.put(parts[1], parts[2]) != null) {
   throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
  }
 }
 
 Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
 Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
 IntsRefBuilder scratchInts = new IntsRefBuilder();
 for (Map.Entry<String,String> entry : mappings.entrySet()) {
  Util.toUTF16(entry.getKey(), scratchInts);
  builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
 }
 
 return builder.finish();
}

代码示例来源:origin: harbby/presto-connectors

/**
  * Builds the final automaton from a list of entries.
  */
 private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
  // Build the automaton.
  final Outputs<Object> outputs = NoOutputs.getSingleton();
  final Object empty = outputs.getNoOutput();
  final Builder<Object> builder = new Builder<>(
    FST.INPUT_TYPE.BYTE1, 0, 0, true, true, 
    shareMaxTailLength, outputs, false, 
    PackedInts.DEFAULT, true, 15);
  
  BytesRefBuilder scratch = new BytesRefBuilder();
  BytesRef entry;
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  int count = 0;
  BytesRefIterator iter = sorter.iterator();
  while((entry = iter.next()) != null) {
   count++;
   if (scratch.get().compareTo(entry) != 0) {
    builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
    scratch.copyBytes(entry);
   }
  }
  
  return count == 0 ? null : builder.finish();
 }
}

相关文章