本文整理了Java中org.apache.lucene.util.fst.Util
类的一些代码示例,展示了Util
类的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Util
类的具体详情如下:
包路径:org.apache.lucene.util.fst.Util
类名称:Util
[英]Static helper methods.
[中]静态助手方法。
代码示例来源:origin: org.apache.lucene/lucene-core
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
outputs, true, 15);
assert bytes.length > 0;
scratchBytes.writeTo(bytes, 0);
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
scratchBytes.reset();
index = indexBuilder.finish();
代码示例来源:origin: org.apache.lucene/lucene-core
SegmentTermsEnumFrame f = getFrame(ord);
assert f != null;
final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix);
if (f.nextEnt == -1) {
out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<< BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix-1)&0xFF)) {
out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF));
throw new RuntimeException("seek state is broken");
BytesRef output = Util.get(fr.index, prefix);
if (output == null) {
out.println(" broken seek state: prefix is not final in index");
代码示例来源:origin: org.apache.lucene/lucene-core
/** Reverse lookup (lookup by output instead of by input),
* in the special case when your FSTs outputs are
* strictly ascending. This locates the input/output
* pair where the output is equal to the target, and will
* return null if that output does not exist.
*
* <p>NOTE: this only works with {@code FST<Long>}, only
* works when the outputs are ascending in order with
* the inputs.
* For example, simple ordinals (0, 1,
* 2, ...), or file offets (when appending to a file)
* fit this. */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
final BytesReader in = fst.getBytesReader();
// TODO: would be nice not to alloc this on every lookup
FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
FST.Arc<Long> scratchArc = new FST.Arc<>();
final IntsRefBuilder result = new IntsRefBuilder();
return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}
代码示例来源:origin: org.apache.lucene/lucene-core
emitDotState(out, "initial", "point", "white", "");
emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput));
emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput);
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");
代码示例来源:origin: org.elasticsearch/elasticsearch
BytesRefBuilder scratch = new BytesRefBuilder();
new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
Util.toBytesRef(string, scratch);
if (scratch.length() > Short.MAX_VALUE-2) {
throw new IllegalArgumentException(
"cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length() + ")");
short analyzedLength = (short) scratch.length();
Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
analyzed.append((byte) dedup);
Util.toIntsRef(analyzed.get(), scratchInts);
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts.get(), outputs.newPair(cost, br));
代码示例来源:origin: org.apache.lucene/lucene-analyzers-common
BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
BytesRef scratch1 = new BytesRef();
BytesRef scratch2 = new BytesRef();
IntsRefBuilder currentOrds = new IntsRefBuilder();
} else {
encodeFlags(flagsScratch, wordForm);
int ord = flagLookup.add(flagsScratch.get());
if (ord < 0) {
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
success2 = true;
} finally {
代码示例来源:origin: harbby/presto-connectors
BytesRefBuilder b = new BytesRefBuilder();
b.append(tokenBytes);
lastTokens[gramCount-1] = b;
for(int i=token.length()-1;i>=0;i--) {
if (token.byteAt(i) == separator) {
BytesRef context = new BytesRef(token.bytes(), 0, i);
Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
assert output != null;
contextCount = decodeWeight(output);
lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
break;
searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
token.setLength(prefixLength);
Util.toBytesRef(completion.input, suffix);
token.append(suffix);
代码示例来源:origin: harbby/presto-connectors
BytesRef scratch = new BytesRef();
InputIterator iter = new WFSTInputIterator(iterator);
IntsRefBuilder scratchInts = new IntsRefBuilder();
BytesRefBuilder previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
while ((scratch = iter.next()) != null) {
long cost = iter.weight();
previous = new BytesRefBuilder();
} else if (scratch.equals(previous.get())) {
continue; // for duplicate suggestions, the best weight is actually
Util.toIntsRef(scratch, scratchInts);
builder.add(scratchInts.get(), cost);
previous.copyBytes(scratch);
count++;
fst = builder.finish();
代码示例来源:origin: org.elasticsearch/elasticsearch
public void finishTerm(long defaultWeight) throws IOException {
ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
int deduplicator = 0;
analyzed.append((byte) 0);
analyzed.setLength(analyzed.length() + 1);
analyzed.grow(analyzed.length());
for (int i = 0; i < count; i++) {
analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
Util.toIntsRef(analyzed.get(), scratchInts);
SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
}
seenSurfaceForms.clear();
count = 0;
}
代码示例来源:origin: lintool/warcbase
public String getUrl(int id) {
BytesRef scratchBytes = new BytesRef();
IntsRef key = null;
try {
key = Util.getByOutput(fst, id);
} catch (IOException e) {
LOG.error("Error id " + id);
e.printStackTrace();
return null;
}
if (key == null) {
return null;
}
return Util.toBytesRef(key, scratchBytes).utf8ToString();
}
代码示例来源:origin: org.apache.lucene/lucene-analyzers-common
new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
scratch.grow(estimatedSize);
scratchOutput.reset(scratch.bytes());
scratch.setLength(scratchOutput.getPosition());
builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
FST<BytesRef> fst = builder.finish();
return new SynonymMap(fst, words, maxHorizontalContext);
代码示例来源:origin: org.apache.lucene/lucene-spellchecker
/**
* Builds the final automaton from a list of entries.
*/
private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<Object>(
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
shareMaxTailLength, outputs, null, false);
BytesRef scratch = new BytesRef();
BytesRef entry;
final IntsRef scratchIntsRef = new IntsRef();
int count = 0;
BytesRefIterator iter = sorter.iterator();
while((entry = iter.next()) != null) {
count++;
if (scratch.compareTo(entry) != 0) {
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
scratch.copyBytes(entry);
}
}
return count == 0 ? null : builder.finish();
}
}
代码示例来源:origin: org.apache.lucene/lucene-codecs
OrdsSegmentTermsEnumFrame f = getFrame(ord);
assert f != null;
final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix);
if (f.nextEnt == -1) {
out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd() + " termOrd=" + f.termOrd);
if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix-1)&0xFF)) {
out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF));
throw new RuntimeException("seek state is broken");
Output output = Util.get(fr.index, prefix);
if (output == null) {
out.println(" broken seek state: prefix is not final in index");
代码示例来源:origin: org.apache.lucene/lucene-spellchecker
@Override
public void build(TermFreqIterator iterator) throws IOException {
BytesRef scratch = new BytesRef();
TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator,
BytesRef.getUTF8SortedAsUnicodeComparator());
IntsRef scratchInts = new IntsRef();
BytesRef previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
while ((scratch = iter.next()) != null) {
long cost = iter.weight();
if (previous == null) {
previous = new BytesRef();
} else if (scratch.equals(previous)) {
continue; // for duplicate suggestions, the best weight is actually
// added
}
Util.toIntsRef(scratch, scratchInts);
builder.add(scratchInts, cost);
previous.copyBytes(scratch);
}
fst = builder.finish();
}
代码示例来源:origin: org.apache.lucene/lucene-analyzers
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRef scratch = new BytesRef(64);
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
scratch.grow(estimatedSize);
scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
assert scratch.offset == 0;
builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
FST<BytesRef> fst = builder.finish();
return new SynonymMap(fst, words, maxHorizontalContext);
代码示例来源:origin: org.apache.lucene/lucene-codecs
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
meta.writeVInt(field.number);
meta.writeByte(FST);
meta.writeLong(data.getFilePointer());
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0;
for (BytesRef v : values) {
builder.add(Util.toIntsRef(v, scratch), ord);
ord++;
}
FST<Long> fst = builder.finish();
if (fst != null) {
fst.save(data);
}
meta.writeVLong(ord);
}
代码示例来源:origin: org.apache.lucene/lucene-codecs
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
outputsInner);
b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
IndexInput in = SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRefBuilder lastTerm = new BytesRefBuilder();
long lastDocsStart = -1;
int docFreq = 0;
long totalTermFreq = 0;
FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
while(true) {
SimpleTextUtil.readLine(in, scratch);
if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq)));
} else if (StringHelper.startsWith(scratch.get(), TERM)) {
if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq)));
代码示例来源:origin: lintool/warcbase
public int getID(String url) {
Long id = null;
try {
id = Util.get(fst, new BytesRef(url));
} catch (IOException e) {
// Log error, but assume that URL doesn't exist.
LOG.error("Error fetching " + url);
e.printStackTrace();
return -1;
}
return id == null ? -1 : id.intValue();
}
代码示例来源:origin: org.apache.lucene/lucene-analyzers-common
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
Map<String,String> mappings = new TreeMap<>();
for (int i = 0; i < num; i++) {
String line = reader.readLine();
String parts[] = line.split("\\s+");
if (parts.length != 3) {
throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
}
if (mappings.put(parts[1], parts[2]) != null) {
throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
}
}
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String,String> entry : mappings.entrySet()) {
Util.toUTF16(entry.getKey(), scratchInts);
builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
}
return builder.finish();
}
代码示例来源:origin: harbby/presto-connectors
/**
* Builds the final automaton from a list of entries.
*/
private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<>(
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
shareMaxTailLength, outputs, false,
PackedInts.DEFAULT, true, 15);
BytesRefBuilder scratch = new BytesRefBuilder();
BytesRef entry;
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
int count = 0;
BytesRefIterator iter = sorter.iterator();
while((entry = iter.next()) != null) {
count++;
if (scratch.get().compareTo(entry) != 0) {
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
scratch.copyBytes(entry);
}
}
return count == 0 ? null : builder.finish();
}
}
内容来源于网络,如有侵权,请联系作者删除!