本文整理了Java中zemberek.core.collections.Histogram
类的一些代码示例,展示了Histogram
类的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Histogram
类的具体详情如下:
包路径:zemberek.core.collections.Histogram
类名称:Histogram
[英]A simple set like data structure for counting unique elements. Not thread safe.
[中]用于计算唯一元素的简单集合式数据结构。不是线程安全的。
代码示例来源:origin: ahmetaa/zemberek-nlp
/**
* adds an element. and increments it's count.
*
* @param t element to add.
* @return the count of the added element.
* @throws NullPointerException if element is null.
*/
public int add(T t) {
return add(t, 1);
}
代码示例来源:origin: ahmetaa/zemberek-nlp
NormalizationVocabulary(
Path correct,
Path incorrect,
Path maybeIncorrect,
int correctMinCount,
int incorrectMinCount,
int maybeIncorrectMinCount) throws IOException {
Histogram<String> correctWords = Histogram.loadFromUtf8File(correct, ' ');
Histogram<String> noisyWords = Histogram.loadFromUtf8File(incorrect, ' ');
Histogram<String> maybeIncorrectWords = new Histogram<>();
if (maybeIncorrect != null) {
maybeIncorrectWords = Histogram.loadFromUtf8File(maybeIncorrect, ' ');
}
correctWords.removeSmaller(correctMinCount);
noisyWords.removeSmaller(incorrectMinCount);
maybeIncorrectWords.removeSmaller(maybeIncorrectMinCount);
this.noisyWordStart = correctWords.size();
this.words = new ArrayList<>(correctWords.getSortedList());
words.addAll(noisyWords.getSortedList());
this.maybeIncorrectWordStart = words.size();
words.addAll(maybeIncorrectWords.getSortedList());
int i = 0;
for (String word : words) {
indexes.put(word, i);
i++;
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public static void serializeStringHistogram(Histogram<String> h, DataOutputStream dos)
throws IOException {
dos.writeInt(h.size());
for (String key : h.map) {
dos.writeUTF(key);
dos.writeInt(h.getCount(key));
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
/**
* merges another Histogram to this one.
*
* @param otherSet another Histogram
*/
public void add(Histogram<T> otherSet) {
if (otherSet == null) {
throw new NullPointerException("Histogram cannot be null");
}
for (T t : otherSet) {
add(t, otherSet.getCount(t));
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
/**
* Returns keys that both histogram contain.
* @param other Another Histogram
* @return A set of keys that both histogram contain.
*/
public Set<T> getIntersectionOfKeys(Histogram<T> other) {
LinkedHashSet<T> result = new LinkedHashSet<>();
Histogram<T> smaller = other.size() < size() ? other : this;
Histogram<T> larger = smaller == this ? other : this;
for (T t : smaller.getSortedList()) {
if (larger.contains(t)) {
result.add(t);
}
}
return result;
}
代码示例来源:origin: ahmetaa/zemberek-nlp
static void getQuestionSuffixes(Path in, Path out) throws IOException {
List<String> splitLines = Files.readAllLines(in, Charsets.UTF_8);
Histogram<String> endings = new Histogram<>();
for (String splitLine : splitLines) {
String[] tokens = splitLine.split("=");
String s = tokens[1].trim();
String[] t2 = s.split("[ ]");
if (t2.length != 2) {
System.out.println("Problem in " + splitLine);
continue;
}
String suf = t2[1];
if (suf.startsWith("mi") ||
suf.startsWith("mu") ||
suf.startsWith("mı") ||
suf.startsWith("mü")
) {
endings.add(t2[1]);
}
}
for (String ending : endings.getSortedList()) {
System.out.println(ending + " " + endings.getCount(ending));
}
for (String ending : endings.getSortedList()) {
System.out.println(ending);
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public void noParse(String... filename) throws IOException {
Histogram<String> uniques = new Histogram<>(1000000);
int total = 0;
for (String file : filename) {
uniques.add(s);
st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
int count = uniques.getCount(s);
if (count > 5) {
st.significantCounts += count;
代码示例来源:origin: ahmetaa/zemberek-nlp
void dataInfo(List<String> lines) {
Log.info("Total lines = " + lines.size());
Histogram<String> hist = new Histogram<>();
lines.stream()
.map(s -> s.substring(0, s.indexOf(' ')))
.forEach(hist::add);
Log.info("Categories :");
for (String s : hist.getSortedList()) {
Log.info(s + " " + hist.getCount(s));
}
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
Log.info("Language model = %s", lm.info());
Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
Log.info("%d words loaded.", wordFreq.size());
wordFreq.removeSmaller(minWordCount);
if (minWordCount > 1) {
Log.info("%d words left after removing counts less than %d.",
wordFreq.size(),
minWordCount
);
PrintWriter pwFreq =
new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
for (String word : wordFreq.getSortedList()) {
if (best.score > -7) {
pw.println(word + " = " + best.item);
pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));
代码示例来源:origin: ahmetaa/zemberek-nlp
.collect(Collectors.toList());
Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
Log.info("Collected %d words.", wordAnalyses.size());
LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
for (String sentence : toProcess) {
stems.add(s.getStem());
if (stems.size() > minCount) {
wordAnalyses.add(analysis.getWordAnalysis());
break;
if (wordAnalyses.size() > wordCount) {
break;
Path amb = outRoot.resolve(s + "-amb.txt");
try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
for (WordAnalysis wa : wordAnalyses.getSortedList()) {
pwa.println(wa.getInput());
for (SingleAnalysis analysis : wa) {
代码示例来源:origin: ahmetaa/zemberek-nlp
static void countTokens(Path... paths) throws IOException {
for (Path path : paths) {
List<String> lines = TextIO.loadLines(path);
Histogram<String> hw = new Histogram<>();
Histogram<String> hl = new Histogram<>();
for (String l : lines) {
for (String s : l.split("[\\s]+")) {
if (s.contains("__label__")) {
if(s.contains("-")) {
Log.warn(l);
}
hl.add(s);
} else {
hw.add(s);
}
}
}
Log.info("There are %d lines, %d words, %d labels in %s",
lines.size(),
hw.size(),
hl.size(),
path);
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
String line;
Histogram<String> histogram = new Histogram<>(50000);
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int count = 0;
continue;
histogram.add(words);
if (count % 500000 == 0 && count != 0) {
Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
histogram.size());
if (top >= histogram.size()) {
top = histogram.size();
} else {
Log.info("Top %d words will be used.", top);
List<String> mostFrequent = histogram.getTop(top);
Log.info("Coverage: %.3f",
100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());
代码示例来源:origin: ahmetaa/zemberek-nlp
List<String> getEndingsFromVocabulary(List<String> words) {
Histogram<String> endings = new Histogram<>(words.size() / 10);
for (String word : words) {
WordAnalysis analyses = morphology.analyze(word);
for (SingleAnalysis analysis : analyses) {
if (analysis.isUnknown()) {
continue;
}
StemAndEnding se = analysis.getStemAndEnding();
if (se.ending.length() > 0) {
endings.add(se.ending);
}
}
}
return endings.getSortedList(Turkish.STRING_COMPARATOR_ASC);
}
代码示例来源:origin: ahmetaa/zemberek-nlp
static void multipleLetterRepetitionWords(Path in, Path out) throws IOException {
Histogram<String> noisyWords = Histogram.loadFromUtf8File(in, ' ');
Histogram<String> repetitionWords = new Histogram<>();
for (String w : noisyWords) {
if (w.length() == 1) {
continue;
}
int maxRepetitionCount = 1;
int repetitionCount = 1;
char lastChar = w.charAt(0);
for (int i = 1; i < w.length(); i++) {
char c = w.charAt(i);
if (c == lastChar) {
repetitionCount++;
} else {
if (repetitionCount > maxRepetitionCount) {
maxRepetitionCount = repetitionCount;
}
repetitionCount = 0;
}
lastChar = c;
}
if (maxRepetitionCount > 1) {
repetitionWords.set(w, noisyWords.getCount(w));
}
}
repetitionWords.saveSortedByCounts(out, " ");
}
代码示例来源:origin: ahmetaa/zemberek-nlp
Histogram<String> parseFails = new Histogram<>();
for (SentenceData sentenceData : set) {
.map(DataConverter::convert)
.collect(Collectors.toList());
parseFails.add(s + " " + p);
parseFails.removeSmaller(3);
parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");
代码示例来源:origin: ahmetaa/zemberek-nlp
public String log() {
List<String> res = new ArrayList<>();
res.add(String.format("Number of sentences = %d", numberOfSentences));
res.add(String.format("Number of tokens = %d", numberOfTokens));
for (String type : typeHistogram.getSortedList()) {
res.add(String.format("Type = %s (Count = %d, Token Count = %d Av. Token = %.2f )",
type,
typeHistogram.getCount(type),
tokenHistogram.getCount(type),
tokenHistogram.getCount(type) * 1f / typeHistogram.getCount(type)));
}
return String.join("\n", res);
}
代码示例来源:origin: ahmetaa/zemberek-nlp
/**
* Loads data from the custom serialized file and generates a CharNgramCountModel from it.
*
* @param is InputStream to load data.
* @return a CharNgramCountModel generated from file.
*/
public static CharNgramCountModel load(InputStream is) throws IOException {
try (DataInputStream dis = new DataInputStream(new BufferedInputStream(is))) {
int order = dis.readInt();
String modelId = dis.readUTF();
Histogram<String>[] gramCounts = new Histogram[order + 1];
for (int j = 1; j <= order; j++) {
int size = dis.readInt();
Histogram<String> countSet = new Histogram<>(size * 2);
for (int i = 0; i < size; i++) {
String key = dis.readUTF();
countSet.add(key, dis.readInt());
}
gramCounts[j] = countSet;
}
return new CharNgramCountModel(modelId, order, gramCounts);
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public static Histogram<String> loadFromLines(
List<String> lines,
char delimiter,
boolean keyComesFirst) {
Histogram<String> result = new Histogram<>(lines.size());
for (String s : lines) {
int index = s.indexOf(delimiter);
if (index <= 0) {
throw new IllegalStateException("Bad histogram line = " + s);
}
String item = keyComesFirst ? s.substring(0, index) : s.substring(index + 1);
String countStr = keyComesFirst ? s.substring(index + 1) : s.substring(0, index);
int count = Integer.parseInt(countStr);
result.add(item, count);
}
return result;
}
代码示例来源:origin: ahmetaa/zemberek-nlp
for (Token token : tokens) {
String s = token.getText();
if (local.correct.contains(s) || globalVocabulary.correct.contains(s)) {
local.correct.add(s);
continue;
if (local.incorrect.contains(s) || globalVocabulary.incorrect.contains(s)) {
local.incorrect.add(s);
continue;
token.getType() == TurkishLexer.Emoticon ||
token.getType() == TurkishLexer.Unknown ||
local.ignored.contains(s) ||
globalVocabulary.ignored.contains(s) ||
local.ignored.add(s);
continue;
local.incorrect.add(s);
} else {
local.correct.add(s);
try {
lock.lock();
globalVocabulary.correct.add(local.correct);
globalVocabulary.incorrect.add(local.incorrect);
globalVocabulary.ignored.add(local.ignored);
Log.info("Correct = %d, Incorrect = %d, Ignored = %d",
globalVocabulary.correct.size(),
代码示例来源:origin: ahmetaa/zemberek-nlp
/**
* @return total count of the items in the input Iterable.
*/
public long totalCount(Iterable<T> it) {
long count = 0;
for (T t : it) {
count += getCount(t);
}
return count;
}
内容来源于网络,如有侵权,请联系作者删除!