zemberek.core.collections.Histogram类的使用及代码示例

x33g5p2x  于2022-01-20 转载在 其他  
字(10.8k)|赞(0)|评价(0)|浏览(124)

本文整理了Java中zemberek.core.collections.Histogram类的一些代码示例,展示了Histogram类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Histogram类的具体详情如下:
包路径:zemberek.core.collections.Histogram
类名称:Histogram

Histogram介绍

[英]A simple set like data structure for counting unique elements. Not thread safe.
[中]用于计算唯一元素的简单集合式数据结构。不是线程安全的。

代码示例

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * adds an element. and increments it's count.
 *
 * @param t element to add.
 * @return the count of the added element.
 * @throws NullPointerException if element is null.
 */
public int add(T t) {
 return add(t, 1);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

NormalizationVocabulary(
  Path correct,
  Path incorrect,
  Path maybeIncorrect,
  int correctMinCount,
  int incorrectMinCount,
  int maybeIncorrectMinCount) throws IOException {
 Histogram<String> correctWords = Histogram.loadFromUtf8File(correct, ' ');
 Histogram<String> noisyWords = Histogram.loadFromUtf8File(incorrect, ' ');
 Histogram<String> maybeIncorrectWords = new Histogram<>();
 if (maybeIncorrect != null) {
  maybeIncorrectWords = Histogram.loadFromUtf8File(maybeIncorrect, ' ');
 }
 correctWords.removeSmaller(correctMinCount);
 noisyWords.removeSmaller(incorrectMinCount);
 maybeIncorrectWords.removeSmaller(maybeIncorrectMinCount);
 this.noisyWordStart = correctWords.size();
 this.words = new ArrayList<>(correctWords.getSortedList());
 words.addAll(noisyWords.getSortedList());
 this.maybeIncorrectWordStart = words.size();
 words.addAll(maybeIncorrectWords.getSortedList());
 int i = 0;
 for (String word : words) {
  indexes.put(word, i);
  i++;
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public static void serializeStringHistogram(Histogram<String> h, DataOutputStream dos)
  throws IOException {
 dos.writeInt(h.size());
 for (String key : h.map) {
  dos.writeUTF(key);
  dos.writeInt(h.getCount(key));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * merges another Histogram to this one.
 *
 * @param otherSet another Histogram
 */
public void add(Histogram<T> otherSet) {
 if (otherSet == null) {
  throw new NullPointerException("Histogram cannot be null");
 }
 for (T t : otherSet) {
  add(t, otherSet.getCount(t));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * Returns keys that both histogram contain.
 * @param other Another Histogram
 * @return A set of keys that both histogram contain.
 */
public Set<T> getIntersectionOfKeys(Histogram<T> other) {
 LinkedHashSet<T> result = new LinkedHashSet<>();
 Histogram<T> smaller = other.size() < size() ? other : this;
 Histogram<T> larger = smaller == this ? other : this;
 for (T t : smaller.getSortedList()) {
  if (larger.contains(t)) {
   result.add(t);
  }
 }
 return result;
}

代码示例来源:origin: ahmetaa/zemberek-nlp

static void getQuestionSuffixes(Path in, Path out) throws IOException {
 List<String> splitLines = Files.readAllLines(in, Charsets.UTF_8);
 Histogram<String> endings = new Histogram<>();
 for (String splitLine : splitLines) {
  String[] tokens = splitLine.split("=");
  String s = tokens[1].trim();
  String[] t2 = s.split("[ ]");
  if (t2.length != 2) {
   System.out.println("Problem in " + splitLine);
   continue;
  }
  String suf = t2[1];
  if (suf.startsWith("mi") ||
    suf.startsWith("mu") ||
    suf.startsWith("mı") ||
    suf.startsWith("mü")
  ) {
   endings.add(t2[1]);
  }
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending + " " + endings.getCount(ending));
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending);
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public void noParse(String... filename) throws IOException {
 Histogram<String> uniques = new Histogram<>(1000000);
 int total = 0;
 for (String file : filename) {
     uniques.add(s);
 st.allCounts = (int) uniques.totalCount();
 st.allUniques = uniques.size();
 for (String s : uniques.getSortedList()) {
  int count = uniques.getCount(s);
  if (count > 5) {
   st.significantCounts += count;

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
  Log.info("Total lines = " + lines.size());
  Histogram<String> hist = new Histogram<>();
  lines.stream()
    .map(s -> s.substring(0, s.indexOf(' ')))
    .forEach(hist::add);
  Log.info("Categories :");
  for (String s : hist.getSortedList()) {
   Log.info(s + " " + hist.getCount(s));
  }
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

Log.info("Language model = %s", lm.info());
Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
Log.info("%d words loaded.", wordFreq.size());
wordFreq.removeSmaller(minWordCount);
if (minWordCount > 1) {
 Log.info("%d words left after removing counts less than %d.",
   wordFreq.size(),
   minWordCount
 );
  PrintWriter pwFreq =
    new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
 for (String word : wordFreq.getSortedList()) {
   if (best.score > -7) {
    pw.println(word + " = " + best.item);
    pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));

代码示例来源:origin: ahmetaa/zemberek-nlp

.collect(Collectors.toList());
Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
  Log.info("Collected %d words.", wordAnalyses.size());
  LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
  for (String sentence : toProcess) {
      stems.add(s.getStem());
      if (stems.size() > minCount) {
       wordAnalyses.add(analysis.getWordAnalysis());
       break;
 if (wordAnalyses.size() > wordCount) {
  break;
Path amb = outRoot.resolve(s + "-amb.txt");
try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
 for (WordAnalysis wa : wordAnalyses.getSortedList()) {
  pwa.println(wa.getInput());
  for (SingleAnalysis analysis : wa) {

代码示例来源:origin: ahmetaa/zemberek-nlp

static void countTokens(Path... paths) throws IOException {
 for (Path path : paths) {
  List<String> lines = TextIO.loadLines(path);
  Histogram<String> hw = new Histogram<>();
  Histogram<String> hl = new Histogram<>();
  for (String l : lines) {
   for (String s : l.split("[\\s]+")) {
    if (s.contains("__label__")) {
     if(s.contains("-")) {
      Log.warn(l);
     }
     hl.add(s);
    } else {
     hw.add(s);
    }
   }
  }
  Log.info("There are %d lines, %d words, %d labels in %s",
    lines.size(),
    hw.size(),
    hl.size(),
    path);
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
String line;
Histogram<String> histogram = new Histogram<>(50000);
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int count = 0;
  continue;
 histogram.add(words);
 if (count % 500000 == 0 && count != 0) {
  Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
  histogram.size());
if (top >= histogram.size()) {
 top = histogram.size();
} else {
 Log.info("Top %d words will be used.", top);
List<String> mostFrequent = histogram.getTop(top);
Log.info("Coverage: %.3f",
  100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());

代码示例来源:origin: ahmetaa/zemberek-nlp

List<String> getEndingsFromVocabulary(List<String> words) {
 Histogram<String> endings = new Histogram<>(words.size() / 10);
 for (String word : words) {
  WordAnalysis analyses = morphology.analyze(word);
  for (SingleAnalysis analysis : analyses) {
   if (analysis.isUnknown()) {
    continue;
   }
   StemAndEnding se = analysis.getStemAndEnding();
   if (se.ending.length() > 0) {
    endings.add(se.ending);
   }
  }
 }
 return endings.getSortedList(Turkish.STRING_COMPARATOR_ASC);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

static void multipleLetterRepetitionWords(Path in, Path out) throws IOException {
 Histogram<String> noisyWords = Histogram.loadFromUtf8File(in, ' ');
 Histogram<String> repetitionWords = new Histogram<>();
 for (String w : noisyWords) {
  if (w.length() == 1) {
   continue;
  }
  int maxRepetitionCount = 1;
  int repetitionCount = 1;
  char lastChar = w.charAt(0);
  for (int i = 1; i < w.length(); i++) {
   char c = w.charAt(i);
   if (c == lastChar) {
    repetitionCount++;
   } else {
    if (repetitionCount > maxRepetitionCount) {
     maxRepetitionCount = repetitionCount;
    }
    repetitionCount = 0;
   }
   lastChar = c;
  }
  if (maxRepetitionCount > 1) {
   repetitionWords.set(w, noisyWords.getCount(w));
  }
 }
 repetitionWords.saveSortedByCounts(out, " ");
}

代码示例来源:origin: ahmetaa/zemberek-nlp

Histogram<String> parseFails = new Histogram<>();
for (SentenceData sentenceData : set) {
     .map(DataConverter::convert)
     .collect(Collectors.toList());
   parseFails.add(s + " " + p);
parseFails.removeSmaller(3);
parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");

代码示例来源:origin: ahmetaa/zemberek-nlp

public String log() {
 List<String> res = new ArrayList<>();
 res.add(String.format("Number of sentences      = %d", numberOfSentences));
 res.add(String.format("Number of tokens         = %d", numberOfTokens));
 for (String type : typeHistogram.getSortedList()) {
  res.add(String.format("Type = %s (Count = %d, Token Count = %d Av. Token = %.2f )",
    type,
    typeHistogram.getCount(type),
    tokenHistogram.getCount(type),
    tokenHistogram.getCount(type) * 1f / typeHistogram.getCount(type)));
 }
 return String.join("\n", res);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * Loads data from the custom serialized file and generates a CharNgramCountModel from it.
 *
 * @param is InputStream to load data.
 * @return a CharNgramCountModel generated from file.
 */
public static CharNgramCountModel load(InputStream is) throws IOException {
 try (DataInputStream dis = new DataInputStream(new BufferedInputStream(is))) {
  int order = dis.readInt();
  String modelId = dis.readUTF();
  Histogram<String>[] gramCounts = new Histogram[order + 1];
  for (int j = 1; j <= order; j++) {
   int size = dis.readInt();
   Histogram<String> countSet = new Histogram<>(size * 2);
   for (int i = 0; i < size; i++) {
    String key = dis.readUTF();
    countSet.add(key, dis.readInt());
   }
   gramCounts[j] = countSet;
  }
  return new CharNgramCountModel(modelId, order, gramCounts);
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public static Histogram<String> loadFromLines(
  List<String> lines,
  char delimiter,
  boolean keyComesFirst) {
 Histogram<String> result = new Histogram<>(lines.size());
 for (String s : lines) {
  int index = s.indexOf(delimiter);
  if (index <= 0) {
   throw new IllegalStateException("Bad histogram line = " + s);
  }
  String item = keyComesFirst ? s.substring(0, index) : s.substring(index + 1);
  String countStr = keyComesFirst ? s.substring(index + 1) : s.substring(0, index);
  int count = Integer.parseInt(countStr);
  result.add(item, count);
 }
 return result;
}

代码示例来源:origin: ahmetaa/zemberek-nlp

for (Token token : tokens) {
  String s = token.getText();
  if (local.correct.contains(s) || globalVocabulary.correct.contains(s)) {
   local.correct.add(s);
   continue;
  if (local.incorrect.contains(s) || globalVocabulary.incorrect.contains(s)) {
   local.incorrect.add(s);
   continue;
    token.getType() == TurkishLexer.Emoticon ||
    token.getType() == TurkishLexer.Unknown ||
    local.ignored.contains(s) ||
    globalVocabulary.ignored.contains(s) ||
   local.ignored.add(s);
   continue;
   local.incorrect.add(s);
  } else {
   local.correct.add(s);
try {
 lock.lock();
 globalVocabulary.correct.add(local.correct);
 globalVocabulary.incorrect.add(local.incorrect);
 globalVocabulary.ignored.add(local.ignored);
 Log.info("Correct = %d, Incorrect = %d, Ignored = %d",
   globalVocabulary.correct.size(),

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * @return total count of the items in the input Iterable.
 */
public long totalCount(Iterable<T> it) {
 long count = 0;
 for (T t : it) {
  count += getCount(t);
 }
 return count;
}

相关文章