zemberek.core.collections.Histogram类的使用及代码示例

x33g5p2x 于2022-01-20 转载在其他

字(10.8k)|赞(0)|评价(0)|浏览(124)

本文整理了Java中zemberek.core.collections.Histogram类的一些代码示例，展示了Histogram类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台，是从一些精选项目中提取出来的代码，具有较强的参考意义，能在一定程度帮忙到你。Histogram类的具体详情如下：
包路径：zemberek.core.collections.Histogram
类名称：Histogram

Histogram介绍

[英]A simple set like data structure for counting unique elements. Not thread safe.
[中]用于计算唯一元素的简单集合式数据结构。不是线程安全的。

代码示例

代码示例来源：origin: ahmetaa/zemberek-nlp

/**
 * adds an element. and increments it's count.
 *
 * @param t element to add.
 * @return the count of the added element.
 * @throws NullPointerException if element is null.
 */
public int add(T t) {
 return add(t, 1);
}

代码示例来源：origin: ahmetaa/zemberek-nlp

NormalizationVocabulary(
  Path correct,
  Path incorrect,
  Path maybeIncorrect,
  int correctMinCount,
  int incorrectMinCount,
  int maybeIncorrectMinCount) throws IOException {
 Histogram<String> correctWords = Histogram.loadFromUtf8File(correct, ' ');
 Histogram<String> noisyWords = Histogram.loadFromUtf8File(incorrect, ' ');
 Histogram<String> maybeIncorrectWords = new Histogram<>();
 if (maybeIncorrect != null) {
  maybeIncorrectWords = Histogram.loadFromUtf8File(maybeIncorrect, ' ');
 }
 correctWords.removeSmaller(correctMinCount);
 noisyWords.removeSmaller(incorrectMinCount);
 maybeIncorrectWords.removeSmaller(maybeIncorrectMinCount);
 this.noisyWordStart = correctWords.size();
 this.words = new ArrayList<>(correctWords.getSortedList());
 words.addAll(noisyWords.getSortedList());
 this.maybeIncorrectWordStart = words.size();
 words.addAll(maybeIncorrectWords.getSortedList());
 int i = 0;
 for (String word : words) {
  indexes.put(word, i);
  i++;
 }
}

代码示例来源：origin: ahmetaa/zemberek-nlp

public static void serializeStringHistogram(Histogram<String> h, DataOutputStream dos)
  throws IOException {
 dos.writeInt(h.size());
 for (String key : h.map) {
  dos.writeUTF(key);
  dos.writeInt(h.getCount(key));
 }
}

代码示例来源：origin: ahmetaa/zemberek-nlp

/**
 * merges another Histogram to this one.
 *
 * @param otherSet another Histogram
 */
public void add(Histogram<T> otherSet) {
 if (otherSet == null) {
  throw new NullPointerException("Histogram cannot be null");
 }
 for (T t : otherSet) {
  add(t, otherSet.getCount(t));
 }
}

代码示例来源：origin: ahmetaa/zemberek-nlp

/**
 * Returns keys that both histogram contain.
 * @param other Another Histogram
 * @return A set of keys that both histogram contain.
 */
public Set<T> getIntersectionOfKeys(Histogram<T> other) {
 LinkedHashSet<T> result = new LinkedHashSet<>();
 Histogram<T> smaller = other.size() < size() ? other : this;
 Histogram<T> larger = smaller == this ? other : this;
 for (T t : smaller.getSortedList()) {
  if (larger.contains(t)) {
   result.add(t);
  }
 }
 return result;
}

代码示例来源：origin: ahmetaa/zemberek-nlp

static void getQuestionSuffixes(Path in, Path out) throws IOException {
 List<String> splitLines = Files.readAllLines(in, Charsets.UTF_8);
 Histogram<String> endings = new Histogram<>();
 for (String splitLine : splitLines) {
  String[] tokens = splitLine.split("=");
  String s = tokens[1].trim();
  String[] t2 = s.split("[ ]");
  if (t2.length != 2) {
   System.out.println("Problem in " + splitLine);
   continue;
  }
  String suf = t2[1];
  if (suf.startsWith("mi") ||
    suf.startsWith("mu") ||
    suf.startsWith("mı") ||
    suf.startsWith("mü")
  ) {
   endings.add(t2[1]);
  }
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending + " " + endings.getCount(ending));
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending);
 }
}

代码示例来源：origin: ahmetaa/zemberek-nlp

public void noParse(String... filename) throws IOException {
 Histogram<String> uniques = new Histogram<>(1000000);
 int total = 0;
 for (String file : filename) {
     uniques.add(s);
 st.allCounts = (int) uniques.totalCount();
 st.allUniques = uniques.size();
 for (String s : uniques.getSortedList()) {
  int count = uniques.getCount(s);
  if (count > 5) {
   st.significantCounts += count;

代码示例来源：origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
  Log.info("Total lines = " + lines.size());
  Histogram<String> hist = new Histogram<>();
  lines.stream()
    .map(s -> s.substring(0, s.indexOf(' ')))
    .forEach(hist::add);
  Log.info("Categories :");
  for (String s : hist.getSortedList()) {
   Log.info(s + " " + hist.getCount(s));
  }
 }
}

代码示例来源：origin: ahmetaa/zemberek-nlp

Log.info("Language model = %s", lm.info());
Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
Log.info("%d words loaded.", wordFreq.size());
wordFreq.removeSmaller(minWordCount);
if (minWordCount > 1) {
 Log.info("%d words left after removing counts less than %d.",
   wordFreq.size(),
   minWordCount
 );
  PrintWriter pwFreq =
    new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
 for (String word : wordFreq.getSortedList()) {
   if (best.score > -7) {
    pw.println(word + " = " + best.item);
    pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));

代码示例来源：origin: ahmetaa/zemberek-nlp

.collect(Collectors.toList());
Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
  Log.info("Collected %d words.", wordAnalyses.size());
  LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
  for (String sentence : toProcess) {
      stems.add(s.getStem());
      if (stems.size() > minCount) {
       wordAnalyses.add(analysis.getWordAnalysis());
       break;
 if (wordAnalyses.size() > wordCount) {
  break;
Path amb = outRoot.resolve(s + "-amb.txt");
try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
 for (WordAnalysis wa : wordAnalyses.getSortedList()) {
  pwa.println(wa.getInput());
  for (SingleAnalysis analysis : wa) {

代码示例来源：origin: ahmetaa/zemberek-nlp

static void countTokens(Path... paths) throws IOException {
 for (Path path : paths) {
  List<String> lines = TextIO.loadLines(path);
  Histogram<String> hw = new Histogram<>();
  Histogram<String> hl = new Histogram<>();
  for (String l : lines) {
   for (String s : l.split("[\\s]+")) {
    if (s.contains("__label__")) {
     if(s.contains("-")) {
      Log.warn(l);
     }
     hl.add(s);
    } else {
     hw.add(s);
    }
   }
  }
  Log.info("There are %d lines, %d words, %d labels in %s",
    lines.size(),
    hw.size(),
    hl.size(),
    path);
 }
}

代码示例来源：origin: ahmetaa/zemberek-nlp

new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
String line;
Histogram<String> histogram = new Histogram<>(50000);
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int count = 0;
  continue;
 histogram.add(words);
 if (count % 500000 == 0 && count != 0) {
  Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
  histogram.size());
if (top >= histogram.size()) {
 top = histogram.size();
} else {
 Log.info("Top %d words will be used.", top);
List<String> mostFrequent = histogram.getTop(top);
Log.info("Coverage: %.3f",
  100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());

代码示例来源：origin: ahmetaa/zemberek-nlp

List<String> getEndingsFromVocabulary(List<String> words) {
 Histogram<String> endings = new Histogram<>(words.size() / 10);
 for (String word : words) {
  WordAnalysis analyses = morphology.analyze(word);
  for (SingleAnalysis analysis : analyses) {
   if (analysis.isUnknown()) {
    continue;
   }
   StemAndEnding se = analysis.getStemAndEnding();
   if (se.ending.length() > 0) {
    endings.add(se.ending);
   }
  }
 }
 return endings.getSortedList(Turkish.STRING_COMPARATOR_ASC);
}

代码示例来源：origin: ahmetaa/zemberek-nlp

static void multipleLetterRepetitionWords(Path in, Path out) throws IOException {
 Histogram<String> noisyWords = Histogram.loadFromUtf8File(in, ' ');
 Histogram<String> repetitionWords = new Histogram<>();
 for (String w : noisyWords) {
  if (w.length() == 1) {
   continue;
  }
  int maxRepetitionCount = 1;
  int repetitionCount = 1;
  char lastChar = w.charAt(0);
  for (int i = 1; i < w.length(); i++) {
   char c = w.charAt(i);
   if (c == lastChar) {
    repetitionCount++;
   } else {
    if (repetitionCount > maxRepetitionCount) {
     maxRepetitionCount = repetitionCount;
    }
    repetitionCount = 0;
   }
   lastChar = c;
  }
  if (maxRepetitionCount > 1) {
   repetitionWords.set(w, noisyWords.getCount(w));
  }
 }
 repetitionWords.saveSortedByCounts(out, " ");
}

代码示例来源：origin: ahmetaa/zemberek-nlp

Histogram<String> parseFails = new Histogram<>();
for (SentenceData sentenceData : set) {
     .map(DataConverter::convert)
     .collect(Collectors.toList());
   parseFails.add(s + " " + p);
parseFails.removeSmaller(3);
parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");

代码示例来源：origin: ahmetaa/zemberek-nlp

public String log() {
 List<String> res = new ArrayList<>();
 res.add(String.format("Number of sentences      = %d", numberOfSentences));
 res.add(String.format("Number of tokens         = %d", numberOfTokens));
 for (String type : typeHistogram.getSortedList()) {
  res.add(String.format("Type = %s (Count = %d, Token Count = %d Av. Token = %.2f )",
    type,
    typeHistogram.getCount(type),
    tokenHistogram.getCount(type),
    tokenHistogram.getCount(type) * 1f / typeHistogram.getCount(type)));
 }
 return String.join("\n", res);
}

代码示例来源：origin: ahmetaa/zemberek-nlp

/**
 * Loads data from the custom serialized file and generates a CharNgramCountModel from it.
 *
 * @param is InputStream to load data.
 * @return a CharNgramCountModel generated from file.
 */
public static CharNgramCountModel load(InputStream is) throws IOException {
 try (DataInputStream dis = new DataInputStream(new BufferedInputStream(is))) {
  int order = dis.readInt();
  String modelId = dis.readUTF();
  Histogram<String>[] gramCounts = new Histogram[order + 1];
  for (int j = 1; j <= order; j++) {
   int size = dis.readInt();
   Histogram<String> countSet = new Histogram<>(size * 2);
   for (int i = 0; i < size; i++) {
    String key = dis.readUTF();
    countSet.add(key, dis.readInt());
   }
   gramCounts[j] = countSet;
  }
  return new CharNgramCountModel(modelId, order, gramCounts);
 }
}

代码示例来源：origin: ahmetaa/zemberek-nlp

public static Histogram<String> loadFromLines(
  List<String> lines,
  char delimiter,
  boolean keyComesFirst) {
 Histogram<String> result = new Histogram<>(lines.size());
 for (String s : lines) {
  int index = s.indexOf(delimiter);
  if (index <= 0) {
   throw new IllegalStateException("Bad histogram line = " + s);
  }
  String item = keyComesFirst ? s.substring(0, index) : s.substring(index + 1);
  String countStr = keyComesFirst ? s.substring(index + 1) : s.substring(0, index);
  int count = Integer.parseInt(countStr);
  result.add(item, count);
 }
 return result;
}

代码示例来源：origin: ahmetaa/zemberek-nlp

for (Token token : tokens) {
  String s = token.getText();
  if (local.correct.contains(s) || globalVocabulary.correct.contains(s)) {
   local.correct.add(s);
   continue;
  if (local.incorrect.contains(s) || globalVocabulary.incorrect.contains(s)) {
   local.incorrect.add(s);
   continue;
    token.getType() == TurkishLexer.Emoticon ||
    token.getType() == TurkishLexer.Unknown ||
    local.ignored.contains(s) ||
    globalVocabulary.ignored.contains(s) ||
   local.ignored.add(s);
   continue;
   local.incorrect.add(s);
  } else {
   local.correct.add(s);
try {
 lock.lock();
 globalVocabulary.correct.add(local.correct);
 globalVocabulary.incorrect.add(local.incorrect);
 globalVocabulary.ignored.add(local.ignored);
 Log.info("Correct = %d, Incorrect = %d, Ignored = %d",
   globalVocabulary.correct.size(),

代码示例来源：origin: ahmetaa/zemberek-nlp

/**
 * @return total count of the items in the input Iterable.
 */
public long totalCount(Iterable<T> it) {
 long count = 0;
 for (T t : it) {
  count += getCount(t);
 }
 return count;
}

内容来源于网络，如有侵权，请联系作者删除！

相关文章

热门标签

Java query python Node 开发语言 request Util 数据库 Table 后端算法 Logger Message Element Parser

最新文章

高级程序员和新手小白程序员区别你是那个等级看解决bug速度
浏览(1001) 发布于 5个月前
还在用双层for循环吗？太慢了
浏览(925) 发布于 5个月前
我用EasyExcel优化了公司的导出（附踩坑记录）
浏览(967) 发布于 5个月前
记录因Sharding Jdbc批量操作引发的一次fullGC
浏览(802) 发布于 5个月前
进大厂必须要会的单元测试
浏览(801) 发布于 5个月前

Histogram类方法