zemberek.core.collections.Histogram.getSortedList()方法的使用及代码示例

x33g5p2x  于2022-01-20 转载在 其他  
字(9.1k)|赞(0)|评价(0)|浏览(123)

本文整理了Java中zemberek.core.collections.Histogram.getSortedList()方法的一些代码示例,展示了Histogram.getSortedList()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Histogram.getSortedList()方法的具体详情如下:
包路径:zemberek.core.collections.Histogram
类名称:Histogram
方法名:getSortedList

Histogram.getSortedList介绍

[英]returns the Elements in a list sorted by count, descending.
[中]返回列表中按计数降序排序的元素。

代码示例

代码示例来源:origin: ahmetaa/zemberek-nlp

public Iterable<String> getSortedKeyIterator(int order) {
 return gramCounts[order].getSortedList();
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public void saveSortedByCounts(Path path, String delimiter)
  throws IOException {
 try (PrintWriter pw = new PrintWriter(path.toFile(), StandardCharsets.UTF_8.name())) {
  List<T> sorted = getSortedList();
  for (T t : sorted) {
   pw.println(t + delimiter + getCount(t));
  }
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public void saveSortedByKeys(Path path, String delimiter, Comparator<T> comparator)
  throws IOException {
 try (PrintWriter pw = new PrintWriter(path.toFile(), StandardCharsets.UTF_8.name())) {
  List<T> sorted = getSortedList(comparator);
  for (T t : sorted) {
   pw.println(t + delimiter + getCount(t));
  }
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public String log() {
 List<String> res = new ArrayList<>();
 res.add(String.format("Number of sentences      = %d", numberOfSentences));
 res.add(String.format("Number of tokens         = %d", numberOfTokens));
 for (String type : typeHistogram.getSortedList()) {
  res.add(String.format("Type = %s (Count = %d, Token Count = %d Av. Token = %.2f )",
    type,
    typeHistogram.getCount(type),
    tokenHistogram.getCount(type),
    tokenHistogram.getCount(type) * 1f / typeHistogram.getCount(type)));
 }
 return String.join("\n", res);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

static void getQuestionSuffixes(Path in, Path out) throws IOException {
 List<String> splitLines = Files.readAllLines(in, Charsets.UTF_8);
 Histogram<String> endings = new Histogram<>();
 for (String splitLine : splitLines) {
  String[] tokens = splitLine.split("=");
  String s = tokens[1].trim();
  String[] t2 = s.split("[ ]");
  if (t2.length != 2) {
   System.out.println("Problem in " + splitLine);
   continue;
  }
  String suf = t2[1];
  if (suf.startsWith("mi") ||
    suf.startsWith("mu") ||
    suf.startsWith("mı") ||
    suf.startsWith("mü")
  ) {
   endings.add(t2[1]);
  }
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending + " " + endings.getCount(ending));
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending);
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
  Log.info("Total lines = " + lines.size());
  Histogram<String> hist = new Histogram<>();
  lines.stream()
    .map(s -> s.substring(0, s.indexOf(' ')))
    .forEach(hist::add);
  Log.info("Categories :");
  for (String s : hist.getSortedList()) {
   Log.info(s + " " + hist.getCount(s));
  }
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
 Log.info("Total lines = " + lines.size());
 Histogram<String> hist = new Histogram<>();
 lines.stream()
   .map(s -> s.substring(0, s.indexOf(' ')))
   .forEach(hist::add);
 Log.info("Categories :");
 for (String s : hist.getSortedList()) {
  Log.info(s + " " + hist.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
 Log.info("Total lines = " + lines.size());
 Histogram<String> hist = new Histogram<>();
 lines.stream()
   .map(s -> s.substring(0, s.indexOf(' ')))
   .forEach(hist::add);
 Log.info("Categories :");
 for (String s : hist.getSortedList()) {
  Log.info(s + " " + hist.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
 Log.info("Total lines = " + lines.size());
 Histogram<String> hist = new Histogram<>();
 lines.stream()
   .map(s -> s.substring(0, s.indexOf(' ')))
   .forEach(hist::add);
 Log.info("Categories :");
 for (String s : hist.getSortedList()) {
  Log.info(s + " " + hist.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

private static void checkWeirdChars(Path root) throws IOException {
 List<Path> files = Files.walk(root, 1).filter(s -> s.toFile().isFile())
   .collect(Collectors.toList());
 Histogram<String> chars = new Histogram<>();
 for (Path file : files) {
  System.out.println(file);
  LinkedHashSet<String> sentences = getSentences(file);
  for (String sentence : sentences) {
   for (int i = 0; i < sentence.length(); i++) {
    char c = sentence.charAt(i);
    if (c >= 0x300 && c <= 0x036f) {
     chars.add(String.valueOf(c));
    }
    if (Scripts.undesiredChars.contains(c)) {
     chars.add(String.valueOf(c));
    }
   }
  }
 }
 for (String s : chars.getSortedList()) {
  System.out.println(String.format("%x %d", (int) s.charAt(0), chars.getCount(s)));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * Returns keys that both histogram contain.
 * @param other Another Histogram
 * @return A set of keys that both histogram contain.
 */
public Set<T> getIntersectionOfKeys(Histogram<T> other) {
 LinkedHashSet<T> result = new LinkedHashSet<>();
 Histogram<T> smaller = other.size() < size() ? other : this;
 Histogram<T> larger = smaller == this ? other : this;
 for (T t : smaller.getSortedList()) {
  if (larger.contains(t)) {
   result.add(t);
  }
 }
 return result;
}

代码示例来源:origin: ahmetaa/zemberek-nlp

NormalizationVocabulary(
  Path correct,
  Path incorrect,
  Path maybeIncorrect,
  int correctMinCount,
  int incorrectMinCount,
  int maybeIncorrectMinCount) throws IOException {
 Histogram<String> correctWords = Histogram.loadFromUtf8File(correct, ' ');
 Histogram<String> noisyWords = Histogram.loadFromUtf8File(incorrect, ' ');
 Histogram<String> maybeIncorrectWords = new Histogram<>();
 if (maybeIncorrect != null) {
  maybeIncorrectWords = Histogram.loadFromUtf8File(maybeIncorrect, ' ');
 }
 correctWords.removeSmaller(correctMinCount);
 noisyWords.removeSmaller(incorrectMinCount);
 maybeIncorrectWords.removeSmaller(maybeIncorrectMinCount);
 this.noisyWordStart = correctWords.size();
 this.words = new ArrayList<>(correctWords.getSortedList());
 words.addAll(noisyWords.getSortedList());
 this.maybeIncorrectWordStart = words.size();
 words.addAll(maybeIncorrectWords.getSortedList());
 int i = 0;
 for (String word : words) {
  indexes.put(word, i);
  i++;
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

List<String> getEndingsFromVocabulary(List<String> words) {
 Histogram<String> endings = new Histogram<>(words.size() / 10);
 for (String word : words) {
  WordAnalysis analyses = morphology.analyze(word);
  for (SingleAnalysis analysis : analyses) {
   if (analysis.isUnknown()) {
    continue;
   }
   StemAndEnding se = analysis.getStemAndEnding();
   if (se.ending.length() > 0) {
    endings.add(se.ending);
   }
  }
 }
 return endings.getSortedList(Turkish.STRING_COMPARATOR_ASC);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
 int count = uniques.getCount(s);
 if (count > 5) {

代码示例来源:origin: ahmetaa/zemberek-nlp

Path amb = outRoot.resolve(s + "-amb.txt");
try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
 for (WordAnalysis wa : wordAnalyses.getSortedList()) {
  pwa.println(wa.getInput());
  for (SingleAnalysis analysis : wa) {

代码示例来源:origin: ahmetaa/zemberek-nlp

PrintWriter pwFreq =
   new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
for (String word : wordFreq.getSortedList()) {

代码示例来源:origin: ahmetaa/zemberek-nlp

st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
 int count = uniques.getCount(s);
 if (st.overCutoff(count)) {

代码示例来源:origin: ahmetaa/zemberek-nlp

st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
 int count = uniques.getCount(s);
 if (st.overCutoff(count)) {
  System.out.println(s + " : " + count + "    " + pp(p1));
  Histogram<String> members = ambiguityGroups.get(s);
  for (String member : members.getSortedList()) {
   int memberCount = members.getCount(member);
   if (pct(memberCount, count) > 0.1) {

代码示例来源:origin: ahmetaa/zemberek-nlp

writer.writeLine("Sentence count:" + sentences.size());
writer.writeLine("\nCoarse POS values:\n");
for (CoarsePosTag coarsePo : coarsePos.getSortedList()) {
 writer.writeLine(coarsePo.getAsConnlValue() + " : " + coarsePos.getCount(coarsePo));
for (PosTag posTag : pos.getSortedList()) {
 writer.writeLine(posTag.getAsConnlValue() + " : " + pos.getCount(posTag));
for (DependencyRelation depRel : depRelations.getSortedList()) {
 writer.writeLine(depRel.getAsConnlString() + " : " + depRelations.getCount(depRel));
for (String morphItem : morphItems.getSortedList()) {
 writer.writeLine(morphItem + " : " + morphItems.getCount(morphItem));

代码示例来源:origin: ahmetaa/zemberek-nlp

public static void counts() {
 String[] fruits = {"apple", "pear", "grape", "apple", "apple", "apricot", "grape"};
 Log.info("Adding elements to histogram:" + Arrays.toString(fruits));
 Histogram<String> histogram = new Histogram<>();
 histogram.add(fruits);
 Log.info("\nPrint with no order");
 for (String s : histogram) {
  Log.info(s + " count: " + histogram.getCount(s));
 }
 Log.info("\nPrint with count order");
 for (String s : histogram.getSortedList()) {
  Log.info(s + " count: " + histogram.getCount(s));
 }
 histogram.removeSmaller(2);
 Log.info("\nAfter removing elements with counts less than 2");
 for (String s : histogram.getSortedList()) {
  Log.info(s + " count: " + histogram.getCount(s));
 }
}

相关文章