zemberek.core.collections.Histogram.getCount()方法的使用及代码示例

x33g5p2x  于2022-01-20 转载在 其他  
字(9.5k)|赞(0)|评价(0)|浏览(128)

本文整理了Java中zemberek.core.collections.Histogram.getCount()方法的一些代码示例,展示了Histogram.getCount()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Histogram.getCount()方法的具体详情如下:
包路径:zemberek.core.collections.Histogram
类名称:Histogram
方法名:getCount

Histogram.getCount介绍

[英]current count of the given element
[中]给定元素的当前计数

代码示例

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * @return total count of the items in the input Iterable.
 */
public long totalCount(Iterable<T> it) {
 long count = 0;
 for (T t : it) {
  count += getCount(t);
 }
 return count;
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public int getCount(int order, String key) {
 return gramCounts[order].getCount(key);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public String log() {
 List<String> res = new ArrayList<>();
 res.add(String.format("Number of sentences      = %d", numberOfSentences));
 res.add(String.format("Number of tokens         = %d", numberOfTokens));
 for (String type : typeHistogram.getSortedList()) {
  res.add(String.format("Type = %s (Count = %d, Token Count = %d Av. Token = %.2f )",
    type,
    typeHistogram.getCount(type),
    tokenHistogram.getCount(type),
    tokenHistogram.getCount(type) * 1f / typeHistogram.getCount(type)));
 }
 return String.join("\n", res);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public void saveSortedByCounts(Path path, String delimiter)
  throws IOException {
 try (PrintWriter pw = new PrintWriter(path.toFile(), StandardCharsets.UTF_8.name())) {
  List<T> sorted = getSortedList();
  for (T t : sorted) {
   pw.println(t + delimiter + getCount(t));
  }
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public static void serializeStringHistogram(Histogram<String> h, DataOutputStream dos)
  throws IOException {
 dos.writeInt(h.size());
 for (String key : h.map) {
  dos.writeUTF(key);
  dos.writeInt(h.getCount(key));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public void saveSortedByKeys(Path path, String delimiter, Comparator<T> comparator)
  throws IOException {
 try (PrintWriter pw = new PrintWriter(path.toFile(), StandardCharsets.UTF_8.name())) {
  List<T> sorted = getSortedList(comparator);
  for (T t : sorted) {
   pw.println(t + delimiter + getCount(t));
  }
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * merges another Histogram to this one.
 *
 * @param otherSet another Histogram
 */
public void add(Histogram<T> otherSet) {
 if (otherSet == null) {
  throw new NullPointerException("Histogram cannot be null");
 }
 for (T t : otherSet) {
  add(t, otherSet.getCount(t));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

private int subSumCount(Term t, List<Histogram<Term>> histograms) {
 int sum = 0;
 for (int i = t.order(); i < order; i++) {
  for (Term t2 : histograms.get(i)) {
   if (t2.contains(t)) {
    sum += histograms.get(i).getCount(t2);
   }
  }
 }
 return sum;
}

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * A custom serializer. Big-endian format is like this: int32 order Utf id int32 keyCount utf key
 * int32 count ... int32 keyCount utf key int32 count ...
 *
 * @param f file to serialize.
 */
public void save(File f) throws IOException {
 try (DataOutputStream dos = new DataOutputStream(
   new BufferedOutputStream(new FileOutputStream(f)))) {
  dos.writeInt(order);
  dos.writeUTF(id);
  for (int i = 1; i < gramCounts.length; i++) {
   dos.writeInt(gramCounts[i].size());
   for (String key : gramCounts[i]) {
    dos.writeUTF(key);
    dos.writeInt(gramCounts[i].getCount(key));
   }
  }
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public List<ScoredItem<Term>> initialRank(List<Histogram<Term>> histograms) {
 List<ScoredItem<Term>> scores = new ArrayList<>(); // TODO: use a priority queue
 int termCount = histograms.get(0).size();
 for (Histogram<Term> ngramTerms : histograms) {
  for (Term term : ngramTerms) {
   double tf = ((double) ngramTerms.getCount(term)) / termCount;
   // add 1 for smoothing.
   double termDocCount =
     term.order() == 1 ? (statistics.documentFrequencies.getCount(term.words[0])) + 1 : 1;
   double idf = Math.log(statistics.documentCount / termDocCount);
   ScoredItem<Term> scoredItem = new ScoredItem<>(term, (float) (tf * idf));
   scores.add(scoredItem);
  }
 }
 Collections.sort(scores);
 return new ArrayList<>(scores.subList(0, scores.size()));
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
 Log.info("Total lines = " + lines.size());
 Histogram<String> hist = new Histogram<>();
 lines.stream()
   .map(s -> s.substring(0, s.indexOf(' ')))
   .forEach(hist::add);
 Log.info("Categories :");
 for (String s : hist.getSortedList()) {
  Log.info(s + " " + hist.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
 Log.info("Total lines = " + lines.size());
 Histogram<String> hist = new Histogram<>();
 lines.stream()
   .map(s -> s.substring(0, s.indexOf(' ')))
   .forEach(hist::add);
 Log.info("Categories :");
 for (String s : hist.getSortedList()) {
  Log.info(s + " " + hist.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
  Log.info("Total lines = " + lines.size());
  Histogram<String> hist = new Histogram<>();
  lines.stream()
    .map(s -> s.substring(0, s.indexOf(' ')))
    .forEach(hist::add);
  Log.info("Categories :");
  for (String s : hist.getSortedList()) {
   Log.info(s + " " + hist.getCount(s));
  }
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
 Log.info("Total lines = " + lines.size());
 Histogram<String> hist = new Histogram<>();
 lines.stream()
   .map(s -> s.substring(0, s.indexOf(' ')))
   .forEach(hist::add);
 Log.info("Categories :");
 for (String s : hist.getSortedList()) {
  Log.info(s + " " + hist.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public List<ScoredItem<Term>> rescoreStatistics(
  List<Histogram<Term>> histograms,
  List<ScoredItem<Term>> initialScores) {
 int termCount = histograms.get(0).size();
 List<ScoredItem<Term>> scores = new ArrayList<>(); // TODO: use a priority queue
 for (ScoredItem<Term> si : initialScores) {
  Term term = si.item;
  // position of first occurrence
  double pfo = Math.log(DEFAULT_CUTOFF_POSITION / (si.item.firstOccurrenceIndex + 1));
  double termLength = Math.sqrt(term.order());
  double tf = termCount - subSumCount(term, histograms, initialScores.subList(0, 100));
  // add 1 for smoothing.
  double termDocCount =
    term.order() == 1 ? (statistics.documentFrequencies.getCount(term.words[0])) + 1 : 1;
  double idf = Math.log(statistics.documentCount / termDocCount);
  ScoredItem<Term> scoredItem = new ScoredItem<>(term, (float) (tf * pfo * idf * termLength));
  scores.add(scoredItem);
 }
 Collections.sort(scores);
 return new ArrayList<>(scores.subList(0, scores.size()));
}

代码示例来源:origin: ahmetaa/zemberek-nlp

static void getQuestionSuffixes(Path in, Path out) throws IOException {
 List<String> splitLines = Files.readAllLines(in, Charsets.UTF_8);
 Histogram<String> endings = new Histogram<>();
 for (String splitLine : splitLines) {
  String[] tokens = splitLine.split("=");
  String s = tokens[1].trim();
  String[] t2 = s.split("[ ]");
  if (t2.length != 2) {
   System.out.println("Problem in " + splitLine);
   continue;
  }
  String suf = t2[1];
  if (suf.startsWith("mi") ||
    suf.startsWith("mu") ||
    suf.startsWith("mı") ||
    suf.startsWith("mü")
  ) {
   endings.add(t2[1]);
  }
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending + " " + endings.getCount(ending));
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending);
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

private static void checkWeirdChars(Path root) throws IOException {
 List<Path> files = Files.walk(root, 1).filter(s -> s.toFile().isFile())
   .collect(Collectors.toList());
 Histogram<String> chars = new Histogram<>();
 for (Path file : files) {
  System.out.println(file);
  LinkedHashSet<String> sentences = getSentences(file);
  for (String sentence : sentences) {
   for (int i = 0; i < sentence.length(); i++) {
    char c = sentence.charAt(i);
    if (c >= 0x300 && c <= 0x036f) {
     chars.add(String.valueOf(c));
    }
    if (Scripts.undesiredChars.contains(c)) {
     chars.add(String.valueOf(c));
    }
   }
  }
 }
 for (String s : chars.getSortedList()) {
  System.out.println(String.format("%x %d", (int) s.charAt(0), chars.getCount(s)));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

private int subSumCount(Term t, List<Histogram<Term>> histograms, List<ScoredItem<Term>> top) {
 int sum = 0;
 for (ScoredItem<Term> scoredItem : top) {
  Term t2 = scoredItem.item;
  if (t.order() >= t2.order()) {
   continue;
  }
  if (t2.contains(t)) {
   sum += histograms.get(t2.order() - 1).getCount(t2);
  }
 }
 return sum;
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public static void counts() {
 String[] fruits = {"apple", "pear", "grape", "apple", "apple", "apricot", "grape"};
 Log.info("Adding elements to histogram:" + Arrays.toString(fruits));
 Histogram<String> histogram = new Histogram<>();
 histogram.add(fruits);
 Log.info("\nPrint with no order");
 for (String s : histogram) {
  Log.info(s + " count: " + histogram.getCount(s));
 }
 Log.info("\nPrint with count order");
 for (String s : histogram.getSortedList()) {
  Log.info(s + " count: " + histogram.getCount(s));
 }
 histogram.removeSmaller(2);
 Log.info("\nAfter removing elements with counts less than 2");
 for (String s : histogram.getSortedList()) {
  Log.info(s + " count: " + histogram.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public Quantizer getQuantizer(QuantizerType type) {
 Log.info("Unique value count:" + histogram.size());
 double[] lookup = new double[histogram.size()];
 int[] counts = new int[histogram.size()];
 int j = 0;
 for (double key : histogram) {
  lookup[j] = key;
  counts[j] = histogram.getCount(key);
  j++;
 }
 Log.info("Quantizing to " + bitCount + " bits");
 switch (type) {
  case BINNING:
   return BinningQuantizer.linearBinning(lookup, bitCount);
  case BINNING_WEIGHTED:
   return BinningQuantizer.logCountBinning(lookup, counts, bitCount);
  case KMEANS:
   return KMeansQuantizer.generateFromRawData(lookup, bitCount);
  default:
   throw new UnsupportedOperationException("Linear cannot be used in this operation");
 }
}

相关文章