zemberek.core.collections.Histogram.<init>()方法的使用及代码示例

x33g5p2x  于2022-01-20 转载在 其他  
字(10.1k)|赞(0)|评价(0)|浏览(166)

本文整理了Java中zemberek.core.collections.Histogram.<init>()方法的一些代码示例,展示了Histogram.<init>()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Histogram.<init>()方法的具体详情如下:
包路径:zemberek.core.collections.Histogram
类名称:Histogram
方法名:<init>

Histogram.<init>介绍

暂无

代码示例

代码示例来源:origin: ahmetaa/zemberek-nlp

LookupCalculator(int bitCount) {
 this.bitCount = bitCount;
 this.n = 1 << bitCount;
 histogram = new Histogram<>(n / 2);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public CharNgramCountModel(String modelId, int order) {
 super(modelId, order);
 gramCounts = new Histogram[order + 1];
 for (int i = 0; i < gramCounts.length; i++) {
  gramCounts[i] = new Histogram<>();
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

CorpusStatistics(int expectedTermCount) {
 termFrequencies = new Histogram<>(expectedTermCount);
 documentFrequencies = new Histogram<>(expectedTermCount);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public static Histogram<String> deserializeStringHistogram(DataInputStream dis)
  throws IOException {
 int size = dis.readInt();
 if (size < 0) {
  throw new IllegalStateException(
    "Cannot deserialize String histogram. Count value is negative : " + size);
 }
 Histogram<String> result = new Histogram<>(size);
 for (int i = 0; i < size; i++) {
  result.set(dis.readUTF(), dis.readInt());
 }
 return result;
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public static Histogram<String> loadFromLines(
  List<String> lines,
  char delimiter,
  boolean keyComesFirst) {
 Histogram<String> result = new Histogram<>(lines.size());
 for (String s : lines) {
  int index = s.indexOf(delimiter);
  if (index <= 0) {
   throw new IllegalStateException("Bad histogram line = " + s);
  }
  String item = keyComesFirst ? s.substring(0, index) : s.substring(index + 1);
  String countStr = keyComesFirst ? s.substring(index + 1) : s.substring(0, index);
  int count = Integer.parseInt(countStr);
  result.add(item, count);
 }
 return result;
}

代码示例来源:origin: ahmetaa/zemberek-nlp

/**
 * Loads data from the custom serialized file and generates a CharNgramCountModel from it.
 *
 * @param is InputStream to load data.
 * @return a CharNgramCountModel generated from file.
 */
public static CharNgramCountModel load(InputStream is) throws IOException {
 try (DataInputStream dis = new DataInputStream(new BufferedInputStream(is))) {
  int order = dis.readInt();
  String modelId = dis.readUTF();
  Histogram<String>[] gramCounts = new Histogram[order + 1];
  for (int j = 1; j <= order; j++) {
   int size = dis.readInt();
   Histogram<String> countSet = new Histogram<>(size * 2);
   for (int i = 0; i < size; i++) {
    String key = dis.readUTF();
    countSet.add(key, dis.readInt());
   }
   gramCounts[j] = countSet;
  }
  return new CharNgramCountModel(modelId, order, gramCounts);
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
 Log.info("Total lines = " + lines.size());
 Histogram<String> hist = new Histogram<>();
 lines.stream()
   .map(s -> s.substring(0, s.indexOf(' ')))
   .forEach(hist::add);
 Log.info("Categories :");
 for (String s : hist.getSortedList()) {
  Log.info(s + " " + hist.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
 Log.info("Total lines = " + lines.size());
 Histogram<String> hist = new Histogram<>();
 lines.stream()
   .map(s -> s.substring(0, s.indexOf(' ')))
   .forEach(hist::add);
 Log.info("Categories :");
 for (String s : hist.getSortedList()) {
  Log.info(s + " " + hist.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
  Log.info("Total lines = " + lines.size());
  Histogram<String> hist = new Histogram<>();
  lines.stream()
    .map(s -> s.substring(0, s.indexOf(' ')))
    .forEach(hist::add);
  Log.info("Categories :");
  for (String s : hist.getSortedList()) {
   Log.info(s + " " + hist.getCount(s));
  }
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

void dataInfo(List<String> lines) {
 Log.info("Total lines = " + lines.size());
 Histogram<String> hist = new Histogram<>();
 lines.stream()
   .map(s -> s.substring(0, s.indexOf(' ')))
   .forEach(hist::add);
 Log.info("Categories :");
 for (String s : hist.getSortedList()) {
  Log.info(s + " " + hist.getCount(s));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

private List<Histogram<Term>> wordNgrams(List<String> paragraphs) {
 List<Histogram<Term>> ngrams = new ArrayList<>(order + 1);
 for (int i = 0; i < order; i++) {
  ngrams.add(new Histogram<>(100));
 }
 int tokenCount = 0;
 List<String> sentences = extractor.fromParagraphs(paragraphs);
 for (String sentence : sentences) {
  List<Token> tokens = lexer.tokenize(sentence);
  for (int i = 0; i < order; i++) {
   collectGrams(tokens, ngrams.get(i), i + 1, tokenCount);
  }
  // TODO: should we count only term tokens?
  tokenCount += tokens.size();
 }
 return ngrams;
}

代码示例来源:origin: ahmetaa/zemberek-nlp

private static void checkWeirdChars(Path root) throws IOException {
 List<Path> files = Files.walk(root, 1).filter(s -> s.toFile().isFile())
   .collect(Collectors.toList());
 Histogram<String> chars = new Histogram<>();
 for (Path file : files) {
  System.out.println(file);
  LinkedHashSet<String> sentences = getSentences(file);
  for (String sentence : sentences) {
   for (int i = 0; i < sentence.length(); i++) {
    char c = sentence.charAt(i);
    if (c >= 0x300 && c <= 0x036f) {
     chars.add(String.valueOf(c));
    }
    if (Scripts.undesiredChars.contains(c)) {
     chars.add(String.valueOf(c));
    }
   }
  }
 }
 for (String s : chars.getSortedList()) {
  System.out.println(String.format("%x %d", (int) s.charAt(0), chars.getCount(s)));
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

static void getQuestionSuffixes(Path in, Path out) throws IOException {
 List<String> splitLines = Files.readAllLines(in, Charsets.UTF_8);
 Histogram<String> endings = new Histogram<>();
 for (String splitLine : splitLines) {
  String[] tokens = splitLine.split("=");
  String s = tokens[1].trim();
  String[] t2 = s.split("[ ]");
  if (t2.length != 2) {
   System.out.println("Problem in " + splitLine);
   continue;
  }
  String suf = t2[1];
  if (suf.startsWith("mi") ||
    suf.startsWith("mu") ||
    suf.startsWith("mı") ||
    suf.startsWith("mü")
  ) {
   endings.add(t2[1]);
  }
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending + " " + endings.getCount(ending));
 }
 for (String ending : endings.getSortedList()) {
  System.out.println(ending);
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

static void multipleLetterRepetitionWords(Path in, Path out) throws IOException {
 Histogram<String> noisyWords = Histogram.loadFromUtf8File(in, ' ');
 Histogram<String> repetitionWords = new Histogram<>();
 for (String w : noisyWords) {
  if (w.length() == 1) {
   continue;
  }
  int maxRepetitionCount = 1;
  int repetitionCount = 1;
  char lastChar = w.charAt(0);
  for (int i = 1; i < w.length(); i++) {
   char c = w.charAt(i);
   if (c == lastChar) {
    repetitionCount++;
   } else {
    if (repetitionCount > maxRepetitionCount) {
     maxRepetitionCount = repetitionCount;
    }
    repetitionCount = 0;
   }
   lastChar = c;
  }
  if (maxRepetitionCount > 1) {
   repetitionWords.set(w, noisyWords.getCount(w));
  }
 }
 repetitionWords.saveSortedByCounts(out, " ");
}

代码示例来源:origin: ahmetaa/zemberek-nlp

static void countTokens(Path... paths) throws IOException {
 for (Path path : paths) {
  List<String> lines = TextIO.loadLines(path);
  Histogram<String> hw = new Histogram<>();
  Histogram<String> hl = new Histogram<>();
  for (String l : lines) {
   for (String s : l.split("[\\s]+")) {
    if (s.contains("__label__")) {
     if(s.contains("-")) {
      Log.warn(l);
     }
     hl.add(s);
    } else {
     hw.add(s);
    }
   }
  }
  Log.info("There are %d lines, %d words, %d labels in %s",
    lines.size(),
    hw.size(),
    hl.size(),
    path);
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

List<String> getEndingsFromVocabulary(List<String> words) {
 Histogram<String> endings = new Histogram<>(words.size() / 10);
 for (String word : words) {
  WordAnalysis analyses = morphology.analyze(word);
  for (SingleAnalysis analysis : analyses) {
   if (analysis.isUnknown()) {
    continue;
   }
   StemAndEnding se = analysis.getStemAndEnding();
   if (se.ending.length() > 0) {
    endings.add(se.ending);
   }
  }
 }
 return endings.getSortedList(Turkish.STRING_COMPARATOR_ASC);
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public void ambiguousWordStats(String filename) throws IOException {
 List<String> lines = readAll(filename);
 Histogram<String> uniques = new Histogram<>(1000000);
 int total = 0;
 Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();

代码示例来源:origin: ahmetaa/zemberek-nlp

public void dumpStats(List<DependencySentence> sentences, File statFile) throws IOException {
 Histogram<CoarsePosTag> coarsePos = new Histogram<>();
 Histogram<PosTag> pos = new Histogram<>();
 Histogram<DependencyRelation> depRelations = new Histogram<>();
 Histogram<String> morphItems = new Histogram<>();

代码示例来源:origin: ahmetaa/zemberek-nlp

NormalizationVocabulary(
  Path correct,
  Path incorrect,
  Path maybeIncorrect,
  int correctMinCount,
  int incorrectMinCount,
  int maybeIncorrectMinCount) throws IOException {
 Histogram<String> correctWords = Histogram.loadFromUtf8File(correct, ' ');
 Histogram<String> noisyWords = Histogram.loadFromUtf8File(incorrect, ' ');
 Histogram<String> maybeIncorrectWords = new Histogram<>();
 if (maybeIncorrect != null) {
  maybeIncorrectWords = Histogram.loadFromUtf8File(maybeIncorrect, ' ');
 }
 correctWords.removeSmaller(correctMinCount);
 noisyWords.removeSmaller(incorrectMinCount);
 maybeIncorrectWords.removeSmaller(maybeIncorrectMinCount);
 this.noisyWordStart = correctWords.size();
 this.words = new ArrayList<>(correctWords.getSortedList());
 words.addAll(noisyWords.getSortedList());
 this.maybeIncorrectWordStart = words.size();
 words.addAll(maybeIncorrectWords.getSortedList());
 int i = 0;
 for (String word : words) {
  indexes.put(word, i);
  i++;
 }
}

代码示例来源:origin: ahmetaa/zemberek-nlp

public static void counts() {
 String[] fruits = {"apple", "pear", "grape", "apple", "apple", "apricot", "grape"};
 Log.info("Adding elements to histogram:" + Arrays.toString(fruits));
 Histogram<String> histogram = new Histogram<>();
 histogram.add(fruits);
 Log.info("\nPrint with no order");
 for (String s : histogram) {
  Log.info(s + " count: " + histogram.getCount(s));
 }
 Log.info("\nPrint with count order");
 for (String s : histogram.getSortedList()) {
  Log.info(s + " count: " + histogram.getCount(s));
 }
 histogram.removeSmaller(2);
 Log.info("\nAfter removing elements with counts less than 2");
 for (String s : histogram.getSortedList()) {
  Log.info(s + " count: " + histogram.getCount(s));
 }
}

相关文章