本文整理了Java中zemberek.core.collections.Histogram.getCount()
方法的一些代码示例,展示了Histogram.getCount()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Histogram.getCount()
方法的具体详情如下:
包路径:zemberek.core.collections.Histogram
类名称:Histogram
方法名:getCount
[英]current count of the given element
[中]给定元素的当前计数
代码示例来源:origin: ahmetaa/zemberek-nlp
/**
* @return total count of the items in the input Iterable.
*/
public long totalCount(Iterable<T> it) {
long count = 0;
for (T t : it) {
count += getCount(t);
}
return count;
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public int getCount(int order, String key) {
return gramCounts[order].getCount(key);
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public String log() {
List<String> res = new ArrayList<>();
res.add(String.format("Number of sentences = %d", numberOfSentences));
res.add(String.format("Number of tokens = %d", numberOfTokens));
for (String type : typeHistogram.getSortedList()) {
res.add(String.format("Type = %s (Count = %d, Token Count = %d Av. Token = %.2f )",
type,
typeHistogram.getCount(type),
tokenHistogram.getCount(type),
tokenHistogram.getCount(type) * 1f / typeHistogram.getCount(type)));
}
return String.join("\n", res);
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public void saveSortedByCounts(Path path, String delimiter)
throws IOException {
try (PrintWriter pw = new PrintWriter(path.toFile(), StandardCharsets.UTF_8.name())) {
List<T> sorted = getSortedList();
for (T t : sorted) {
pw.println(t + delimiter + getCount(t));
}
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public static void serializeStringHistogram(Histogram<String> h, DataOutputStream dos)
throws IOException {
dos.writeInt(h.size());
for (String key : h.map) {
dos.writeUTF(key);
dos.writeInt(h.getCount(key));
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public void saveSortedByKeys(Path path, String delimiter, Comparator<T> comparator)
throws IOException {
try (PrintWriter pw = new PrintWriter(path.toFile(), StandardCharsets.UTF_8.name())) {
List<T> sorted = getSortedList(comparator);
for (T t : sorted) {
pw.println(t + delimiter + getCount(t));
}
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
/**
* merges another Histogram to this one.
*
* @param otherSet another Histogram
*/
public void add(Histogram<T> otherSet) {
if (otherSet == null) {
throw new NullPointerException("Histogram cannot be null");
}
for (T t : otherSet) {
add(t, otherSet.getCount(t));
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
private int subSumCount(Term t, List<Histogram<Term>> histograms) {
int sum = 0;
for (int i = t.order(); i < order; i++) {
for (Term t2 : histograms.get(i)) {
if (t2.contains(t)) {
sum += histograms.get(i).getCount(t2);
}
}
}
return sum;
}
代码示例来源:origin: ahmetaa/zemberek-nlp
/**
* A custom serializer. Big-endian format is like this: int32 order Utf id int32 keyCount utf key
* int32 count ... int32 keyCount utf key int32 count ...
*
* @param f file to serialize.
*/
public void save(File f) throws IOException {
try (DataOutputStream dos = new DataOutputStream(
new BufferedOutputStream(new FileOutputStream(f)))) {
dos.writeInt(order);
dos.writeUTF(id);
for (int i = 1; i < gramCounts.length; i++) {
dos.writeInt(gramCounts[i].size());
for (String key : gramCounts[i]) {
dos.writeUTF(key);
dos.writeInt(gramCounts[i].getCount(key));
}
}
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public List<ScoredItem<Term>> initialRank(List<Histogram<Term>> histograms) {
List<ScoredItem<Term>> scores = new ArrayList<>(); // TODO: use a priority queue
int termCount = histograms.get(0).size();
for (Histogram<Term> ngramTerms : histograms) {
for (Term term : ngramTerms) {
double tf = ((double) ngramTerms.getCount(term)) / termCount;
// add 1 for smoothing.
double termDocCount =
term.order() == 1 ? (statistics.documentFrequencies.getCount(term.words[0])) + 1 : 1;
double idf = Math.log(statistics.documentCount / termDocCount);
ScoredItem<Term> scoredItem = new ScoredItem<>(term, (float) (tf * idf));
scores.add(scoredItem);
}
}
Collections.sort(scores);
return new ArrayList<>(scores.subList(0, scores.size()));
}
代码示例来源:origin: ahmetaa/zemberek-nlp
void dataInfo(List<String> lines) {
Log.info("Total lines = " + lines.size());
Histogram<String> hist = new Histogram<>();
lines.stream()
.map(s -> s.substring(0, s.indexOf(' ')))
.forEach(hist::add);
Log.info("Categories :");
for (String s : hist.getSortedList()) {
Log.info(s + " " + hist.getCount(s));
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
void dataInfo(List<String> lines) {
Log.info("Total lines = " + lines.size());
Histogram<String> hist = new Histogram<>();
lines.stream()
.map(s -> s.substring(0, s.indexOf(' ')))
.forEach(hist::add);
Log.info("Categories :");
for (String s : hist.getSortedList()) {
Log.info(s + " " + hist.getCount(s));
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
void dataInfo(List<String> lines) {
Log.info("Total lines = " + lines.size());
Histogram<String> hist = new Histogram<>();
lines.stream()
.map(s -> s.substring(0, s.indexOf(' ')))
.forEach(hist::add);
Log.info("Categories :");
for (String s : hist.getSortedList()) {
Log.info(s + " " + hist.getCount(s));
}
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
void dataInfo(List<String> lines) {
Log.info("Total lines = " + lines.size());
Histogram<String> hist = new Histogram<>();
lines.stream()
.map(s -> s.substring(0, s.indexOf(' ')))
.forEach(hist::add);
Log.info("Categories :");
for (String s : hist.getSortedList()) {
Log.info(s + " " + hist.getCount(s));
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public List<ScoredItem<Term>> rescoreStatistics(
List<Histogram<Term>> histograms,
List<ScoredItem<Term>> initialScores) {
int termCount = histograms.get(0).size();
List<ScoredItem<Term>> scores = new ArrayList<>(); // TODO: use a priority queue
for (ScoredItem<Term> si : initialScores) {
Term term = si.item;
// position of first occurrence
double pfo = Math.log(DEFAULT_CUTOFF_POSITION / (si.item.firstOccurrenceIndex + 1));
double termLength = Math.sqrt(term.order());
double tf = termCount - subSumCount(term, histograms, initialScores.subList(0, 100));
// add 1 for smoothing.
double termDocCount =
term.order() == 1 ? (statistics.documentFrequencies.getCount(term.words[0])) + 1 : 1;
double idf = Math.log(statistics.documentCount / termDocCount);
ScoredItem<Term> scoredItem = new ScoredItem<>(term, (float) (tf * pfo * idf * termLength));
scores.add(scoredItem);
}
Collections.sort(scores);
return new ArrayList<>(scores.subList(0, scores.size()));
}
代码示例来源:origin: ahmetaa/zemberek-nlp
static void getQuestionSuffixes(Path in, Path out) throws IOException {
List<String> splitLines = Files.readAllLines(in, Charsets.UTF_8);
Histogram<String> endings = new Histogram<>();
for (String splitLine : splitLines) {
String[] tokens = splitLine.split("=");
String s = tokens[1].trim();
String[] t2 = s.split("[ ]");
if (t2.length != 2) {
System.out.println("Problem in " + splitLine);
continue;
}
String suf = t2[1];
if (suf.startsWith("mi") ||
suf.startsWith("mu") ||
suf.startsWith("mı") ||
suf.startsWith("mü")
) {
endings.add(t2[1]);
}
}
for (String ending : endings.getSortedList()) {
System.out.println(ending + " " + endings.getCount(ending));
}
for (String ending : endings.getSortedList()) {
System.out.println(ending);
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
private static void checkWeirdChars(Path root) throws IOException {
List<Path> files = Files.walk(root, 1).filter(s -> s.toFile().isFile())
.collect(Collectors.toList());
Histogram<String> chars = new Histogram<>();
for (Path file : files) {
System.out.println(file);
LinkedHashSet<String> sentences = getSentences(file);
for (String sentence : sentences) {
for (int i = 0; i < sentence.length(); i++) {
char c = sentence.charAt(i);
if (c >= 0x300 && c <= 0x036f) {
chars.add(String.valueOf(c));
}
if (Scripts.undesiredChars.contains(c)) {
chars.add(String.valueOf(c));
}
}
}
}
for (String s : chars.getSortedList()) {
System.out.println(String.format("%x %d", (int) s.charAt(0), chars.getCount(s)));
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
private int subSumCount(Term t, List<Histogram<Term>> histograms, List<ScoredItem<Term>> top) {
int sum = 0;
for (ScoredItem<Term> scoredItem : top) {
Term t2 = scoredItem.item;
if (t.order() >= t2.order()) {
continue;
}
if (t2.contains(t)) {
sum += histograms.get(t2.order() - 1).getCount(t2);
}
}
return sum;
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public static void counts() {
String[] fruits = {"apple", "pear", "grape", "apple", "apple", "apricot", "grape"};
Log.info("Adding elements to histogram:" + Arrays.toString(fruits));
Histogram<String> histogram = new Histogram<>();
histogram.add(fruits);
Log.info("\nPrint with no order");
for (String s : histogram) {
Log.info(s + " count: " + histogram.getCount(s));
}
Log.info("\nPrint with count order");
for (String s : histogram.getSortedList()) {
Log.info(s + " count: " + histogram.getCount(s));
}
histogram.removeSmaller(2);
Log.info("\nAfter removing elements with counts less than 2");
for (String s : histogram.getSortedList()) {
Log.info(s + " count: " + histogram.getCount(s));
}
}
代码示例来源:origin: ahmetaa/zemberek-nlp
public Quantizer getQuantizer(QuantizerType type) {
Log.info("Unique value count:" + histogram.size());
double[] lookup = new double[histogram.size()];
int[] counts = new int[histogram.size()];
int j = 0;
for (double key : histogram) {
lookup[j] = key;
counts[j] = histogram.getCount(key);
j++;
}
Log.info("Quantizing to " + bitCount + " bits");
switch (type) {
case BINNING:
return BinningQuantizer.linearBinning(lookup, bitCount);
case BINNING_WEIGHTED:
return BinningQuantizer.logCountBinning(lookup, counts, bitCount);
case KMEANS:
return KMeansQuantizer.generateFromRawData(lookup, bitCount);
default:
throw new UnsupportedOperationException("Linear cannot be used in this operation");
}
}
内容来源于网络,如有侵权,请联系作者删除!