找不到mapreduce停止字

ljsrvy3e  于 2021-05-29  发布在  Hadoop
关注(0)|答案(1)|浏览(430)

我是mapreduce的新手,正在尝试编写一个程序来计算文件中停止字的数量。我从命令行引用stopword.txt文件,但每次运行时,结果都是stopwords=0和good words=30(应该是5&25)。我没有得到任何例外,它的编译和运行正常。我不知道还有什么好尝试的。下面是我的代码。hadoop版本是2.0。
停止字.java

public class StopWord {

public enum COUNTERS {
      STOPWORDS, GOODWORDS
     }
public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    GenericOptionsParser parser = new GenericOptionsParser(conf, args);
    args = parser.getRemainingArgs();

    Job job = new Job(conf, "StopWord");
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setJarByClass(StopWord.class);
    job.setMapperClass(MyMapper.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; i++) {
        if ("-skip".equals(args[i])) {
            DistributedCache.addCacheFile(new Path(args[++i]).toUri(),
                    job.getConfiguration());
            if (i+1 < args.length)
            {
                i++;
            }
            else
            {
                break;
            }
        }

        other_args.add(args[i]);
    }

    FileInputFormat.setInputPaths(job, new Path(other_args.get(0)));
    FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    System.out.printf("Good Words: %d, Stop Words: %d\n",
              counters.findCounter(COUNTERS.GOODWORDS).getValue(),
              counters.findCounter(COUNTERS.STOPWORDS).getValue());
         }
    }

mymapper.java文件

public class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

private Text word = new Text();
private Set<String> stopWordList = new HashSet<String>();
private BufferedReader fis;

protected void setup(Context context) throws java.io.IOException,
        InterruptedException {

    try {
        Path[] stopWordFiles = new Path[0];
        stopWordFiles = context.getLocalCacheFiles();
        System.out.println(stopWordFiles.toString());
        if (stopWordFiles != null && stopWordFiles.length > 0) {
            for (Path stopWordFile : stopWordFiles) {
                readStopWordFile(stopWordFile);
            }
        }
    } catch (IOException e) {
        System.err.println("Exception reading stop word file: " + e);
    }
}

 //reading the stop word file
private void readStopWordFile(Path stopWordFile) {
    try {
        fis = new BufferedReader(new FileReader(stopWordFile.toString()));
        String stopWord = null;
        while ((stopWord = fis.readLine()) != null) {
            stopWordList.add(stopWord);
        }
    } catch (IOException e) {
        System.err.println("Exception while reading stop word file '"
                + stopWordFile + "' : " + e.toString());
    }
}

public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line);

    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        if (stopWordList.contains(token)) {
            context.getCounter(StopWord.COUNTERS.STOPWORDS)
                    .increment(1);
        } else {
            context.getCounter(StopWord.COUNTERS.GOODWORDS)
                    .increment(1);
            word.set(token);
            context.write(word, null);
        }
    }
}
}
6ie5vjzr

6ie5vjzr1#

从我可以看到你的stopword文件可能是空的,你正在添加作业初始化后的分布式缓存。
有关访问hadoop分布式缓存中的文件的更多信息,请参阅本文

相关问题