在hadoop中运行此程序时出错

ccrfmcuu  于 2021-06-01  发布在  Hadoop
关注(0)|答案(1)|浏览(348)

我正在尝试用hadoop编写这个map reduce程序,它统计任何特定用户从twitter垃圾堆发tweet的字数。下面是我的代码:我相信在解析csv文件时,我的Map器中有一个错误,但是我不确定如何去解决它。有人能提供一些见解吗?

import java.io.IOException;
import java.util.StringTokenizer;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;    
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.commons.lang.StringEscapeUtils;
import java.util.HashMap;

public class Problem1{

public static class SOWordCountMapper extends
        Mapper<Object, Text, Text, IntWritable> {

private static final IntWritable ONE = new IntWritable(1);
private Text txt = new Text();
private Text user = new Text();

    public void map(Object key, Text value, Context context)
            throws IOException, InterruptedException {

Configuration conf = context.getConfiguration();
String tweetHandle = conf.get("tweetHandle");

String line = value.toString();
String parsed[] = line.split("\" , ");

String skip[] = new String[1];
skip[0] = "Handle,Tweet,Favs,RTs,Latitude,Longitude";
if(parsed[0].equals(skip[0])){

    return;
}

String handle = parsed[0].replace("\"","").toLowerCase();
String text = parsed[1].replace("\"", "");

        if(user == null || txt == null || !user.equals(tweetHandle)){
            return;
        }
        }

}

    public static class IntSumReducer extends
        Reducer<Text, IntWritable, Text, IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values,
            Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable val : values) {
            sum += val.get();
        }

        result.set(sum);
        context.write(key, result);

    }
}
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args)
            .getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: Problem1 <in> <out>");
        System.exit(2);
    }
    Job job = new Job(conf, "Problem1");

    job.setJarByClass(Problem1.class);
    job.setMapperClass(SOWordCountMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);

}
}

尝试运行程序时出现以下错误:

java.lang.Exception: java.lang.ArrayIndexOutOfBoundsException: 1
        at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:462)
        at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:522)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 1
        at Problem1$SOWordCountMapper.map(Problem1.java:49)
        at Problem1$SOWordCountMapper.map(Problem1.java:24)
        at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
        at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
        at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
s3fp2yjn

s3fp2yjn1#

程序正在中断 String text = parsed[1].replace("\"", ""); ; 因为 parsed[1] 不存在。
问题在于代码 String parsed[] = line.split("\" , "); 我认为 split() 不正确(括号中有3个双引号)。如果你想分开 "," ```
String parsed[] = line.split(Pattern.quote(","), -1);

如果你想分开 `"\"` ```
String parsed[] = line.split(Pattern.quote("\"), -1);

为了 Pattern 类,使用 import java.util.regex.Pattern .

相关问题