我正在尝试用hadoop编写这个map reduce程序,它统计任何特定用户从twitter垃圾堆发tweet的字数。下面是我的代码:我相信在解析csv文件时,我的Map器中有一个错误,但是我不确定如何去解决它。有人能提供一些见解吗?
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.commons.lang.StringEscapeUtils;
import java.util.HashMap;
public class Problem1{
public static class SOWordCountMapper extends
Mapper<Object, Text, Text, IntWritable> {
private static final IntWritable ONE = new IntWritable(1);
private Text txt = new Text();
private Text user = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String tweetHandle = conf.get("tweetHandle");
String line = value.toString();
String parsed[] = line.split("\" , ");
String skip[] = new String[1];
skip[0] = "Handle,Tweet,Favs,RTs,Latitude,Longitude";
if(parsed[0].equals(skip[0])){
return;
}
String handle = parsed[0].replace("\"","").toLowerCase();
String text = parsed[1].replace("\"", "");
if(user == null || txt == null || !user.equals(tweetHandle)){
return;
}
}
}
public static class IntSumReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: Problem1 <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "Problem1");
job.setJarByClass(Problem1.class);
job.setMapperClass(SOWordCountMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
尝试运行程序时出现以下错误:
java.lang.Exception: java.lang.ArrayIndexOutOfBoundsException: 1
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:462)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:522)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 1
at Problem1$SOWordCountMapper.map(Problem1.java:49)
at Problem1$SOWordCountMapper.map(Problem1.java:24)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
1条答案
按热度按时间s3fp2yjn1#
程序正在中断
String text = parsed[1].replace("\"", "");
; 因为parsed[1]
不存在。问题在于代码
String parsed[] = line.split("\" , ");
我认为split()
不正确(括号中有3个双引号)。如果你想分开","
```String parsed[] = line.split(Pattern.quote(","), -1);
为了
Pattern
类,使用import java.util.regex.Pattern
.