我需要运行2个map reduce作业,以便第二个作业将第一个作业的输出作为输入。我想在一个调用中完成这一点,myclass在其中扩展 Configured
和工具 Tool
.
我已经编写了代码,只要我不在同一个调用中运行两个作业,它就可以工作(这样可以工作):
hadoop jar myjar.jar path.to.my.class.MyClass -i input -o output -m job1
hadoop jar myjar.jar path.to.my.class.MyClass -i dummy -o output -m job2
但事实并非如此:
hadoop jar myjar.jar path.to.my.class.MyClass -i input -o output -m all
(-m代表“模式”)
在这种情况下,第一个作业的输出不会传递给第二个作业的Map器(我通过调试发现了这一点),但我不知道为什么。
我见过其他关于链接的帖子,但它们是针对“旧的”Mapapi的。我需要在作业之间运行第三方代码,所以我不知道chainmapper/chainreducer是否适用于我的用例。
使用hadoop版本1.0.3,aws弹性mapreduce发行版。
代码:
import java.io.IOException;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyClass extends Configured implements Tool {
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new HBasePrep(), args);
System.exit(res);
}
@Override
public int run(String[] args) throws Exception {
CommandLineParser parser = new BasicParser();
Options allOptions = setupOptions();
Configuration conf = getConf();
String[] argv_ = new GenericOptionsParser(conf, args).getRemainingArgs();
CommandLine cmdLine = parser.parse(allOptions, argv_);
boolean doJob1 = true;
boolean doJob2 = true;
if (cmdLine.hasOption('m')) {
String mode = cmdLine.getOptionValue('m');
if ("job1".equals(mode)) {
doJob2 = false;
} else if ("job2".equals(mode)){
doJob1 = false;
}
}
Path outPath = new Path(cmdLine.getOptionValue("output"), "job1out");
Job job = new Job(conf, "HBase Prep For Data Build");
Job job2 = new Job(conf, "HBase SessionIndex load");
if (doJob1) {
conf = job.getConfiguration();
String[] values = cmdLine.getOptionValues("input");
if (values != null && values.length > 0) {
for (String input : values) {
System.out.println("input:" + input);
FileInputFormat.addInputPaths(job, input);
}
}
job.setJarByClass(HBasePrep.class);
job.setMapperClass(SessionMapper.class);
MultipleOutputs.setCountersEnabled(job, false);
MultipleOutputs.addNamedOutput(job, "sessionindex", TextOutputFormat.class, Text.class, Text.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(KeyValue.class);
job.setOutputFormatClass(HFileOutputFormat.class);
HTable hTable = new HTable(conf, "session");
// Auto configure partitioner and reducer
HFileOutputFormat.configureIncrementalLoad(job, hTable);
FileOutputFormat.setOutputPath(job, outPath);
if (!job.waitForCompletion(true)) {
return 1;
}
// Load generated HFiles into table
LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
loader.doBulkLoad(outPath, hTable);
FileSystem fs = FileSystem.get(outPath.toUri(), conf);
fs.delete(new Path(outPath, "cf"), true); # i delete this because after the hbase build load, it is left an empty directory which causes problems later
}
/////////////////////////////////////////////
// SECOND JOB //
/////////////////////////////////////////////
if (doJob2) {
conf = job2.getConfiguration();
System.out.println("-- job 2 input path : " + outPath.toString());
FileInputFormat.setInputPaths(job2, outPath.toString());
job2.setJarByClass(HBasePrep.class);
job2.setMapperClass(SessionIndexMapper.class);
MultipleOutputs.setCountersEnabled(job2, false);
job2.setMapOutputKeyClass(ImmutableBytesWritable.class);
job2.setMapOutputValueClass(KeyValue.class);
job2.setOutputFormatClass(HFileOutputFormat.class);
HTable hTable = new HTable(conf, "session_index_by_hour");
// Auto configure partitioner and reducer
HFileOutputFormat.configureIncrementalLoad(job2, hTable);
outPath = new Path(cmdLine.getOptionValue("output"), "job2out");
System.out.println("-- job 2 output path: " + outPath.toString());
FileOutputFormat.setOutputPath(job2, outPath);
if (!job2.waitForCompletion(true)) {
return 2;
}
// Load generated HFiles into table
LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
loader.doBulkLoad(outPath, hTable);
}
return 0;
}
public static class SessionMapper extends
Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> {
private MultipleOutputs<ImmutableBytesWritable, KeyValue> multiOut;
@Override
public void setup(Context context) throws IOException {
multiOut = new MultipleOutputs<ImmutableBytesWritable, KeyValue>(context);
}
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
...
context.write(..., ...); # this is called mutiple times
multiOut.write("sessionindex", new Text(...), new Text(...), "sessionindex");
}
}
public static class SessionIndexMapper extends
Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> {
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(new ImmutableBytesWritable(...), new KeyValue(...));
}
}
private static Options setupOptions() {
Option input = createOption("i", "input",
"input file(s) for the Map step", "path", Integer.MAX_VALUE,
true);
Option output = createOption("o", "output",
"output directory for the Reduce step", "path", 1, true);
Option mode = createOption("m", "mode",
"what mode ('all', 'job1', 'job2')", "-mode-", 1, false);
return new Options().addOption(input).addOption(output)
.addOption(mode);
}
public static Option createOption(String name, String longOpt, String desc,
String argName, int max, boolean required) {
OptionBuilder.withArgName(argName);
OptionBuilder.hasArgs(max);
OptionBuilder.withDescription(desc);
OptionBuilder.isRequired(required);
OptionBuilder.withLongOpt(longOpt);
return OptionBuilder.create(name);
}
}
输出(单次调用):
input:s3n://...snip...
13/12/09 23:08:43 INFO util.NativeCodeLoader: Loaded the native-hadoop library
13/12/09 23:08:43 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
13/12/09 23:08:43 INFO compress.CodecPool: Got brand-new compressor
13/12/09 23:08:43 INFO mapred.JobClient: Default number of map tasks: null
13/12/09 23:08:43 INFO mapred.JobClient: Setting default number of map tasks based on cluster size to : 2
13/12/09 23:08:43 INFO mapred.JobClient: Default number of reduce tasks: 1
13/12/09 23:08:43 INFO security.ShellBasedUnixGroupsMapping: add hadoop to shell userGroupsCache
13/12/09 23:08:43 INFO mapred.JobClient: Setting group to hadoop
13/12/09 23:08:43 INFO input.FileInputFormat: Total input paths to process : 1
13/12/09 23:08:43 INFO lzo.GPLNativeCodeLoader: Loaded native gpl library
13/12/09 23:08:43 WARN lzo.LzoCodec: Could not find build properties file with revision hash
13/12/09 23:08:43 INFO lzo.LzoCodec: Successfully loaded & initialized native-lzo library [hadoop-lzo rev UNKNOWN]
13/12/09 23:08:43 WARN snappy.LoadSnappy: Snappy native library is available
13/12/09 23:08:43 INFO snappy.LoadSnappy: Snappy native library loaded
13/12/09 23:08:44 INFO mapred.JobClient: Running job: job_201312062235_0044
13/12/09 23:08:45 INFO mapred.JobClient: map 0% reduce 0%
13/12/09 23:09:09 INFO mapred.JobClient: map 100% reduce 0%
13/12/09 23:09:27 INFO mapred.JobClient: map 100% reduce 100%
13/12/09 23:09:32 INFO mapred.JobClient: Job complete: job_201312062235_0044
13/12/09 23:09:32 INFO mapred.JobClient: Counters: 42
13/12/09 23:09:32 INFO mapred.JobClient: MyCounter1
13/12/09 23:09:32 INFO mapred.JobClient: ValidCurrentDay=3526
13/12/09 23:09:32 INFO mapred.JobClient: Job Counters
13/12/09 23:09:32 INFO mapred.JobClient: Launched reduce tasks=1
13/12/09 23:09:32 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=19693
13/12/09 23:09:32 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0
13/12/09 23:09:32 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0
13/12/09 23:09:32 INFO mapred.JobClient: Rack-local map tasks=1
13/12/09 23:09:32 INFO mapred.JobClient: Launched map tasks=1
13/12/09 23:09:32 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=15201
13/12/09 23:09:32 INFO mapred.JobClient: File Output Format Counters
13/12/09 23:09:32 INFO mapred.JobClient: Bytes Written=1979245
13/12/09 23:09:32 INFO mapred.JobClient: FileSystemCounters
13/12/09 23:09:32 INFO mapred.JobClient: S3N_BYTES_READ=51212
13/12/09 23:09:32 INFO mapred.JobClient: FILE_BYTES_READ=400417
13/12/09 23:09:32 INFO mapred.JobClient: HDFS_BYTES_READ=231
13/12/09 23:09:32 INFO mapred.JobClient: FILE_BYTES_WRITTEN=859881
13/12/09 23:09:32 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=2181624
13/12/09 23:09:32 INFO mapred.JobClient: File Input Format Counters
13/12/09 23:09:32 INFO mapred.JobClient: Bytes Read=51212
13/12/09 23:09:32 INFO mapred.JobClient: MyCounter2
13/12/09 23:09:32 INFO mapred.JobClient: ASCII=3526
13/12/09 23:09:32 INFO mapred.JobClient: StatsUnaggregatedMapEventTypeCurrentDay
13/12/09 23:09:32 INFO mapred.JobClient: adProgress0=343
13/12/09 23:09:32 INFO mapred.JobClient: asset=562
13/12/09 23:09:32 INFO mapred.JobClient: podComplete=612
13/12/09 23:09:32 INFO mapred.JobClient: adProgress100=247
13/12/09 23:09:32 INFO mapred.JobClient: adProgress25=247
13/12/09 23:09:32 INFO mapred.JobClient: click=164
13/12/09 23:09:32 INFO mapred.JobClient: adProgress50=247
13/12/09 23:09:32 INFO mapred.JobClient: adCall=244
13/12/09 23:09:32 INFO mapred.JobClient: adProgress75=247
13/12/09 23:09:32 INFO mapred.JobClient: podStart=613
13/12/09 23:09:32 INFO mapred.JobClient: Map-Reduce Framework
13/12/09 23:09:32 INFO mapred.JobClient: Map output materialized bytes=400260
13/12/09 23:09:32 INFO mapred.JobClient: Map input records=3526
13/12/09 23:09:32 INFO mapred.JobClient: Reduce shuffle bytes=400260
13/12/09 23:09:32 INFO mapred.JobClient: Spilled Records=14104
13/12/09 23:09:32 INFO mapred.JobClient: Map output bytes=2343990
13/12/09 23:09:32 INFO mapred.JobClient: Total committed heap usage (bytes)=497549312
13/12/09 23:09:32 INFO mapred.JobClient: CPU time spent (ms)=10120
13/12/09 23:09:32 INFO mapred.JobClient: Combine input records=0
13/12/09 23:09:32 INFO mapred.JobClient: SPLIT_RAW_BYTES=231
13/12/09 23:09:32 INFO mapred.JobClient: Reduce input records=7052
13/12/09 23:09:32 INFO mapred.JobClient: Reduce input groups=246
13/12/09 23:09:32 INFO mapred.JobClient: Combine output records=0
13/12/09 23:09:32 INFO mapred.JobClient: Physical memory (bytes) snapshot=519942144
13/12/09 23:09:32 INFO mapred.JobClient: Reduce output records=7052
13/12/09 23:09:32 INFO mapred.JobClient: Virtual memory (bytes) snapshot=3076526080
13/12/09 23:09:32 INFO mapred.JobClient: Map output records=7052
13/12/09 23:09:32 WARN mapreduce.LoadIncrementalHFiles: Skipping non-directory hdfs://10.91.18.96:9000/path/job1out/_SUCCESS
13/12/09 23:09:32 WARN mapreduce.LoadIncrementalHFiles: Skipping non-directory hdfs://10.91.18.96:9000/path/job1out/sessionindex-m-00000
1091740526
-- job 2 input path : /path/job1out
-- job 2 output path: /path/job2out
13/12/09 23:09:32 INFO mapred.JobClient: Default number of map tasks: null
13/12/09 23:09:32 INFO mapred.JobClient: Setting default number of map tasks based on cluster size to : 2
13/12/09 23:09:32 INFO mapred.JobClient: Default number of reduce tasks: 1
13/12/09 23:09:33 INFO mapred.JobClient: Setting group to hadoop
13/12/09 23:09:33 INFO input.FileInputFormat: Total input paths to process : 1
13/12/09 23:09:33 INFO mapred.JobClient: Running job: job_201312062235_0045
13/12/09 23:09:34 INFO mapred.JobClient: map 0% reduce 0%
13/12/09 23:09:51 INFO mapred.JobClient: map 100% reduce 0%
13/12/09 23:10:03 INFO mapred.JobClient: map 100% reduce 33%
13/12/09 23:10:06 INFO mapred.JobClient: map 100% reduce 100%
13/12/09 23:10:11 INFO mapred.JobClient: Job complete: job_201312062235_0045
13/12/09 23:10:11 INFO mapred.JobClient: Counters: 27
13/12/09 23:10:11 INFO mapred.JobClient: Job Counters
13/12/09 23:10:11 INFO mapred.JobClient: Launched reduce tasks=1
13/12/09 23:10:11 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=13533
13/12/09 23:10:11 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0
13/12/09 23:10:11 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0
13/12/09 23:10:11 INFO mapred.JobClient: Launched map tasks=1
13/12/09 23:10:11 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=12176
13/12/09 23:10:11 INFO mapred.JobClient: File Output Format Counters
13/12/09 23:10:11 INFO mapred.JobClient: Bytes Written=0
13/12/09 23:10:11 INFO mapred.JobClient: FileSystemCounters
13/12/09 23:10:11 INFO mapred.JobClient: FILE_BYTES_READ=173
13/12/09 23:10:11 INFO mapred.JobClient: HDFS_BYTES_READ=134
13/12/09 23:10:11 INFO mapred.JobClient: FILE_BYTES_WRITTEN=57735
13/12/09 23:10:11 INFO mapred.JobClient: File Input Format Counters
13/12/09 23:10:11 INFO mapred.JobClient: Bytes Read=0
13/12/09 23:10:11 INFO mapred.JobClient: Map-Reduce Framework
13/12/09 23:10:11 INFO mapred.JobClient: Map output materialized bytes=16
13/12/09 23:10:11 INFO mapred.JobClient: Map input records=0
13/12/09 23:10:11 INFO mapred.JobClient: Reduce shuffle bytes=16
13/12/09 23:10:11 INFO mapred.JobClient: Spilled Records=0
13/12/09 23:10:11 INFO mapred.JobClient: Map output bytes=0
13/12/09 23:10:11 INFO mapred.JobClient: Total committed heap usage (bytes)=434634752
13/12/09 23:10:11 INFO mapred.JobClient: CPU time spent (ms)=2270
13/12/09 23:10:11 INFO mapred.JobClient: Combine input records=0
13/12/09 23:10:11 INFO mapred.JobClient: SPLIT_RAW_BYTES=134
13/12/09 23:10:11 INFO mapred.JobClient: Reduce input records=0
13/12/09 23:10:11 INFO mapred.JobClient: Reduce input groups=0
13/12/09 23:10:11 INFO mapred.JobClient: Combine output records=0
13/12/09 23:10:11 INFO mapred.JobClient: Physical memory (bytes) snapshot=423612416
13/12/09 23:10:11 INFO mapred.JobClient: Reduce output records=0
13/12/09 23:10:11 INFO mapred.JobClient: Virtual memory (bytes) snapshot=3058089984
13/12/09 23:10:11 INFO mapred.JobClient: Map output records=0
13/12/09 23:10:11 WARN mapreduce.LoadIncrementalHFiles: Skipping non-directory hdfs://10.91.18.96:9000/path/job2out/_SUCCESS
13/12/09 23:10:11 WARN mapreduce.LoadIncrementalHFiles: Bulk load operation did not find any files to load in directory /path/job2out. Does it contain files in subdirectories that correspond to column family names?
暂无答案!
目前还没有任何答案,快来回答吧!