mapreduce与mapr中oozie的hcatalog集成

imzjd6km  于 2021-05-29  发布在  Hadoop
关注(0)|答案(1)|浏览(484)

我已经编写了一个mapreduce程序,它使用hcatlog从配置单元表读取数据并写入hbase。这是一个Map只有工作没有减少。我已经从命令行运行了这个程序,它按预期工作(创建了一个胖jar以避免jar问题)。我想把它整合起来(借助色调)。我有两个选择来运行它
使用mapreduce操作
使用java操作
因为我的mapreduce程序有一个驱动程序方法,其中包含以下代码

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.apache.hive.hcatalog.mapreduce.HCatOutputFormat;

public class HBaseValdiateInsertDriver {

public static void main(String[] args) throws Exception {

    String dbName = "Test";
    String tableName = "emp";
    Configuration conf = new Configuration();

    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = new Job(conf, "HBase Get Put Demo");
    job.setInputFormatClass(HCatInputFormat.class);
    HCatInputFormat.setInput(job, dbName, tableName, null);

    job.setJarByClass(HBaseValdiateInsertDriver.class);

    job.setMapperClass(HBaseValdiateInsert.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);
    FileInputFormat.addInputPath(job, new Path("maprfs:///user/input"));
    FileOutputFormat.setOutputPath(job, new Path("maprfs:///user/output"));

    job.waitForCompletion(true);

    }
}

如何在oozie中指定驱动程序方法,我所能看到的只是指定mapper和reducer类。有人能指导我如何设置属性吗?
使用java action,我可以将我的驱动程序类指定为主类并执行它,但我会遇到一些错误,如找不到表、找不到hcatlog jars等。我在工作流中包含了hive-site.xml(使用hue),但我觉得系统无法获取属性。有人能告诉我我要做什么吗?我还需要包括其他配置属性吗?
我在cloudera网站上引用的示例程序

HCatInputFormat.setInput(job, InputJobInfo.create(dbName,
                inputTableName, null));

当我使用下面的方法时(我没有看到一个方法接受上面的输入)

HCatInputFormat.setInput(job, dbName, tableName, null);

下面是我的Map代码

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hive.hcatalog.data.HCatRecord;

public class HBaseValdiateInsert extends Mapper<WritableComparable, HCatRecord, Text, Text> {

    static HTableInterface table;
    static HTableInterface inserted;
    private String hbaseDate = null;
    String existigValue=null;
    List<Put> putList = new ArrayList<Put>();

    @Override
    public void setup(Context context) throws IOException {

                Configuration conf = context.getConfiguration();
                String tablename = "dev_arch186";
        Utils.getHBConnection();
        table = Utils.getTable(tablename);
        table.setAutoFlushTo(false);
    }
    @Override
    public void cleanup(Context context) {
        try {
            table.put(putList);
            table.flushCommits();
            table.close();
        } catch (IOException e) {

            e.printStackTrace();
        }
        Utils.closeConnection();
    }

    @Override
    public void map(WritableComparable key, HCatRecord value, Context context) throws IOException, InterruptedException {

                String name_hive = (String) value.get(0);
                String id_hive = (String) value.get(1);

                String rec[] = test.toString().split(",");
        Get g = new Get(Bytes.toBytes(name_hive));

        existigValue=getOneRecord(Bytes.toBytes("Info"),Bytes.toBytes("name"),name_hive);
        if (existigValue.equalsIgnoreCase("NA") || !existigValue.equalsIgnoreCase(id_hive)) {
            Put put = new Put(Bytes.toBytes(rec[0]));
            put.add(Bytes.toBytes("Info"),
                    Bytes.toBytes("name"),
                    Bytes.toBytes(rec[1]));
            put.setDurability(Durability.SKIP_WAL);
            putList.add(put);
            if(putList.size()>25000){
                table.put(putList);
                table.flushCommits();
            }
        }

    }

    public String getOneRecord(byte[] columnFamily, byte[] columnQualifier, String rowKey)
            throws IOException {
        Get get = new Get(rowKey.getBytes());
        get.setMaxVersions(1);
        Result rs = table.get(get);
        rs.getColumn(columnFamily, columnQualifier);
        System.out.println(rs.containsColumn(columnFamily, columnQualifier));
        KeyValue result = rs.getColumnLatest(columnFamily,columnQualifier);

        if (rs.containsColumn(columnFamily, columnQualifier))
            return (Bytes.toString(result.getValue()));
        else
            return "NA";
    }

    public boolean columnQualifierExists(String tableName, String ColumnFamily,
            String ColumnQualifier, String rowKey) throws IOException  {
        Get get = new Get(rowKey.getBytes());
        Result rs = table.get(get);
        return(rs.containsColumn(ColumnFamily.getBytes(),ColumnQualifier.getBytes()));
    }

}

注意:我使用mapr(m3)cluster和hue作为oozie的接口。配置单元版本:1-0 hcat版本:1-0

o2rvlv0m

o2rvlv0m1#

我找不到任何方法从oozie mapreduce操作初始化hcatinputformat。但我有一个解决办法如下。
通过扩展hcatinputformat创建了lazyhcatinputformat。重写getjobinfo方法,以处理初始化。这将作为getsplits(..)调用的一部分调用。

private static void lazyInit(Configuration conf){
    try{

        if(conf==null){
            conf = new Configuration(false);
        }
        conf.addResource(new Path(System.getProperty("oozie.action.conf.xml")));
        conf.addResource(new org.apache.hadoop.fs.Path("hive-config.xml"));

        String databaseName = conf.get("LazyHCatInputFormat.databaseName");
        String tableName = conf.get("LazyHCatInputFormat.tableName");
        String partitionFilter = conf.get("LazyHCatInputFormat.partitionFilter");

        setInput(conf, databaseName, tableName);
        //setFilter(partitionFilter);

        //System.out.println("After lazyinit : "+conf.get("mapreduce.lib.hcat.job.info"));
    }catch(Exception e){
        System.out.println("***LAZY INIT FAILED***");
        //e.printStackTrace();
    }
}

public static InputJobInfo getJobInfo(Configuration conf)
        throws IOException {
    String jobString = conf.get("mapreduce.lib.hcat.job.info");
    if (jobString == null) {
        lazyInit(conf);
        jobString = conf.get("mapreduce.lib.hcat.job.info");
        if(jobString == null){
            throw new IOException("job information not found in JobContext. HCatInputFormat.setInput() not called?");   
        }
    }
    return (InputJobInfo) HCatUtil.deserialize(jobString);
}

在oozie map redcue操作中,配置如下。

<property>
                <name>mapreduce.job.inputformat.class</name>
                <value>com.xyz.LazyHCatInputFormat</value>
          </property>
          <property>
                <name>LazyHCatInputFormat.databaseName</name>
                <value>HCAT DatabaseNameHere</value>
          </property>
           <property>
                <name>LazyHCatInputFormat.tableName</name>
                <value>HCAT TableNameHere</value>
          </property>

这可能不是最好的实现,但它是一个快速的黑客使其工作。

相关问题