mapreduce方式操作hbase

一、导入数据到hbase

1、配置hbase-site.xml指向hdfs

<configuration>

  <property>

    <name>hbase.rootdir</name>

    <value>hdfs://bigdata-senior01.home.com:9000/hbase</value>

  </property>

  <property>

    <name>hbase.zookeeper.property.dataDir</name>

    <value>hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper</value>

  </property>

  <property>

    <name>hbase.unsafe.stream.capability.enforce</name>

    <value>false</value>

    <description>

      Controls whether HBase will check for stream capabilities (hflush/hsync).

      Disable this if you intend to run on LocalFileSystem, denoted by a rootdir

      with the 'file://' scheme, but be mindful of the NOTE below.

      WARNING: Setting this to false blinds you to potential data loss and

      inconsistent system state in the event of process and/or node failures. If

      HBase is complaining of an inability to use hsync or hflush it's most

      likely not a false positive.

    </description>

  </property>

</configuration>

2、依赖

        <dependency>

            <groupId>org.apache.hadoop</groupId>

            <artifactId>hadoop-client</artifactId>

            <version>3.2.0</version>

        </dependency>

        <dependency>

            <groupId>org.apache.hbase</groupId>

            <artifactId>hbase-client</artifactId>

            <version>2.0.4</version>

        </dependency>

        <dependency>

            <groupId>org.apache.hbase</groupId>

            <artifactId>hbase-mapreduce</artifactId>

            <version>2.0.4</version>

        </dependency>

3、mapper

//输入:文本方式，输出：字节作为键，hbase的Mutation作为输出值

public class ImportMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Mutation> {

    //计数器

    public enum Counters {

        LINES

    }

    private byte[] family = null;

    private byte[] qualifier = null;

    /**

     * Called once at the beginning of the task.

     *

     * @param context

     */

    @Override

    protected void setup(Context context) throws IOException, InterruptedException {

        //从配置文件中读取列族信息，这个信息是控制台方式写入，并通过cli获取

        String column = context.getConfiguration().get("conf.column");

        ColParser parser = new ColParser();

        parser.parse(column);

        if(!parser.isValid()) throw new IOException("family or qualifier error");

        family = parser.getFamily();

        qualifier = parser.getQualifier();

    }

    /**

     * Called once for each key/value pair in the input split. Most applications

     * should override this, but the default is the identity function.

     *

     * @param key

     * @param value

     * @param context

     */

    @Override

    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        try {

            String line = value.toString();

            //散列每行数据作为行键，根据需求调整

            byte[] rowKey = DigestUtils.md5(line);

            Put put = new Put(rowKey);

            put.addColumn(this.family,this.qualifier,Bytes.toBytes(line));

            context.write(new ImmutableBytesWritable(rowKey),put);

            context.getCounter(Counters.LINES).increment(1);

        }catch (Exception e){

            e.printStackTrace();

        }

    }

    class ColParser {

        private byte[] family;

        private byte[] qualifier;

        private boolean valid;

        public byte[] getFamily() {

            return family;

        }

        public byte[] getQualifier() {

            return qualifier;

        }

        public boolean isValid() {

            return valid;

        }

        public void parse(String value) {

            try {

                String[] sValue = value.split(":");

                if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) {

                    valid = false;

                    return;

                }

                family = Bytes.toBytes(sValue[0]);

                qualifier = Bytes.toBytes(sValue[1]);

                valid = true;

            } catch (Exception e) {

                valid = false;

            }

        }

    }

}

4、main

public class ImportFromFile {

//    private static String HDFSUri = "hdfs://bigdata-senior01.home.com:9000";

    public static final String NAME = "ImportFromFile";

    private static CommandLine parseArgs(String[] args) throws ParseException{

        Options options = new Options();

        Option option = new Option("t","table",true,"表不能为空");

        option.setArgName("table-name");

        option.setRequired(true);

        options.addOption(option);

        option = new Option("c","column",true,"列族和列名不能为空");

        option.setArgName("family:qualifier");

        option.setRequired(true);

        options.addOption(option);

        option = new Option("i","input",true,"输入文件或者目录");

        option.setArgName("path-in-HDFS");

        option.setRequired(true);

        options.addOption(option);

        options.addOption("d","debug",false,"switch on DEBUG log level");

        CommandLineParser parser = new PosixParser();

        CommandLine cmd = null;

        try {

            cmd = parser.parse(options,args);

        }catch (Exception e){

            System.err.println("ERROR: " + e.getMessage() + "\n");

            HelpFormatter formatter = new HelpFormatter();

            formatter.printHelp(NAME + " ", options, true);

            System.exit(-1);

        }

        if (cmd.hasOption("d")) {

            Logger log = Logger.getLogger("mapreduce");

            log.setLevel(Level.DEBUG);

        }

        return cmd;

    }

    public static void main(String[] args) throws Exception{

        Configuration conf = HBaseConfiguration.create();

        String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();

        CommandLine cmd = parseArgs(runArgs);

        if (cmd.hasOption("d")) conf.set("conf.debug", "true");

        String table = cmd.getOptionValue("t");

        String input = cmd.getOptionValue("i");

        String column = cmd.getOptionValue("c");

        //写入配置后，在mapper阶段取出

        conf.set("conf.column", column);

        Job job = Job.getInstance(conf,"Import from file " + input +" into table " + table);

        job.setJarByClass(ImportFromFile.class);

        job.setMapperClass(ImportMapper.class);

        job.setOutputFormatClass(TableOutputFormat.class);

        job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE,table);

        job.setOutputKeyClass(ImmutableBytesWritable.class);

        job.setOutputValueClass(Writable.class);

        job.setNumReduceTasks(0); //不需要reduce

        FileInputFormat.addInputPath(job,new Path(input));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

5、执行

先在HBASE里建表

create 'importTable','data'

把jar包传到hdfs上执行

hadoop jar ImportFromFile.jar -t importTable -i /input/test-data.txt -c data:json

二、从hbase获取数据进行计算

从上例中把hbase数据抽取出来计算作者出现数量

多加一个依赖

      <dependency>

            <groupId>com.googlecode.json-simple</groupId>

            <artifactId>json-simple</artifactId>

            <version>1.1.1</version>

        </dependency>

1、mapper

import org.apache.hadoop.hbase.Cell;

import org.apache.hadoop.hbase.client.Result;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.hbase.mapreduce.TableMapper;

import org.apache.hadoop.hbase.util.Bytes;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.json.simple.JSONObject;

import org.json.simple.parser.JSONParser;

import java.io.IOException;

public class AnalyzeMapper extends TableMapper<Text,IntWritable> {

    private JSONParser parser = new JSONParser();

    public enum Counters { ROWS, COLS, ERROR, VALID }

    private IntWritable ONE = new IntWritable(1);

    /**

     * Called once for each key/value pair in the input split. Most applications

     * should override this, but the default is the identity function.

     *

     * @param key

     * @param value

     * @param context

     */

    @Override

    protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {

        context.getCounter(Counters.ROWS).increment(1);

        String val = null;

        try {

            for(Cell cell:value.listCells()){

                context.getCounter(Counters.COLS).increment(1);

                val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength());

                JSONObject json = (JSONObject)parser.parse(val);

                String author = (String)json.get("author");

                if (context.getConfiguration().get("conf.debug") != null)

                    System.out.println("Author: " + author);

                context.write(new Text(author),ONE);

                context.getCounter(Counters.VALID).increment(1);

            }

        }catch (Exception e){

            e.printStackTrace();

            System.err.println("Row: " + Bytes.toStringBinary(key.get()) +

                    ", JSON: " + value);

            context.getCounter(Counters.ERROR).increment(1);

        }

    }

}

2、reducer

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class AnalyzeReducer extends Reducer<Text,IntWritable,Text,IntWritable> {

    /**

     * This method is called once for each key. Most applications will define

     * their reduce class by overriding this method. The default implementation

     * is an identity function.

     *

     * @param key

     * @param values

     * @param context

     */

    @Override

    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int count = 0;

        for(IntWritable one:values) count++;

        if (context.getConfiguration().get("conf.debug") != null)

            System.out.println("Author: " + key.toString() + ", Count: " + count);

        context.write(key,new IntWritable(count));

    }

}

3、main

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.commons.cli.*;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.hbase.HBaseConfiguration;

import org.apache.hadoop.hbase.client.Scan;

import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.log4j.Level;

import org.apache.log4j.Logger;

import java.io.IOException;

public class AnalyzeData {

    private static final Log LOG = LogFactory.getLog(AnalyzeData.class);

    public static final String NAME = "AnalyzeData";

    /**

     * Parse the command line parameters.

     *

     * @param args The parameters to parse.

     * @return The parsed command line.

     * @throws org.apache.commons.cli.ParseException When the parsing of the parameters fails.

     */

    private static CommandLine parseArgs(String[] args) throws ParseException {

        Options options = new Options();

        Option o = new Option("t", "table", true,

                "table to read from (must exist)");

        o.setArgName("table-name");

        o.setRequired(true);

        options.addOption(o);

        o = new Option("c", "column", true,

                "column to read data from (must exist)");

        o.setArgName("family:qualifier");

        options.addOption(o);

        o = new Option("o", "output", true,

                "the directory to write to");

        o.setArgName("path-in-HDFS");

        o.setRequired(true);

        options.addOption(o);

        options.addOption("d", "debug", false, "switch on DEBUG log level");

        CommandLineParser parser = new PosixParser();

        CommandLine cmd = null;

        try {

            cmd = parser.parse(options, args);

        } catch (Exception e) {

            System.err.println("ERROR: " + e.getMessage() + "\n");

            HelpFormatter formatter = new HelpFormatter();

            formatter.printHelp(NAME + " ", options, true);

            System.exit(-1);

        }

        if (cmd.hasOption("d")) {

            Logger log = Logger.getLogger("mapreduce");

            log.setLevel(Level.DEBUG);

            System.out.println("DEBUG ON");

        }

        return cmd;

    }

    public static void main(String[] args) throws Exception{

        Configuration conf = HBaseConfiguration.create();

        String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();

        CommandLine cmd = parseArgs(runArgs);

        if(cmd.hasOption("d"))

            conf.set("conf.debug","true");

        String table = cmd.getOptionValue("t");

        String column = cmd.getOptionValue("c");

        String output = cmd.getOptionValue("o");

        ColumnParser columnParser = new ColumnParser();

        columnParser.parse(column);

        if(!columnParser.isValid()) throw new IOException("family or qualifier error");

        byte[] family = columnParser.getFamily();

        byte[] qualifier = columnParser.getQualifier();

        Scan scan = new Scan();

        scan.addColumn(family,qualifier);

        Job job = Job.getInstance(conf,"Analyze data in " + table);

        job.setJarByClass(AnalyzeData.class);

        TableMapReduceUtil.initTableMapperJob(table,scan,AnalyzeMapper.class, Text.class, IntWritable.class,job);

        job.setMapperClass(AnalyzeMapper.class);

        job.setReducerClass(AnalyzeReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        job.setNumReduceTasks(1);

        FileOutputFormat.setOutputPath(job,new Path(output));

        System.exit(job.waitForCompletion(true) ? 0:1);

    }

}

###

public class ColumnParser {

    private byte[] family;

    private byte[] qualifier;

    private boolean valid;

    public byte[] getFamily() {

        return family;

    }

    public byte[] getQualifier() {

        return qualifier;

    }

    public boolean isValid() {

        return valid;

    }

    public void parse(String value) {

        try {

            String[] sValue = value.split(":");

            if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) {

                valid = false;

                return;

            }

            family = Bytes.toBytes(sValue[0]);

            qualifier = Bytes.toBytes(sValue[1]);

            valid = true;

        } catch (Exception e) {

            valid = false;

        }

    }

}

4、执行

hadoop jar AnalyzeData.jar -t importTable -c data:json -o /output9

结果：

... ...

    AnalyzeMapper$Counters

        COLS=993

        ERROR=6

        ROWS=993

        VALID=987

三、从hbase中读取数据，计算后存回hbase

把上例中存入的json串读出，按key-value的方式分解，把key作为列名，value作为列值存入hbase

public class ParseJson {

    private static final String HDFSUri = "hdfs://bigdata-senior01.home.com:9000";

    private static final Log LOG = LogFactory.getLog(ParseJson.class);

    public static final String NAME = "ParseJson";

    public enum Counters {ROWS,COLS,VALID,ERROR};

    static class ParseMapper extends TableMapper<ImmutableBytesWritable, Mutation>{

        private JSONParser parser = new JSONParser();

        private byte[] columnFamily = null;

        @Override

        protected void setup(Context context) throws IOException, InterruptedException {

            columnFamily = Bytes.toBytes(context.getConfiguration().get("conf.columnFamily"));

        }

        @Override

        protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {

            context.getCounter(Counters.ROWS).increment(1);

            String val = null;

            try {

                Put put = new Put(key.get());

                for(Cell cell : value.listCells()){

                    context.getCounter(Counters.COLS).increment(1);

                    val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength());

                    JSONObject json = (JSONObject) parser.parse(val);

                    for (Object jsonKey : json.keySet()){

                        Object jsonValue = json.get(jsonKey);

                        put.addColumn(columnFamily,Bytes.toBytes(jsonKey.toString()),Bytes.toBytes(jsonValue.toString()));

                    }

                }

                context.write(key,put);

                context.getCounter(Counters.VALID).increment(1);

            }catch (Exception e){

                e.printStackTrace();

                System.err.println("Error: " + e.getMessage() + ", Row: " +

                        Bytes.toStringBinary(key.get()) + ", JSON: " + value);

                context.getCounter(Counters.ERROR).increment(1);

            }

        }

    }

    private static CommandLine parseArgs(String[] args) throws ParseException{

        Options options = new Options();

        Option o = new Option("i", "input", true,

                "table to read from (must exist)");

        o.setArgName("input-table-name");

        o.setRequired(true);

        options.addOption(o);

        o = new Option("o", "output", true,

                "table to write to (must exist)");

        o.setArgName("output-table-name");

        o.setRequired(true);

        options.addOption(o);

        o = new Option("c", "column", true,

                "column to read data from (must exist)");

        o.setArgName("family:qualifier");

        options.addOption(o);

        options.addOption("d", "debug", false, "switch on DEBUG log level");

        CommandLineParser parser = new PosixParser();

        CommandLine cmd = null;

        try {

            cmd = parser.parse(options, args);

        } catch (Exception e) {

            System.err.println("ERROR: " + e.getMessage() + "\n");

            HelpFormatter formatter = new HelpFormatter();

            formatter.printHelp(NAME + " ", options, true);

            System.exit(-1);

        }

        if (cmd.hasOption("d")) {

            Logger log = Logger.getLogger("mapreduce");

            log.setLevel(Level.DEBUG);

            System.out.println("DEBUG ON");

        }

        return cmd;

    }

    public static void main(String[] args) throws Exception{

        Configuration conf = HBaseConfiguration.create();

//        conf.set("hbase.master","192.168.31.10");

//        conf.set("hbase.zookeeper.quorum", "192.168.31.10");

//        conf.set("hbase.rootdir","hdfs://bigdata-senior01.home.com:9000/hbase");

//        conf.set("hbase.zookeeper.property.dataDir","hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper");

        String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();

        CommandLine cmd = parseArgs(runArgs);

        if(cmd.hasOption("d")) conf.set("conf.debug","true");

        String input = cmd.getOptionValue("i");

        String output = cmd.getOptionValue("o");

        String column = cmd.getOptionValue("c");

        ColumnParser columnParser = new ColumnParser();

        columnParser.parse(column);

        if(!columnParser.isValid()) throw new IOException("family or qualifier error");

        byte[] family = columnParser.getFamily();

        byte[] qualifier = columnParser.getQualifier();

        Scan scan = new Scan();

        scan.addColumn(family,qualifier);

        conf.set("conf.columnFamily", Bytes.toStringBinary(family));

        Job job = Job.getInstance(conf, "Parse data in " + input +

                ", write to " + output);

        job.setJarByClass(ParseJson.class);

        TableMapReduceUtil.initTableMapperJob(input,scan,ParseMapper.class,ImmutableBytesWritable.class,Put.class,job);

        TableMapReduceUtil.initTableReducerJob(output, IdentityTableReducer.class,job);

        System.exit(job.waitForCompletion(true)?0:1);

    }

}

执行：

hadoop jar ParseJson.jar -i importTable -c data:json -o importTable

mapreduce方式操作hbase的更多相关文章

用mapreduce来操作hbase的优化
(1)scan.setCacheBlocks(false); 初始化map任务 TableMapReduceUtil.initTableMapperJob 本次mr任务scan的所有数据不放在缓 ...
HBase 相关API操练(三)：MapReduce操作HBase
MapReduce 操作 HBase 在 HBase 系统上运行批处理运算,最方便和实用的模型依然是 MapReduce,如下图所示. HBase Table 和 Region 的关系类似 HDFS ...
Mapreduce操作HBase
这个操作和普通的Mapreduce还不太一样,比如普通的Mapreduce输入可以是txt文件等,Mapreduce可以直接读取Hive中的表的数据(能够看见是以类似txt文件形式),但Mapredu ...
Hbase第五章 MapReduce操作HBase
容易遇到的坑: 当用mapReducer操作HBase时,运行jar包的过程中如果遇到 java.lang.NoClassDefFoundError 类似的错误时,一般是由于hadoop环境没有hba ...
7.MapReduce操作Hbase
7 HBase的MapReduce HBase中Table和Region的关系,有些类似HDFS中File和Block的关系.由于HBase提供了配套的与MapReduce进行交互的API如 Ta ...
PHP通过Thrift操作Hbase
PHP通过Thrift操作Hbase HBase是一个开源的NoSQL产品,它是实现了Google BigTable论文的一个开源产品,和Hadoop和HDFS一起,可用来存储和处理海量col ...
大数据技术之_11_HBase学习_02_HBase API 操作 + HBase 与 Hive 集成 + HBase 优化
第6章 HBase API 操作6.1 环境准备6.2 HBase API6.2.1 判断表是否存在6.2.2 抽取获取 Configuration.Connection.Admin 对象的方法以及关 ...
大数据技术之_11_HBase学习_01_HBase 简介+HBase 安装+HBase Shell 操作+HBase 数据结构+HBase 原理
第1章 HBase 简介1.1 什么是 HBase1.2 HBase 特点1.3 HBase 架构1.3 HBase 中的角色1.3.1 HMaster1.3.2 RegionServer1.3.3 ...
吴裕雄--天生自然HADOOP操作实验学习笔记：使用hive操作hbase
实验目的熟悉hive和hbase的操作熟悉hadoop.hbase.hive.zookeeper的关系熟练大数据环境的搭建学会分析日志排除问题实验原理 1.hive整合hbase原理前面大 ...

随机推荐

linux用命令行编译使用函数库
同步于气象家园日志 from fcode 视频编译静态链接库 gfortran -c sub.f90 func.f90 产生了func.mod文件.注:mod文件是静态库的接口.如果删掉了fu ...
HT7A6312—— 离线开关电源小功率初级转换开关IC 记录总结
1. 芯片特性 a. 固定60KHz开关频率: b. 宽Vcc输出电压范围:9V - 38V: c. 宽交流输入电压范围:85Vac - 265Vac: d. 电流模式PWM控制: e. 带迟滞的辅助 ...
Oracle中解析XMLType格式字段
背景:项目从某数据交换平台获取XML数据,以Oracle的XMLType格式保存在数据库字段中,需要建立触发器.存储过程,在保存数据时解析XML字段,将数据写入其他业务表中. 参考资料:Oracle的 ...
戴尔win10重新安装win7系统
戴尔v5468电脑win10重装回win7系统首先是公司需要用到ie8来执行公司的项目维护,都是很早之前的项目了,因为是对接政府相关的业务,不怎么有把握对项目进行稳定更新,所以我就为这个ie8操碎了 ...
java中JVM的原理
转载:https://blog.csdn.net/witsmakemen/article/details/28600127 一.java虚拟机的生命周期: Java虚拟机的生命周期一个运行中的Jav ...
docker 安装vim
执行以下命令 apt-get update apt-get install vim
django-simple_tag、filter
simple_tag与filter的用法 1.支持自定义函数处理方法 2.支持模板调用创建步骤: a.在app目录下创建templatetags文件夹 b.在templatetags中创建任意名称. ...
启动Nodejs服务
vs code 中间创建 1. settings.json { , { , { 'Content-Type': 'text/plain;charset=utf-8' })
Linux shell中&，&&，|，||的用法
前言在玩dvwa的命令注入漏洞的时候,遇到了没有预料到的错误,执行 ping 127.0.0.1 & echo "<?php phpinfo(); ?>" & ...
java synchronized关键字浅析
synchronized这个关键字想必学Java的人都应该知道. 直接上例子: 方法级别实例 public class AtomicInteger { private int index; publi ...

mapreduce方式操作hbase

mapreduce方式操作hbase的更多相关文章

随机推荐

热门专题