一、导入数据到hbase

1、配置hbase-site.xml指向hdfs

<configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://bigdata-senior01.home.com:9000/hbase</value>
</property>
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper</value>
</property>
<property>
<name>hbase.unsafe.stream.capability.enforce</name>
<value>false</value>
<description>
Controls whether HBase will check for stream capabilities (hflush/hsync). Disable this if you intend to run on LocalFileSystem, denoted by a rootdir
with the 'file://' scheme, but be mindful of the NOTE below. WARNING: Setting this to false blinds you to potential data loss and
inconsistent system state in the event of process and/or node failures. If
HBase is complaining of an inability to use hsync or hflush it's most
likely not a false positive.
</description>
</property>
</configuration>

2、依赖

        <dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.2.0</version>
</dependency> <dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.0.4</version>
</dependency> <dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
<version>2.0.4</version>
</dependency>

3、mapper

//输入:文本方式,输出:字节作为键,hbase的Mutation作为输出值
public class ImportMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Mutation> {
//计数器
public enum Counters {
LINES
} private byte[] family = null;
private byte[] qualifier = null; /**
* Called once at the beginning of the task.
*
* @param context
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//从配置文件中读取列族信息,这个信息是控制台方式写入,并通过cli获取
String column = context.getConfiguration().get("conf.column");
ColParser parser = new ColParser();
parser.parse(column);
if(!parser.isValid()) throw new IOException("family or qualifier error");
family = parser.getFamily();
qualifier = parser.getQualifier();
} /**
* Called once for each key/value pair in the input split. Most applications
* should override this, but the default is the identity function.
*
* @param key
* @param value
* @param context
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
String line = value.toString();
//散列每行数据作为行键,根据需求调整
byte[] rowKey = DigestUtils.md5(line);
Put put = new Put(rowKey);
put.addColumn(this.family,this.qualifier,Bytes.toBytes(line));
context.write(new ImmutableBytesWritable(rowKey),put);
context.getCounter(Counters.LINES).increment(1);
}catch (Exception e){
e.printStackTrace();
}
} class ColParser {
private byte[] family;
private byte[] qualifier;
private boolean valid; public byte[] getFamily() {
return family;
} public byte[] getQualifier() {
return qualifier;
} public boolean isValid() {
return valid;
} public void parse(String value) {
try {
String[] sValue = value.split(":");
if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) {
valid = false;
return;
} family = Bytes.toBytes(sValue[0]);
qualifier = Bytes.toBytes(sValue[1]);
valid = true;
} catch (Exception e) {
valid = false;
}
} }
}

4、main

public class ImportFromFile {
// private static String HDFSUri = "hdfs://bigdata-senior01.home.com:9000";
public static final String NAME = "ImportFromFile"; private static CommandLine parseArgs(String[] args) throws ParseException{
Options options = new Options(); Option option = new Option("t","table",true,"表不能为空");
option.setArgName("table-name");
option.setRequired(true);
options.addOption(option); option = new Option("c","column",true,"列族和列名不能为空");
option.setArgName("family:qualifier");
option.setRequired(true);
options.addOption(option); option = new Option("i","input",true,"输入文件或者目录");
option.setArgName("path-in-HDFS");
option.setRequired(true);
options.addOption(option); options.addOption("d","debug",false,"switch on DEBUG log level");
CommandLineParser parser = new PosixParser();
CommandLine cmd = null;
try {
cmd = parser.parse(options,args);
}catch (Exception e){
System.err.println("ERROR: " + e.getMessage() + "\n");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(NAME + " ", options, true);
System.exit(-1);
}
if (cmd.hasOption("d")) {
Logger log = Logger.getLogger("mapreduce");
log.setLevel(Level.DEBUG);
} return cmd;
} public static void main(String[] args) throws Exception{
Configuration conf = HBaseConfiguration.create(); String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
CommandLine cmd = parseArgs(runArgs);
if (cmd.hasOption("d")) conf.set("conf.debug", "true"); String table = cmd.getOptionValue("t");
String input = cmd.getOptionValue("i");
String column = cmd.getOptionValue("c");
//写入配置后,在mapper阶段取出
conf.set("conf.column", column); Job job = Job.getInstance(conf,"Import from file " + input +" into table " + table);
job.setJarByClass(ImportFromFile.class);
job.setMapperClass(ImportMapper.class);
job.setOutputFormatClass(TableOutputFormat.class);
job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE,table);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(Writable.class);
job.setNumReduceTasks(0); //不需要reduce FileInputFormat.addInputPath(job,new Path(input)); System.exit(job.waitForCompletion(true) ? 0 : 1); }
}

5、执行

先在HBASE里建表
create 'importTable','data' 把jar包传到hdfs上执行
hadoop jar ImportFromFile.jar -t importTable -i /input/test-data.txt -c data:json

二、从hbase获取数据进行计算

从上例中把hbase数据抽取出来计算作者出现数量

多加一个依赖

      <dependency>
<groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId>
<version>1.1.1</version>
</dependency>

1、mapper

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser; import java.io.IOException; public class AnalyzeMapper extends TableMapper<Text,IntWritable> {
private JSONParser parser = new JSONParser();
public enum Counters { ROWS, COLS, ERROR, VALID }
private IntWritable ONE = new IntWritable(1);
/**
* Called once for each key/value pair in the input split. Most applications
* should override this, but the default is the identity function.
*
* @param key
* @param value
* @param context
*/
@Override
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
context.getCounter(Counters.ROWS).increment(1);
String val = null;
try {
for(Cell cell:value.listCells()){
context.getCounter(Counters.COLS).increment(1);
val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength());
JSONObject json = (JSONObject)parser.parse(val);
String author = (String)json.get("author");
if (context.getConfiguration().get("conf.debug") != null)
System.out.println("Author: " + author);
context.write(new Text(author),ONE);
context.getCounter(Counters.VALID).increment(1);
} }catch (Exception e){
e.printStackTrace();
System.err.println("Row: " + Bytes.toStringBinary(key.get()) +
", JSON: " + value);
context.getCounter(Counters.ERROR).increment(1);
} }
}

2、reducer

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class AnalyzeReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
/**
* This method is called once for each key. Most applications will define
* their reduce class by overriding this method. The default implementation
* is an identity function.
*
* @param key
* @param values
* @param context
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for(IntWritable one:values) count++; if (context.getConfiguration().get("conf.debug") != null)
System.out.println("Author: " + key.toString() + ", Count: " + count); context.write(key,new IntWritable(count));
}
}

3、main

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.commons.cli.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Level;
import org.apache.log4j.Logger; import java.io.IOException; public class AnalyzeData {
private static final Log LOG = LogFactory.getLog(AnalyzeData.class); public static final String NAME = "AnalyzeData"; /**
* Parse the command line parameters.
*
* @param args The parameters to parse.
* @return The parsed command line.
* @throws org.apache.commons.cli.ParseException When the parsing of the parameters fails.
*/
private static CommandLine parseArgs(String[] args) throws ParseException {
Options options = new Options();
Option o = new Option("t", "table", true,
"table to read from (must exist)");
o.setArgName("table-name");
o.setRequired(true);
options.addOption(o);
o = new Option("c", "column", true,
"column to read data from (must exist)");
o.setArgName("family:qualifier");
options.addOption(o);
o = new Option("o", "output", true,
"the directory to write to");
o.setArgName("path-in-HDFS");
o.setRequired(true);
options.addOption(o);
options.addOption("d", "debug", false, "switch on DEBUG log level");
CommandLineParser parser = new PosixParser();
CommandLine cmd = null;
try {
cmd = parser.parse(options, args);
} catch (Exception e) {
System.err.println("ERROR: " + e.getMessage() + "\n");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(NAME + " ", options, true);
System.exit(-1);
}
if (cmd.hasOption("d")) {
Logger log = Logger.getLogger("mapreduce");
log.setLevel(Level.DEBUG);
System.out.println("DEBUG ON");
}
return cmd;
} public static void main(String[] args) throws Exception{
Configuration conf = HBaseConfiguration.create();
String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
CommandLine cmd = parseArgs(runArgs);
if(cmd.hasOption("d"))
conf.set("conf.debug","true"); String table = cmd.getOptionValue("t");
String column = cmd.getOptionValue("c");
String output = cmd.getOptionValue("o"); ColumnParser columnParser = new ColumnParser();
columnParser.parse(column);
if(!columnParser.isValid()) throw new IOException("family or qualifier error");
byte[] family = columnParser.getFamily();
byte[] qualifier = columnParser.getQualifier(); Scan scan = new Scan();
scan.addColumn(family,qualifier); Job job = Job.getInstance(conf,"Analyze data in " + table);
job.setJarByClass(AnalyzeData.class);
TableMapReduceUtil.initTableMapperJob(table,scan,AnalyzeMapper.class, Text.class, IntWritable.class,job);
job.setMapperClass(AnalyzeMapper.class);
job.setReducerClass(AnalyzeReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(1);
FileOutputFormat.setOutputPath(job,new Path(output)); System.exit(job.waitForCompletion(true) ? 0:1); } }
###
public class ColumnParser {
private byte[] family;
private byte[] qualifier;
private boolean valid; public byte[] getFamily() {
return family;
} public byte[] getQualifier() {
return qualifier;
} public boolean isValid() {
return valid;
} public void parse(String value) {
try {
String[] sValue = value.split(":");
if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) {
valid = false;
return;
} family = Bytes.toBytes(sValue[0]);
qualifier = Bytes.toBytes(sValue[1]);
valid = true;
} catch (Exception e) {
valid = false;
}
}
}

4、执行

hadoop jar AnalyzeData.jar -t importTable -c data:json -o /output9

结果:
... ...
AnalyzeMapper$Counters
COLS=993
ERROR=6
ROWS=993
VALID=987

三、从hbase中读取数据,计算后存回hbase

把上例中存入的json串读出,按key-value的方式分解,把key作为列名,value作为列值存入hbase

public class ParseJson {
private static final String HDFSUri = "hdfs://bigdata-senior01.home.com:9000";
private static final Log LOG = LogFactory.getLog(ParseJson.class);
public static final String NAME = "ParseJson";
public enum Counters {ROWS,COLS,VALID,ERROR}; static class ParseMapper extends TableMapper<ImmutableBytesWritable, Mutation>{
private JSONParser parser = new JSONParser();
private byte[] columnFamily = null; @Override
protected void setup(Context context) throws IOException, InterruptedException {
columnFamily = Bytes.toBytes(context.getConfiguration().get("conf.columnFamily"));
} @Override
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
context.getCounter(Counters.ROWS).increment(1);
String val = null;
try {
Put put = new Put(key.get());
for(Cell cell : value.listCells()){
context.getCounter(Counters.COLS).increment(1);
val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength());
JSONObject json = (JSONObject) parser.parse(val); for (Object jsonKey : json.keySet()){
Object jsonValue = json.get(jsonKey);
put.addColumn(columnFamily,Bytes.toBytes(jsonKey.toString()),Bytes.toBytes(jsonValue.toString()));
}
}
context.write(key,put);
context.getCounter(Counters.VALID).increment(1);
}catch (Exception e){
e.printStackTrace();
System.err.println("Error: " + e.getMessage() + ", Row: " +
Bytes.toStringBinary(key.get()) + ", JSON: " + value);
context.getCounter(Counters.ERROR).increment(1);
}
}
} private static CommandLine parseArgs(String[] args) throws ParseException{
Options options = new Options();
Option o = new Option("i", "input", true,
"table to read from (must exist)");
o.setArgName("input-table-name");
o.setRequired(true);
options.addOption(o);
o = new Option("o", "output", true,
"table to write to (must exist)");
o.setArgName("output-table-name");
o.setRequired(true);
options.addOption(o);
o = new Option("c", "column", true,
"column to read data from (must exist)");
o.setArgName("family:qualifier");
options.addOption(o);
options.addOption("d", "debug", false, "switch on DEBUG log level"); CommandLineParser parser = new PosixParser();
CommandLine cmd = null;
try {
cmd = parser.parse(options, args);
} catch (Exception e) {
System.err.println("ERROR: " + e.getMessage() + "\n");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(NAME + " ", options, true);
System.exit(-1);
}
if (cmd.hasOption("d")) {
Logger log = Logger.getLogger("mapreduce");
log.setLevel(Level.DEBUG);
System.out.println("DEBUG ON");
}
return cmd;
} public static void main(String[] args) throws Exception{
Configuration conf = HBaseConfiguration.create(); // conf.set("hbase.master","192.168.31.10");
// conf.set("hbase.zookeeper.quorum", "192.168.31.10");
// conf.set("hbase.rootdir","hdfs://bigdata-senior01.home.com:9000/hbase");
// conf.set("hbase.zookeeper.property.dataDir","hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper"); String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
CommandLine cmd = parseArgs(runArgs);
if(cmd.hasOption("d")) conf.set("conf.debug","true");
String input = cmd.getOptionValue("i");
String output = cmd.getOptionValue("o");
String column = cmd.getOptionValue("c"); ColumnParser columnParser = new ColumnParser();
columnParser.parse(column);
if(!columnParser.isValid()) throw new IOException("family or qualifier error");
byte[] family = columnParser.getFamily();
byte[] qualifier = columnParser.getQualifier(); Scan scan = new Scan();
scan.addColumn(family,qualifier);
conf.set("conf.columnFamily", Bytes.toStringBinary(family)); Job job = Job.getInstance(conf, "Parse data in " + input +
", write to " + output);
job.setJarByClass(ParseJson.class);
TableMapReduceUtil.initTableMapperJob(input,scan,ParseMapper.class,ImmutableBytesWritable.class,Put.class,job);
TableMapReduceUtil.initTableReducerJob(output, IdentityTableReducer.class,job); System.exit(job.waitForCompletion(true)?0:1); } }

执行:

hadoop jar ParseJson.jar -i importTable -c data:json -o importTable

mapreduce方式操作hbase的更多相关文章

  1. 用mapreduce来操作hbase的优化

    (1)scan.setCacheBlocks(false); 初始化map任务    TableMapReduceUtil.initTableMapperJob 本次mr任务scan的所有数据不放在缓 ...

  2. HBase 相关API操练(三):MapReduce操作HBase

    MapReduce 操作 HBase 在 HBase 系统上运行批处理运算,最方便和实用的模型依然是 MapReduce,如下图所示. HBase Table 和 Region 的关系类似 HDFS ...

  3. Mapreduce操作HBase

    这个操作和普通的Mapreduce还不太一样,比如普通的Mapreduce输入可以是txt文件等,Mapreduce可以直接读取Hive中的表的数据(能够看见是以类似txt文件形式),但Mapredu ...

  4. Hbase第五章 MapReduce操作HBase

    容易遇到的坑: 当用mapReducer操作HBase时,运行jar包的过程中如果遇到 java.lang.NoClassDefFoundError 类似的错误时,一般是由于hadoop环境没有hba ...

  5. 7.MapReduce操作Hbase

    7 HBase的MapReduce   HBase中Table和Region的关系,有些类似HDFS中File和Block的关系.由于HBase提供了配套的与MapReduce进行交互的API如 Ta ...

  6. PHP通过Thrift操作Hbase

    PHP通过Thrift操作Hbase     HBase是一个开源的NoSQL产品,它是实现了Google BigTable论文的一个开源产品,和Hadoop和HDFS一起,可用来存储和处理海量col ...

  7. 大数据技术之_11_HBase学习_02_HBase API 操作 + HBase 与 Hive 集成 + HBase 优化

    第6章 HBase API 操作6.1 环境准备6.2 HBase API6.2.1 判断表是否存在6.2.2 抽取获取 Configuration.Connection.Admin 对象的方法以及关 ...

  8. 大数据技术之_11_HBase学习_01_HBase 简介+HBase 安装+HBase Shell 操作+HBase 数据结构+HBase 原理

    第1章 HBase 简介1.1 什么是 HBase1.2 HBase 特点1.3 HBase 架构1.3 HBase 中的角色1.3.1 HMaster1.3.2 RegionServer1.3.3 ...

  9. 吴裕雄--天生自然HADOOP操作实验学习笔记:使用hive操作hbase

    实验目的 熟悉hive和hbase的操作 熟悉hadoop.hbase.hive.zookeeper的关系 熟练大数据环境的搭建 学会分析日志排除问题 实验原理 1.hive整合hbase原理 前面大 ...

随机推荐

  1. VS2017 C++操作mysql数据库

    1.首先安装mysql 具体教程可以参考https://blog.csdn.net/zhouzezhou/article/details/52446608 注意安装产品的时候记得选择MySQL Con ...

  2. 我所理解的selenium之PO设计模式

    下午,花了点时间来整理UI自动化设计,就把我所理解的PO设计模式项目结构脑图整理如下,有不对的地方还望多多包涵.谢谢

  3. react-native初体验(1) — hello world

    没有简介,直接开始干活吧. 默认阅读本文的你已经安装好 nodejs, windows用户需要升级yarn到最新版本. 并且设置安装源为国内的淘宝源: npm config set registry ...

  4. python函数式编程,性能,测试,编码规范

    这篇文章主要是对我收集的一些文章的摘要.因为已经有很多比我有才华的人写出了大量关于如何成为优秀Python程序员的好文章. 我的总结主要集中在四个基本题目上:函数式编程,性能,测试,编码规范.如果一个 ...

  5. Appium 安装详细版教程

      1.安装Appium Python Client包 输入命令  pip install Appium-Python-Client  

  6. 记一次nginx -t非常慢的排障经历

    在一次修改nginx配置时候,执行 case: #/usr/local/nginx/sbin/nginx -t 出现执行命令出现很久没返回结果,也没返回成功或是失败,就是一直卡住的状态,严重影响ngi ...

  7. Python基础系列讲解—动态类型语言的特点

    前言 在C语言中变量所分配到的地址是内存空间中一个固定的位置,当我们改变变量值时, 对应内存空间中的值也相应改变.在Python中变量存储的机制是完全不一样的,当给一个变量赋值时首先解释器会给这个值分 ...

  8. redis高级应用(集群搭建、集群分区原理、集群操作)

    文章主目录 Redis集群简介 Redis集群搭建 Redis集群分区原理 集群操作 参考文档 本文是redis学习系列的第四篇,前面我们学习了redis的数据结构和一些高级特性,点击下面链接可回看 ...

  9. 【Oracle】存储过程在字符串单引号'内拼接单引号'

    http://blog.csdn.net/u011704894/article/details/44976557 一般变量里面接3个单引号 eg: 'DELETE FROM RDM_SUPP_DATA ...

  10. TeamWork#3,Week5,Introduction to the "take-away" Sale Selection Project

    一.NABCD 1.N(Need 需求) 当今社会生活节奏快,很多大学生.上班族叫外卖比较普遍,外卖生意异常火爆.最近美团.饿了么等外卖服务竞争激烈,产生了大量外卖优惠信息.而网络上外卖信息比较混乱, ...