Flink连接器-批处理-读写Hbase
Flink批处理与hbase的读写
source-hbase
父类
是模仿官方写的.
import org.apache.flink.api.common.io.LocatableInputSplitAssigner;
import org.apache.flink.api.common.io.RichInputFormat;
import org.apache.flink.api.common.io.statistics.BaseStatistics;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.io.InputSplitAssigner;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @Auther WeiJiQian
* @描述
*/
public abstract class SourceHBaseInputBase<T> extends RichInputFormat<T, MyTableInputSplit>{
protected static final Logger LOG = LoggerFactory.getLogger(SourceHBaseInputBase.class);
// helper variable to decide whether the input is exhausted or not
protected boolean endReached = false;
protected transient HTable table = null;
protected transient Scan scan = null;
protected transient Connection connection = null;
/** HBase iterator wrapper. */
protected ResultScanner resultScanner = null;
protected byte[] currentRow;
protected long scannedRows;
protected ParameterTool parameterTool;
protected abstract T mapResultToOutType(Result r);
protected abstract void getScan();
protected abstract TableName getTableName();
protected void getTable() throws IOException {
org.apache.hadoop.conf.Configuration configuration;
parameterTool = PropertiesUtil.PARAMETER_TOOL;
configuration = HBaseConfiguration.create();
configuration.set(HBASE_ZOOKEEPER_QUORUM, parameterTool.get(HBASE_ZOOKEEPER_QUORUM));
configuration.set(HBASE_ZOOKEEPER_PROPERTY_CLIENTPORT, parameterTool.get(HBASE_ZOOKEEPER_PROPERTY_CLIENTPORT));
configuration.set(HBASE_RPC_TIMEOUT, parameterTool.get(HBASE_RPC_TIMEOUT));
configuration.set(HBASE_CLIENT_OPERATION_TIMEOUT, parameterTool.get(HBASE_CLIENT_OPERATION_TIMEOUT));
configuration.set(HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, parameterTool.get(HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD));
connection = ConnectionFactory.createConnection(configuration);
table = (HTable) connection.getTable(getTableName());
}
@SneakyThrows
@Override
public void configure(Configuration parameters) {
getTable();
getScan();
}
@Override
public void open(MyTableInputSplit split) throws IOException {
System.out.println("open:" + table == null);
if (table == null) {
System.out.println("open:table is null ---------");
throw new IOException("The HBase table has not been opened! " +
"This needs to be done in configure().");
}
if (scan == null) {
throw new IOException("Scan has not been initialized! " +
"This needs to be done in configure().");
}
if (split == null) {
throw new IOException("Input split is null!");
}
logSplitInfo("opening", split);
// set scan range
currentRow = split.getStartRow();
scan.setStartRow(currentRow);
scan.setStopRow(split.getEndRow());
resultScanner = table.getScanner(scan);
endReached = false;
scannedRows = 0;
}
public T nextRecord(T reuse) throws IOException {
if (resultScanner == null) {
throw new IOException("No table result scanner provided!");
}
Result res;
try {
res = resultScanner.next();
} catch (Exception e) {
resultScanner.close();
//workaround for timeout on scan
LOG.warn("Error after scan of " + scannedRows + " rows. Retry with a new scanner...", e);
scan.withStartRow(currentRow, false);
resultScanner = table.getScanner(scan);
res = resultScanner.next();
}
if (res != null) {
scannedRows++;
currentRow = res.getRow();
return mapResultToOutType(res);
}
endReached = true;
return null;
}
private void logSplitInfo(String action, MyTableInputSplit split) {
int splitId = split.getSplitNumber();
String splitStart = Bytes.toString(split.getStartRow());
String splitEnd = Bytes.toString(split.getEndRow());
String splitStartKey = splitStart.isEmpty() ? "-" : splitStart;
String splitStopKey = splitEnd.isEmpty() ? "-" : splitEnd;
String[] hostnames = split.getHostnames();
LOG.info("{} split (this={})[{}|{}|{}|{}]", action, this, splitId, hostnames, splitStartKey, splitStopKey);
}
@Override
public boolean reachedEnd() throws IOException {
return endReached;
}
@Override
public void close() throws IOException {
LOG.info("Closing split (scanned {} rows)", scannedRows);
currentRow = null;
try {
if (resultScanner != null) {
resultScanner.close();
}
} finally {
resultScanner = null;
}
}
@Override
public void closeInputFormat() throws IOException {
try {
if (connection != null) {
connection.close();
}
} finally {
connection = null;
}
try {
if (table != null) {
table.close();
}
} finally {
table = null;
}
}
@Override
public MyTableInputSplit[] createInputSplits(final int minNumSplits) throws IOException {
if (table == null) {
throw new IOException("The HBase table has not been opened! " +
"This needs to be done in configure().");
}
if (scan == null) {
throw new IOException("Scan has not been initialized! " +
"This needs to be done in configure().");
}
// Get the starting and ending row keys for every region in the currently open table
final Pair<byte[][], byte[][]> keys = table.getRegionLocator().getStartEndKeys();
if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
throw new IOException("Expecting at least one region.");
}
final byte[] startRow = scan.getStartRow();
final byte[] stopRow = scan.getStopRow();
final boolean scanWithNoLowerBound = startRow.length == 0;
final boolean scanWithNoUpperBound = stopRow.length == 0;
final List<MyTableInputSplit> splits = new ArrayList<MyTableInputSplit>(minNumSplits);
for (int i = 0; i < keys.getFirst().length; i++) {
final byte[] startKey = keys.getFirst()[i];
final byte[] endKey = keys.getSecond()[i];
final String regionLocation = table.getRegionLocator().getRegionLocation(startKey, false).getHostnamePort();
// Test if the given region is to be included in the InputSplit while splitting the regions of a table
if (!includeRegionInScan(startKey, endKey)) {
continue;
}
// Find the region on which the given row is being served
final String[] hosts = new String[]{regionLocation};
// Determine if regions contains keys used by the scan
boolean isLastRegion = endKey.length == 0;
if ((scanWithNoLowerBound || isLastRegion || Bytes.compareTo(startRow, endKey) < 0) &&
(scanWithNoUpperBound || Bytes.compareTo(stopRow, startKey) > 0)) {
final byte[] splitStart = scanWithNoLowerBound || Bytes.compareTo(startKey, startRow) >= 0 ? startKey : startRow;
final byte[] splitStop = (scanWithNoUpperBound || Bytes.compareTo(endKey, stopRow) <= 0)
&& !isLastRegion ? endKey : stopRow;
int id = splits.size();
final MyTableInputSplit split = new MyTableInputSplit(id, hosts, table.getName().getName(), splitStart, splitStop);
splits.add(split);
}
}
LOG.info("Created " + splits.size() + " splits");
for (MyTableInputSplit split : splits) {
logSplitInfo("created", split);
}
return splits.toArray(new MyTableInputSplit[splits.size()]);
}
/**
* Test if the given region is to be included in the scan while splitting the regions of a table.
*
* @param startKey Start key of the region
* @param endKey End key of the region
* @return true, if this region needs to be included as part of the input (default).
*/
protected boolean includeRegionInScan(final byte[] startKey, final byte[] endKey) {
return true;
}
@Override
public InputSplitAssigner getInputSplitAssigner(MyTableInputSplit[] inputSplits) {
return new LocatableInputSplitAssigner(inputSplits);
}
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStatistics) {
return null;
}
}
子类
import org.apache.flink.configuration.Configuration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.BinaryPrefixComparator;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.util.Bytes;
import javax.swing.*;
import java.util.List;
import static org.apache.hadoop.hbase.filter.FilterList.Operator.MUST_PASS_ONE;
/**
* @author WeiJiQian
* @param
* @return
*/
public class SourceDaysHbase extends SourceHBaseInputBase<UsersBean> {
public SourceDaysHbase(List<String> dates){
this.dates = dates;
}
private List<String> dates;
private UsersBean usersBean = new UsersBean();
@Override
public void configure(Configuration parameters) {
super.configure(parameters);
}
@Override
protected UsersBean mapResultToOutType(Result r) {
usersBean.setPhone11(CustomizeUtils.getPhoneOfPersonaDataRowKey(Bytes.toString(r.getRow())));
usersBean.setPhone8(CustomizeUtils.getPhone8(usersBean.getPhone11()));
return usersBean;
}
@Override
protected void getScan() {
scan = new Scan();
scan.addColumn(HBaseConstant.HBASE_PERSONA_FAMILY_MONTH_DAY, HBaseConstant.HBASE_PERSONA_ACTIVITE_DATE);
}
@Override
protected TableName getTableName() {
return TableName.valueOf(parameterTool.get(HBaseConstant.HBASE_TABLE_NAME_PERSONA_DATA));
}
}
sink-hbase
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.common.io.OutputFormat;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.mortbay.util.MultiPartWriter;
import java.io.IOException;
import static com.hecaiyun.common.bean.HBaseConstant.*;
/**
* @Auther WeiJiQian
* @描述
*/
@Slf4j
public abstract class HBaseOutputFormatBase<T> implements OutputFormat<T> {
protected final String valueString = "1";
protected String date ;
protected Table table ;
protected Connection connection;
protected BufferedMutatorParams params;
protected BufferedMutator mutator;
protected org.apache.hadoop.conf.Configuration configuration;
protected ParameterTool parameterTool;
public abstract TableName getTableName();
public void configure(Configuration parameters) {
parameterTool = PropertiesUtil.PARAMETER_TOOL;
configuration = HBaseConfiguration.create();
configuration.set(HBASE_ZOOKEEPER_QUORUM, parameterTool.get(HBASE_ZOOKEEPER_QUORUM));
configuration.set(HBASE_ZOOKEEPER_PROPERTY_CLIENTPORT, parameterTool.get(HBASE_ZOOKEEPER_PROPERTY_CLIENTPORT));
configuration.set(HBASE_RPC_TIMEOUT, parameterTool.get(HBASE_RPC_TIMEOUT));
configuration.set(HBASE_CLIENT_OPERATION_TIMEOUT, parameterTool.get(HBASE_CLIENT_OPERATION_TIMEOUT));
configuration.set(HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, parameterTool.get(HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD));
}
public void open(int taskNumber, int numTasks) throws IOException {
connection = ConnectionFactory.createConnection(configuration);
table = connection.getTable(getTableName());
params = new BufferedMutatorParams(table.getName());
//设置缓存的大小 100M
params.writeBufferSize(parameterTool.getLong(HBASE_WRITEBUFFER_SIZE));
mutator = connection.getBufferedMutator(params);
}
/*
* @author WeiJiQian
* @param rowKey
* @param family
* @param colum
* @param value
* @return org.apache.hadoop.hbase.client.Put
* 描述 覆盖数据
*/
public void putData(String rowKey,byte[] family, byte[] colum,String value ) throws IOException {
Put put = new Put(Bytes.toBytes(rowKey));
put.addColumn(family,colum,Bytes.toBytes(value));
put.setDurability(Durability.SKIP_WAL);
mutator.mutate(put);
}
public void close() throws IOException {
if (mutator != null){
mutator.flush();
mutator.close();
}
if (table != null){
table.close();
}
if (connection != null){
connection.close();
}
}
}
Flink连接器-批处理-读写Hbase的更多相关文章
- Spark读写Hbase的二种方式对比
作者:Syn良子 出处:http://www.cnblogs.com/cssdongl 转载请注明出处 一.传统方式 这种方式就是常用的TableInputFormat和TableOutputForm ...
- 【原创】大叔经验分享(25)hive通过外部表读写hbase数据
在hive中创建外部表: CREATE EXTERNAL TABLE hive_hbase_table(key string, name string,desc string) STORED BY ' ...
- Spark读写HBase
Spark读写HBase示例 1.HBase shell查看表结构 hbase(main)::> desc 'SDAS_Person' Table SDAS_Person is ENABLED ...
- Flink批处理读写Hive
import org.apache.flink.table.api.*; import org.apache.flink.table.catalog.hive.HiveCatalog; /** * @ ...
- flink连接器-流处理-读写redis
写入redis resultStream.addSink(new RedisSink(FlinkUtils.getRedisSinkConfig(parameters),new MyRedisMapp ...
- Spark读写Hbase中的数据
def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local").setAppN ...
- Spark实战之读写HBase
1 配置 1.1 开发环境: HBase:hbase-1.0.0-cdh5.4.5.tar.gz Hadoop:hadoop-2.6.0-cdh5.4.5.tar.gz ZooKeeper:zooke ...
- spark读写hbase性能对比
一.spark写入hbase hbase client以put方式封装数据,并支持逐条或批量插入.spark中内置saveAsHadoopDataset和saveAsNewAPIHadoopDatas ...
- Spark学习笔记——读写Hbase
1.首先在Hbase中建立一张表,名字为student 参考 Hbase学习笔记——基本CRUD操作 一个cell的值,取决于Row,Column family,Column Qualifier和Ti ...
随机推荐
- RayFire的下载与安装方法
RayFire的下载与安装方法 发布时间:2020/10/12 近几年,电影中融入了越来越多的动画元素,其中的爆炸场景更是十分吸引眼球.小编不禁好奇,什么样的插件能做出来如此好玩的特效,上网搜索一番发 ...
- 不一样的Flink入门教程
前言 微信搜[Java3y]关注这个朴实无华的男人,点赞关注是对我最大的支持! 文本已收录至我的GitHub:https://github.com/ZhongFuCheng3y/3y,有300多篇原创 ...
- 简单的 通过ID获取文件名称
模型中的方法class 模型名{ /** * 通过ID获取文件名称 */ public static function getNameById($id) { $model = self::findOn ...
- Java进阶专题(十七) 系统缓存架构设计 (上)
前言 我们将先从Redis.Nginx+Lua等技术点出发,了解缓存应用的场景.通过使用缓存相关技术,解决高并发的业务场景案例,来深入理解一套成熟的企业级缓存架构如何设计的.本文Redis部分总结 ...
- D - Number of Multisets 题解(思维dp)
题目链接 题目大意 给你一个数k和n,表示用n个\(1/2^i(i=0,1,2.....)\)组成k有多少种方案数 题目思路 这个dp实属巧妙 设\(dp[i][j]表示i个数构成j\) 这i个数可以 ...
- Java基础教程——线程池
启动新线程,需要和操作系统进行交互,成本比较高. 使用线程池可以提高性能-- 线程池会提前创建大量的空闲线程,随时待命执行线程任务.在执行完了一个任务之后,线程会回到空闲状态,等待执行下一个任务.(这 ...
- 盘点腾讯Linux、 C++后台开发面试题,做好充足准备,不怕被Pass
一.C/C++ const 多态 什么类不能被继承 二.网络 网络的字节序 网络知识 TCP三次握手 各种细节 timewait状态 TCP与UDP的区别 概念 适用范围 TCP四次挥 ...
- linux 权限提升
1.内核提权,根据版本搜索相应exp 查看操作系统版本命令 uname –a lsb_release –a cat /proc/version 查看内核版本 cat /etc/issue 查看发行类型 ...
- [整理]qbxt集训10场考试 大 杂 烩 (后篇)
前篇 Contest 6 A 两个数,第 \(i\) 轮从较大数(如果相等就是第一个)里减去 \(i\) ,问操作不能进行时两数分别为多少. 首先把大数减到和小数差不多,然后我们会发现接下来两数会轮流 ...
- Centos7配置阿里epel源|yum源
这一步非常重要.重要.重要.在这解释一下源的概念,打个比方如果手机想获取一个软件,可以选择很多途径,如华为的华为商店,小米的应用商店,苹果的App store,源就相当于各种手机获取软件的商店.因为国 ...