hadoop hdfs hbase优化实例
需求描述:
从hdfs中获取数据,字段url需要计算出url_type 通过进行hive的left outer join ,效率非常低。故将url的类型导入到hbase中,利用hbase快速查询的特点,结合mapreduce进行字段打标。
刚开始的mapreduce程序如下:
package com.bonc.db; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.HTablePool;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.bonc.URLMatch.HBaseMain; public class DWA_S_D_USE_MB_COUNT_BASE2 {
public static void main(String args[]) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "DWA_S_D_USE_MB_COUNT_BASE");
job.setJarByClass(DWA_S_D_USE_MB_COUNT_BASE2.class);
job.setMapperClass(DataCleanMapper.class);
job.setReducerClass(DataCleanReduce.class);
job.setNumReduceTasks(150);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
} public static class DataCleanMapper extends
Mapper<LongWritable, Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String lines = value.toString();
String[] strs = lines.split("\\|");
ParesURL pu = new ParesURL();
String url = "NULL";
if (strs.length > 25) {
url = pu.execute(strs[25], "HOST");
}
String keys = "";
String values = "";
if (strs.length > 16) {
keys = strs[0] + "|" + strs[1] + "|" + strs[2] + "|" + strs[3]
+ "|" + strs[4] + "|" + use_seg(strs[5]) + "|"
+ strs[11] + "|" + strs[16] + "|" + url + "|" + strs[7]
+ "|" + strs[8] + "|" + strs[9] + "|" + strs[10] + "|";
}
if (strs.length > 15) {
values = url + "|" + strs[13] + "|" + strs[15] + "|" + "1";
}
context.write(new Text(keys), new Text(values));
} public String use_seg(String start_date) {
String s = "**";
if (start_date.toString().length() > 23) {
if (isNum(start_date.toString().substring(11, 13))
&& Integer.parseInt(start_date.toString().substring(11,
13)) >= 0
&& Integer.parseInt(start_date.toString().substring(11,
13)) <= 23) {
s = start_date.toString().substring(11, 13);
}
}
return s;
} public static boolean isNum(String str) {
return str
.matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$");
}
} public static class DataCleanReduce extends Reducer<Text, Text, Text, Text> {
private HTable table; @Override
protected void reduce(Text arg0, Iterable<Text> arg1, Context context)
throws IOException, InterruptedException {
String keys = arg0.toString();
String value[] = { "" };
String url = "NULL";
String visitIP = "NULL";
String value2 = "NULL";
for (Text c : arg1) {
value = c.toString().split("\\|");
if (value.length > 0) {
url = value[0];
}
if (value.length > 1) {
visitIP = value[1];
}
if (value.length > 2) {
value2 = value[2];
}
}
String matchResult = urlMatch(url);
if (matchResult.equals("NULL")) {
matchResult = urlMatch(visitIP);
}
String output = matchResult + "|" + value2 + "|" + "1";
// System.out.println(output+"+++++++++++++++++");
context.write(new Text(keys), new Text(output));
} @Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
super.cleanup(context);
table.close();
} @Override
protected void setup(Context context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
super.setup(context);
HTablePool pool = new HTablePool(HBaseMain.conf, 1000);
table = (HTable) pool.getTable("22222");
} public String urlMatch(String url) {
String s = "NULL";
if (url == null || url.equals("NULL")) {
s = "NULL";
} else {
try {
Get getu = new Get(url.getBytes());
Result ru = table.get(getu);
if (!ru.isEmpty()) {
s = new String(ru.getValue("123".getBytes(), "456".getBytes()));
}
} catch (IOException e) {
e.printStackTrace();
}
}
return s;
}
}
}
后来发现效率很低,主要是每一条数据都要访问hbase并且进行随机查询,所以后来转换方法,查询时先将row组装成list,然后再去查询,时间几乎是原来的一半。
改进后的代码:
package com.bonc.db; import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.HTablePool;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.bonc.URLMatch.HBaseMain; public class DWA_S_D_USE_MB_COUNT_BASE {
public static void main(String args[]) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "DWA_S_D_USE_MB_COUNT_BASE2");
job.setJarByClass(DWA_S_D_USE_MB_COUNT_BASE.class);
job.setMapperClass(DataCleanMapper.class);
job.setReducerClass(DataCleanReduce.class);
job.setNumReduceTasks(150);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
} public static class DataCleanMapper extends
Mapper<LongWritable, Text, Text, Text> {
public static Counter ct = null;
public static long i = 0; @Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//之所以在后面+1,是为了保证如果后面的几个字段都为空的话,依然可以输出这个字段!
String lines = value.toString()+"|"+"1";
String[] strs = lines.split("\\|");
ParesURL pu = new ParesURL();
String url = "NULL";
String keys = "";
String values = "";
if (strs.length > 25) {
i++;
if(!strs[25].startsWith("http://")){
strs[25]="http://"+strs[25];
}
url = pu.execute(EmptyParse(strs[25]), "HOST");
keys = EmptyParse(strs[0]) + "|" + EmptyParse(strs[1]) + "|"
+ EmptyParse(strs[2]) + "|" + EmptyParse(strs[3]) + "|"
+ EmptyParse(strs[4]) + "|"
+ EmptyParse(use_seg(strs[5])) + "|"
+ EmptyParse(strs[11]) + "|" + EmptyParse(strs[16])
+ "|" + EmptyParse(url) + "|" + EmptyParse(strs[7])
+ "|" + EmptyParse(strs[8]) + "|" + EmptyParse(strs[9])
+ "|" + EmptyParse(strs[10]) + "|";
values = EmptyParse(url) + "|" + EmptyParse(strs[13]) + "|"
+ EmptyParse(strs[15]) + "|" + i;
context.write(new Text(String.valueOf(i % 10000)), new Text(
keys + values));
}
} public String use_seg(String start_date) {
String s = "**";
if (start_date.toString().length() > 23) {
if (isNum(start_date.toString().substring(11, 13))
&& Integer.parseInt(start_date.toString().substring(11,
13)) >= 0
&& Integer.parseInt(start_date.toString().substring(11,
13)) <= 23) {
s = start_date.toString().substring(11, 13);
}
}
return s;
} public static boolean isNum(String str) {
return str
.matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$");
} public static String EmptyParse(String str) {
if (str == null || str.length() < 1 || str.equals("")
|| str.isEmpty()) {
return "NULL";
} else {
return str;
}
}
} public static class DataCleanReduce extends Reducer<Text, Text, Text, Text> {
private HTable table;
private long index = 0; @Override
protected void reduce(Text arg0, Iterable<Text> arg1, Context context)
throws IOException, InterruptedException {
String keys = arg0.toString();
String value[] = { "" };
String url = "NULL";
String visitIP = "NULL";
String value2 = "NULL";
String reduceoutput = "NULL";
String urlMatch = "NULL";
String output = "NULL";
Get getu;
Get getip;
List<Get> lg = new ArrayList<Get>();
List<Get> li = new ArrayList<Get>();
List<String> lo = new ArrayList<String>();
List<String> useragent = new ArrayList<String>();
for (Text c : arg1) {
value = c.toString().split("\\|");
url = value[13];
visitIP = value[14];
value2 = value[15];
output = value[0] + "|" + value[1] + "|" + value[2] + "|"
+ value[3] + "|" + value[4] + "|" + value[5] + "|"
+ value[6] + "|" + value[7] + "|" + value[8] + "|"
+ value[9] + "|" + value[10] + "|" + value[11] + "|"
+ value[12] + "|";
getu = new Get(url.getBytes());
getip = new Get(visitIP.getBytes());
lg.add(getu);
li.add(getip);
lo.add(output);
useragent.add(value2);
} Result ru[];
Result ri[];
ru = table.get(lg);
ri = table.get(li);
for (int i = 0; i < lo.size(); i++) { if (!ru[i].isEmpty()) {
urlMatch = new String(ru[i].getValue("url_type".getBytes(),
"type".getBytes()));
} else if (!ri[i].isEmpty()) {
urlMatch = new String(ri[i].getValue("url_type".getBytes(),
"type".getBytes()));
}
reduceoutput = urlMatch + "|" + useragent.get(i) + "|" + "1";
context.write(new Text(lo.get(i)), new Text(reduceoutput));
}
} @Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
super.cleanup(context);
table.close();
} @Override
protected void setup(Context context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
super.setup(context);
HTablePool pool = new HTablePool(HBaseMain.conf, 1000);
table = (HTable) pool.getTable("url_rule");
} public String urlMatch(String url) {
String s = "NULL";
Result ru;
if (url == null || url.equals("NULL")) {
s = "NULL";
} else {
try {
Get getu = new Get(123.getBytes());
ru = table.get(getu);
if (!ru.isEmpty()) {
s = new String(ru.getValue("123123".getBytes(),
"123".getBytes()));
}
} catch (IOException e) {
e.printStackTrace();
}
}
return s;
}
}
}
在有限的资源下,可以激发一个人的创造力。用这句话作为总结吧。
hadoop hdfs hbase优化实例的更多相关文章
- Hadoop HDFS (3) JAVA訪问HDFS
如今我们来深入了解一下Hadoop的FileSystem类. 这个类是用来跟Hadoop的文件系统进行交互的.尽管我们这里主要是针对HDFS.可是我们还是应该让我们的代码仅仅使用抽象类FileSyst ...
- Hadoop生态圈-HBase性能优化
Hadoop生态圈-HBase性能优化 作者:尹正杰 版权声明:原创作品,谢绝转载!否则将追究法律责任.
- hbase+hadoop+hdfs集群搭建 集成spring
序言 最近公司一个汽车项目想用hbase做存储,然后就有了这篇文字,来,来,来, 带你一起征服hbase,并推荐一本书<hbase权威指南> 这是一本极好的hbase入门书籍,我花了一个晚 ...
- 大数据技术之_11_HBase学习_02_HBase API 操作 + HBase 与 Hive 集成 + HBase 优化
第6章 HBase API 操作6.1 环境准备6.2 HBase API6.2.1 判断表是否存在6.2.2 抽取获取 Configuration.Connection.Admin 对象的方法以及关 ...
- Hadoop 之Hbase命令
一.常用命令:(hbase shell 进入终端) 1.创建表: create 'users','user_id','address','info' 表users,有三个列族user_id,addre ...
- Hadoop HDFS分布式文件系统设计要点与架构
Hadoop HDFS分布式文件系统设计要点与架构 Hadoop简介:一个分布式系统基础架构,由Apache基金会开发.用户可以在不了解分布式底层细节的情况下,开发分布式程序.充分利用集群 ...
- 基于Hadoop技术实现的离线电商分析平台(Flume、Hadoop、Hbase、SpringMVC、highcharts)
离线数据分析平台是一种利用hadoop集群开发工具的一种方式,主要作用是帮助公司对网站的应用有一个比较好的了解.尤其是在电商.旅游.银行.证券.游戏等领域有非常广泛,因为这些领域对数据和用户的特性把握 ...
- Hadoop、Hbase基本命令及调优方式
HDFS基本命令 接触大数据挺长时间了,项目刚刚上完线,趁着空闲时间整理下大数据hadoop.Hbase等常用命令以及各自的优化方式,当做是一个学习笔记吧. HDFS命令基本格式:Hadoop fs ...
- Hadoop + ZK + HBase 环境搭建
Hadoop 环境搭建 参考资料: http://hadoop.apache.org/docs/r2.4.1/hadoop-project-dist/hadoop-common/ClusterSetu ...
随机推荐
- Cannot call sendRedirect() after the response has been committed的解决办法
做一个Login Demo的时候,写了如下代码: protected void doPost(HttpServletRequest request, HttpServletResponse respo ...
- win10上使用php与python实现与arduino串口通信
注意: php 需要php7,安装及开启php_dio.dll com口按照实际的进行设置,如果不知道可以打开arduino编辑器进行查看 可以与用户实现命令行交互,但是效率过慢,不清楚如何优化,使用 ...
- swagger2 Could not resolve pointer: /definitions
错误信息: Errors Resolver error at paths././query.post.parameters.20.schema.$ref Could not resolve refer ...
- R语言学习笔记:读取前n行数据
常规读取 一般我们读取文件时都会读取全部的文件然后再进行操作,因为R是基于内存进行计算的. data <- read.table("C:\\Users\\Hider\\Desktop\ ...
- winfrom 操作Excel
利用Aspose.Cells.dll 操作Excel,内容如下: 1.界面设计: 2.逻辑: using System; using System.Collections.Generic; using ...
- StringUtils类API及使用方法详解
StringUtils类API及使用方法详解 StringUtils方法概览 判空函数 1)StringUtils.isEmpty(String str) 2)StringUtils.isNotEmp ...
- [LeetCode]1089. Duplicate Zeros
Given a fixed length array arr of integers, duplicate each occurrence of zero, shifting the remainin ...
- 日常开发用Windows 好还是 Ubuntu好?
最近打算给电脑重新装系统,纠结了很久,不知道应该是换Windows还是Ubuntu,今天通过我自身的体验,来为大家分析一下,日常开发环境到底是用Windows和Ubuntu. [系统介绍] Windo ...
- Java面向对象(一)
面向对象(Object Oriented) 面向过程:事物比较简单.将问题分解为若干个步骤.按照步骤依次执行.面向对象:事物比较复杂.在解决面向对象的过程中,最后的执行部分还是面向过程方式,面向过程和 ...
- Spring mvc 初始化过程
1.DispatcherServlet:获取servlet的name 2.XmlWebApplicationContext:获取contentConfigLocation的xml名称和namespac ...