hadoop hdfs hbase优化实例

需求描述：

从hdfs中获取数据，字段url需要计算出url_type 通过进行hive的left outer join ，效率非常低。故将url的类型导入到hbase中，利用hbase快速查询的特点，结合mapreduce进行字段打标。

刚开始的mapreduce程序如下：

 package com.bonc.db;

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.hbase.client.Get;

 import org.apache.hadoop.hbase.client.HTable;

 import org.apache.hadoop.hbase.client.HTablePool;

 import org.apache.hadoop.hbase.client.Result;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import com.bonc.URLMatch.HBaseMain;

 public class DWA_S_D_USE_MB_COUNT_BASE2 {

     public static void main(String args[]) throws Exception {

         Configuration conf = new Configuration();

         Job job = new Job(conf, "DWA_S_D_USE_MB_COUNT_BASE");

         job.setJarByClass(DWA_S_D_USE_MB_COUNT_BASE2.class);

         job.setMapperClass(DataCleanMapper.class);

         job.setReducerClass(DataCleanReduce.class);

         job.setNumReduceTasks(150);

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(Text.class);

         job.setMapOutputKeyClass(Text.class);

         job.setMapOutputValueClass(Text.class);

         FileInputFormat.addInputPath(job, new Path(args[0]));

         FileOutputFormat.setOutputPath(job, new Path(args[1]));

         System.exit(job.waitForCompletion(true) ? 0 : 1);

     }

     public static class DataCleanMapper extends

             Mapper<LongWritable, Text, Text, Text> {

         @Override

         protected void map(LongWritable key, Text value, Context context)

                 throws IOException, InterruptedException {

             String lines = value.toString();

             String[] strs = lines.split("\\|");

             ParesURL pu = new ParesURL();

             String url = "NULL";

             if (strs.length > 25) {

                 url = pu.execute(strs[25], "HOST");

             }

             String keys = "";

             String values = "";

             if (strs.length > 16) {

                 keys = strs[0] + "|" + strs[1] + "|" + strs[2] + "|" + strs[3]

                         + "|" + strs[4] + "|" + use_seg(strs[5]) + "|"

                         + strs[11] + "|" + strs[16] + "|" + url + "|" + strs[7]

                         + "|" + strs[8] + "|" + strs[9] + "|" + strs[10] + "|";

             }

             if (strs.length > 15) {

                 values = url + "|" + strs[13] + "|" + strs[15] + "|" + "1";

             }

             context.write(new Text(keys), new Text(values));

         }

         public String use_seg(String start_date) {

             String s = "**";

             if (start_date.toString().length() > 23) {

                 if (isNum(start_date.toString().substring(11, 13))

                         && Integer.parseInt(start_date.toString().substring(11,

                                 13)) >= 0

                         && Integer.parseInt(start_date.toString().substring(11,

                                 13)) <= 23) {

                     s = start_date.toString().substring(11, 13);

                 }

             }

             return s;

         }

         public static boolean isNum(String str) {

             return str

                     .matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$");

         }

     }

     public static class DataCleanReduce extends Reducer<Text, Text, Text, Text> {

         private HTable table;

         @Override

         protected void reduce(Text arg0, Iterable<Text> arg1, Context context)

                 throws IOException, InterruptedException {

             String keys = arg0.toString();

             String value[] = { "" };

             String url = "NULL";

             String visitIP = "NULL";

             String value2 = "NULL";

             for (Text c : arg1) {

                 value = c.toString().split("\\|");

                 if (value.length > 0) {

                     url = value[0];

                 }

                 if (value.length > 1) {

                     visitIP = value[1];

                 }

                 if (value.length > 2) {

                     value2 = value[2];

                 }

             }

             String matchResult = urlMatch(url);

             if (matchResult.equals("NULL")) {

                 matchResult = urlMatch(visitIP);

             }

             String output = matchResult + "|" + value2 + "|" + "1";

             // System.out.println(output+"+++++++++++++++++");

             context.write(new Text(keys), new Text(output));

         }

         @Override

         protected void cleanup(Context context) throws IOException,

                 InterruptedException {

             super.cleanup(context);

             table.close();

         }

         @Override

         protected void setup(Context context) throws IOException,

                 InterruptedException {

             // TODO Auto-generated method stub

             super.setup(context);

             HTablePool pool = new HTablePool(HBaseMain.conf, 1000);

             table = (HTable) pool.getTable("22222");

         }

         public String urlMatch(String url) {

             String s = "NULL";

             if (url == null || url.equals("NULL")) {

                 s = "NULL";

             } else {

                 try {

                     Get getu = new Get(url.getBytes());

                     Result ru = table.get(getu);

                     if (!ru.isEmpty()) {

                         s = new String(ru.getValue("123".getBytes(), "456".getBytes()));

                     }

                 } catch (IOException e) {

                     e.printStackTrace();

                 }

             }

             return s;

         }

     }

 }

后来发现效率很低，主要是每一条数据都要访问hbase并且进行随机查询，所以后来转换方法，查询时先将row组装成list，然后再去查询，时间几乎是原来的一半。

改进后的代码：

 package com.bonc.db;

 import java.io.IOException;

 import java.util.ArrayList;

 import java.util.Iterator;

 import java.util.List;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.hbase.client.Get;

 import org.apache.hadoop.hbase.client.HTable;

 import org.apache.hadoop.hbase.client.HTablePool;

 import org.apache.hadoop.hbase.client.Result;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapred.Counters.Counter;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import com.bonc.URLMatch.HBaseMain;

 public class DWA_S_D_USE_MB_COUNT_BASE {

     public static void main(String args[]) throws Exception {

         Configuration conf = new Configuration();

         Job job = new Job(conf, "DWA_S_D_USE_MB_COUNT_BASE2");

         job.setJarByClass(DWA_S_D_USE_MB_COUNT_BASE.class);

         job.setMapperClass(DataCleanMapper.class);

         job.setReducerClass(DataCleanReduce.class);

         job.setNumReduceTasks(150);

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(Text.class);

         job.setMapOutputKeyClass(Text.class);

         job.setMapOutputValueClass(Text.class);

         FileInputFormat.addInputPath(job, new Path(args[0]));

         FileOutputFormat.setOutputPath(job, new Path(args[1]));

         System.exit(job.waitForCompletion(true) ? 0 : 1);

     }

     public static class DataCleanMapper extends

             Mapper<LongWritable, Text, Text, Text> {

         public static Counter ct = null;

         public static long i = 0;

         @Override

         protected void map(LongWritable key, Text value, Context context)

                 throws IOException, InterruptedException {

             //之所以在后面+1，是为了保证如果后面的几个字段都为空的话，依然可以输出这个字段！

             String lines = value.toString()+"|"+"1";

             String[] strs = lines.split("\\|");

             ParesURL pu = new ParesURL();

             String url = "NULL";

             String keys = "";

             String values = "";

             if (strs.length > 25) {

                 i++;

                 if(!strs[25].startsWith("http://")){

                     strs[25]="http://"+strs[25];

                 }

                 url = pu.execute(EmptyParse(strs[25]), "HOST");

                 keys = EmptyParse(strs[0]) + "|" + EmptyParse(strs[1]) + "|"

                         + EmptyParse(strs[2]) + "|" + EmptyParse(strs[3]) + "|"

                         + EmptyParse(strs[4]) + "|"

                         + EmptyParse(use_seg(strs[5])) + "|"

                         + EmptyParse(strs[11]) + "|" + EmptyParse(strs[16])

                         + "|" + EmptyParse(url) + "|" + EmptyParse(strs[7])

                         + "|" + EmptyParse(strs[8]) + "|" + EmptyParse(strs[9])

                         + "|" + EmptyParse(strs[10]) + "|";

                 values = EmptyParse(url) + "|" + EmptyParse(strs[13]) + "|"

                         + EmptyParse(strs[15]) + "|" + i;

                 context.write(new Text(String.valueOf(i % 10000)), new Text(

                         keys + values));

             }

         }

         public String use_seg(String start_date) {

             String s = "**";

             if (start_date.toString().length() > 23) {

                 if (isNum(start_date.toString().substring(11, 13))

                         && Integer.parseInt(start_date.toString().substring(11,

                                 13)) >= 0

                         && Integer.parseInt(start_date.toString().substring(11,

                                 13)) <= 23) {

                     s = start_date.toString().substring(11, 13);

                 }

             }

             return s;

         }

         public static boolean isNum(String str) {

             return str

                     .matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$");

         }

         public static String EmptyParse(String str) {

             if (str == null || str.length() < 1 || str.equals("")

                     || str.isEmpty()) {

                 return "NULL";

             } else {

                 return str;

             }

         }

     }

     public static class DataCleanReduce extends Reducer<Text, Text, Text, Text> {

         private HTable table;

         private long index = 0;

         @Override

         protected void reduce(Text arg0, Iterable<Text> arg1, Context context)

                 throws IOException, InterruptedException {

             String keys = arg0.toString();

             String value[] = { "" };

             String url = "NULL";

             String visitIP = "NULL";

             String value2 = "NULL";

             String reduceoutput = "NULL";

             String urlMatch = "NULL";

             String output = "NULL";

             Get getu;

             Get getip;

             List<Get> lg = new ArrayList<Get>();

             List<Get> li = new ArrayList<Get>();

             List<String> lo = new ArrayList<String>();

             List<String> useragent = new ArrayList<String>();

             for (Text c : arg1) {

                 value = c.toString().split("\\|");

                 url = value[13];

                 visitIP = value[14];

                 value2 = value[15];

                 output = value[0] + "|" + value[1] + "|" + value[2] + "|"

                         + value[3] + "|" + value[4] + "|" + value[5] + "|"

                         + value[6] + "|" + value[7] + "|" + value[8] + "|"

                         + value[9] + "|" + value[10] + "|" + value[11] + "|"

                         + value[12] + "|";

                 getu = new Get(url.getBytes());

                 getip = new Get(visitIP.getBytes());

                 lg.add(getu);

                 li.add(getip);

                 lo.add(output);

                 useragent.add(value2);

             }

             Result ru[];

             Result ri[];

             ru = table.get(lg);

             ri = table.get(li);

             for (int i = 0; i < lo.size(); i++) {

                 if (!ru[i].isEmpty()) {

                     urlMatch = new String(ru[i].getValue("url_type".getBytes(),

                             "type".getBytes()));

                 } else if (!ri[i].isEmpty()) {

                     urlMatch = new String(ri[i].getValue("url_type".getBytes(),

                             "type".getBytes()));

                 }

                 reduceoutput = urlMatch + "|" + useragent.get(i) + "|" + "1";

                 context.write(new Text(lo.get(i)), new Text(reduceoutput));

             }

         }

         @Override

         protected void cleanup(Context context) throws IOException,

                 InterruptedException {

             super.cleanup(context);

             table.close();

         }

         @Override

         protected void setup(Context context) throws IOException,

                 InterruptedException {

             // TODO Auto-generated method stub

             super.setup(context);

             HTablePool pool = new HTablePool(HBaseMain.conf, 1000);

             table = (HTable) pool.getTable("url_rule");

         }

         public String urlMatch(String url) {

             String s = "NULL";

             Result ru;

             if (url == null || url.equals("NULL")) {

                 s = "NULL";

             } else {

                 try {

                     Get getu = new Get(123.getBytes());

                     ru = table.get(getu);

                     if (!ru.isEmpty()) {

                         s = new String(ru.getValue("123123".getBytes(),

                                 "123".getBytes()));

                     }

                 } catch (IOException e) {

                     e.printStackTrace();

                 }

             }

             return s;

         }

     }

 }

在有限的资源下，可以激发一个人的创造力。用这句话作为总结吧。

hadoop hdfs hbase优化实例的更多相关文章

Hadoop HDFS (3) JAVA訪问HDFS
如今我们来深入了解一下Hadoop的FileSystem类. 这个类是用来跟Hadoop的文件系统进行交互的.尽管我们这里主要是针对HDFS.可是我们还是应该让我们的代码仅仅使用抽象类FileSyst ...
Hadoop生态圈-HBase性能优化
Hadoop生态圈-HBase性能优化作者:尹正杰版权声明:原创作品,谢绝转载!否则将追究法律责任.
hbase+hadoop+hdfs集群搭建集成spring
序言最近公司一个汽车项目想用hbase做存储,然后就有了这篇文字,来,来,来, 带你一起征服hbase,并推荐一本书<hbase权威指南> 这是一本极好的hbase入门书籍,我花了一个晚 ...
大数据技术之_11_HBase学习_02_HBase API 操作 + HBase 与 Hive 集成 + HBase 优化
第6章 HBase API 操作6.1 环境准备6.2 HBase API6.2.1 判断表是否存在6.2.2 抽取获取 Configuration.Connection.Admin 对象的方法以及关 ...
Hadoop 之Hbase命令
一.常用命令:(hbase shell 进入终端) 1.创建表: create 'users','user_id','address','info' 表users,有三个列族user_id,addre ...
Hadoop HDFS分布式文件系统设计要点与架构
Hadoop HDFS分布式文件系统设计要点与架构 Hadoop简介:一个分布式系统基础架构,由Apache基金会开发.用户可以在不了解分布式底层细节的情况下,开发分布式程序.充分利用集群 ...
基于Hadoop技术实现的离线电商分析平台（Flume、Hadoop、Hbase、SpringMVC、highcharts）
离线数据分析平台是一种利用hadoop集群开发工具的一种方式,主要作用是帮助公司对网站的应用有一个比较好的了解.尤其是在电商.旅游.银行.证券.游戏等领域有非常广泛,因为这些领域对数据和用户的特性把握 ...
Hadoop、Hbase基本命令及调优方式
HDFS基本命令接触大数据挺长时间了,项目刚刚上完线,趁着空闲时间整理下大数据hadoop.Hbase等常用命令以及各自的优化方式,当做是一个学习笔记吧. HDFS命令基本格式:Hadoop fs ...
Hadoop + ZK + HBase 环境搭建
Hadoop 环境搭建参考资料: http://hadoop.apache.org/docs/r2.4.1/hadoop-project-dist/hadoop-common/ClusterSetu ...

随机推荐

react的嵌套组件
react没有vue插槽的概念,但是有嵌套组件,可以用嵌套组件实现类似插槽的功能.下例中,color,name,btn相当于具名插槽,children相当于匿名插槽. import React fro ...
js弹窗返回值详解(window.open方式)
今天在改公司一个老系统时,碰到了window.open()的这个语法.虽然这个方法有点老,不太用了.所以有点不清楚父级弹框如何获取子级页面返回的值.为了解决这个问题,上网搜了一下.原作者参考网址:ht ...
DevOps与Kubernetes 、容器的关系
近两年,随着容器.Kubernetes 等技术的兴起,DevOps 这个概念被广泛提及并被大量使用. 本文将会从以下几个方面着手,结合实验展现的方式,让读者真正理解 DevOps 的含义. DevOp ...
shell脚本视频学习1
一.知识点:变量,参数传递练习1:使用shell脚本,输出当前所在的目录练习2:计算/etc目录下有多少个文件,用shell脚本实现 ls -l--->数一下, ls -l|wc -l ( ...
web录音——上传录音文件
捕获麦克风一. 前言公司项目需要实现web录音,刚刚好接手此功能,由于之前未接触过,在网上找了些资料做对比 ) https://www.cnblogs.com/starcrm/p/51 ...
Tomcat设置默认启动项目
Tomcat设置默认启动项目 Tomcat设置默认启动项目,顾名思义,就是让可以在浏览器的地址栏中输入ip:8080,就能访问到我们的项目.具体操作如下: 1.打开tomcat的安装根目录,找 ...
laravel5.8 IoC 容器
网上对容器的解释有很多,这里只是记录,搬运! 1.简单理解: 2019-10-10 11:24:09 解析 lavarel 容器 IoC 容器作用就是 “解耦” .“依赖注入(DI) IoC 容 ...
【转】make menuconfig/.config/Kconfig解析
当执行#make menuconfig时会出现内核的配置界面,所有配置工具都是通过读取"arch/$(ARCH)/Kconfig"文件来生成配置界面,这个文件就是所有配置的总入口, ...
解决国内安装tensorflow, opencv等安装不成功或下载太慢问题
解决国内安装tensorflow, opencv等安装不成功或下载太慢问题复制自博客:https://blog.csdn.net/jorg_zhao/article/details/80075293 ...
springboot配置对jsp页面的解析支持
pom.xml文件配置依赖信息  <dependency> <group ...

hadoop hdfs hbase优化实例

hadoop hdfs hbase优化实例的更多相关文章

随机推荐

热门专题