MapReduce清洗日志数据统计PV量

 package mapreduce.webpv;

 import java.io.IOException;

 import org.apache.commons.lang.StringUtils;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.conf.Configured;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.util.Tool;

 import org.apache.hadoop.util.ToolRunner;

 public class WebPvMapReduce extends Configured implements Tool {

     // step 1: Mapper

     public static class WebPvMapper extends

             Mapper<LongWritable, Text, IntWritable, IntWritable> {

         private IntWritable mapOutputKey = new IntWritable();

         private IntWritable mapOutputValue = new IntWritable(1);

         @Override

         public void map(LongWritable key, Text value, Context context)

                 throws IOException, InterruptedException {

             // line value

             String lineValue = value.toString();

             // spilt

             String[] values = lineValue.split("\t");

             // url

             String urlValue = values[1];

             if (StringUtils.isBlank(urlValue)) {

                 // conuter

                 context.getCounter("WEBPVMAPPER_CUUNTERS", "URL_BLANK")

                         .increment(1L);

                 return;

             }

             if (30 > values.length) {

                 // conuter

                 context.getCounter("WEBPVMAPPER_CUUNTERS", "LENGTH_LT_30")

                         .increment(1L);

                 return;

             }

             // province id

             String provinceIdValue = values[23];

             if (StringUtils.isBlank(provinceIdValue)) {

                 // conuter

                 context.getCounter("WEBPVMAPPER_CUUNTERS", "PROVINCEID_BLANK")

                         .increment(1L);

                 return;

             }

             Integer provinceId = Integer.MAX_VALUE;

             try {

                 provinceId = Integer.valueOf(provinceIdValue);

             } catch (Exception e) {

                 // conuter

                 context.getCounter("WEBPVMAPPER_CUUNTERS",

                         "PROVINCEID_NOT_NUMBER").increment(1L);

                 return;

             }

             // map outpu key

             mapOutputKey.set(provinceId);

             context.write(mapOutputKey, mapOutputValue);

         }

     }

     // step 2: Reducer

     public static class WebPvReducer extends

             Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {

         private IntWritable outputValue = new IntWritable();

         @Override

         protected void reduce(IntWritable key, Iterable<IntWritable> values,

                 Context context) throws IOException, InterruptedException {

             // temp sum

             int sum = 0;

             // iterator

             for (IntWritable value : values) {

                 sum += value.get();

             }

             // set output

             outputValue.set(sum);

             context.write(key, outputValue);

         }

     }

     // step 3: Driver

     public int run(String[] args) throws Exception {

         Configuration configuration = this.getConf();

         Job job = Job.getInstance(configuration, this.getClass()

                 .getSimpleName());

         job.setJarByClass(WebPvMapReduce.class);

         // set job

         // input

         Path inpath = new Path(args[0]);

         FileInputFormat.addInputPath(job, inpath);

         // output

         Path outPath = new Path(args[1]);

         FileOutputFormat.setOutputPath(job, outPath);

         // Mapper

         job.setMapperClass(WebPvMapper.class);

         job.setMapOutputKeyClass(IntWritable.class);

         job.setMapOutputValueClass(IntWritable.class);

         // Reducer

         job.setReducerClass(WebPvReducer.class);

         job.setOutputKeyClass(IntWritable.class);

         job.setOutputValueClass(IntWritable.class);

         // submit job -> YARN

         boolean isSuccess = job.waitForCompletion(true);

         return isSuccess ? 0 : 1;

     }

     public static void main(String[] args) throws Exception {

         Configuration configuration = new Configuration();

         args = new String[] {

                 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/input/testdata/2015082818",

                 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/output1" };

         int status = ToolRunner.run(configuration, new WebPvMapReduce(), args);

         // exit program

         System.exit(status);

     }

 }

查看结果

 $ bin/hdfs dfs -text /user/beifeng01/mapreduce/output1/pa*

 1       3527

 2       1672

 3       511

 4       325

 5       776

 6       661

 7       95

 8       80

 9       183

 10      93

 11      135

 12      289

 13      264

 14      374

 15      163

 16      419

 17      306

 18      272

 19      226

 20      2861

 21      124

 22      38

 23      96

 24      100

 25      20

 26      157

 27      49

 28      21

 29      85

 30      42

 32      173

MapReduce清洗日志数据统计PV量的更多相关文章

利用mapreduce清洗日志内存不足问题
package com.libc; import java.io.IOException; import java.io.UnsupportedEncodingException; import ja ...
nginx日志分析及其统计PV、UV、IP
一.nginx日志结构 nginx中access.log 的日志结构: $remote_addr 客户端地址 211.28.65.253 $remote_user 客户端用户名称 -- $time_l ...
基于WebForm+EasyUI的业务管理系统形成之旅 -- 数据统计(Ⅳ)
上篇<基于WebForm+EasyUI的业务管理系统形成之旅 -- 首页快捷方式>,主要介绍通过添加首页快捷方式,快速进入各个应用菜单功能. 将常用的菜单功能作为快捷方式,避免由于寻找诸多 ...
Git 常用命令和统计代码量
摘要分享Git日常操作中常用的命令,分享如何统计在项目中贡献的代码量. 下面列出Git bash常用命令. 1. git clone **(项目地址) 克隆一个git项目到本地,将git项目拉取到本 ...
使用mapreduce对日志进行清洗
网站日志分析项目案例(一)项目介绍:http://www.cnblogs.com/edisonchou/p/4449082.html 网站日志分析项目案例(二)数据清洗:当前页面网站日志分析项目案例 ...
有关“数据统计”的一些概念 -- PV UV VV IP跳出率等
有关"数据统计"的一些概念 -- PV UV VV IP跳出率等版权声明:本文为博主原创文章,未经博主允许不得转载. 此文是本人工作中碰到的,随时记下来的零散概念,特此整理一下. ...
视频网站数据MapReduce清洗及Hive数据分析
一.需求描述利用MapReduce清洗视频网站的原数据,用Hive统计出各种TopN常规指标: 视频观看数 Top10 视频类别热度 Top10 视频观看数 Top20 所属类别包含这 Top20 ...
登录日志的访问日志的统计 MapReduce
登录日志的访问日志的统计 MapReduce <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-commo ...
mapreduce清洗数据
继上篇 MapReduce清洗数据 package mapreduce; import java.io.IOException; import org.apache.hadoop.conf.Confi ...

随机推荐

ArrayList排序Sort（）方法(转)
//使用Sort方法,可以对集合中的元素进行排序.Sort有三种重载方法,声明代码如下所//示. public void Sort(); //使用集合元素的比较方式进行排序 public void S ...
SQL Server ->> Database Promgramming Object Security Control（数据库编程对象安全控制）
对于SQL Server内编程对象的安全控制是今天我在思考的问题.在MSDN上找到了几篇有用的文章. 首先微软推荐了三种做法: 1)第一种做法是在SQL Server中对一个应用程序对应创建应用程序角 ...
【Leetcode】【Easy】Same Tree
Given two binary trees, write a function to check if they are equal or not. Two binary trees are con ...
redis持久化那些事(kēng)儿
这是一篇包含了介绍性质和吐槽性质的日志.主要介绍一下我学习redis持久化时候被坑的经历.redis的使用介绍现在没有打算写,因为比较多,以我如此懒的性格...好吧,还是有点这方面想法的,不过一篇博客 ...
ADF系列-2.EO的高级属性
在上一篇博客 ADF系列-1.EO的各个属性初探中介绍了EO的一些常用简单属性.本次将介绍EO中一些比较常用的一些高级属性一.基于Sequence创建EO,一下介绍三种方式(以HR用户的Emplo ...
UVa 1625 - Color Length（线性DP + 滚动数组）
链接: https://uva.onlinejudge.org/index.php?option=com_onlinejudge&Itemid=8&page=show_problem& ...
富文本使用之wangEditor3
一.介绍: wangEditor —— 轻量级 web 富文本编辑器,配置方便,使用简单.支持 IE10+ 浏览器. 二.使用方式: 直接下载:https://github.com/wangfupen ...
iPhone 耳机在PC电脑上使用方法
把主声道(Master)从正中间调整到最左或者最右就行了
iOS之序列化与反序列化
所谓的序列化和反序列化就是将数据结构或对象和二进制串之间相互转换的过程: 本人的理解是当你于写数据需要本地存储时,即将你的数据写到硬盘上的时候,你就必须对他进行序列化,转换成二进制文件,从而便于在磁盘 ...
【转载】iPhone屏幕尺寸、分辨率及适配
iPhone屏幕尺寸.分辨率及适配转载http://m.blog.csdn.net/article/details?id=42174937 1.iPhone尺寸规格 iPhone 整机宽度Width ...

MapReduce清洗日志数据统计PV量

MapReduce清洗日志数据统计PV量的更多相关文章

随机推荐

热门专题