MapReduce数据筛选

需求：

编写MapReduce程序算出高峰时间段（如9-10点）哪张表被访问的最频繁的表，以及这段时间访问这张表最多的用户，以及这个用户访问这张表的总时间开销。

测试数据：

TableName(表名)，Time(时间)，User(用户)，TimeSpan(时间开销)

*t003 6:00 u002 180

*t003 7:00 u002 180

*t003 7:08 u002 180

*t003 7:25 u002 180

*t002 8:00 u002 180

*t001 8:00 u001 240

*t001 9:00 u002 300

*t001 9:11 u001 240

*t003 9:26 u001 180

*t001 9:39 u001 300

*t001 10:00 u001 200

代码

方法一：

package com.table.main;

import java.io.IOException;

import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TableUsed {

	public static class MRMapper extends Mapper<LongWritable, Text, Text, Text> {

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] split = value.toString().substring(1).split("\\s+");

			Long time = Long.parseLong(split[1].charAt(0) + "");

			// 筛选9-10点使用过的表

			if (time == 9 || time == 10) {

				context.write(new Text(split[0]), new Text(split[2] + ":" + split[3]));

			}

		}

	}

	public static class MRReducer extends Reducer<Text, Text, Text, Text> {

		// 存放使用量最大的表的表名及用户

		public static HashMap<String, HashMap<String, Integer>> map = new HashMap<String, HashMap<String, Integer>>();

		// 最大用使用量

		public static int max_used_num = 0;

		// 使用量最大的表

		public static String table = "";

		protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

			HashMap<String, Integer> user_map = new HashMap<String, Integer>();

			int table_used_num = 0;

			for (Text t : values) {

				table_used_num++;

				String[] split = t.toString().split(":");

				// 如map中已经存在的用户则把使用时间叠加 不存在则添加该用户

				if (user_map.get(split[0]) == null) {

					user_map.put(split[0], Integer.parseInt(split[1]));

				} else {

					Integer use_time = user_map.get(split[0]);

					use_time += Integer.parseInt(split[1]);

					user_map.put(split[0], use_time);

				}

			}

			if (table_used_num > max_used_num) {

				map.put(key.toString(), user_map);

				table = key.toString();

				max_used_num = table_used_num;

			}

		}

		protected void cleanup(Context context) throws IOException, InterruptedException {

			// 循环map，查出使用时间最长的用户信息

			HashMap<String, Integer> map2 = map.get(table);

			int max = 0;

			String max_used_user = "";

			for (HashMap.Entry<String, Integer> m : map2.entrySet()) {

				if (m.getValue() > max) {

					max = m.getValue();

					max_used_user = m.getKey();

				}

			}

			context.write(new Text(table), new Text("\t" + max_used_user + "\t" + map2.get(max_used_user)));

		}

	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(TableUsed.class);

		job.setMapperClass(MRMapper.class);

		job.setReducerClass(MRReducer.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		FileInputFormat.setInputPaths(job, new Path("hdfs://hadoop5:9000/input/table_time.txt"));

		FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop5:9000/output/put2"));

		System.out.println(job.waitForCompletion(true) ? 1 : 0);

	}

}

缺点：只算出使用时间最长的用户，没有判断该用户是否是使用次数最多的

方法二：

package com.table.main;

import java.io.IOException;

import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TableUsed {

	public static class MRMapper extends Mapper<LongWritable, Text, Text, Text> {

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] split = value.toString().substring(1).split("\\s+");

			Long time = Long.parseLong(split[1].charAt(0) + "");

			// 筛选9-10点使用过的表

			if (time == 9 || time == 10) {

				context.write(new Text(split[0]), new Text(split[2] + ":" + split[3]));

			}

		}

	}

	public static class MRReducer extends Reducer<Text, Text, Text, Text> {

		// 					表的最大使用次数		使用该表最多的用户

		public static int max_used_num = 0, max_user_used = 0;

		//						使用量最大的表		使用该表最多的用户名

		public static String max_used_table = "", user_name = "";

		// 					使用次数最多的用户的 使用时间

		public static Integer user_used_time = 0;

		protected void reduce(Text key, Iterable<Text> values, Context context)

				throws IOException, InterruptedException {

			HashMap<String, Integer> user_map = new HashMap<String, Integer>();

			HashMap<String, Integer> user_used_map = new HashMap<String, Integer>();

			int table_used_num = 0;// 表的使用次数

			Integer use_num = 0;// 用户使用次数

			Integer use_time = 0;//使用时间

			String username = "";//用户名

			for (Text t : values) {

				table_used_num++;

				String[] split = t.toString().split(":");

				// 如map中已经存在的用户则把使用时间叠加 不存在则添加该用户

				if (user_map.get(split[0]) == null) {

					user_map.put(split[0], Integer.parseInt(split[1]));

					user_used_map.put(split[0], 1);

				} else {

					use_time = user_map.get(split[0]);

					use_time += Integer.parseInt(split[1]);

					user_map.put(split[0], use_time);

					use_num = user_used_map.get(split[0]);

					use_num ++;

					user_used_map.put(split[0], use_num);

				}

				/**

				 * 判断该用户是否为此表使用次数最多的,

				 * 是则存进user_map和user_used_map，否则不存;

				 * 由于只需要求使用量最多的用户,因此使用量不是最多用户没有必要存在于map中

				 */

				if (use_num > max_user_used) {

					username = split[0];

					max_user_used = use_num;

					user_used_time = use_time;

					//此处也可以不remove()

					user_used_map.remove(split[0]);

					user_map.remove(split[0]);

				}

			}

			if (table_used_num > max_used_num) {

				max_used_table = key.toString();

				max_used_num = table_used_num;

				user_name = username;

			}

		}

		protected void cleanup(Context context) throws IOException, InterruptedException {

			context.write(new Text(max_used_table), new Text(max_user_used + "\t" + user_name + "\t" + user_used_time));

		}

	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(TableUsed.class);

		job.setMapperClass(MRMapper.class);

		job.setReducerClass(MRReducer.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		FileInputFormat.setInputPaths(job, new Path("hdfs://hadoop5:9000/input/table_time.txt"));

		FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop5:9000/output/put6"));

		System.out.println(job.waitForCompletion(true) ? 1 : 0);

	}

}

MapReduce数据筛选的更多相关文章

ASP.NET MVC5+EF6+EasyUI 后台管理系统（81）-数据筛选（万能查询）
系列目录前言听标题的名字似乎是一个非常牛X复杂的功能,但是实际上它确实是非常复杂的,我们本节将演示如何实现对数据,进行组合查询(数据筛选) 我们都知道Excel中是如何筛选数据的.就像下面一样他 ...
DataGridView如何实现列标头带数据筛选功能，就象Excel高级筛选功能一样
'近日有本论坛网友问:DataGridView如何实现列标头带数据筛选功能,就象Excel高级筛选功能一样 '今晚正好闲着没事,加之以前也没用到过这个需求,所以就写了个模拟功能,供各位坛友酌情参考. ...
layui table 根据条件改变更换表格颜色高亮显示数据筛选
请问想让当layui表格的某个字段符合某个条件的时候,让该行变颜色.这样可以实现么. layui数据表格怎么更换表格颜色 layui表格通过判断某一行中的某一列的值进行设置这一行的颜色 LayUI之 ...
C#进行数据筛选（二）
这里介绍LINQ+Lambda表达式进行数据筛选的方式这里是第一种方式,还是使用了if条件语句去判断,根据选择的条件去筛选出我所需要的数据 public GxAnaly SelectDay(stri ...
C#进行数据筛选（一）
这里介绍数据筛选的第一种方式,不用过滤器,给新手看得 public DataTable SourceList(string Wmain, string OrderNo, string Process) ...
python之pandas数据筛选和csv操作
本博主要总结DaraFrame数据筛选方法(loc,iloc,ix,at,iat),并以操作csv文件为例进行说明 1. 数据筛选 a b c (1)单条件筛选 df[df[] # 如果想筛选a列的取 ...
Pandas 数据筛选,去重结合group by
Pandas 数据筛选,去重结合group by 需求今小伙伴有一个Excel表, 是部门里的小伙9月份打卡记录, 关键字段如下: 姓名, 工号, 日期, 打卡方式, 时间, 详细位置, IP地址. ...
【杂记】mysql 左右连接查询中的NULL的数据筛选问题，查询NULL设置默认值，DATE_FORMAT函数
MySQL左右连接查询中的NULL的数据筛选问题 xpression 为 Null,则 IsNull 将返回 True:否则 IsNull 将返回 False. 如果 expression 由多个变量 ...
4-Pandas之数据类型与数据筛选
一.数据类型 1.Pandas的数据类型主要结合了pandas和numpy两个模块中的数据类型,包括以下几种: float int bool datetime64[ns]------>日期类型 ...

随机推荐

Webphere WAS 启动
如果WebSphere是默认安装的话,是自带两个profile,Dmgr和AppSrv,只需要到指定目录下启动管理器和节点即可/usr/IBM/WebSphere/AppServer/profiles ...
（转）Linux-epoll
在Linux网络编程中,Linux内核2.6版本之前大多都是用 select() 作为非阻塞的事件触发模型,但是效率低,使用受限已经很明显的暴露了select()(包括poll)的缺陷,为了解决这些缺 ...
grunt的简单应用
grunt是干什么的呢,一句话:自动化.对于需要反复重复的任务,例如压缩(minification).编译.单元测试.linting等,自动化工具可以减轻你的劳动,简化你的工作.当你在 Gruntfi ...
《JAVA多线程编程核心技术》笔记：第七章：拾遗增补
一.线程的状态 1.1 状态种类及理解:(一共6个) 文字说明和理解: NEW状态:线程实例化后还从未执行start()方法时的状态: RUNNABLE状态:线程进入运行的状态: TERMINATED ...
《JAVA多线程编程核心技术》笔记：第六章：单例模式与多线程
一.立即加载/"饿汉模式"和延迟加载/"懒汉模式" 立即加载(又称饿汉模式):在使用类的时候已经将对象创建完毕,常见实现方法是直接new实例化延迟加载(又称懒 ...
Powershell About Active Directory Group Membership of a domain user
使用Get-User命令去寻找group membership of a domain user $((Get-ADUser Wendy -Properties *).MemberOf -split ...
并发测试 java.lang.OutOfMemoryError: GC overhead limit exceeded Xms Xmx 阻塞请求单节点请求分发负载均衡
at javax.servlet.http.HttpServlet.service(HttpServlet.java:705) at javax.servlet.http.HttpServlet.se ...
pycharm调试scrapy
pycharm调试scrapy 创建一个run.py文件作为调试入口 run.py中,name是要调试的爬虫的名字(注意,是爬虫类中的name,而不是爬虫类所在文件的名字) 拼接爬虫运行的命令,然后用 ...
django_forms组件用ajax发送数据验证注册
forms组件 -forms是什么? 就是一个类,可以校验字段(前台传过来的字段) -怎么用: -校验字段功能: -先写一个类,继承Form from django.shortcuts import ...

MapReduce数据筛选

需求：

测试数据：

代码

MapReduce数据筛选的更多相关文章

随机推荐

热门专题