利用mapreduce清洗日志内存不足问题

package com.libc;

import java.io.IOException;

import java.io.UnsupportedEncodingException;

import java.util.HashMap;

import java.util.Iterator;

import java.util.Map;

import java.util.Set;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class Process {

	public static class TokenizerMapper extends

			Mapper<Object, Text, Text, Text> {

		private Text word = new Text();

		public void map(Object key, Text value, Context context)

				throws IOException, InterruptedException {

			// TODO Auto-generated method stub

			String datas = "";

			try {

				datas = new String(value.getBytes(), 0, value.getLength(),

						"GBK");

			} catch (UnsupportedEncodingException e1) {

				// TODO Auto-generated catch block

				e1.printStackTrace();

			}

			// datas = value.toString();

			try {

				String[] split = datas.split(" time=");

				// 处理头中包含空格的字段

				Pattern p = Pattern.compile("phonemodel=\"(.*?)\"");

				String pm = getIndex(split[0], p);

				split[0] = split[0].replaceAll(pm, pm.replace(" ", ""));

				Pattern p1 = Pattern.compile("networktype=\"(.*?)\"");

				String nt = getIndex(split[0], p1);

				split[0] = split[0].replaceAll(nt, nt.replace(" ", ""));

				for (int i = 1; i < split.length; i++) {

					String[] codes = split[i].split(" ", 4);

					int headLen = split[0].split(" ").length;

					if (headLen != 20) {

						// 丢掉错误日志

						continue;

					}

					// 处理旧版本日志判别标准：|

					if (codes[2].equals("code=\"100\"")){

						if(codes[3].indexOf("contact_name")>-1){

							codes[3] = process100(codes[3]);

						}

							codes[3] = codes[3].replace(' ', '#');

					}else if(codes[2].equals("code=\"101\"") ){

						if(codes[3].indexOf("message_to_")>-1){

							codes[3] = process101(codes[3]);

						}

							codes[3] = codes[3].replace(' ', '#');

					}

					else if(codes[2].equals("code=\"102\"")){

						if(codes[3].indexOf("caller_n")>-1||codes[3].indexOf("caller_d")>-1){

							codes[3] = process102(codes[3]);

						}

							codes[3] = codes[3].replace(' ', '#');

					}else{

						codes[3] = codes[3].replace("  ", " ");

					}

					String collect = split[0] + " time=" + codes[0] + " "

							+ codes[1] + " " + codes[2] + " " + codes[3];

					word.set(collect);

					context.write(word, new Text(""));

				}

			} catch (Exception e) {

				// TODO Auto-generated catch block

			}

		}

	}

	public static String process100(String code) throws Exception{

		String[] codes = code.split(" ");

		HashMap<String, Contact> hs = new HashMap<String, Process.Contact>();

		Pattern p0 = Pattern.compile("_(\\d*)=");

		Pattern p1 = Pattern.compile("\"(.*)\"");

		for (int i = 0; i < codes.length; i++) {

			if (codes[i].equals(""))

				continue;

			String index = getIndex(codes[i], p0);

			if (index == null)

				continue;

			String value = getIndex(codes[i], p1);

			Contact contact = null;

			if (hs.containsKey(index)) {

				contact = hs.get(index);

			} else {

				contact = new Contact();

			}

			if (codes[i].startsWith("contact_name_")) {

				contact.contactName = value;

			} else if (codes[i].startsWith("contact_num_")) {

				contact.contactNum = value;

			}

			contact.index = index;

			hs.put(index, contact);

		}

		return printToString(hs);

	}

	public static String process101(String code) throws Exception{

		String[] codes = code.split("\"  ");

		HashMap<String, Message> hs = new HashMap<String, Process.Message>();

		Pattern p = Pattern.compile("_(\\d*)=");

		Pattern p1 = Pattern.compile("\"(.*)");

		for (int i = 0; i < codes.length; i++) {

			String index = getIndex(codes[i], p);

			String value = getIndex(codes[i], p1);

			if (index == null)

				continue;

			Message message = null;

			if (hs.containsKey(index)) {

				message = hs.get(index);

			} else {

				message = new Message();

			}

			if (codes[i].startsWith("message_time_")) {

				message.messageTime = value;

			} else if (codes[i].startsWith("message_to_")) {

				message.messageTo = value;

			}

			message.index = index;

			hs.put(index, message);

		}

		return printToString(hs);

	}

	public static String process102(String code) throws Exception{

		String[] codes = code.split("\"  ");

		HashMap<String, CallLog> hs = new HashMap<String, Process.CallLog>();

		Pattern p = Pattern.compile("_(\\d*)=");

		Pattern p1 = Pattern.compile("\"(.*)");

		for (int i = 0; i < codes.length; i++) {

			String index = getIndex(codes[i], p);

			if (index == null)

				continue;

			String value = getIndex(codes[i], p1);

			CallLog callLog = null;

			if (hs.containsKey(index)) {

				callLog = hs.get(index);

			} else {

				callLog = new CallLog();

			}

			if (codes[i].startsWith("caller_date_")) {

				callLog.callerDate = value;

			} else if (codes[i].startsWith("caller_duration_")) {

				callLog.callerDuration = value;

			} else if (codes[i].startsWith("caller_name_")) {

				callLog.callerName = value;

			} else if (codes[i].startsWith("caller_num_")) {

				callLog.callerNum = value;

			}

			callLog.index = index;

			hs.put(index, callLog);

		}

		return printToString(hs);

	}

	public static String printToString(Map hs) {

		Set set = hs.keySet();

		Iterator<String> it = set.iterator();

		String result = "";

		while (it.hasNext()) {

			result = result + hs.get(it.next()).toString() + "|";

		}

		return result;

	}

	public static String getIndex(String code, Pattern p) {

		String index = null;

		Matcher matcher = p.matcher(code);

		if (matcher.find()) {

			index = matcher.group(1);

		}

		return index;

	}

	public static class IntSumReducer extends Reducer<Text, Text, Text, Text> {

		public void reduce(Text key, Text rr, Context context)

				throws IOException, InterruptedException {

			context.write(key, new Text(""));

		}

	}

	public static class Contact {

		public String index;

		public String contactName;

		public String contactNum;

		@Override

		public String toString() {

			// TODO Auto-generated method stub

			return "contact_" + index + "=" + this.contactName + ";"

					+ this.contactNum;

		}

	}

	public static class Message {

		public String index;

		public String messageTime;

		public String messageTo;

		@Override

		public String toString() {

			// TODO Auto-generated method stub

			return "message_" + this.index + "=" + this.messageTo + ";"

					+ this.messageTime;

		}

	}

	public static class CallLog {

		public String index;

		public String callerDuration;

		public String callerNum;

		public String callerName;

		public String callerDate;

		@Override

		public String toString() {

			// TODO Auto-generated method stub

			return "callLog_" + this.index + "=" + this.callerName + ";"

					+ this.callerNum + ";" + this.callerDate + ";"

					+ this.callerDuration;

		}

	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();

		String[] otherArgs = new GenericOptionsParser(conf, args)

				.getRemainingArgs();

		if (otherArgs.length != 2) {

			System.err.println("Usage: process <in> <out>");

			System.exit(2);

		}

		Job job = new Job(conf, "process");

		job.setJarByClass(Process.class);

		job.setMapperClass(TokenizerMapper.class);

		job.setCombinerClass(IntSumReducer.class);

		job.setReducerClass(IntSumReducer.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

　　此版本为第一版，运行几天后服务器日志量暴增，导致堆栈溢出错误，

因此修改为第二版后可以对jvm内存自定义配置

方案一：

/opt/aimcpro/mapred/bin/hadoop jar libc_process.jar com.libc.Process -D mapred.child.java.opts=-Xmx2048m hdfs://mycluster/libc/input hdfs://mycluster/libc/output

方案二：

Configuration cc = job.getConfiguration();
String mem = cc.get("mapred.child.java.opts");
System.out.println(mem);

即在代码中更改设置。

当jvm从1G设为2G后，job顺利通过了

数据一直在增长啊：

20140801 6058177
20140802 7490572
20140803 8114244
20140804 7278280
20140805 7673678
20140806 8213066
20140807 9192677
20140808 9362143
20140809 10989437
20140810 11396093
20140811 10229799
20140812 10346527
20140813 10064709
20140814 11017971
20140815 11634611
20140818 10422815
20140819 12874181
20140820 13478590
20140821 12530974
20140822 11590312
20140823 15705258

利用mapreduce清洗日志内存不足问题的更多相关文章

MapReduce清洗日志数据统计PV量
package mapreduce.webpv; import java.io.IOException; import org.apache.commons.lang.StringUtils; imp ...
视频网站数据MapReduce清洗及Hive数据分析
一.需求描述利用MapReduce清洗视频网站的原数据,用Hive统计出各种TopN常规指标: 视频观看数 Top10 视频类别热度 Top10 视频观看数 Top20 所属类别包含这 Top20 ...
利用RELK进行日志收集
利用RELK进行日志收集发布时间:April 3, 2018 // 分类:运维工作,开发笔记,python // No Comments 前不久在做应急的总是遇到要求对日志进行分析溯源,当时就想到如 ...
Hadoop 中利用 mapreduce 读写 mysql 数据
Hadoop 中利用 mapreduce 读写 mysql 数据有时候我们在项目中会遇到输入结果集很大,但是输出结果很小,比如一些 pv.uv 数据,然后为了实时查询的需求,或者一些 OLAP ...
.NET Core的日志[5]:利用TraceSource写日志
从微软推出第一个版本的.NET Framework的时候,就在“System.Diagnostics”命名空间中提供了Debug和Trace两个类帮助我们完成针对调试和跟踪信息的日志记录.在.NET ...
Hadoop阅读笔记（二）——利用MapReduce求平均数和去重
前言:圣诞节来了,我怎么能虚度光阴呢?!依稀记得,那一年,大家互赠贺卡,短短几行字,字字融化在心里:那一年,大家在水果市场,寻找那些最能代表自己心意的苹果香蕉梨,摸着冰冷的水果外皮,内心早已滚烫.这一 ...
利用TraceSource写日志
利用TraceSource写日志从微软推出第一个版本的.NET Framework的时候,就在“System.Diagnostics”命名空间中提供了Debug和Trace两个类帮助我们完成针对调试 ...
hadoop笔记之MapReduce的应用案例(利用MapReduce进行排序)
MapReduce的应用案例(利用MapReduce进行排序) MapReduce的应用案例(利用MapReduce进行排序) 思路: Reduce之后直接进行结果合并具体样例: 程序名:Sort. ...
SQL调优日志--内存问题
SQL调优日志--内存问题排查入门篇概述很多系统的性能问题,是由内存导致的.内存不够会导致页面频繁换入换出,IO队列高,进而影响数据库整体性能. 排查内存对数据库性能非常重要.那么我当出现问 ...

随机推荐

Hoeffding连接到机器学习
统计学场景: 一个罐子中有红球和绿球,红球比例$v$未知,数量未知,如何得到红球比例?方法---随机抽样N个球,在其中红球占比为$u$ 由hoeffding可以知道:$P(|u-v|>\epsi ...
在centos中添加开机自启动服务
将服务的shell脚本添加到/etc/rc.d的rc.local文件的最后面,需要在服务名称的前面加上其路径. 例如我要将httpd添加到开机自启动中,需要在rc.local添加如下代码 /usr/s ...
Android中数据库的操作流程详解
Android中数据库的操作方法: 1.Android平台提供了一个数据库辅助类来创建或打开数据库. 这个辅助类继承自SQLiteOpenHelper类.继承和扩展SQLiteOpenHelper类主 ...
Tomcat 设置为服务使用脚本 service
进入到Tomcat的bin目录下,如果使用的是Windows系统则使用service.bat进行操作;Linux系统则使用service.sh进行. service.bat install/remov ...
struts2中的表达元素标签使用详解
级联标签是使用:一级下拉框应该使用map对象的key集合作为下拉框元素,二级下了框应该使用一级下拉框对应的选择值自动的弹出待选择的元素值(集合) 页面代码如下:<s:set name=" ...
js 获取月份格式yy-mm-dd
/** * 获取上一个月 * * @date 格式为yyyy-mm-dd的日期,如:2014-01-25 */ function getPreMonth(date) { var arr = date. ...
你的阅读造就了你 You are what you read
在豆瓣上看到的一篇很有思想和正能量的文章,在这里请允许我用原创的方式来呈现给大家.如果你是在校的大学生或者研究生博士生,这篇文章会让你有很多的共鸣.如果你已真正的踏入这个社会,也将受益匪浅. 电脑 ...
linux shell 札记
shell 数组数组索引: 单个元素索引: ${array[n]} 全部元素: ${array[*]} 或者 ${array[@]} 部分索引: ${array[2:]} 数组所有元素统一加 ...
OpenGL绘制环形渐变
开始看计算机图形学和OpenGL,挺有意思就自己随便写了一些效果. 以中间点坐标为圆心,计算每一点和圆心距离,根据距离算出一个RGB值,于是整体便呈现环形分布. 代码如下: #include < ...
stormzhang的推荐！
欢迎转载,但请务必在明确位置注明出处!http://stormzhang.com/android/2014/07/07/learn-android-from-rookie/ QQ交流群:入群理由请正确 ...

利用mapreduce清洗日志内存不足问题

利用mapreduce清洗日志内存不足问题的更多相关文章

随机推荐

热门专题