【hadoop】1、MapReduce进行日志分析,并排序统计结果
1.网上很多关于搭建Hadoop集群的知识,这里不多做叙述,并且本机运行Hadoop程序是不需要hdfs集群的,我们本机运行只做个demo样式,当真的需要运行大数据的时候,才需要真正的集群
2.还有就是词频统计的知识,不论是官方文档,还是网上的知识,基本都能随意百度个几百篇出来
但是我找半天,确实是没有找到对词频的结果进行全局排序的操作,实在是苦于搜索不到,我就自己瞎鼓捣一波,搞了个demo出来,还有决定不找接口了,之前一直说自己忙,没时间写blog,现在想想其实还是接口,因为永远没有那么多闲余的时间给你慢慢学学。。。
废话少聊,这里实现对结果进行排序的根本,其实也很简单,借助MapReduce本身的排序机制,我们只需要进行2次MapReduce即可
在第二次运行MapReduce的时候,我们需要调转一下key-value的顺序,就可以实现对结果数据的排序
package cn.cutter.demo.hadoop.demo; import cn.cutter.demo.hadoop.utils.ZipUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils; import java.io.*;
import java.net.URI;
import java.util.*; /**
* @program: cutter-point
* @description: 测试hadoop,新加上排序能力
* @author: xiaof
* @create: 2018-11-24 20:47
*
* (input) <k1, v1> -> map -> <k2, v2> -> combine -> <k2, v2> -> reduce -> <k3, v3> (output)
**/
public class WorkCount4 { /**
* map类
*/
public static class TokenizerMapper extends Mapper<Object, Text, NewKey1, IntWritable> { static enum CountersEnum { INPUT_WORDS } private final static IntWritable one = new IntWritable(1);
private Text word = new Text(); private boolean caseSensitive;
private Set<String> patternsToSkip = new HashSet<String>(); private Configuration conf;
private BufferedReader fis; public void map(Object key, Text value, Mapper<Object, Text, NewKey1, IntWritable>.Context context) throws IOException, InterruptedException { String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase(); for(String pattern : patternsToSkip) {
line = line.replaceAll(pattern, "").replace(" ", "").trim();
} if(line.contains("Exception") || line.contains("exception")) {
NewKey1 newKey1 = new NewKey1(line, 1l);
context.write(newKey1, one);
} } } /**
* reduce类
*/
public static class IntSumReducer
extends Reducer<NewKey1, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable(); public void reduce(NewKey1 newKey1, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
newKey1.setSecond((long)result.get());
context.write(new Text(newKey1.getFirst()), result);
} } public static class NewKey1 implements WritableComparable<NewKey1> { private String first;
private Long second; public NewKey1() {} public NewKey1(String first, Long second) {
this.first = first;
this.second = second;
} public String getFirst() {
return first;
} public void setFirst(String first) {
this.first = first;
} public Long getSecond() {
return second;
} public void setSecond(Long second) {
this.second = second;
} @Override
public int compareTo(NewKey1 o) {
//优先根据value进行排序
Long result = this.second - o.second;
if(result != 0)
return result.intValue();
else
return first.compareTo(o.first);
} @Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.write((this.first + "\n").getBytes());
dataOutput.writeLong(this.second);
} @Override
public void readFields(DataInput dataInput) throws IOException {
this.first = dataInput.readLine();
this.second = dataInput.readLong();
} @Override
public int hashCode() {
return this.first.hashCode() + this.second.hashCode() + Integer.valueOf(random(6));
} @Override
public boolean equals(Object obj) {
if(!(obj instanceof NewKey1)) {
return false;
} NewKey1 newKey1 = (NewKey1) obj;
return (this.first.equals(newKey1.first)) && (this.second == newKey1.second);
}
} public static String random(int i) {
String sources = "0123456789";
Random random = new Random();
StringBuffer flag = new StringBuffer();
for(int j = 0; j < i; ++j) {
flag.append(sources.charAt(random.nextInt(9)));
} return flag.toString();
} public static class SortMap1 extends Mapper<Object, Text, IntWritable, Text> { @Override
protected void map(Object key, Text value, Mapper<Object, Text, IntWritable, Text>.Context context) throws IOException, InterruptedException {
String line = value.toString();
String words[] = line.split("\t");
if(words.length == 2) {
IntWritable intWritable = new IntWritable();
intWritable.set(Integer.valueOf(words[1]));
context.write(intWritable, new Text(words[0]));
}
}
} public static class SortReduce1 extends Reducer<IntWritable, Text, Text, IntWritable> { @Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
context.write(values.iterator().next(), key);
}
} /**
* 使用远程input目录的数据,需要用hdfs的目录,用本地目录不行
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
System.setProperty("hadoop.home.dir", "F:\\hadoop-2.7.7");
Configuration conf = new Configuration();
GenericOptionsParser optionsParser = new GenericOptionsParser(conf, args); // conf.set("fs.default.name", "hdfs://127.0.0.1:9000");
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WorkCount4.class);
job.setMapperClass(TokenizerMapper.class);
job.setMapOutputKeyClass(NewKey1.class);
// job.setCombinerClass(NewKey1.class);
//制定reduce类
job.setReducerClass(IntSumReducer.class);
//指定输出<k3,v3>的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//先解析zip文件,并删除zip包
//H:\ideaworkspace\1-tmp\input H:\ideaworkspace\1-tmp\output
String temp[] = {"H:\\ideaworkspace\\1-tmp\\input", "H:\\ideaworkspace\\1-tmp\\output"};
String name = random(temp.length);
args = temp;
ZipUtil.unZipFilesAndDel(args[0]);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1] + name)); //job1加入控制器
ControlledJob ctrlJob1 = new ControlledJob(conf);
ctrlJob1.setJob(job); //JOB2设置
Job job2 = Job.getInstance(conf, "word count2");
job2.setJarByClass(WorkCount4.class);
job2.setMapperClass(SortMap1.class);
job2.setMapOutputKeyClass(IntWritable.class);
job2.setMapOutputValueClass(Text.class);
//制定reduce类
job2.setReducerClass(SortReduce1.class);
//指定输出<k3,v3>的类型
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(IntWritable.class);
//job2加入控制器
ControlledJob ctrlJob2 = new ControlledJob(conf);
ctrlJob2.setJob(job2); FileInputFormat.setInputPaths(job2, new Path(args[1] + name));
FileOutputFormat.setOutputPath(job2, new Path(args[1] + name + "-result")); //设置作业之间的以来关系,job2的输入以来job1的输出
ctrlJob2.addDependingJob(ctrlJob1); //设置主控制器,控制job1和job2两个作业
JobControl jobCtrl = new JobControl("myCtrl");
//添加到总的JobControl里,进行控制
jobCtrl.addJob(ctrlJob1);
jobCtrl.addJob(ctrlJob2); //在线程中启动,记住一定要有这个
Thread thread = new Thread(jobCtrl);
thread.start();
while (true) {
if (jobCtrl.allFinished()) {
System.out.println(jobCtrl.getSuccessfulJobList());
jobCtrl.stop();
break;
}
} // System.exit(job.waitForCompletion(true) ? 0 : 1);
} }
辅助类,因为服务器上的日志都是自动压缩好的,要想进行分析,那就先要进行解压
package cn.cutter.demo.hadoop.utils; import java.io.*;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Random;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream; /**
* @ClassName ZipUtil
* @Description TODO
* @Author xiaof
* @Date 2018/12/11 23:08
* @Version 1.0
**/
public class ZipUtil { private static byte[] ZIP_HEADER_1 = new byte[] { 80, 75, 3, 4 };
private static byte[] ZIP_HEADER_2 = new byte[] { 80, 75, 5, 6 }; /**
* 解压这个目录的zip文件
* @param zipPath
*/
public static void unZipFilesAndDel(String zipPath) throws IOException { File file = new File(zipPath);
if(file.isDirectory()) {
//遍历所有文件
File files[] = file.listFiles();
for (int i = 0; i < files.length; ++i) {
unZipFilesAndDel(files[i].getAbsolutePath());
}
} else {
if(isArchiveFile(file)) {
String filename = file.getName();
unZipFile(file);
file.delete();
System.out.println("完成解压:" + filename);
}
}
} public static String random(int i) {
String sources = "0123456789";
Random random = new Random();
StringBuffer flag = new StringBuffer();
for(int j = 0; j < i; ++j) {
flag.append(sources.charAt(random.nextInt(9)));
} return flag.toString();
} private static void unZipFile(File file) throws IOException {
ZipFile zip = new ZipFile(file,Charset.forName("UTF-8"));//解决中文文件夹乱码
String name = zip.getName().substring(zip.getName().lastIndexOf('\\') + 1, zip.getName().lastIndexOf('.')); BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(file));
ZipInputStream zipInputStream = new ZipInputStream(bufferedInputStream); BufferedOutputStream bufferedOutputStream = null; ZipEntry zipEntry = null;
while((zipEntry = zipInputStream.getNextEntry()) != null) {
String entryName = zipEntry.getName();
bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(file.getParentFile() + "\\" + name + random(6)));
int b = 0;
while((b = zipInputStream.read()) != -1) {
bufferedOutputStream.write(b);
}
bufferedOutputStream.flush();
bufferedOutputStream.close();
}
zipInputStream.close();
bufferedInputStream.close();
zip.close();
} /**
* 判断文件是否为一个压缩文件
*
* @param file
* @return
*/
public static boolean isArchiveFile(File file) { if (file == null) {
return false;
} if (file.isDirectory()) {
return false;
} boolean isArchive = false;
InputStream input = null;
try {
input = new FileInputStream(file);
byte[] buffer = new byte[4];
int length = input.read(buffer, 0, 4);
if (length == 4) {
isArchive = (Arrays.equals(ZIP_HEADER_1, buffer)) || (Arrays.equals(ZIP_HEADER_2, buffer));
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (input != null) {
try {
input.close();
} catch (IOException e) {
}
}
} return isArchive;
} public static void main(String[] args) {
File file = new File("H:\\ideaworkspace\\1-tmp\\input\\111 - 副本.zip"); try {
unZipFile(file);
boolean res = file.delete();
System.out.println(res);
} catch (IOException e) {
e.printStackTrace();
} System.out.println(isArchiveFile(file));
System.out.println(file.getAbsolutePath()); } }
这里说个点,我发现自己一台电脑就单单分析20G的数据,都要跑半天,几个小时下来毛都没跑出来。。。
尴尬了,于是只能找个几十M的文件试试水
我们发现是这个地方报错非常频繁,可以从这个入手,看看是那些sql导致的,当然这个程序还有待改进,这里只能找到发生最频繁的异常,并不能分析出到底是哪个地方(当然跟记录日志的格式也有关系,这个太乱)。。。。
【hadoop】1、MapReduce进行日志分析,并排序统计结果的更多相关文章
- 一、基于hadoop的nginx访问日志分析---解析日志篇
前一阵子,搭建了ELK日志分析平台,用着挺爽的,再也不用给开发拉各种日志,节省了很多时间. 这篇博文是介绍用python代码实现日志分析的,用MRJob实现hadoop上的mapreduce,可以直接 ...
- MapReduce实现手机上网日志分析(排序)
一.背景 1.1 流程 实现排序,分组拍上一篇通过Partitioner实现了. 实现接口,自动产生接口方法,写属性,产生getter和setter,序列化和反序列化属性,写比较方法,重写toStri ...
- Hadoop:实战Web日志分析
示例场景 日志说明 有两台Web服务器,日志文件存放在/usr/local/nginx/logs/目录,日志默认为nginx定义格式.如: 123.13.17.13 - - [25/Aug/2016: ...
- 四、基于hadoop的nginx访问日志分析---top 10 request
代码: # cat top_10_request.py #!/usr/bin/env python # coding=utf-8 from mrjob.job import MRJob from mr ...
- 二、基于hadoop的nginx访问日志分析---计算日pv
代码: # pv_day.py#!/usr/bin/env python # coding=utf-8 from mrjob.job import MRJob from nginx_accesslog ...
- 五、基于hadoop的nginx访问日志分析--userAgent和spider
useragent: 代码(不包含蜘蛛): # cat top_10_useragent.py #!/usr/bin/env python # coding=utf-8 from mrjob.job ...
- 三、基于hadoop的nginx访问日志分析--计算时刻pv
代码: # cat pv_hour.py #!/usr/bin/env python # coding=utf-8 from mrjob.job import MRJob from nginx_acc ...
- Hadoop学习笔记—20.网站日志分析项目案例
1.1 项目来源 本次要实践的数据日志来源于国内某技术学习论坛,该论坛由某培训机构主办,汇聚了众多技术学习者,每天都有人发帖.回帖,如图1所示. 图1 项目来源网站-技术学习论坛 本次实践的目的就在于 ...
- Hadoop学习笔记—20.网站日志分析项目案例(一)项目介绍
网站日志分析项目案例(一)项目介绍:当前页面 网站日志分析项目案例(二)数据清洗:http://www.cnblogs.com/edisonchou/p/4458219.html 网站日志分析项目案例 ...
随机推荐
- SQLPrompt 安装后sql上看不到菜单
解决方案很简单,请使用管理员权限安装 破解也是一样,需要管理员权限运行
- 【Selenium】【BugList1】调用firefox浏览器,报 TypeError: 'module' object is not callable
#coding=utf-8 from selenium import webdriver driver=webdriver.firefox() 解决方法:firefox改为Firefox
- 464. Can I Win
https://leetcode.com/problems/can-i-win/description/ In the "100 game," two players take t ...
- shell脚本编写某一文件夹内拷贝某一段文件(有则跳过没有则拷贝)
必须是同一台服务器下,或者挂载目录,不同服务器下没办法查询目录中是否有该文件 如果不在同一服务器下,可以把要查询的那个服务器的文件夹设置共享挂在到当前服务器 或者可以把脚本写到要拷贝的服务器上,那么s ...
- js html标签select 中option 删除除了第一行外的其他行
背景:共两个下拉框,第一个下拉框选择完之后,以第一个选定的值为条件返回第二个下拉框中的内容,用js中的createElement()创建,并利用appendChild()来添加进父标签.出现意外:每次 ...
- SQL,group by分组后分别计算组内不同值的数量
select name as 姓名,sum( case when cargo='笔' then 1 else 0 end ) as 笔,sum( case when cargo='橡皮' then 1 ...
- Linux---基础指令(一)
https://www.linuxprobe.com/chapter-02.html (Linux就要这么学) 一.执行查看帮助命令 date:date命令用于显示及设置系统的时间或日期,格式为“d ...
- Linux-3.0.8中基于S5PV210的GPIO模块代码追踪和分析
编写按键驱动时,想知道内核是如何管理GPIO的,所以开始追踪代码,中间走了一些弯路,现记录于此. 追踪代码之前,我猜测:第一,这部分代码应该在系统set up阶段执行:第二,GPIO的代码应该在mac ...
- pom.xml文件模板、application文件模板、configuration逆向生成文件、
pom: <?xml version="1.0" encoding="UTF-8"?><project xmlns="http:// ...
- noip第26课资料