1, tf-idf

计算每个人的词条中的重要度

需要3个mapreduce 的 job执行, 第一个计算 TF 和 n, 第二个计算 DF, 第三个代入公式计算结果值

1, 第一个job

  1. package com.wenbronk.weibo;
  2.  
  3. import java.io.IOException;
  4. import java.io.StringReader;
  5.  
  6. import org.apache.hadoop.io.IntWritable;
  7. import org.apache.hadoop.io.LongWritable;
  8. import org.apache.hadoop.io.Text;
  9. import org.apache.hadoop.mapreduce.Mapper;
  10. import org.wltea.analyzer.core.IKSegmenter;
  11. import org.wltea.analyzer.core.Lexeme;
  12.  
  13. /**
  14. * 第一个map, 计算 TF 和 N
  15. *
  16. * @author root
  17. *
  18. */
  19. public class FirstMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  20.  
  21. /**
  22. * TF 在一个文章中出现的词频 N 总共多少文章
  23. * 按行传入
  24. */
  25. @Override
  26. protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
  27. throws IOException, InterruptedException {
  28.  
  29. String[] values = value.toString().trim().split("\t");
  30.  
  31. if (values.length >= ) {
  32. String id = values[].trim();
  33. String content = values[].trim();
  34.  
  35. // 分词
  36. StringReader stringReader = new StringReader(content);
  37. IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);
  38. Lexeme word = null;
  39. while ((word = ikSegmenter.next()) != null ) {
  40. String w = word.getLexemeText();
  41. context.write(new Text(w + "_" + id), new IntWritable());
  42. }
  43. context.write(new Text("count"), new IntWritable());
  44. }else {
  45. System.out.println(values.toString() + "---");
  46. }
  47.  
  48. }
  49.  
  50. }

reduce

  1. package com.wenbronk.weibo;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.io.IntWritable;
  6. import org.apache.hadoop.io.Text;
  7. import org.apache.hadoop.mapreduce.Reducer;
  8.  
  9. /**
  10. * 统计tf, n
  11. * @author root
  12. *
  13. */
  14. public class FirstReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
  15.  
  16. @Override
  17. protected void reduce(Text arg0, Iterable<IntWritable> arg1,
  18. Reducer<Text, IntWritable, Text, IntWritable>.Context arg2) throws IOException, InterruptedException {
  19.  
  20. int sum = ;
  21. for (IntWritable intWritable : arg1) {
  22. sum += intWritable.get();
  23. }
  24. if (arg0.equals(new Text("count"))) {
  25. System.err.println(arg0.toString() + "---");
  26. }
  27. arg2.write(arg0, new IntWritable(sum));
  28. }
  29.  
  30. }

partition

  1. package com.wenbronk.weibo;
  2.  
  3. import org.apache.hadoop.io.IntWritable;
  4. import org.apache.hadoop.io.Text;
  5. import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
  6.  
  7. /**
  8. * 决定分区, 计划分4个, n一个, tf三个
  9. * @author root
  10. *
  11. */
  12. public class FirstPartition extends HashPartitioner<Text, IntWritable>{
  13.  
  14. @Override
  15. public int getPartition(Text key, IntWritable value, int numReduceTasks) {
  16. if (key.equals(new Text("count"))) {
  17. return ;
  18. }else {
  19. return super.getPartition(key, value, numReduceTasks - );
  20. }
  21.  
  22. }
  23.  
  24. }

mainJob

  1. package com.wenbronk.weibo;
  2.  
  3. import org.apache.hadoop.conf.Configuration;
  4. import org.apache.hadoop.fs.FileSystem;
  5. import org.apache.hadoop.fs.Path;
  6. import org.apache.hadoop.io.IntWritable;
  7. import org.apache.hadoop.io.Text;
  8. import org.apache.hadoop.mapreduce.Job;
  9. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  10. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  11.  
  12. public class FirstJob {
  13.  
  14. public static void main(String[] args) {
  15. Configuration config = new Configuration();
  16. config.set("fs.defaults", "hdfs://192.168.208.106:8020");
  17. config.set("yarn.resourcemanager.hostname", "192.168.208.106");
  18. // config.set("maper.jar", "E:\\sxt\\target\\weibo1.jar");
  19.  
  20. try {
  21.  
  22. Job job = Job.getInstance(config);
  23. job.setJarByClass(FirstJob.class);
  24. job.setJobName("first");
  25.  
  26. job.setPartitionerClass(FirstPartition.class);
  27. job.setMapperClass(FirstMapper.class);
  28. job.setNumReduceTasks();
  29. job.setCombinerClass(FirstReducer.class);
  30. job.setReducerClass(FirstReducer.class);
  31.  
  32. job.setMapOutputKeyClass(Text.class);
  33. job.setMapOutputValueClass(IntWritable.class);
  34.  
  35. FileInputFormat.addInputPath(job, new Path("E:\\sxt\\1-MapReduce\\data\\weibo2.txt"));
  36.  
  37. FileSystem fileSystem = FileSystem.get(config);
  38.  
  39. Path outPath = new Path("E:\\sxt\\1-MapReduce\\data\\weibo1");
  40. if (fileSystem.exists(outPath)) {
  41. fileSystem.delete(outPath);
  42. }
  43. FileOutputFormat.setOutputPath(job, outPath);
  44.  
  45. boolean waitForCompletion = job.waitForCompletion(true);
  46. if (waitForCompletion) {
  47. System.out.println("first success");
  48. }
  49.  
  50. }catch (Exception e) {
  51. e.printStackTrace();
  52. }
  53.  
  54. }
  55.  
  56. }

2, 第二个

  1. package com.wenbronk.weibo;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.io.IntWritable;
  6. import org.apache.hadoop.io.LongWritable;
  7. import org.apache.hadoop.io.Text;
  8. import org.apache.hadoop.mapreduce.Mapper;
  9. import org.apache.hadoop.mapreduce.lib.input.FileSplit;
  10.  
  11. /**
  12. * 计算 DFi的值, 在多少个文章中出现过
  13. *
  14. */
  15. public class SecondMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  16.  
  17. @Override
  18. protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
  19. throws IOException, InterruptedException {
  20.  
  21. // 获取当前maptask的数据片段
  22. FileSplit inputSplit = (FileSplit) context.getInputSplit();
  23.  
  24. // count不被统计
  25. if (!inputSplit.getPath().getName().contains("part-r-00003")) {
  26.  
  27. String[] values = value.toString().trim().split("\t");
  28.  
  29. if (values.length >= ) {
  30. String[] split = values[].trim().split("_");
  31. if (split.length >= ) {
  32. String id = split[];
  33. context.write(new Text(id), new IntWritable());
  34. }
  35. }
  36. }else {
  37. System.out.println(value.toString() + "----");
  38. }
  39.  
  40. }
  41.  
  42. }

reduce

  1. package com.wenbronk.weibo;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.io.IntWritable;
  6. import org.apache.hadoop.io.Text;
  7. import org.apache.hadoop.mapreduce.Reducer;
  8.  
  9. /**
  10. *
  11. * @author root
  12. *
  13. */
  14. public class SecondReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
  15.  
  16. @Override
  17. protected void reduce(Text arg0, Iterable<IntWritable> arg1,
  18. Reducer<Text, IntWritable, Text, IntWritable>.Context arg2) throws IOException, InterruptedException {
  19.  
  20. int sum = ;
  21. for (IntWritable intWritable : arg1) {
  22. sum += intWritable.get();
  23. }
  24. arg2.write(new Text(arg0), new IntWritable(sum));
  25. }
  26.  
  27. }

mainjob

  1. package com.wenbronk.weibo;
  2.  
  3. import org.apache.hadoop.conf.Configuration;
  4. import org.apache.hadoop.fs.FileSystem;
  5. import org.apache.hadoop.fs.Path;
  6. import org.apache.hadoop.io.IntWritable;
  7. import org.apache.hadoop.io.Text;
  8. import org.apache.hadoop.mapreduce.Job;
  9. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  10. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  11.  
  12. public class SecondJob {
  13.  
  14. public static void main(String[] args) {
  15. Configuration config = new Configuration();
  16. config.set("fs.default", "hdfs://192.168.208.106:8020");
  17. config.set("yarn.resourcemanager.hostname", "192.168.208.106");
  18.  
  19. try {
  20.  
  21. Job job = Job.getInstance(config);
  22. job.setJarByClass(SecondJob.class);
  23. job.setJobName("second");
  24.  
  25. job.setMapperClass(SecondMapper.class);
  26. job.setCombinerClass(SecondReducer.class);
  27. job.setReducerClass(SecondReducer.class);
  28.  
  29. job.setOutputKeyClass(Text.class);
  30. job.setOutputValueClass(IntWritable.class);
  31.  
  32. FileInputFormat.addInputPath(job, new Path("E:\\sxt\\1-MapReduce\\data\\weibo1"));
  33.  
  34. FileSystem fileSystem = FileSystem.get(config);
  35. Path outPath = new Path("E:\\sxt\\1-MapReduce\\data\\weibo2");
  36. if (fileSystem.exists(outPath)) {
  37. fileSystem.delete(outPath);
  38. }
  39. FileOutputFormat.setOutputPath(job, outPath);
  40.  
  41. boolean f = job.waitForCompletion(true);
  42. if (f) {
  43. System.out.println("job2 success");
  44. }
  45.  
  46. }catch(Exception e) {
  47. e.printStackTrace();
  48. }
  49.  
  50. }
  51.  
  52. }

3, 第三个Job

  1. package com.wenbronk.weibo;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.IOException;
  5. import java.io.InputStreamReader;
  6. import java.net.URI;
  7. import java.text.NumberFormat;
  8. import java.util.HashMap;
  9. import java.util.Map;
  10.  
  11. import org.apache.hadoop.conf.Configuration;
  12. import org.apache.hadoop.fs.FSDataInputStream;
  13. import org.apache.hadoop.fs.FileSystem;
  14. import org.apache.hadoop.fs.Path;
  15. import org.apache.hadoop.io.IntWritable;
  16. import org.apache.hadoop.io.LongWritable;
  17. import org.apache.hadoop.io.Text;
  18. import org.apache.hadoop.mapreduce.Mapper;
  19. import org.apache.hadoop.mapreduce.lib.input.FileSplit;
  20.  
  21. public class ThirdMapper extends Mapper<LongWritable, Text, Text, Text>{
  22.  
  23. //存放微博总数, 将小数据缓存进内存, 预加载
  24. public static Map<String, Integer> cmap = null;
  25. //存放df
  26. public static Map<String, Integer> df = null;
  27.  
  28. // 在初始化类时执行, 将数据预加载进map
  29. protected void setup(Context context)
  30. throws IOException, InterruptedException {
  31.  
  32. System.out.println("*****");
  33. if (cmap == null || cmap.size() == || df == null || df.size() == ) {
  34. URI[] cacheFiles = context.getCacheFiles();
  35. if (cacheFiles != null) {
  36. for (URI uri : cacheFiles) {
  37. if (uri.getPath().endsWith("part-r-00003")) {
  38. Path path = new Path(uri.getPath());
  39. // 获取文件
  40. Configuration configuration = context.getConfiguration();
  41. FileSystem fs = FileSystem.get(configuration);
  42. FSDataInputStream open = fs.open(path);
  43. BufferedReader reader = new BufferedReader(new InputStreamReader(open));
  44.  
  45. // BufferedReader reader = new BufferedReader(new FileReader(path.getName()));
  46. String line = reader.readLine();
  47. if (line.startsWith("count")) {
  48. String[] split = line.split("\t");
  49. cmap = new HashMap<>();
  50. cmap.put(split[], Integer.parseInt(split[].trim()));
  51. }
  52. reader.close();
  53. }else if (uri.getPath().endsWith("part-r-00000")) {
  54. df = new HashMap<>();
  55. Path path = new Path(uri.getPath());
  56.  
  57. // 获取文件
  58. Configuration configuration = context.getConfiguration();
  59. FileSystem fs = FileSystem.get(configuration);
  60. FSDataInputStream open = fs.open(path);
  61. BufferedReader reader = new BufferedReader(new InputStreamReader(open));
  62. // BufferedReader reader = new BufferedReader(new FileReader(path.getName()));
  63.  
  64. String line = null;
  65. while ((line = reader.readLine()) != null) {
  66. String[] ls = line.split("\t");
  67. df.put(ls[], Integer.parseInt(ls[].trim()));
  68. }
  69. reader.close();
  70. }
  71. }
  72. }
  73. }
  74. }
  75.  
  76. @Override
  77. protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
  78. throws IOException, InterruptedException {
  79. // 获取分片
  80. FileSplit inputSplit = (FileSplit) context.getInputSplit();
  81.  
  82. if (!inputSplit.getPath().getName().contains("part-r-00003")) {
  83. String[] values = value.toString().trim().split("\t");
  84.  
  85. if (values.length >= ) {
  86.  
  87. int tf = Integer.parseInt(values[].trim());
  88.  
  89. String[] ss = values[].split("_");
  90. if (ss.length >= ) {
  91. String word = ss[];
  92. String id = ss[];
  93.  
  94. // 公式
  95. Double s = tf * Math.log(cmap.get("count")) / df.get(word);
  96. NumberFormat format = NumberFormat.getInstance();
  97. // 取小数点后5位
  98. format.setMaximumFractionDigits();
  99.  
  100. context.write(new Text(id), new Text(word + ": " + format.format(s)));
  101. }else {
  102. System.out.println(value.toString() + "------");
  103. }
  104. }
  105. }
  106. }
  107. }

reduce

  1. package com.wenbronk.weibo;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.io.Text;
  6. import org.apache.hadoop.mapreduce.Reducer;
  7.  
  8. public class ThirdReducer extends Reducer<Text, Text, Text, Text>{
  9.  
  10. @Override
  11. protected void reduce(Text arg0, Iterable<Text> arg1, Reducer<Text, Text, Text, Text>.Context arg2)
  12. throws IOException, InterruptedException {
  13.  
  14. StringBuffer sb = new StringBuffer();
  15. for (Text text : arg1) {
  16. sb.append(text.toString() + "\t");
  17. }
  18. arg2.write(arg0, new Text(sb.toString()));
  19. }
  20.  
  21. }

mainJob

  1. package com.wenbronk.weibo;
  2.  
  3. import org.apache.hadoop.conf.Configuration;
  4. import org.apache.hadoop.fs.FileSystem;
  5. import org.apache.hadoop.fs.Path;
  6. import org.apache.hadoop.io.Text;
  7. import org.apache.hadoop.mapreduce.Job;
  8. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  9. import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
  10. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  11.  
  12. public class ThirdJob {
  13.  
  14. public static void main(String[] args) {
  15.  
  16. Configuration config = new Configuration();
  17. config.set("fs.defaults", "hdfs://192.168.208.106:8020");
  18. config.set("yarn.resourcemanager.hostname", "192.168.208.106");
  19. try {
  20. Job job = Job.getInstance(config);
  21. job.setJarByClass(ThirdJob.class);
  22. job.setJobName("third");
  23. // job.setInputFormatClass(KeyValueTextInputFormat.class);
  24.  
  25. //把微博总数加载到内存
  26. job.addCacheFile(new Path("E:\\sxt\\1-MapReduce\\data\\weibo1\\part-r-00003").toUri());
  27. //把df加载到内存
  28. job.addCacheFile(new Path("E:\\sxt\\1-MapReduce\\data\\weibo2\\part-r-00000").toUri());
  29.  
  30. job.setMapperClass(ThirdMapper.class);
  31. job.setReducerClass(ThirdReducer.class);
  32.  
  33. job.setMapOutputKeyClass(Text.class);
  34. job.setMapOutputValueClass(Text.class);
  35.  
  36. FileSystem fs = FileSystem.get(config);
  37. FileInputFormat.addInputPath(job, new Path("E:\\sxt\\1-MapReduce\\data\\weibo1"));
  38. Path path = new Path("E:\\sxt\\1-MapReduce\\data\\weibo3");
  39. if (fs.exists(path)) {
  40. fs.delete(path);
  41. }
  42. FileOutputFormat.setOutputPath(job, path);
  43.  
  44. boolean waitForCompletion = job.waitForCompletion(true);
  45. if(waitForCompletion) {
  46. System.out.println("执行job成功");
  47. }
  48. }catch (Exception e) {
  49. e.printStackTrace();
  50. }
  51. }
  52.  
  53. }

系列来自尚学堂视频

21-hadoop-weibo推送广告的更多相关文章

  1. ADSafe净网大师----所谓的去广告神器竟然在偷偷推送广告

    今天刚开发完的网站上线联调, 偶然发现<head>里多了一个脚本引用: <script async src="http://c.cnzz.com/core.php" ...

  2. Android消息推送之GCM方式(二)

    <声明> 转载请保留本来源地址: http://blog.csdn.net/wzg_1987/article/details/9148023 上一节讲了GCM方式实现前的一些必要准备工作, ...

  3. android极光推送

    版权声明:本文为博主原创文章,未经博主允许不得转载. Android开发记录18-集成推送服务的一点说明 关于推送服务,国内有很多选择,笔者也对它们进行了一个详细的对比,一般我们产品选择推送服务主要考 ...

  4. MIUI(ADUI)关闭广告推送步骤方法

    MIUI自从到了版本MIUI8之后,系统增加了各种推送,让人们所诟病.很多消费者因为这个原因,不再考虑小米手机,尽管小米手机确实很便宜. 下面就说一下如何关闭所有的MIUI 8的广告推送.方法源自MI ...

  5. 如何用Nearby Service开发针对附近人群的精准广告推送功能

      当你想找一家餐厅吃饭,却不知道去哪家,这时候手机跳出一条通知,为你自动推送附近优质餐厅的信息,你会点击查看吗?当你还在店内纠结于是否买下一双球鞋时,手机应用给了你发放了老顾客5折优惠券,这样的广告 ...

  6. 使用用WCF中的双工(Duplex)模式将广告图片推送到每个Winform客户端机子上

    参考资料地址:http://www.cnblogs.com/server126/archive/2011/08/11/2134942.html 代码实现: WCF宿主(服务端) IServices.c ...

  7. iOS10推送必看UNNotificationServiceExtension

    转:http://www.cocoachina.com/ios/20161017/17769.html (收录供个人学习用) iOS10推送UNNotificationServic 招聘信息: 产品经 ...

  8. (转)在SAE使用Apple Push Notification Service服务开发iOS应用, 实现消息推送

    在SAE使用Apple Push Notification Service服务开发iOS应用, 实现消息推送 From: http://saeapns.sinaapp.com/doc.html 1,在 ...

  9. 基于 WebSocket 的 MQTT 移动推送方案

    WebSphere MQ Telemetry Transport 简介 WebSphere MQ Telemetry Transport (MQTT) 是一项异步消息传输协议,是 IBM 在分析了他们 ...

随机推荐

  1. Page页面生命周期——微信小程序

    onLoad:function (options) {     //页面初始化     console.log('index Load') }, onShow:function () {     // ...

  2. python使用数据库的一些操作

    学py感觉还是用linux操作系统比较好,下载安装mysql很简单,linux里面都有自带的,但是要用python去用mysql我们就得安装一下他的模块,因为python里面没有自带他的模块,用yum ...

  3. day22(过滤器Filter)

    过滤器 生命周期:初始化 -----过滤-------销毁 作用:过滤url ,特定字符 创建方式:实现一个接口继承Filter package com.baidu.filter; import ja ...

  4. NameNode工作机制

    NameNode工作机制

  5. 省赛在即!最大流问题回顾学习!!DInic

    Dinic是很好的算法,但是我还是从ek算法复习起步 面对最大流问题,印象最深的就是反向边的思想,他给我们提供了反悔的机会,其实现在放到实际上来想,可以相当于两边的水都流了这条边,只是方向不一样,放到 ...

  6. poj2481

    题意:给定一些线段(s, e),起点为s,终点为e,求每一段线段被多少线段包含(不包括相等) 思路:很明显的树状数组题目..但是做的时候想了挺久..(下面的x为线段起点, y为线段终点) 做法1:先对 ...

  7. Android-Kotlin-单例模式

    先看一个案例,非单例模式的案例: 描述Dog对象: package cn.kotlin.kotlin_oop08 class Dog(var name:String, var color:String ...

  8. Linux-IO重定向与管道

    1. 输入与输出 标准输入 STDIN 文件描述符:0,默认:键盘输入 标准输出 STDOUT 文件描述符:1,默认:屏幕输出 错误输出 STDERR 文件描述符:2,默认:屏幕输出 2. 标准输出重 ...

  9. Python 中的深拷贝和浅拷贝

    一.浅拷贝python中 对象赋值时 默认是浅拷贝,满足如下规律:1. 对于 不可变对象(字符串,元组 等),赋值 实际上是创建一个新的对象:例如: >>> person=['nam ...

  10. JS学习笔记9_JSON

    1.JSON概述 JavaScript Object Natation,js对象表示法,(像XML一样)是一种数据格式,它与js有相同的语法形式 P.S.一点小历史:JSON之父是道格拉斯,<J ...