运用mapreduce计算tf-idf
问题描写叙述:给定一个大文件,文件里的内容每一行为:文档名,文档内容。
input
文档名1,word1 Word2 .......
文档名2,word1 Word2 .......
output
word 文档名 tfidf值
package com.elex.mapreduce; import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.elex.mapreduce.TFIDF_4.IDFMap;
import com.elex.mapreduce.TFIDF_4.IDFReduce;
import com.elex.utils.DataClean;
import com.google.common.io.Closeables; public class TFIDF_5 {
public static String hdfsURL = "hdfs://namenode:8020";
public static String fileURL = "/tmp/usercount"; public static class TFMap extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String userWordstmp = value.toString();
StringTokenizer userWords = new StringTokenizer(userWordstmp, "\n");
while (userWords.hasMoreTokens()) {
String userWordFragtmp = userWords.nextToken();
StringTokenizer userWordFrag = new StringTokenizer(
userWordFragtmp, ",");
String user = userWordFrag.nextToken();
Text outputKey = new Text();
Text outputValue = new Text();
while (userWordFrag.hasMoreTokens()) {
String words = userWordFrag.nextToken();
HashMap<String, Integer> wordMap = DataClean.clean(words,
"!total");
int wordTotal = wordMap.get("!total");
wordMap.remove("!total");
for (Map.Entry<String, Integer> wordEntry : wordMap
.entrySet()) {
String word = wordEntry.getKey();
int wordCount = wordEntry.getValue();
float tf = (float) wordCount / (float) wordTotal;
String outputStr = word + " " + Float.toString(tf)
+ ",";
byte[] bytes = outputStr.getBytes();
outputValue.append(bytes, 0, bytes.length);
}
}
outputKey.set(user);
context.write(outputKey, outputValue);
}
}
} public static class TFReduce extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// StringBuffer sb = new StringBuffer();
Iterator<Text> iter = values.iterator();
while (iter.hasNext()) {
// sb.append(iter.next().toString() + "\t");
context.write(key, iter.next());
}
// Text outputValue = new Text();
// outputValue.set(sb.toString());
// context.write(key, outputValue);
}
} public static class IDFMap extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String valuesTmp = value.toString();
StringTokenizer userWordFrag = new StringTokenizer(valuesTmp, "\n");
while (userWordFrag.hasMoreTokens()) {
// String userWordtmp = userWordFrag.nextToken();
StringTokenizer userWords = new StringTokenizer(
userWordFrag.nextToken(), "\t");
String user = userWords.nextToken();
while (userWords.hasMoreTokens()) {
StringTokenizer wordTFs = new StringTokenizer(
userWords.nextToken(), ",");
while (wordTFs.hasMoreTokens()) {
StringTokenizer wordTF = new StringTokenizer(
wordTFs.nextToken());
String word = wordTF.nextToken();
String tf = wordTF.nextToken();
Text outputKey = new Text();
Text outputValue = new Text();
outputKey.set(word);
outputValue.set(user + "\t" + tf);
context.write(outputKey, outputValue);
}
}
} }
} public static class IDFReduce extends Reducer<Text, Text, Text, Text> {
long userCount = 0; public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
Path path = new Path(fileURL);
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
if (!fs.isFile(path)) {
FSDataOutputStream output = fs.create(path, true);
output.close();
}
FSDataInputStream input = fs.open(path);
StringBuffer sb = new StringBuffer();
byte[] bytes = new byte[1024];
int status = input.read(bytes);
while (status != -1) {
sb.append(new String(bytes));
status = input.read(bytes);
}
if (!"".equals(sb.toString())) {
userCount = Long.parseLong(sb.toString().trim());
}
input.close();
} public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
LinkedList<String> userList = new LinkedList<String>();
Iterator<Text> iter = values.iterator();
long wordCount = 0;
while (iter.hasNext()) {
wordCount++;
userList.add(iter.next().toString());
}
float idf = (float) Math.log((float) userCount
/ (float) (wordCount + 1));
Iterator<String> userIter = userList.iterator();
Text outputValue = new Text();
while (userIter.hasNext()) {
String usertftmp = userIter.next();
StringTokenizer usertf = new StringTokenizer(usertftmp, "\t");
String user = usertf.nextToken();
String tfStr = usertf.nextToken();
float tf = Float.parseFloat(tfStr.trim().toString());
float tfidf = tf * idf;
String outputTmp = user + "\t" + tfidf;
outputValue.set(outputTmp);
context.write(key, outputValue);
}
}
} public static class UserCountMap extends Mapper<Object, Text, Text, Text> { public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String userWordtmp = value.toString();
StringTokenizer userWord = new StringTokenizer(userWordtmp, "\n");
while (userWord.hasMoreTokens()) {
userWord.nextToken();
Text outputKey = new Text();
outputKey.set("usercount");
Text one = new Text();
one.set("1");
context.write(outputKey, one);
}
}
} public static class UserCountCombine extends
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
long user = 0;
for (Text value : values) {
String valueTmp = value.toString();
user += Long.parseLong(valueTmp);
}
Text outputValue = new Text();
outputValue.set(Long.toString(user));
context.write(key, outputValue);
}
} public static class UserCountReduce extends Reducer<Text, Text, Text, Text> {
int userCount = 0; public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
String valueTmp = value.toString();
userCount += Long.parseLong(valueTmp);
}
} public void cleanup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
Path path = new Path(fileURL);
FSDataOutputStream output = fs.create(path, true);
String content = Long.toString(userCount);
output.write(content.getBytes());
output.flush();
output.close();
}
} public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
// conf.set("mapred.child.java.opts", "-Xmx4096m");
Job tfJob = Job.getInstance(conf, "tfjob");
tfJob.setJarByClass(TFIDF_5.class);
tfJob.setMapperClass(TFMap.class);
// tfJob.setCombinerClass(TFCombine.class);
tfJob.setReducerClass(TFReduce.class);
tfJob.setOutputKeyClass(Text.class);
tfJob.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(tfJob, new Path(args[0]));
FileOutputFormat.setOutputPath(tfJob, new Path(args[1]));
tfJob.waitForCompletion(true); // Job userCountJob = Job.getInstance(conf, "usercountjob");
// userCountJob.setJarByClass(TFIDF_5.class);
// userCountJob.setMapperClass(UserCountMap.class);
// userCountJob.setCombinerClass(UserCountCombine.class);
// userCountJob.setReducerClass(UserCountReduce.class);
// userCountJob.setOutputKeyClass(Text.class);
// userCountJob.setOutputValueClass(Text.class);
// FileInputFormat.setInputPaths(userCountJob, new Path(args[1]));
// FileOutputFormat.setOutputPath(userCountJob, new Path(args[2]));
// userCountJob.waitForCompletion(true);
<span style="white-space: pre;"> </span>//计算文档数,并暂时储存到hdfs上
Counter ct = tfJob.getCounters().findCounter(
"org.apache.hadoop.mapreduce.TaskCounter", "MAP_INPUT_RECORDS");
System.out.println(ct.getValue());
Iterable<String> groupNames = tfJob.getCounters().getGroupNames();
for (String groupName : groupNames) {
System.out.println(groupName);
}
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
Path path = new Path(fileURL);
FSDataOutputStream output = fs.create(path, true);
String content = Long.toString(ct.getValue());
output.write(content.getBytes());
output.flush();
output.close(); Job idfJob = Job.getInstance(conf, "idfjob");
idfJob.setJarByClass(TFIDF_5.class);
idfJob.setMapperClass(IDFMap.class);
idfJob.setReducerClass(IDFReduce.class);
idfJob.setOutputKeyClass(Text.class);
idfJob.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(idfJob, new Path(args[1]));
FileOutputFormat.setOutputPath(idfJob, new Path(args[3]));
System.exit(idfJob.waitForCompletion(true) ? 0 : 1); } }
最初运用了一个单独的job计算文档数,后面经过公司前辈的指点,能够通过计算tf的时候运用输入数据的条数来巧妙的计算文档数。
运用mapreduce计算tf-idf的更多相关文章
- TF/IDF(term frequency/inverse document frequency)
TF/IDF(term frequency/inverse document frequency) 的概念被公认为信息检索中最重要的发明. 一. TF/IDF描述单个term与特定document的相 ...
- TF/IDF计算方法
FROM:http://blog.csdn.net/pennyliang/article/details/1231028 我们已经谈过了如何自动下载网页.如何建立索引.如何衡量网页的质量(Page R ...
- tf–idf算法解释及其python代码实现(下)
tf–idf算法python代码实现 这是我写的一个tf-idf的简单实现的代码,我们知道tfidf=tf*idf,所以可以分别计算tf和idf值在相乘,首先我们创建一个简单的语料库,作为例子,只有四 ...
- tf–idf算法解释及其python代码实现(上)
tf–idf算法解释 tf–idf, 是term frequency–inverse document frequency的缩写,它通常用来衡量一个词对在一个语料库中对它所在的文档有多重要,常用在信息 ...
- 文本分类学习(三) 特征权重(TF/IDF)和特征提取
上一篇中,主要说的就是词袋模型.回顾一下,在进行文本分类之前,我们需要把待分类文本先用词袋模型进行文本表示.首先是将训练集中的所有单词经过去停用词之后组合成一个词袋,或者叫做字典,实际上一个维度很大的 ...
- 信息检索中的TF/IDF概念与算法的解释
https://blog.csdn.net/class_brick/article/details/79135909 概念 TF-IDF(term frequency–inverse document ...
- Elasticsearch学习之相关度评分TF&IDF
relevance score算法,简单来说,就是计算出,一个索引中的文本,与搜索文本,他们之间的关联匹配程度 Elasticsearch使用的是 term frequency/inverse doc ...
- tf idf公式及sklearn中TfidfVectorizer
在文本挖掘预处理之向量化与Hash Trick中我们讲到在文本挖掘的预处理中,向量化之后一般都伴随着TF-IDF的处理,那么什么是TF-IDF,为什么一般我们要加这一步预处理呢?这里就对TF-IDF的 ...
- 25.TF&IDF算法以及向量空间模型算法
主要知识点: boolean model IF/IDF vector space model 一.boolean model 在es做各种搜索进行打分排序时,会先用boolean mo ...
随机推荐
- 盘点:#AzureChat - 虚拟机和自动伸缩
感谢大家跟 Corey Sanders 和 Stephen Siciliano 一起参加本次 #AzureChat.我们很高兴能借这次在线讨论的机会,倾听各位社区成员对我们最受欢迎的两个主题的意见 - ...
- Spring源代码解析 ---- 循环依赖
一.循环引用: 1. 定义: 循环依赖就是循环引用,就是两个或多个Bean相互之间的持有对方,比方CircularityA引用CircularityB,CircularityB引用Circularit ...
- 数据绑定以及Container.DataItem几种方式与使用方法分析
灵活的运用数据绑定操作 绑定到简单属性:<%#UserName%> 绑定到集合:<asp:ListBox id="ListBox1" ...
- sql server 实现sleep延时
sql server中实现与C++ 中Sleep类似的功能,可以使用 waitfor delay '00:00:00:10' 表示延时10毫秒
- Reader开发(二)增加PDF阅读功能
最近任务很多很忙,所以更新博客的速度很慢. 大概上周就为Reader加了一个PDF阅读的功能,但是一直没时间写上来.昨晚找一下文件发现扩展了功能的Demo居然在文件目录下看不到任何文件,但是却显示有文 ...
- Oracle、DB2、MySql、SQLServer JDBC驱动
四种数据库JDBC驱动,还列出了连接的Class驱动名和Url Pattern,DB2包括Type 2.Type 3和Type 4三种模式.注意驱动包名称的大小写. Oralce连接驱动包名和URL ...
- Cocos2d-x游戏的场景结构布局
- Android Content Provider的启动过程源码分析
本文參考Android应用程序组件Content Provider的启动过程源码分析http://blog.csdn.net/luoshengyang/article/details/6963418和 ...
- android5.0(Lollipop) BLE Central牛刀小试
转载请表明作者:http://blog.csdn.net/lansefeiyang08/article/details/46482073 昨天写了android L BLE Peripheral的简单 ...
- javascript面向对象基础讲解(工厂模式、构造函数模式、原型模式、混合模式、动态原型模式)
面向对象可以把程序中的关键模块都视为对象,而模块拥有属性及方法.这样我们如果把一些属性及方法封装起来,日后使用将非常方便,也可以避免繁琐重复的工作.接下来将为大家讲解在JS中面向对象的实现. 工厂 ...