问题描写叙述:给定一个大文件,文件里的内容每一行为:文档名,文档内容。

input

文档名1,word1 Word2 .......

文档名2,word1 Word2 .......

output

word  文档名  tfidf值

package com.elex.mapreduce;

import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.elex.mapreduce.TFIDF_4.IDFMap;
import com.elex.mapreduce.TFIDF_4.IDFReduce;
import com.elex.utils.DataClean;
import com.google.common.io.Closeables; public class TFIDF_5 {
public static String hdfsURL = "hdfs://namenode:8020";
public static String fileURL = "/tmp/usercount"; public static class TFMap extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String userWordstmp = value.toString();
StringTokenizer userWords = new StringTokenizer(userWordstmp, "\n");
while (userWords.hasMoreTokens()) {
String userWordFragtmp = userWords.nextToken();
StringTokenizer userWordFrag = new StringTokenizer(
userWordFragtmp, ",");
String user = userWordFrag.nextToken();
Text outputKey = new Text();
Text outputValue = new Text();
while (userWordFrag.hasMoreTokens()) {
String words = userWordFrag.nextToken();
HashMap<String, Integer> wordMap = DataClean.clean(words,
"!total");
int wordTotal = wordMap.get("!total");
wordMap.remove("!total");
for (Map.Entry<String, Integer> wordEntry : wordMap
.entrySet()) {
String word = wordEntry.getKey();
int wordCount = wordEntry.getValue();
float tf = (float) wordCount / (float) wordTotal;
String outputStr = word + " " + Float.toString(tf)
+ ",";
byte[] bytes = outputStr.getBytes();
outputValue.append(bytes, 0, bytes.length);
}
}
outputKey.set(user);
context.write(outputKey, outputValue);
}
}
} public static class TFReduce extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// StringBuffer sb = new StringBuffer();
Iterator<Text> iter = values.iterator();
while (iter.hasNext()) {
// sb.append(iter.next().toString() + "\t");
context.write(key, iter.next());
}
// Text outputValue = new Text();
// outputValue.set(sb.toString());
// context.write(key, outputValue);
}
} public static class IDFMap extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String valuesTmp = value.toString();
StringTokenizer userWordFrag = new StringTokenizer(valuesTmp, "\n");
while (userWordFrag.hasMoreTokens()) {
// String userWordtmp = userWordFrag.nextToken();
StringTokenizer userWords = new StringTokenizer(
userWordFrag.nextToken(), "\t");
String user = userWords.nextToken();
while (userWords.hasMoreTokens()) {
StringTokenizer wordTFs = new StringTokenizer(
userWords.nextToken(), ",");
while (wordTFs.hasMoreTokens()) {
StringTokenizer wordTF = new StringTokenizer(
wordTFs.nextToken());
String word = wordTF.nextToken();
String tf = wordTF.nextToken();
Text outputKey = new Text();
Text outputValue = new Text();
outputKey.set(word);
outputValue.set(user + "\t" + tf);
context.write(outputKey, outputValue);
}
}
} }
} public static class IDFReduce extends Reducer<Text, Text, Text, Text> {
long userCount = 0; public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
Path path = new Path(fileURL);
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
if (!fs.isFile(path)) {
FSDataOutputStream output = fs.create(path, true);
output.close();
}
FSDataInputStream input = fs.open(path);
StringBuffer sb = new StringBuffer();
byte[] bytes = new byte[1024];
int status = input.read(bytes);
while (status != -1) {
sb.append(new String(bytes));
status = input.read(bytes);
}
if (!"".equals(sb.toString())) {
userCount = Long.parseLong(sb.toString().trim());
}
input.close();
} public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
LinkedList<String> userList = new LinkedList<String>();
Iterator<Text> iter = values.iterator();
long wordCount = 0;
while (iter.hasNext()) {
wordCount++;
userList.add(iter.next().toString());
}
float idf = (float) Math.log((float) userCount
/ (float) (wordCount + 1));
Iterator<String> userIter = userList.iterator();
Text outputValue = new Text();
while (userIter.hasNext()) {
String usertftmp = userIter.next();
StringTokenizer usertf = new StringTokenizer(usertftmp, "\t");
String user = usertf.nextToken();
String tfStr = usertf.nextToken();
float tf = Float.parseFloat(tfStr.trim().toString());
float tfidf = tf * idf;
String outputTmp = user + "\t" + tfidf;
outputValue.set(outputTmp);
context.write(key, outputValue);
}
}
} public static class UserCountMap extends Mapper<Object, Text, Text, Text> { public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String userWordtmp = value.toString();
StringTokenizer userWord = new StringTokenizer(userWordtmp, "\n");
while (userWord.hasMoreTokens()) {
userWord.nextToken();
Text outputKey = new Text();
outputKey.set("usercount");
Text one = new Text();
one.set("1");
context.write(outputKey, one);
}
}
} public static class UserCountCombine extends
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
long user = 0;
for (Text value : values) {
String valueTmp = value.toString();
user += Long.parseLong(valueTmp);
}
Text outputValue = new Text();
outputValue.set(Long.toString(user));
context.write(key, outputValue);
}
} public static class UserCountReduce extends Reducer<Text, Text, Text, Text> {
int userCount = 0; public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
String valueTmp = value.toString();
userCount += Long.parseLong(valueTmp);
}
} public void cleanup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
Path path = new Path(fileURL);
FSDataOutputStream output = fs.create(path, true);
String content = Long.toString(userCount);
output.write(content.getBytes());
output.flush();
output.close();
}
} public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
// conf.set("mapred.child.java.opts", "-Xmx4096m");
Job tfJob = Job.getInstance(conf, "tfjob");
tfJob.setJarByClass(TFIDF_5.class);
tfJob.setMapperClass(TFMap.class);
// tfJob.setCombinerClass(TFCombine.class);
tfJob.setReducerClass(TFReduce.class);
tfJob.setOutputKeyClass(Text.class);
tfJob.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(tfJob, new Path(args[0]));
FileOutputFormat.setOutputPath(tfJob, new Path(args[1]));
tfJob.waitForCompletion(true); // Job userCountJob = Job.getInstance(conf, "usercountjob");
// userCountJob.setJarByClass(TFIDF_5.class);
// userCountJob.setMapperClass(UserCountMap.class);
// userCountJob.setCombinerClass(UserCountCombine.class);
// userCountJob.setReducerClass(UserCountReduce.class);
// userCountJob.setOutputKeyClass(Text.class);
// userCountJob.setOutputValueClass(Text.class);
// FileInputFormat.setInputPaths(userCountJob, new Path(args[1]));
// FileOutputFormat.setOutputPath(userCountJob, new Path(args[2]));
// userCountJob.waitForCompletion(true);
<span style="white-space: pre;">		</span>//计算文档数,并暂时储存到hdfs上
		Counter ct = tfJob.getCounters().findCounter(
"org.apache.hadoop.mapreduce.TaskCounter", "MAP_INPUT_RECORDS");
System.out.println(ct.getValue());
Iterable<String> groupNames = tfJob.getCounters().getGroupNames();
for (String groupName : groupNames) {
System.out.println(groupName);
}
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
Path path = new Path(fileURL);
FSDataOutputStream output = fs.create(path, true);
String content = Long.toString(ct.getValue());
output.write(content.getBytes());
output.flush();
output.close(); Job idfJob = Job.getInstance(conf, "idfjob");
idfJob.setJarByClass(TFIDF_5.class);
idfJob.setMapperClass(IDFMap.class);
idfJob.setReducerClass(IDFReduce.class);
idfJob.setOutputKeyClass(Text.class);
idfJob.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(idfJob, new Path(args[1]));
FileOutputFormat.setOutputPath(idfJob, new Path(args[3]));
System.exit(idfJob.waitForCompletion(true) ? 0 : 1); } }

最初运用了一个单独的job计算文档数,后面经过公司前辈的指点,能够通过计算tf的时候运用输入数据的条数来巧妙的计算文档数。

运用mapreduce计算tf-idf的更多相关文章

  1. TF/IDF(term frequency/inverse document frequency)

    TF/IDF(term frequency/inverse document frequency) 的概念被公认为信息检索中最重要的发明. 一. TF/IDF描述单个term与特定document的相 ...

  2. TF/IDF计算方法

    FROM:http://blog.csdn.net/pennyliang/article/details/1231028 我们已经谈过了如何自动下载网页.如何建立索引.如何衡量网页的质量(Page R ...

  3. tf–idf算法解释及其python代码实现(下)

    tf–idf算法python代码实现 这是我写的一个tf-idf的简单实现的代码,我们知道tfidf=tf*idf,所以可以分别计算tf和idf值在相乘,首先我们创建一个简单的语料库,作为例子,只有四 ...

  4. tf–idf算法解释及其python代码实现(上)

    tf–idf算法解释 tf–idf, 是term frequency–inverse document frequency的缩写,它通常用来衡量一个词对在一个语料库中对它所在的文档有多重要,常用在信息 ...

  5. 文本分类学习(三) 特征权重(TF/IDF)和特征提取

    上一篇中,主要说的就是词袋模型.回顾一下,在进行文本分类之前,我们需要把待分类文本先用词袋模型进行文本表示.首先是将训练集中的所有单词经过去停用词之后组合成一个词袋,或者叫做字典,实际上一个维度很大的 ...

  6. 信息检索中的TF/IDF概念与算法的解释

    https://blog.csdn.net/class_brick/article/details/79135909 概念 TF-IDF(term frequency–inverse document ...

  7. Elasticsearch学习之相关度评分TF&IDF

    relevance score算法,简单来说,就是计算出,一个索引中的文本,与搜索文本,他们之间的关联匹配程度 Elasticsearch使用的是 term frequency/inverse doc ...

  8. tf idf公式及sklearn中TfidfVectorizer

    在文本挖掘预处理之向量化与Hash Trick中我们讲到在文本挖掘的预处理中,向量化之后一般都伴随着TF-IDF的处理,那么什么是TF-IDF,为什么一般我们要加这一步预处理呢?这里就对TF-IDF的 ...

  9. 25.TF&IDF算法以及向量空间模型算法

    主要知识点: boolean model IF/IDF vector space model     一.boolean model     在es做各种搜索进行打分排序时,会先用boolean mo ...

随机推荐

  1. 浅谈MySql的存储引擎(表类型) (转)

    什么是MySql数据库 通常意义上,数据库也就是数据的集合,具体到计算机上数据库可以是存储器上一些文件的集合或者一些内存数据的集合. 我们通常说的MySql数据库,sql server数据库等等其实是 ...

  2. python去掉html标签

    s = '<SPAN style="FONT- SIZE: 9pt">开始1~3<SPAN lang=EN-US>& lt;?xml:namespa ...

  3. 基于visual Studio2013解决C语言竞赛题之0702函数设计

       题目

  4. GDSOI2015 task4 ACU

    题目大意 只要你有耐心看完题目,你就可以得到以下模型: 给出一个有向图,有若干询问,每次询问对于某条边\((v,u)\),求删掉这条边后,\(v\)到\(u\)的最短路. 算法1 暴力出奇迹,期望得分 ...

  5. meanShift算法介绍

    meanShift,均值漂移,在聚类.图像平滑.切割.跟踪等方面有着广泛的应用.meanShift这个概念最早是由Fukunage在1975年提出的,其最初的含义正如其名:偏移的均值向量:但随着理论的 ...

  6. 浅析SSH核心原理(二)

    Hibernate是一个开放源代码的ORM(对象-关系映射)框架,它对JDBC进行了非常轻量级的对象封装,使得Java程序员可以随心所欲的使用对象编程思维来操纵数据库. Hibernate可以应用在任 ...

  7. 【图像处理】Gabor过滤器

    Gabor内核参考wiki 使用实数Real的公式计算核函数代码: Mat getGaborFilter(float lambda, float theta, float sigma2,float g ...

  8. Swift - 实现拨打电话

    要实现打电话功能,最简单最直接的方式便是:直接跳到拨号界面 (注意:这个需要真机调试,模拟器无效果) 1 2 //自动打开拨号页面并自动拨打电话 UIApplication.sharedApplica ...

  9. VB.net数据库编程(03):一个SQLserver连接查询的简单样例

    这个样例,因为在ADO.net入门已经专门学了,再次进行复习 一下. 主要掌握连接字串的情况. 过程就是: 1.引用System.Data.SqlClient.而Access中引用 的是System. ...

  10. 数据库元数据MetaData

    本篇介绍数据库方面的元数据(MetaData)的有关知识.元数据在建立框架和架构方面是特别重要的知识,再下一篇我们仿造开源数据库工具类DbUtils就要使用数据库的元数据来创建自定义JDBC框架. 在 ...