问题描写叙述:给定一个大文件,文件里的内容每一行为:文档名,文档内容。

input

文档名1,word1 Word2 .......

文档名2,word1 Word2 .......

output

word  文档名  tfidf值

package com.elex.mapreduce;

import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.elex.mapreduce.TFIDF_4.IDFMap;
import com.elex.mapreduce.TFIDF_4.IDFReduce;
import com.elex.utils.DataClean;
import com.google.common.io.Closeables; public class TFIDF_5 {
public static String hdfsURL = "hdfs://namenode:8020";
public static String fileURL = "/tmp/usercount"; public static class TFMap extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String userWordstmp = value.toString();
StringTokenizer userWords = new StringTokenizer(userWordstmp, "\n");
while (userWords.hasMoreTokens()) {
String userWordFragtmp = userWords.nextToken();
StringTokenizer userWordFrag = new StringTokenizer(
userWordFragtmp, ",");
String user = userWordFrag.nextToken();
Text outputKey = new Text();
Text outputValue = new Text();
while (userWordFrag.hasMoreTokens()) {
String words = userWordFrag.nextToken();
HashMap<String, Integer> wordMap = DataClean.clean(words,
"!total");
int wordTotal = wordMap.get("!total");
wordMap.remove("!total");
for (Map.Entry<String, Integer> wordEntry : wordMap
.entrySet()) {
String word = wordEntry.getKey();
int wordCount = wordEntry.getValue();
float tf = (float) wordCount / (float) wordTotal;
String outputStr = word + " " + Float.toString(tf)
+ ",";
byte[] bytes = outputStr.getBytes();
outputValue.append(bytes, 0, bytes.length);
}
}
outputKey.set(user);
context.write(outputKey, outputValue);
}
}
} public static class TFReduce extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// StringBuffer sb = new StringBuffer();
Iterator<Text> iter = values.iterator();
while (iter.hasNext()) {
// sb.append(iter.next().toString() + "\t");
context.write(key, iter.next());
}
// Text outputValue = new Text();
// outputValue.set(sb.toString());
// context.write(key, outputValue);
}
} public static class IDFMap extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String valuesTmp = value.toString();
StringTokenizer userWordFrag = new StringTokenizer(valuesTmp, "\n");
while (userWordFrag.hasMoreTokens()) {
// String userWordtmp = userWordFrag.nextToken();
StringTokenizer userWords = new StringTokenizer(
userWordFrag.nextToken(), "\t");
String user = userWords.nextToken();
while (userWords.hasMoreTokens()) {
StringTokenizer wordTFs = new StringTokenizer(
userWords.nextToken(), ",");
while (wordTFs.hasMoreTokens()) {
StringTokenizer wordTF = new StringTokenizer(
wordTFs.nextToken());
String word = wordTF.nextToken();
String tf = wordTF.nextToken();
Text outputKey = new Text();
Text outputValue = new Text();
outputKey.set(word);
outputValue.set(user + "\t" + tf);
context.write(outputKey, outputValue);
}
}
} }
} public static class IDFReduce extends Reducer<Text, Text, Text, Text> {
long userCount = 0; public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
Path path = new Path(fileURL);
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
if (!fs.isFile(path)) {
FSDataOutputStream output = fs.create(path, true);
output.close();
}
FSDataInputStream input = fs.open(path);
StringBuffer sb = new StringBuffer();
byte[] bytes = new byte[1024];
int status = input.read(bytes);
while (status != -1) {
sb.append(new String(bytes));
status = input.read(bytes);
}
if (!"".equals(sb.toString())) {
userCount = Long.parseLong(sb.toString().trim());
}
input.close();
} public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
LinkedList<String> userList = new LinkedList<String>();
Iterator<Text> iter = values.iterator();
long wordCount = 0;
while (iter.hasNext()) {
wordCount++;
userList.add(iter.next().toString());
}
float idf = (float) Math.log((float) userCount
/ (float) (wordCount + 1));
Iterator<String> userIter = userList.iterator();
Text outputValue = new Text();
while (userIter.hasNext()) {
String usertftmp = userIter.next();
StringTokenizer usertf = new StringTokenizer(usertftmp, "\t");
String user = usertf.nextToken();
String tfStr = usertf.nextToken();
float tf = Float.parseFloat(tfStr.trim().toString());
float tfidf = tf * idf;
String outputTmp = user + "\t" + tfidf;
outputValue.set(outputTmp);
context.write(key, outputValue);
}
}
} public static class UserCountMap extends Mapper<Object, Text, Text, Text> { public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String userWordtmp = value.toString();
StringTokenizer userWord = new StringTokenizer(userWordtmp, "\n");
while (userWord.hasMoreTokens()) {
userWord.nextToken();
Text outputKey = new Text();
outputKey.set("usercount");
Text one = new Text();
one.set("1");
context.write(outputKey, one);
}
}
} public static class UserCountCombine extends
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
long user = 0;
for (Text value : values) {
String valueTmp = value.toString();
user += Long.parseLong(valueTmp);
}
Text outputValue = new Text();
outputValue.set(Long.toString(user));
context.write(key, outputValue);
}
} public static class UserCountReduce extends Reducer<Text, Text, Text, Text> {
int userCount = 0; public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
String valueTmp = value.toString();
userCount += Long.parseLong(valueTmp);
}
} public void cleanup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
Path path = new Path(fileURL);
FSDataOutputStream output = fs.create(path, true);
String content = Long.toString(userCount);
output.write(content.getBytes());
output.flush();
output.close();
}
} public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
// conf.set("mapred.child.java.opts", "-Xmx4096m");
Job tfJob = Job.getInstance(conf, "tfjob");
tfJob.setJarByClass(TFIDF_5.class);
tfJob.setMapperClass(TFMap.class);
// tfJob.setCombinerClass(TFCombine.class);
tfJob.setReducerClass(TFReduce.class);
tfJob.setOutputKeyClass(Text.class);
tfJob.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(tfJob, new Path(args[0]));
FileOutputFormat.setOutputPath(tfJob, new Path(args[1]));
tfJob.waitForCompletion(true); // Job userCountJob = Job.getInstance(conf, "usercountjob");
// userCountJob.setJarByClass(TFIDF_5.class);
// userCountJob.setMapperClass(UserCountMap.class);
// userCountJob.setCombinerClass(UserCountCombine.class);
// userCountJob.setReducerClass(UserCountReduce.class);
// userCountJob.setOutputKeyClass(Text.class);
// userCountJob.setOutputValueClass(Text.class);
// FileInputFormat.setInputPaths(userCountJob, new Path(args[1]));
// FileOutputFormat.setOutputPath(userCountJob, new Path(args[2]));
// userCountJob.waitForCompletion(true);
<span style="white-space: pre;">		</span>//计算文档数,并暂时储存到hdfs上
		Counter ct = tfJob.getCounters().findCounter(
"org.apache.hadoop.mapreduce.TaskCounter", "MAP_INPUT_RECORDS");
System.out.println(ct.getValue());
Iterable<String> groupNames = tfJob.getCounters().getGroupNames();
for (String groupName : groupNames) {
System.out.println(groupName);
}
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
Path path = new Path(fileURL);
FSDataOutputStream output = fs.create(path, true);
String content = Long.toString(ct.getValue());
output.write(content.getBytes());
output.flush();
output.close(); Job idfJob = Job.getInstance(conf, "idfjob");
idfJob.setJarByClass(TFIDF_5.class);
idfJob.setMapperClass(IDFMap.class);
idfJob.setReducerClass(IDFReduce.class);
idfJob.setOutputKeyClass(Text.class);
idfJob.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(idfJob, new Path(args[1]));
FileOutputFormat.setOutputPath(idfJob, new Path(args[3]));
System.exit(idfJob.waitForCompletion(true) ? 0 : 1); } }

最初运用了一个单独的job计算文档数,后面经过公司前辈的指点,能够通过计算tf的时候运用输入数据的条数来巧妙的计算文档数。

运用mapreduce计算tf-idf的更多相关文章

  1. TF/IDF(term frequency/inverse document frequency)

    TF/IDF(term frequency/inverse document frequency) 的概念被公认为信息检索中最重要的发明. 一. TF/IDF描述单个term与特定document的相 ...

  2. TF/IDF计算方法

    FROM:http://blog.csdn.net/pennyliang/article/details/1231028 我们已经谈过了如何自动下载网页.如何建立索引.如何衡量网页的质量(Page R ...

  3. tf–idf算法解释及其python代码实现(下)

    tf–idf算法python代码实现 这是我写的一个tf-idf的简单实现的代码,我们知道tfidf=tf*idf,所以可以分别计算tf和idf值在相乘,首先我们创建一个简单的语料库,作为例子,只有四 ...

  4. tf–idf算法解释及其python代码实现(上)

    tf–idf算法解释 tf–idf, 是term frequency–inverse document frequency的缩写,它通常用来衡量一个词对在一个语料库中对它所在的文档有多重要,常用在信息 ...

  5. 文本分类学习(三) 特征权重(TF/IDF)和特征提取

    上一篇中,主要说的就是词袋模型.回顾一下,在进行文本分类之前,我们需要把待分类文本先用词袋模型进行文本表示.首先是将训练集中的所有单词经过去停用词之后组合成一个词袋,或者叫做字典,实际上一个维度很大的 ...

  6. 信息检索中的TF/IDF概念与算法的解释

    https://blog.csdn.net/class_brick/article/details/79135909 概念 TF-IDF(term frequency–inverse document ...

  7. Elasticsearch学习之相关度评分TF&IDF

    relevance score算法,简单来说,就是计算出,一个索引中的文本,与搜索文本,他们之间的关联匹配程度 Elasticsearch使用的是 term frequency/inverse doc ...

  8. tf idf公式及sklearn中TfidfVectorizer

    在文本挖掘预处理之向量化与Hash Trick中我们讲到在文本挖掘的预处理中,向量化之后一般都伴随着TF-IDF的处理,那么什么是TF-IDF,为什么一般我们要加这一步预处理呢?这里就对TF-IDF的 ...

  9. 25.TF&IDF算法以及向量空间模型算法

    主要知识点: boolean model IF/IDF vector space model     一.boolean model     在es做各种搜索进行打分排序时,会先用boolean mo ...

随机推荐

  1. xmlns:android="http://schemas.android.com/apk/res/android的作用是

    xmlns:android="http://schemas.android.com/apk/res/android的作用是 这个是xml的命名空间,有了他,你就可以alt+/作为提示,提示你 ...

  2. IE浏览器下web调试工具之--IE WebDeveloper介绍

    做Web项目的架构设计.开发.测试,免不了要熟悉Web页面调试工具,以此来获知哪些浏览器支持Web页面的显示,哪些浏览器下显示有问题. 目前市面上比较火爆的浏览器内核提供商,有微软的IE.mozill ...

  3. C# - 接口的继承

    代码: using System; using System.Collections.Generic; using System.Linq; using System.Text; using Syst ...

  4. C#游戏框架uFrame

    C#游戏框架uFrame兼谈游戏架构设计 c#语言规范 阅读目录 1.概览 2.基本概念 3.依赖注入 4.Manager of Managers 5.利用UniRX实现响应式编程 6.研究总结 回到 ...

  5. 基于visual Studio2013解决面试题之1102合并字符串

     题目

  6. 基于visual Studio2013解决C语言竞赛题之1023判断排序

         题目 解决代码及点评 /* 23. 有10个两位整数,把这些数作以下变化,如果它是素数, 则把它乘以2,若它是偶数则除以2,其余的数减1, 请将变化后的10个数按从小到大 ...

  7. [置顶] NS2中TCP拥塞控制仿真过程中盲点解析

    最近利用NS2做TCP拥塞控制协议的仿真,发现很多变量的方法含义都是解释的不清楚,给核心模块修改带来很多麻烦,所以决定用最准确的语言解释成员变量.方法,术语等的含义.限于个人水平,若有错误请留言指正! ...

  8. Java ArrayList add(int index, E element) example

    Simple add() method is used for adding an element at the end of the list however there is another va ...

  9. CSS3实现8种Loading效果【第二波】

    原文:CSS3实现8种Loading效果[第二波] 今晚吃完饭回宿舍又捣鼓了另外几种Loading效果,老规矩,直接“上菜“…… 注:gif图片动画有些卡顿,非实际效果! PS:若要转载请注明出处,尊 ...

  10. 树莓派的.bashrc和.bash_aliases文件

    在你的home文件夹中,你能够找到一个包括用户配置的隐藏文件.bashrc. 你能够依据自己的须要改动这个文件. 文件里为你提供了一些实用的调整设置.默认情况下当中一些设置是被凝视掉的. 比如,一些l ...