hadoop实现共同出现的单词(Word co-occurrence)

共同出现的单词(Word co-occurrence)是指在一个句子中相邻的两个单词。每一个相邻的单词就是一个Co-Occurrence对。

Sample Input:

a b cc, c d d c
I Love U.
dd ee f g s sa dew ad da
So shaken as we are, so wan with care.
Find we a time for frighted peace to pant.
And breathe short-winded accents of new broil.
To be commenced in strands afar remote.
I Love U U love i.
i i i i

Sample Output:

a:b 1
a:time1
a:we1
accents:of1
accents:short-winded1
ad:da1
ad:dew1
afar:remote1
afar:strands1
and:breathe1
are:so1
are:we1
as:shaken1
as:we1
b:cc1
be:commenced1
be:to1
breathe:short-winded1
broil:new1
c:cc1
c:d2
care:with1
commenced:in1
d:d1
dd:ee1
dew:sa1
ee:f1
f:g1
find:we1
for:frighted1
for:time1
frighted:peace1
g:s1
i:i3
i:love3
in:strands1
love:u3
new:of1
pant:to1
peace:to1
s:sa1
shaken:so1
so:wan1
u:u1
wan:with1

Code:

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.RawComparator;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.WritableComparator;

import org.apache.hadoop.io.WritableUtils;

import org.apache.hadoop.mapred.Reporter;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.util.GenericOptionsParser;

public class CoOccurrence {

  public static class TextPair implements WritableComparable<TextPair> {

    private Text first;

    private Text second;

    public TextPair(){

    	set(new Text(), new Text());

    }

    public TextPair(String left, String right) {

        set(new Text(left), new Text(right));

    }

    public TextPair(Text left, Text right) {

    	set(left, right);

    }

    public void set(Text left, Text right){

    	String l = left.toString();

    	String r = right.toString();

    	int cmp = l.compareTo(r);

    	if(cmp <= 0){

    		this.first = left;

    		this.second = right;

    	}else{

    		this.first = right;

    		this.second = left;

    	}

    }

    public Text getFirst() {

      return first;

    }

    public Text getSecond() {

      return second;

    }

    @Override

    public void readFields(DataInput in) throws IOException {

      first.readFields(in);

      second.readFields(in);

    }

    @Override

    public void write(DataOutput out) throws IOException {

    	first.write(out);

    	second.write(out);

    }

    @Override

    public int hashCode() {

      return first.hashCode() * 163 + second.hashCode();//May be some trouble here. why 163? sometimes 157

    }

    @Override

    public boolean equals(Object o) {

      if (o instanceof TextPair) {

        TextPair tp = (TextPair) o;

        return first.equals(tp.first) && second.equals(tp.second);

      }

      return false;

    }

    @Override

    public String toString(){

    	return first + ":" + second;

    }

    @Override

    public int compareTo(TextPair tp) {

    	int cmp = first.compareTo(tp.first);

    	if(cmp != 0)

    		return cmp;

    	return second.compareTo(tp.second);

    }

    // A Comparator that com.pares serialized StringPair.

    public static class Comparator extends WritableComparator {

    	private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();

    	public Comparator() {

    		super(TextPair.class);

    	}

    	@Override

    	public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){

    		try {

    			int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);

    			int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);

    			int cmp = TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);

    			if(cmp != 0)

    				return cmp;

    			return TEXT_COMPARATOR.compare(b1, s1 + firstl1, l1 - firstl1,

    										   b2, s2 + firstl2, l1 - firstl2);

    		}catch (IOException e) {

    			throw new IllegalArgumentException(e);

    		}

    	}

    }//End of Comparator

    static { // register this comparator

      WritableComparator.define(TextPair.class, new Comparator());

    }

    // Compare only the first part of the pair, so that reduce is called once for each value of the first part.

    public static class FirstComparator extends WritableComparator {

    	private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();

    	public FirstComparator() {

    		super(TextPair.class);

    	}

    	@Override

    	public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){

    		try {

    			int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);

    			int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);

    			return TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);

    		}catch (IOException e) {

    			throw new IllegalArgumentException(e);

    		}

    	}

    	/*

      @Override

      public int compare(WritableComparator a, WritableComparator b) {

      	if(a instanceof TextPair && b instanceof TextPair)

      		return ((TextPair)a).first.compareTo(((TextPair)b).first);

      	return super.compare(a, b);

      }*/

    }//End of FirstComparator

  }//End of TextPair

  //Partition based on the first part of the pair.

  public static class FirstPartitioner extends Partitioner<TextPair,IntWritable>{

    @Override

    public int getPartition(TextPair key, IntWritable value, int numPartitions) {

      return Math.abs(key.getFirst().toString().indexOf(0) * 127) % numPartitions;//May be some trouble here.

    }

  }//End of FirstPartitioner

  public static class MyMapper extends Mapper<LongWritable, Text, TextPair, IntWritable> {

    private final static IntWritable one = new IntWritable(1);

    private static Text word0 = new Text();

    private static Text word1 = new Text();

    private String pattern = "[^a-zA-Z0-9-']";

    @Override

    public void map(LongWritable inKey, Text inValue, Context context)throws IOException, InterruptedException {

    	String line = inValue.toString();

    	line = line.replaceAll(pattern, " ");

    	line = line.toLowerCase();

    	String[] str = line.split(" +");

    	for(int i=0; i< str.length-1; i++)

    	{

    		word0.set(str[i]);

    		word1.set(str[i+1]);

    		TextPair pair = new TextPair(word0, word1);

    		context.write(pair, one);

    	}

    }

  }//End of MapClass

  public static class MyReducer extends Reducer<TextPair, IntWritable, TextPair, IntWritable> {

	    private IntWritable result = new IntWritable();

	    @Override

	    public void reduce(TextPair inKey, Iterable<IntWritable> inValues, Context context) throws IOException, InterruptedException {

	    	int sum = 0;

		      for (IntWritable val : inValues) {

		        sum += val.get();

		      }

		      result.set(sum);

		      context.write(inKey, result);

	    }

  }//End of MyReducer

  public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();

    //conf.set("Hadoop.job.ugi", "sunguoli,cs402");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    //if (otherArgs.length != 2) {

    //  System.err.println("Usage: CoOccurrence <in> <out>");

    //  System.exit(2);

    //}

    Job job = new Job(conf, "Co-Occurrence");

    job.setJarByClass(CoOccurrence.class);

    job.setMapperClass(MyMapper.class);

    job.setMapOutputKeyClass(TextPair.class);

    job.setMapOutputValueClass(IntWritable.class);

    job.setCombinerClass(MyReducer.class);

    // group and partition by the first int in the pair

    //job.setPartitionerClass(FirstPartitioner.class);

    //job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the reduce output is Text, IntWritable

    job.setReducerClass(MyReducer.class);

    job.setOutputKeyClass(TextPair.class);

    job.setOutputValueClass(IntWritable.class);

    //FileInputFormat.addInputPath(job, new Path("../shakespeareinput"));

    //FileOutputFormat.setOutputPath(job, new Path("output"));

	FileInputFormat.addInputPath(job, new Path(args[0]));

    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

  }//End of main

}//End of CoOccurrence

hadoop实现共同出现的单词(Word co-occurrence)的更多相关文章

Hadoop 统计文件中某个单词出现的次数
如文件word.txt内容如下: what is you name? my name is zhang san. 要求统计word.txt中出现“is”的次数? 代码如下: PerWordMapper ...
Hadoop入门实例——WordCount统计单词
首先要说明的是运行Hadoop需要jdk1.6或以上版本,如果你还没有搭建好Hadoop集群,请参考我的另一篇文章: Linux环境搭建Hadoop伪分布模式马上进入正题. 1.启动Hadoop集群 ...
linux makefile字符串操作函数替换subst、模式替换patsubst、去首尾空格strip、查找字符串findstring、过滤filter、反过滤filter-out、排序函数sort、取单词word、取单词串wordlist、个数统计words
1.1 字符操作函数使用在Makefile中可以使用函数来处理变量,从而让我们的命令或是规则更为的灵活和具有智能.make所支持的函数也不算很多,不过已经足够我们的操作了.函数调用后,函 ...
[LeetCode] Shortest Completing Word 最短完整的单词
Find the minimum length word from a given dictionary words, which has all the letters from the strin ...
Hadoop：使用原生python编写MapReduce
功能实现功能:统计文本文件中所有单词出现的频率功能. 下面是要统计的文本文件 [/root/hadooptest/input.txt] foo foo quux labs foo bar quux ...
Hadoop上路-03_Hadoop JavaAPI
一.Eclipse安装 1.下载解压下载:http://www.eclipse.org/downloads/ 解压:SHELL$ sudo tar -zxvf eclipse.tar.gz 2.快捷 ...
大数据【四】MapReduce（单词计数；二次排序；计数器；join；分布式缓存）
前言: 根据前面的几篇博客学习,现在可以进行MapReduce学习了.本篇博客首先阐述了MapReduce的概念及使用原理,其次直接从五个实验中实践学习(单词计数,二次排序,计数器,join,分 ...
Hadoop世界中的HelloWorld之WordCount具体分析
MapReduce 应用举例:单词计数 WorldCount可以说是MapReduce中的helloworld了,下面来看看hadoop中的例子worldcount对其进行的处理过程,也能对mapre ...
在Hadoop上用Python实现WordCount
一.简单说明本例中我们用Python写一个简单的运行在Hadoop上的MapReduce程序,即WordCount(读取文本文件并统计单词的词频).这里我们将要输入的单词文本input.txt和Py ...

随机推荐

构建高可用web站点学习--前言
前言:本人对于提高web站点的访问量等的有很浓厚的兴趣,也学习了将近一年的时间,希望能总结点东西,虽然很多东西都是从书籍和资料中学习的,而不是原创,但是这是我总结的一点感悟和进行的分类吧.而且可能思路 ...
mongodb 排序 Unable to determine the serialization information for the expression 异常
好久没用mongodb了...最近又开始用起来了. 遇到情景: 2句话分开写.是正常的,因为我是先取再排序的然而.我想直接排序出来. 就写在了一起.最后.ToList() 然后报 Una ...
如何让Qt 的程序等待一段时间（等待的同时，还让主界面刷新图片）good
后面这种方法可以不影响其他线程的响应,又可以达到等待的目的. 测试的一个小例子: class Widget : public QWidget { Q_OBJECT public: Widget(QWi ...
new Thread的弊端（转）
new Thread的弊端如下: a. 每次new Thread新建对象性能差.b. 线程缺乏统一管理,可能无限制新建线程,相互之间竞争,及可能占用过多系统资源导致死机或oom.c. 缺乏更多功能,如 ...
POJ2513 Colored Sticks(欧拉)
题目链接. 题目大意: 给很多木棍,两端被涂了颜色.任意两根木棍的相同颜色处可以拼接在一起,问有没有可能将所有的木棍都连起来,成一条直线? 分析: 考点,欧拉道路. 将一根木棍看成一条边,两端的颜色看 ...
【转】ListView与RadioButton组合——自定义单选列表
原文网址:http://blog.csdn.net/checkin001/article/details/11519131 Android自带的RadioButton单选框只支持添加文字,我们自己写A ...
【贪心+堆】XMU 1584 小明的烦恼
题目链接: http://acm.xmu.edu.cn/JudgeOnline/problem.php?id=1584 题目大意: 给n(n<=100 000)个任务的耗时和截至时间,问最少不能 ...
cf700A As Fast As Possible
On vacations n pupils decided to go on excursion and gather all together. They need to overcome the ...
一般处理程序中使用Session出现未将对象引用设置到对象的实例
遇到问题:未将对象引用设置到对象的实例那就在你的一般处理程序中加入红色背景的代码吧 using System; using System.Collections.Generic; using Sys ...
BitmapFactory.decodeByteArray() 返回null，分析与解决
问题描述:用android自带的Camera获取图片,上传至远程数据库中(mysql),以BLOB格式存储, 但在提取图片时,始终无法在android界面显示,示例代码如下: ..... .... ...

hadoop实现共同出现的单词(Word co-occurrence)

hadoop实现共同出现的单词(Word co-occurrence)的更多相关文章

随机推荐

热门专题