package com.uniclick.dapa.dstest;

import java.io.IOException;
import java.net.URI; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCount {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
String inputFilePath = "/user/zhouyuanlong/wordcount/input/wordTest*.txt";
String outputFilePath = "/user/zhouyuanlong/wordcount/output/";
String queue = "default";
String jobName = "wordCount";
if(args == null || args.length < 2){
System.out.println("[-INPUT <inputFilePath>"
+ "[-OUTPUT <outputFilePath>");
}else{
for(int i=0;i<args.length;i++){
if("-Q".equals(args[i])){
queue = args[++i];
}
}
}
Configuration conf = new Configuration();
conf.set("mapred.job.queue.name", queue);
Job job = new Job(conf, jobName);
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCountMapper.class);
// job.setCombinerClass(cls);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(inputFilePath));
Path path = new Path(outputFilePath);
FileSystem fs = FileSystem.get(URI.create(outputFilePath), conf);
if(fs.exists(path)){
// fs.delete(path);
fs.delete(path, true);
}
FileOutputFormat.setOutputPath(job, new Path(outputFilePath));
System.exit(job.waitForCompletion(true) ? 1 : 0);
} public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private Text kt = new Text();
private final static IntWritable vt = new IntWritable(1); public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] arr = value.toString().split("\t");
for(int i = 0; i < arr.length; i++){
kt.set(arr[i]);
context.write(kt, vt);
}
}
} public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
private IntWritable vt = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException{
int sum = 0;
for(IntWritable intVal : values){
sum += intVal.get();
}
vt.set(sum);
context.write(key, vt);
}
} }

input目录中文件wordTest1.txt的内容(每行以table键分隔):

hello    world
hello    hadoop
hello    mapredruce

input目录中文件wordTest2.txt的内容(每行以table键分隔):

hello    world
hello    hadoop
hello    mapredruce

hdfs输出结果:

web     2
mapredruce      1
python  1
hadoop  1
hello   6
clojure 2
world   1
java    2

PS:对Hadoop自带的wordcount的例子略有改变

Hadoop最基本的wordcount(统计词频)的更多相关文章

  1. Hadoop上的中文分词与词频统计实践 (有待学习 http://www.cnblogs.com/jiejue/archive/2012/12/16/2820788.html)

    解决问题的方案 Hadoop上的中文分词与词频统计实践 首先来推荐相关材料:http://xiaoxia.org/2011/12/18/map-reduce-program-of-rmm-word-c ...

  2. Hadoop入门实例——WordCount统计单词

    首先要说明的是运行Hadoop需要jdk1.6或以上版本,如果你还没有搭建好Hadoop集群,请参考我的另一篇文章: Linux环境搭建Hadoop伪分布模式 马上进入正题. 1.启动Hadoop集群 ...

  3. 使用SparkSQL编写wordCount的词频统计

    # 使用SparkSQL编写wordCount的词频统计 ## word.txt```hello hello scala sparkjava sql html java hellojack jack ...

  4. Hadoop实战3:MapReduce编程-WordCount统计单词个数-eclipse-java-ubuntu环境

    之前习惯用hadoop streaming环境编写python程序,下面总结编辑java的eclipse环境配置总结,及一个WordCount例子运行. 一 下载eclipse安装包及hadoop插件 ...

  5. Hadoop版Helloworld之wordcount运行示例

    1.编写一个统计单词数量的java程序,并命名为wordcount.java,代码如下: import java.io.IOException; import java.util.StringToke ...

  6. 执行hadoop自带的WordCount实例

    hadoop 自带的WordCount实例可以统计一批文本文件中各单词出现的次数.下面介绍如何执行WordCount实例. 1.启动hadoop [root@hadoop ~]# start-all. ...

  7. Excel中COUNTIFS函数统计词频个数出现次数

    Excel中COUNTIFS函数统计词频个数出现次数   在Excel中经常需要实现如下需求:在某一列单元格中有不同的词语,有些词语相同,有的不同(如图1所示).需要统计Excel表格中每个词语出现的 ...

  8. hadoop自带例子wordcount的具体运行步骤

    1.在hadoop所在目录“usr/local”下创建一个文件夹input root@ubuntu:/usr/local# mkdir input 2.在文件夹input中创建两个文本文件file1. ...

  9. HADOOP :: java.lang.ClassNotFoundException: WordCount

    I am using eclipse to export the jar file of a map-reduce program. When i am run the jar using comma ...

随机推荐

  1. gnome中文翻译之po

    文件类型: po: 用msginit分析pot文件,生成各语言对应的po文件,比如中文的zh_CN.po. mo: 用msgfmt将po文件编译生成mo文件,这是二进制文件,不能直接编辑. gmo: ...

  2. Android Material Design NavigationView 及 Palette 颜色提取器

    DrawerLayout + NavigationView DrawerLayout布局,通常在里面添加两个子控件,程序主界面添加到NavitagionView前面. <android.supp ...

  3. 配置nginx1.7.8支持pathinfo模式

    vi nginx/conf/nginx.conf 1.修改正则 set $real_script_name $fastcgi_script_name; if ($fastcgi_script_name ...

  4. yii2源码学习笔记(十五)

    这几天有点忙今天好些了,继续上次的module来吧 /** * Returns the directory that contains the controller classes according ...

  5. TCP带外数据读写

    #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include &l ...

  6. java + spring (jython\python\script) Error:SyntaxError: no viable alternative at character '\n'

    使用Jython结合java和Python开发功能时,要是遇到如下情况: 2016-03-10 16:16:49 DEBUG [com.freedom.orion.configs.JyhtonConf ...

  7. 深入了解一下PYTHON中关于SOCKETSERVER的模块-C

    同时处理多个客户端请求,并且为不同的CLIENT开不同的线程处理. 这个东东,就显然实用性稍强了一些.(FORK和THREAD方式均可,但各有应用) #!/usr/bin/env python fro ...

  8. 14.5.5 Deadlocks in InnoDB

    14.5.5 Deadlocks in InnoDB 14.5.5.1 An InnoDB Deadlock Example 14.5.5.2 Deadlock Detection and Rollb ...

  9. Android MonkeyRunner自动拨打电话

    from com.android.monkeyrunner import MonkeyRunner, MonkeyDevice import time device = MonkeyRunner.wa ...

  10. WordPress Design Approval System插件‘step’参数跨站脚本漏洞

    漏洞名称: WordPress Design Approval System插件‘step’参数跨站脚本漏洞 CNNVD编号: CNNVD-201309-084 发布时间: 2013-09-11 更新 ...