转:http://blog.csdn.net/jediael_lu/article/details/38705371

以下程序在hadoop1.2.1上测试成功。

本例先将源代码呈现,然后详细说明执行步骤,最后对源代码及执行过程进行分析。

一、源代码

 package org.jediael.hadoopdemo.wordcount;  

 import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class WordCount { public static class WordCountMap extends
Mapper<LongWritable, Text, Text, IntWritable> { private final IntWritable one = new IntWritable(1);
private Text word = new Text(); public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer token = new StringTokenizer(line);
while (token.hasMoreTokens()) {
word.set(token.nextToken());
context.write(word, one);
}
}
} public static class WordCountReduce extends
Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
} public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(WordCount.class);
job.setJobName("wordcount"); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class); job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true);
}
}

二、执行程序

1、从eclipse从导出至wordcount.jar,并上传至hadoop服务器,本例中,将程序上传至/home/jediael/project。

2、安装hadoop伪分布模式,可参考Hadoop1.2.1伪分布模式安装指南,本实例将运行在hadoop的伪公布环境中。

3、在HDFS中创建目录wcinput,用作输入目录,并将需要分析的文件复制到目录下。

  1. [root@jediael conf]# hadoop fs -mkdir wcinput
    [root@jediael conf]# hadoop fs -copyFromLocal * wcinput
    [root@jediael conf]# hadoop fs -ls wcinput
    Found 26 items
    -rw-r--r-- 1 root supergroup 1524 2014-08-20 12:29 /user/root/wcinput/automaton-urlfilter.txt
    -rw-r--r-- 1 root supergroup 1311 2014-08-20 12:29 /user/root/wcinput/configuration.xsl
    -rw-r--r-- 1 root supergroup 131090 2014-08-20 12:29 /user/root/wcinput/domain-suffixes.xml
    -rw-r--r-- 1 root supergroup 4649 2014-08-20 12:29 /user/root/wcinput/domain-suffixes.xsd
    -rw-r--r-- 1 root supergroup 824 2014-08-20 12:29 /user/root/wcinput/domain-urlfilter.txt
    -rw-r--r-- 1 root supergroup 3368 2014-08-20 12:29 /user/root/wcinput/gora-accumulo-mapping.xml
    -rw-r--r-- 1 root supergroup 3279 2014-08-20 12:29 /user/root/wcinput/gora-cassandra-mapping.xml
    -rw-r--r-- 1 root supergroup 3447 2014-08-20 12:29 /user/root/wcinput/gora-hbase-mapping.xml
    -rw-r--r-- 1 root supergroup 2677 2014-08-20 12:29 /user/root/wcinput/gora-sql-mapping.xml
    -rw-r--r-- 1 root supergroup 2993 2014-08-20 12:29 /user/root/wcinput/gora.properties
    -rw-r--r-- 1 root supergroup 983 2014-08-20 12:29 /user/root/wcinput/hbase-site.xml
    -rw-r--r-- 1 root supergroup 3096 2014-08-20 12:29 /user/root/wcinput/httpclient-auth.xml
    -rw-r--r-- 1 root supergroup 3948 2014-08-20 12:29 /user/root/wcinput/log4j.properties
    -rw-r--r-- 1 root supergroup 511 2014-08-20 12:29 /user/root/wcinput/nutch-conf.xsl
    -rw-r--r-- 1 root supergroup 42610 2014-08-20 12:29 /user/root/wcinput/nutch-default.xml
    -rw-r--r-- 1 root supergroup 753 2014-08-20 12:29 /user/root/wcinput/nutch-site.xml
    -rw-r--r-- 1 root supergroup 347 2014-08-20 12:29 /user/root/wcinput/parse-plugins.dtd
    -rw-r--r-- 1 root supergroup 3016 2014-08-20 12:29 /user/root/wcinput/parse-plugins.xml
    -rw-r--r-- 1 root supergroup 857 2014-08-20 12:29 /user/root/wcinput/prefix-urlfilter.txt
    -rw-r--r-- 1 root supergroup 2484 2014-08-20 12:29 /user/root/wcinput/regex-normalize.xml
    -rw-r--r-- 1 root supergroup 1736 2014-08-20 12:29 /user/root/wcinput/regex-urlfilter.txt
    -rw-r--r-- 1 root supergroup 18969 2014-08-20 12:29 /user/root/wcinput/schema-solr4.xml
    -rw-r--r-- 1 root supergroup 6020 2014-08-20 12:29 /user/root/wcinput/schema.xml
    -rw-r--r-- 1 root supergroup 1766 2014-08-20 12:29 /user/root/wcinput/solrindex-mapping.xml
    -rw-r--r-- 1 root supergroup 1044 2014-08-20 12:29 /user/root/wcinput/subcollections.xml
    -rw-r--r-- 1 root supergroup 1411 2014-08-20 12:29 /user/root/wcinput/suffix-urlfilter.txt

4、运行程序

  1. [root@jediael project]# hadoop org.jediael.hadoopdemo.wordcount.WordCount wcinput wcoutput3
    14/08/20 12:50:25 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
    14/08/20 12:50:26 INFO input.FileInputFormat: Total input paths to process : 26
    14/08/20 12:50:26 INFO util.NativeCodeLoader: Loaded the native-hadoop library
    14/08/20 12:50:26 WARN snappy.LoadSnappy: Snappy native library not loaded
    14/08/20 12:50:26 INFO mapred.JobClient: Running job: job_201408191134_0005
    14/08/20 12:50:27 INFO mapred.JobClient: map 0% reduce 0%
    14/08/20 12:50:38 INFO mapred.JobClient: map 3% reduce 0%
    14/08/20 12:50:39 INFO mapred.JobClient: map 7% reduce 0%
    14/08/20 12:50:50 INFO mapred.JobClient: map 15% reduce 0%
    14/08/20 12:50:57 INFO mapred.JobClient: map 19% reduce 0%
    14/08/20 12:50:58 INFO mapred.JobClient: map 23% reduce 0%
    14/08/20 12:51:00 INFO mapred.JobClient: map 23% reduce 5%
    14/08/20 12:51:04 INFO mapred.JobClient: map 30% reduce 5%
    14/08/20 12:51:06 INFO mapred.JobClient: map 30% reduce 10%
    14/08/20 12:51:11 INFO mapred.JobClient: map 38% reduce 10%
    14/08/20 12:51:16 INFO mapred.JobClient: map 38% reduce 11%
    14/08/20 12:51:18 INFO mapred.JobClient: map 46% reduce 11%
    14/08/20 12:51:19 INFO mapred.JobClient: map 46% reduce 12%
    14/08/20 12:51:22 INFO mapred.JobClient: map 46% reduce 15%
    14/08/20 12:51:25 INFO mapred.JobClient: map 53% reduce 15%
    14/08/20 12:51:31 INFO mapred.JobClient: map 53% reduce 17%
    14/08/20 12:51:32 INFO mapred.JobClient: map 61% reduce 17%
    14/08/20 12:51:39 INFO mapred.JobClient: map 69% reduce 17%
    14/08/20 12:51:40 INFO mapred.JobClient: map 69% reduce 20%
    14/08/20 12:51:45 INFO mapred.JobClient: map 73% reduce 20%
    14/08/20 12:51:46 INFO mapred.JobClient: map 76% reduce 23%
    14/08/20 12:51:52 INFO mapred.JobClient: map 80% reduce 23%
    14/08/20 12:51:53 INFO mapred.JobClient: map 84% reduce 23%
    14/08/20 12:51:55 INFO mapred.JobClient: map 84% reduce 25%
    14/08/20 12:51:59 INFO mapred.JobClient: map 88% reduce 25%
    14/08/20 12:52:00 INFO mapred.JobClient: map 92% reduce 25%
    14/08/20 12:52:02 INFO mapred.JobClient: map 92% reduce 29%
    14/08/20 12:52:06 INFO mapred.JobClient: map 96% reduce 29%
    14/08/20 12:52:07 INFO mapred.JobClient: map 100% reduce 29%
    14/08/20 12:52:11 INFO mapred.JobClient: map 100% reduce 30%
    14/08/20 12:52:15 INFO mapred.JobClient: map 100% reduce 100%
    14/08/20 12:52:17 INFO mapred.JobClient: Job complete: job_201408191134_0005
    14/08/20 12:52:18 INFO mapred.JobClient: Counters: 29
    14/08/20 12:52:18 INFO mapred.JobClient: Job Counters
    14/08/20 12:52:18 INFO mapred.JobClient: Launched reduce tasks=1
    14/08/20 12:52:18 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=192038
    14/08/20 12:52:18 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0
    14/08/20 12:52:18 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0
    14/08/20 12:52:18 INFO mapred.JobClient: Launched map tasks=26
    14/08/20 12:52:18 INFO mapred.JobClient: Data-local map tasks=26
    14/08/20 12:52:18 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=95814
    14/08/20 12:52:18 INFO mapred.JobClient: File Output Format Counters
    14/08/20 12:52:18 INFO mapred.JobClient: Bytes Written=123950
    14/08/20 12:52:18 INFO mapred.JobClient: FileSystemCounters
    14/08/20 12:52:18 INFO mapred.JobClient: FILE_BYTES_READ=352500
    14/08/20 12:52:18 INFO mapred.JobClient: HDFS_BYTES_READ=247920
    14/08/20 12:52:18 INFO mapred.JobClient: FILE_BYTES_WRITTEN=2177502
    14/08/20 12:52:18 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=123950
    14/08/20 12:52:18 INFO mapred.JobClient: File Input Format Counters
    14/08/20 12:52:18 INFO mapred.JobClient: Bytes Read=244713
    14/08/20 12:52:18 INFO mapred.JobClient: Map-Reduce Framework
    14/08/20 12:52:18 INFO mapred.JobClient: Map output materialized bytes=352650
    14/08/20 12:52:18 INFO mapred.JobClient: Map input records=7403
    14/08/20 12:52:18 INFO mapred.JobClient: Reduce shuffle bytes=352650
    14/08/20 12:52:18 INFO mapred.JobClient: Spilled Records=45210
    14/08/20 12:52:18 INFO mapred.JobClient: Map output bytes=307281
    14/08/20 12:52:18 INFO mapred.JobClient: Total committed heap usage (bytes)=3398606848
    14/08/20 12:52:18 INFO mapred.JobClient: CPU time spent (ms)=14400
    14/08/20 12:52:18 INFO mapred.JobClient: Combine input records=0
    14/08/20 12:52:18 INFO mapred.JobClient: SPLIT_RAW_BYTES=3207
    14/08/20 12:52:18 INFO mapred.JobClient: Reduce input records=22605
    14/08/20 12:52:18 INFO mapred.JobClient: Reduce input groups=6749
    14/08/20 12:52:18 INFO mapred.JobClient: Combine output records=0
    14/08/20 12:52:18 INFO mapred.JobClient: Physical memory (bytes) snapshot=4799041536
    14/08/20 12:52:18 INFO mapred.JobClient: Reduce output records=6749
    14/08/20 12:52:18 INFO mapred.JobClient: Virtual memory (bytes) snapshot=19545337856
    14/08/20 12:52:18 INFO mapred.JobClient: Map output records=22605

5、查看结果

  1. root@jediael project]# hadoop fs -ls wcoutput3
    Found 3 items
    -rw-r--r-- 1 root supergroup 0 2014-08-20 12:52 /user/root/wcoutput3/_SUCCESS
    drwxr-xr-x - root supergroup 0 2014-08-20 12:50 /user/root/wcoutput3/_logs
    -rw-r--r-- 1 root supergroup 123950 2014-08-20 12:52 /user/root/wcoutput3/part-r-00000
    [root@jediael project]# hadoop fs -cat wcoutput3/part-r-00000
    !! 2
    !ci.*.*.us 1
    !co.*.*.us 1
    !town.*.*.us 1
    "AS 22
    "Accept" 1
    "Accept-Language" 1
    "License"); 22
    "NOW" 1
    "WiFi" 1
    "Z" 1
    "all" 1
    "content" 1
    "delete 1
    "delimiter" 1

三、程序分析

1、WordCountMap类继承了org.apache.hadoop.mapreduce.Mapper,4个泛型类型分别是map函数输入key的类型,输入value的类型,输出key的类型,输出value的类型。
 
2、WordCountReduce类继承了org.apache.hadoop.mapreduce.Reducer,4个泛型类型含义与map类相同。
 
3、map的输出类型与reduce的输入类型相同,而一般情况下,map的输出类型与reduce的输出类型相同,因此,reduce的输入类型与输出类型相同。
 
4、hadoop根据以下代码确定输入内容的格式:
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat是hadoop默认的输入方法,它继承自FileInputFormat。在TextInputFormat中,它将数据集切割成小数据集InputSplit,每一个InputSplit由一个mapper处理。此外,InputFormat还提供了一个RecordReader的实现,将一个InputSplit解析成<key,value>的形式,并提供给map函数:
key:这个数据相对于数据分片中的字节偏移量,数据类型是LongWritable。
value:每行数据的内容,类型是Text。
因此,在本例中,map函数的key/value类型是LongWritable与Text。
 
5、Hadoop根据以下代码确定输出内容的格式:
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat是hadoop默认的输出格式,它会将每条记录一行的形式存入文本文件,如
the 30
happy 23
……

Hadoop入门经典:WordCount的更多相关文章

  1. Hadoop入门经典:WordCount 分类: A1_HADOOP 2014-08-20 14:43 2514人阅读 评论(0) 收藏

    以下程序在hadoop1.2.1上测试成功. 本例先将源代码呈现,然后详细说明执行步骤,最后对源代码及执行过程进行分析. 一.源代码 package org.jediael.hadoopdemo.wo ...

  2. Hadoop入门程序WordCount的执行过程

    首先编写WordCount.java源文件,分别通过map和reduce方法统计文本中每个单词出现的次数,然后按照字母的顺序排列输出, Map过程首先是多个map并行提取多个句子里面的单词然后分别列出 ...

  3. Hadoop入门实例——WordCount统计单词

    首先要说明的是运行Hadoop需要jdk1.6或以上版本,如果你还没有搭建好Hadoop集群,请参考我的另一篇文章: Linux环境搭建Hadoop伪分布模式 马上进入正题. 1.启动Hadoop集群 ...

  4. 初识Hadoop入门介绍

    初识hadoop入门介绍 Hadoop一直是我想学习的技术,正巧最近项目组要做电子商城,我就开始研究Hadoop,虽然最后鉴定Hadoop不适用我们的项目,但是我会继续研究下去,技多不压身. < ...

  5. hadoop 入门实例【转】

    原文链接:http://www.cnblogs.com/xia520pi/archive/2012/06/04/2534533.html 1.数据去重  "数据去重"主要是为了掌握 ...

  6. 一.hadoop入门须知

    目录: 1.hadoop入门须知 2.hadoop环境搭建 3.hadoop mapreduce之WordCount例子 4.idea本地调试hadoop程序 5.hadoop 从mysql中读取数据 ...

  7. Hadoop Mapreduce 案例 wordcount+统计手机流量使用情况

    mapreduce设计思想 概念:它是一个分布式并行计算的应用框架它提供相应简单的api模型,我们只需按照这些模型规则编写程序,即可实现"分布式并行计算"的功能. 案例一:word ...

  8. 大数据初级笔记二:Hadoop入门之Hadoop集群搭建

    Hadoop集群搭建 把环境全部准备好,包括编程环境. JDK安装 版本要求: 强烈建议使用64位的JDK版本,这样的优势在于JVM的能够访问到的最大内存就不受限制,基于后期可能会学习到Spark技术 ...

  9. 转 Kafka入门经典教程

    Kafka入门经典教程 http://www.aboutyun.com/thread-12882-1-1.html 问题导读 1.Kafka独特设计在什么地方?2.Kafka如何搭建及创建topic. ...

随机推荐

  1. Weak is not weak,Strong is not strong

    问题 今天做浏览器Controller的时候,碰到了一个奇怪的问题:每次pop浏览器controller之后,等几秒,总会碰到类似下面的错误(其中的xxxController就是浏览器或继承他的子类C ...

  2. 如何使用MASM来编译、连接、调试汇编语言

    先声明下,本人绝非大虾,也只是菜鸟一个,写此文的目的只是为了加深我对知识的理解罢了.好,进入正题.我是把masm解压后发在D盘中的一个叫masm的文件里,在masm文件里新建个记事本(记事本功能是很强 ...

  3. MongoDB之bson的介绍

    MongoDB之bson的介绍 1. 什么是bson BSON是一种类json的一种二进制形式的存储格式,简称Binary JSON,它和JSON一样,支持内嵌的文档对象和数组对象,但是BSON有JS ...

  4. vagrant up时提示 Authentication failure. Retrying

    vagrant up时提示 Authentication failure. Retrying 如图,启动时就报这个错误,virtualbox启动正常 用vagrant的账号密码也可以登录 就是不能使用 ...

  5. iframe 动态onload事件处理方式

    转自:http://w3help.org/zh-cn/causes/SD9022 标准参考 关于 HTML 4.01 规范中 BODY 标记的 onload 属性说明: http://www.w3.o ...

  6. mysql zip 版本配置方法

    -\bin 指 C:\Program Files\MySQL\MySQL Server 5.6\bin 1.增加环境变量 "PATH"-"-\bin" 2.修改 ...

  7. controller传值view

    400错误是请求错误 Model是map格式 @Controller public class HelloController { //view的值传给controller @RequestMappi ...

  8. 如何修改ubuntu系统的电脑名(主机名)

    在按照ubuntu系统时,会提示你给电脑填写一个名字,可能当时你没有想好,就随便填写了一个,可是以后就又有新的想法,想重新更换一个名字,该怎么办呢? 其实很简单.按照下面的步骤即可. 进去后,修改完, ...

  9. Apache服务器常规操作

    导读 Apache是世界上排名第一的Web服务器,50%以上的Web服务器都在使用Apache,它几乎可以在所有计算机平台上运行.下面就由我给大家说说Apache服务器的一些常规操作. Apache服 ...

  10. NGUI无限滑动

    http://www.unity蛮牛.com/blog-9383-1391.html 最近由于工作需要,就开始研究NGUI滑动.刚开始参考NGUI自带的循环滑动,利用隐藏和显示,提高GPU的渲染,但是 ...