Hadoop入门经典:WordCount 分类: A1_HADOOP 2014-08-20 14:43 2514人阅读 评论(0) 收藏
以下程序在hadoop1.2.1上测试成功。
本例先将源代码呈现,然后详细说明执行步骤,最后对源代码及执行过程进行分析。
一、源代码
package org.jediael.hadoopdemo.wordcount; import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class WordCount { public static class WordCountMap extends
Mapper<LongWritable, Text, Text, IntWritable> { private final IntWritable one = new IntWritable(1);
private Text word = new Text(); public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer token = new StringTokenizer(line);
while (token.hasMoreTokens()) {
word.set(token.nextToken());
context.write(word, one);
}
}
} public static class WordCountReduce extends
Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
} public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(WordCount.class);
job.setJobName("wordcount"); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class); job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true);
}
}
二、执行程序
1、从eclipse从导出至wordcount.jar,并上传至hadoop服务器,本例中,将程序上传至/home/jediael/project。
2、安装hadoop伪分布模式,可参考Hadoop1.2.1伪分布模式安装指南,本实例将运行在hadoop的伪公布环境中。
3、在HDFS中创建目录wcinput,用作输入目录,并将需要分析的文件复制到目录下。
[root@jediael conf]# hadoop fs -mkdir wcinput
[root@jediael conf]# hadoop fs -copyFromLocal * wcinput
[root@jediael conf]# hadoop fs -ls wcinput
Found 26 items
-rw-r--r-- 1 root supergroup 1524 2014-08-20 12:29 /user/root/wcinput/automaton-urlfilter.txt
-rw-r--r-- 1 root supergroup 1311 2014-08-20 12:29 /user/root/wcinput/configuration.xsl
-rw-r--r-- 1 root supergroup 131090 2014-08-20 12:29 /user/root/wcinput/domain-suffixes.xml
-rw-r--r-- 1 root supergroup 4649 2014-08-20 12:29 /user/root/wcinput/domain-suffixes.xsd
-rw-r--r-- 1 root supergroup 824 2014-08-20 12:29 /user/root/wcinput/domain-urlfilter.txt
-rw-r--r-- 1 root supergroup 3368 2014-08-20 12:29 /user/root/wcinput/gora-accumulo-mapping.xml
-rw-r--r-- 1 root supergroup 3279 2014-08-20 12:29 /user/root/wcinput/gora-cassandra-mapping.xml
-rw-r--r-- 1 root supergroup 3447 2014-08-20 12:29 /user/root/wcinput/gora-hbase-mapping.xml
-rw-r--r-- 1 root supergroup 2677 2014-08-20 12:29 /user/root/wcinput/gora-sql-mapping.xml
-rw-r--r-- 1 root supergroup 2993 2014-08-20 12:29 /user/root/wcinput/gora.properties
-rw-r--r-- 1 root supergroup 983 2014-08-20 12:29 /user/root/wcinput/hbase-site.xml
-rw-r--r-- 1 root supergroup 3096 2014-08-20 12:29 /user/root/wcinput/httpclient-auth.xml
-rw-r--r-- 1 root supergroup 3948 2014-08-20 12:29 /user/root/wcinput/log4j.properties
-rw-r--r-- 1 root supergroup 511 2014-08-20 12:29 /user/root/wcinput/nutch-conf.xsl
-rw-r--r-- 1 root supergroup 42610 2014-08-20 12:29 /user/root/wcinput/nutch-default.xml
-rw-r--r-- 1 root supergroup 753 2014-08-20 12:29 /user/root/wcinput/nutch-site.xml
-rw-r--r-- 1 root supergroup 347 2014-08-20 12:29 /user/root/wcinput/parse-plugins.dtd
-rw-r--r-- 1 root supergroup 3016 2014-08-20 12:29 /user/root/wcinput/parse-plugins.xml
-rw-r--r-- 1 root supergroup 857 2014-08-20 12:29 /user/root/wcinput/prefix-urlfilter.txt
-rw-r--r-- 1 root supergroup 2484 2014-08-20 12:29 /user/root/wcinput/regex-normalize.xml
-rw-r--r-- 1 root supergroup 1736 2014-08-20 12:29 /user/root/wcinput/regex-urlfilter.txt
-rw-r--r-- 1 root supergroup 18969 2014-08-20 12:29 /user/root/wcinput/schema-solr4.xml
-rw-r--r-- 1 root supergroup 6020 2014-08-20 12:29 /user/root/wcinput/schema.xml
-rw-r--r-- 1 root supergroup 1766 2014-08-20 12:29 /user/root/wcinput/solrindex-mapping.xml
-rw-r--r-- 1 root supergroup 1044 2014-08-20 12:29 /user/root/wcinput/subcollections.xml
-rw-r--r-- 1 root supergroup 1411 2014-08-20 12:29 /user/root/wcinput/suffix-urlfilter.txt
4、运行程序
[root@jediael project]# hadoop org.jediael.hadoopdemo.wordcount.WordCount wcinput wcoutput3
14/08/20 12:50:25 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
14/08/20 12:50:26 INFO input.FileInputFormat: Total input paths to process : 26
14/08/20 12:50:26 INFO util.NativeCodeLoader: Loaded the native-hadoop library
14/08/20 12:50:26 WARN snappy.LoadSnappy: Snappy native library not loaded
14/08/20 12:50:26 INFO mapred.JobClient: Running job: job_201408191134_0005
14/08/20 12:50:27 INFO mapred.JobClient: map 0% reduce 0%
14/08/20 12:50:38 INFO mapred.JobClient: map 3% reduce 0%
14/08/20 12:50:39 INFO mapred.JobClient: map 7% reduce 0%
14/08/20 12:50:50 INFO mapred.JobClient: map 15% reduce 0%
14/08/20 12:50:57 INFO mapred.JobClient: map 19% reduce 0%
14/08/20 12:50:58 INFO mapred.JobClient: map 23% reduce 0%
14/08/20 12:51:00 INFO mapred.JobClient: map 23% reduce 5%
14/08/20 12:51:04 INFO mapred.JobClient: map 30% reduce 5%
14/08/20 12:51:06 INFO mapred.JobClient: map 30% reduce 10%
14/08/20 12:51:11 INFO mapred.JobClient: map 38% reduce 10%
14/08/20 12:51:16 INFO mapred.JobClient: map 38% reduce 11%
14/08/20 12:51:18 INFO mapred.JobClient: map 46% reduce 11%
14/08/20 12:51:19 INFO mapred.JobClient: map 46% reduce 12%
14/08/20 12:51:22 INFO mapred.JobClient: map 46% reduce 15%
14/08/20 12:51:25 INFO mapred.JobClient: map 53% reduce 15%
14/08/20 12:51:31 INFO mapred.JobClient: map 53% reduce 17%
14/08/20 12:51:32 INFO mapred.JobClient: map 61% reduce 17%
14/08/20 12:51:39 INFO mapred.JobClient: map 69% reduce 17%
14/08/20 12:51:40 INFO mapred.JobClient: map 69% reduce 20%
14/08/20 12:51:45 INFO mapred.JobClient: map 73% reduce 20%
14/08/20 12:51:46 INFO mapred.JobClient: map 76% reduce 23%
14/08/20 12:51:52 INFO mapred.JobClient: map 80% reduce 23%
14/08/20 12:51:53 INFO mapred.JobClient: map 84% reduce 23%
14/08/20 12:51:55 INFO mapred.JobClient: map 84% reduce 25%
14/08/20 12:51:59 INFO mapred.JobClient: map 88% reduce 25%
14/08/20 12:52:00 INFO mapred.JobClient: map 92% reduce 25%
14/08/20 12:52:02 INFO mapred.JobClient: map 92% reduce 29%
14/08/20 12:52:06 INFO mapred.JobClient: map 96% reduce 29%
14/08/20 12:52:07 INFO mapred.JobClient: map 100% reduce 29%
14/08/20 12:52:11 INFO mapred.JobClient: map 100% reduce 30%
14/08/20 12:52:15 INFO mapred.JobClient: map 100% reduce 100%
14/08/20 12:52:17 INFO mapred.JobClient: Job complete: job_201408191134_0005
14/08/20 12:52:18 INFO mapred.JobClient: Counters: 29
14/08/20 12:52:18 INFO mapred.JobClient: Job Counters
14/08/20 12:52:18 INFO mapred.JobClient: Launched reduce tasks=1
14/08/20 12:52:18 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=192038
14/08/20 12:52:18 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0
14/08/20 12:52:18 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0
14/08/20 12:52:18 INFO mapred.JobClient: Launched map tasks=26
14/08/20 12:52:18 INFO mapred.JobClient: Data-local map tasks=26
14/08/20 12:52:18 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=95814
14/08/20 12:52:18 INFO mapred.JobClient: File Output Format Counters
14/08/20 12:52:18 INFO mapred.JobClient: Bytes Written=123950
14/08/20 12:52:18 INFO mapred.JobClient: FileSystemCounters
14/08/20 12:52:18 INFO mapred.JobClient: FILE_BYTES_READ=352500
14/08/20 12:52:18 INFO mapred.JobClient: HDFS_BYTES_READ=247920
14/08/20 12:52:18 INFO mapred.JobClient: FILE_BYTES_WRITTEN=2177502
14/08/20 12:52:18 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=123950
14/08/20 12:52:18 INFO mapred.JobClient: File Input Format Counters
14/08/20 12:52:18 INFO mapred.JobClient: Bytes Read=244713
14/08/20 12:52:18 INFO mapred.JobClient: Map-Reduce Framework
14/08/20 12:52:18 INFO mapred.JobClient: Map output materialized bytes=352650
14/08/20 12:52:18 INFO mapred.JobClient: Map input records=7403
14/08/20 12:52:18 INFO mapred.JobClient: Reduce shuffle bytes=352650
14/08/20 12:52:18 INFO mapred.JobClient: Spilled Records=45210
14/08/20 12:52:18 INFO mapred.JobClient: Map output bytes=307281
14/08/20 12:52:18 INFO mapred.JobClient: Total committed heap usage (bytes)=3398606848
14/08/20 12:52:18 INFO mapred.JobClient: CPU time spent (ms)=14400
14/08/20 12:52:18 INFO mapred.JobClient: Combine input records=0
14/08/20 12:52:18 INFO mapred.JobClient: SPLIT_RAW_BYTES=3207
14/08/20 12:52:18 INFO mapred.JobClient: Reduce input records=22605
14/08/20 12:52:18 INFO mapred.JobClient: Reduce input groups=6749
14/08/20 12:52:18 INFO mapred.JobClient: Combine output records=0
14/08/20 12:52:18 INFO mapred.JobClient: Physical memory (bytes) snapshot=4799041536
14/08/20 12:52:18 INFO mapred.JobClient: Reduce output records=6749
14/08/20 12:52:18 INFO mapred.JobClient: Virtual memory (bytes) snapshot=19545337856
14/08/20 12:52:18 INFO mapred.JobClient: Map output records=22605
5、查看结果
root@jediael project]# hadoop fs -ls wcoutput3
Found 3 items
-rw-r--r-- 1 root supergroup 0 2014-08-20 12:52 /user/root/wcoutput3/_SUCCESS
drwxr-xr-x - root supergroup 0 2014-08-20 12:50 /user/root/wcoutput3/_logs
-rw-r--r-- 1 root supergroup 123950 2014-08-20 12:52 /user/root/wcoutput3/part-r-00000
[root@jediael project]# hadoop fs -cat wcoutput3/part-r-00000
!! 2
!ci.*.*.us 1
!co.*.*.us 1
!town.*.*.us 1
"AS 22
"Accept" 1
"Accept-Language" 1
"License"); 22
"NOW" 1
"WiFi" 1
"Z" 1
"all" 1
"content" 1
"delete 1
"delimiter" 1
………………
三、程序分析
版权声明:本文为博主原创文章,未经博主允许不得转载。
Hadoop入门经典:WordCount 分类: A1_HADOOP 2014-08-20 14:43 2514人阅读 评论(0) 收藏的更多相关文章
- 【solr专题之一】Solr快速入门 分类: H4_SOLR/LUCENCE 2014-07-02 14:59 2403人阅读 评论(0) 收藏
一.Solr学习相关资料 1.官方材料 (1)快速入门:http://lucene.apache.org/solr/4_9_0/tutorial.html,以自带的example项目快速介绍发Solr ...
- IIS上虚拟站点的web.config与主站点的web.config冲突解决方法 分类: ASP.NET 2015-06-15 14:07 60人阅读 评论(0) 收藏
IIS上在主站点下搭建虚拟目录后,子站点中的<system.web>节点与主站点的<system.web>冲突解决方法: 在主站点的<system.web>上一级添 ...
- C/C++中const的用法 分类: C/C++ 2015-07-05 00:43 85人阅读 评论(0) 收藏
const是C语言的关键字,经C++进行扩充,变得功能强大,用法复杂.const用于定义一个常变量(只读变量),当const与指针,引用,函数等结合起来使用时,情况会变得复杂的多.下面将从五个方面总结 ...
- C++实现不能被继承的类——终结类 分类: C/C++ 2015-04-06 14:48 64人阅读 评论(0) 收藏
1. 问题 C++如何实现不能被继承的类,即终结类.Java中有final关键字修饰,C#中有sealed关键字修饰,而C++目前还没有类似的关键字来修饰类实现终结类,需编程人员手动实现. ...
- Dungeon Master 分类: 搜索 POJ 2015-08-09 14:25 4人阅读 评论(0) 收藏
Dungeon Master Time Limit: 1000MS Memory Limit: 65536K Total Submissions: 20995 Accepted: 8150 Descr ...
- Codeforces 343D Water Tree 分类: Brush Mode 2014-10-05 14:38 98人阅读 评论(0) 收藏
Mad scientist Mike has constructed a rooted tree, which consists of n vertices. Each vertex is a res ...
- Beautiful People 分类: Brush Mode 2014-10-01 14:33 100人阅读 评论(0) 收藏
Beautiful People Time Limit: 10000/5000MS (Java/Others) Memory Limit: 128000/64000KB (Java/Others) ...
- Hdu 1009 FatMouse' Trade 分类: Translation Mode 2014-08-04 14:07 74人阅读 评论(0) 收藏
FatMouse' Trade Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 65536/32768 K (Java/Others) ...
- sql 视图 按where条件多个字段取一个 分类: SQL Server 2014-12-01 14:09 308人阅读 评论(0) 收藏
首先介绍一下 Case ..When...Then..End 的用法: CASEJiXiaoFind_RowID WHEN '1' THENJiXiao_Money1 WHEN '2' THEN ...
随机推荐
- 【试水CAS-4.0.3】第07节_CASclient配置单点登录
完整版见https://jadyer.github.io/2015/07/26/sso-cas-client-login/ 本文源代码下载:http://download.csdn.net/detai ...
- Codeforces Round #316 (Div. 2) A B C
A. Elections time limit per test 1 second memory limit per test 256 megabytes input standard input o ...
- [NOI.AC#35]string 缩点+拓扑排序
链接 因为有交换相邻字母,因此给你字符串就相当于给你了这个字符串的所有排列 把等价的串映射到整数范围,再根据 \(m\) 种魔法连边,缩点后在 DAG 上DP即可 无耻地用了int128 #inclu ...
- if 的理解
1. if 实现集合的划分 比如著名的 Prim 算法(最小生成树),从某一确定的点出发,每次新加入的点,都是在已访问过的结点(u∈U)和未访问过(v∈V−U)的结点之间的边.这里的未被访问(V−U) ...
- monkey基础知识(二)
- HDU——T 3579 Hello Kiki
http://acm.hdu.edu.cn/showproblem.php?pid=3579 Time Limit: 2000/1000 MS (Java/Others) Memory Limi ...
- CODEVS——T1332 上白泽慧音 || 洛谷——P1726 上白泽慧音
http://codevs.cn/problem/1332/|| https://www.luogu.org/problem/show?pid=1726#sub 时间限制: 1 s 空间限制: 1 ...
- Google Web Toolkit(GWT) 在windows下环境搭建
1.什么是GWT? Google Web Toolkit(简称GWT,读作/ˈɡwɪt/),是一个前端使用JavaScript,后端使用Java的AJAX framework,以Apache许可证2. ...
- 内网使用 IPV6 之 TunnelBroker隧道(6in4)篇
内网使用 IPV6 之 TunnelBroker隧道(6in4)篇 据非专业网民推测 tunnelbroker isatap 和 6to4 貌似都需要公网,但有网民测试这位大作的方法可行.特别之处是 ...
- hashlib —— Python 的 md5 和 sha1 加密
python的md5和sha1加密 0. md5 与 sha1 MD5 的全称是 Message-Digest Algorithm 5(信息-摘要算法).128 位长度.目前 MD5 是一种不可逆算法 ...