Hadoop 实现多文件输出
比如word.txt内容如下:
aaa bbb aba abc
bba bbd bbbc
cc ccd cce
要求按单词的首字母区分单词并分文件输出
代码如下:
LineRecordWriter
package com.hadoop.multi; import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext; public class LineRecordWriter<K, V> extends RecordWriter<K, V> { private static final String utf8 = "UTF-8"; private static final byte[] newline; static {
try {
newline = "n".getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8
+ " encoding");
}
} protected DataOutputStream out;
private final byte[] keyValueSeparator; public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8
+ " encoding");
}
} public LineRecordWriter(DataOutputStream out) {
this(out, "t");
} private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
} public synchronized void write(K key, V value) throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
} public synchronized void close(TaskAttemptContext context)
throws IOException {
out.close();
} }
MultipleOutputFormat
package com.hadoop.multi; import java.io.DataOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils; public abstract class MultipleOutputFormat<K extends WritableComparable<?>, V extends Writable>
extends FileOutputFormat<K, V> { private MultiRecordWriter writer = null; public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException,
InterruptedException {
if (writer == null) {
writer = new MultiRecordWriter(job, getTaskOutputPath(job));
}
return writer;
} private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {
Path workPath = null;
OutputCommitter committer = super.getOutputCommitter(conf);
if (committer instanceof FileOutputCommitter) {
workPath = ((FileOutputCommitter) committer).getWorkPath();
} else {
Path outputPath = super.getOutputPath(conf);
if (outputPath == null) {
throw new IOException("Undefined job output-path");
}
workPath = outputPath;
}
return workPath;
} protected abstract String generateFileNameForKeyValue(K key, V value, Configuration conf); public class MultiRecordWriter extends RecordWriter<K, V> { private HashMap<String, RecordWriter<K, V>> recordWriters = null;
private TaskAttemptContext job = null; private Path workPath = null;
public MultiRecordWriter(TaskAttemptContext job, Path workPath) {
super();
this.job = job;
this.workPath = workPath;
recordWriters = new HashMap<String, RecordWriter<K, V>>();
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();
while (values.hasNext()) {
values.next().close(context);
}
this.recordWriters.clear();
}
@Override
public void write(K key, V value) throws IOException, InterruptedException {
//得到输出文件名
String baseName = generateFileNameForKeyValue(key, value, job.getConfiguration());
RecordWriter<K, V> rw = this.recordWriters.get(baseName);
if (rw == null) {
rw = getBaseRecordWriter(job, baseName);
this.recordWriters.put(baseName, rw);
}
rw.write(key, value);
} private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName)
throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator = ",";
RecordWriter<K, V> recordWriter = null;
if (isCompressed) {
Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job,
GzipCodec.class);
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
Path file = new Path(workPath, baseName + codec.getDefaultExtension());
FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);
recordWriter = new LineRecordWriter<K, V>(new DataOutputStream(codec
.createOutputStream(fileOut)), keyValueSeparator);
} else {
Path file = new Path(workPath, baseName);
FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);
recordWriter = new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
}
return recordWriter;
}
} }
MultiFileOutPut
package com.hadoop.multi; import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.hadoop.multi.MultipleOutputFormat; public class MultiFileOutPut { public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1);
private Text word = new Text(); public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
} public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
} public static class AlphabetOutputFormat extends MultipleOutputFormat<Text, IntWritable> {
@Override
protected String generateFileNameForKeyValue(Text key, IntWritable value, Configuration conf) {
char c = key.toString().toLowerCase().charAt(0);
if (c >= 'a' && c <= 'z') {
return c + ".txt";
}
return "other.txt";
}
} public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(MultiFileOutPut.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(AlphabetOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Hadoop 实现多文件输出的更多相关文章
- hadoop的输入和输出文件
对于hadoop的输入和输出文件,有什么要求吗?
- hadoop多文件输出
现实环境中,经常遇到一个问题就是想使用多个Reduce,可是迫于setup和cleanup在每个Reduce中会调用一次,仅仅能设置一个Reduce,无法是实现负载均衡. 问题,假设要在reduce中 ...
- mapreduce多文件输出的两方法
mapreduce多文件输出的两方法 package duogemap; import java.io.IOException; import org.apache.hadoop.conf ...
- Hadoop之HDFS文件操作常有两种方式(转载)
摘要:Hadoop之HDFS文件操作常有两种方式,命令行方式和JavaAPI方式.本文介绍如何利用这两种方式对HDFS文件进行操作. 关键词:HDFS文件 命令行 Java API HD ...
- hadoop对于压缩文件的支持及算法优缺点
hadoop对于压缩文件的支持及算法优缺点 hadoop对于压缩格式的是透明识别,我们的MapReduce任务的执行是透明的,hadoop能够自动为我们 将压缩的文件解压,而不用我们去关心. 如果 ...
- MR案例:多文件输出MultipleOutputs
问题描述:现有 ip-to-hosts.txt 数据文件,文件中每行数据有两个字段:分别是ip地址和该ip地址对应的国家,以'\t'分隔.要求汇总不同国家的IP数,并以国家名为文件名将其输出.解读:M ...
- Hadoop IO基于文件的数据结构详解【列式和行式数据结构的存储策略】
Charles所有关于hadoop的文章参考自hadoop权威指南第四版预览版 大家可以去safari免费阅读其英文预览版.本人也上传了PDF版本在我的资源中可以免费下载,不需要C币,点击这里下载. ...
- Hadoop之HDFS文件操作
摘要:Hadoop之HDFS文件操作常有两种方式.命令行方式和JavaAPI方式.本文介绍怎样利用这两种方式对HDFS文件进行操作. 关键词:HDFS文件 命令行 Java API HD ...
- 使用log4j配置不同文件输出不同内容
敲代码中很不注意写日志,虽然明白很重要.今天碰到记录日志,需要根据内容分别输出到不同的文件. 参考几篇文章: 感觉最详细:http://blog.csdn.net/azheng270/article/ ...
随机推荐
- MVC 5 Web编程2 -- URL映射
ASP.NET MVC 5 Web编程2 -- URL映射(路由原理) 2015-02-12 08:50 by hangwei, 704 阅读, 5 评论, 收藏, 编辑 本章将讲述ASP.NET M ...
- MyEclipse 设置全部jsp的编码为UFT-8 的方法
- centos6的安装,一步一图,有图有真相
打开虚拟机VMware,点击文件,选择[新建虚拟机],如图所示
- Installshield获取安装包版本的系统变量是IFX_PRODUCT_VERSION
原文:Installshield获取安装包版本的系统变量是IFX_PRODUCT_VERSION Installshield获取安装包版本的系统变量为IFX_PRODUCT_VERSION 当笔记记下 ...
- PHP 15:异常
原文:PHP 15:异常 看完了out_put_fns.php文件,让我们再看看db_fns.php文件.其代码非常简单,如下: ?> 其作用是连接数据库,并返回一个数据库连接.在这里我们 ...
- bzoj 1799: [Ahoi2009]self 类似的分布 解读
[原标题] 1799: [Ahoi2009]self 同类分布 Time Limit: 50 Sec Memory Limit: 64 MB Submit: 554 Solved: 194 [id ...
- Qt Creator(编译器MinGW)中使用__attribute__(packed)的问题
http://www.bttr-software.de/forum/mix_entry.php?id=11767 假设我们从串口中读到一串数据,当我们想要处理这串数据的时候通常是这样做的: 1 将这些 ...
- openwrt构建过程探索
参考网站:http://wiki.openwrt.org/doc/howto/buildroot.exigence 需要下载必要的库文件,编译器等... 1 首先要获得openwrt的源码,参考ope ...
- 点击按钮,通过JS代码实现复制INPUT表单,表格:
获取表单: <SCRIPT LANGUAGE="JavaScript"> function copyinput() { var input=document.getEl ...
- Hive中抽取连续多天登录用户
昨天群上有人发个阿里的面试题,题目描述大概如下: 数据源:用户登录表,只有俩个字段,uid和dt 试用HQL抽取出连续登录了K天的用户uid 第一个想法就是直接用一个UDF解决,按uid分组,把dt收 ...