Hadoop 实现多文件输出
比如word.txt内容如下:
aaa bbb aba abc
bba bbd bbbc
cc ccd cce
要求按单词的首字母区分单词并分文件输出
代码如下:
LineRecordWriter
package com.hadoop.multi; import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext; public class LineRecordWriter<K, V> extends RecordWriter<K, V> { private static final String utf8 = "UTF-8"; private static final byte[] newline; static {
try {
newline = "n".getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8
+ " encoding");
}
} protected DataOutputStream out;
private final byte[] keyValueSeparator; public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8
+ " encoding");
}
} public LineRecordWriter(DataOutputStream out) {
this(out, "t");
} private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
} public synchronized void write(K key, V value) throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
} public synchronized void close(TaskAttemptContext context)
throws IOException {
out.close();
} }
MultipleOutputFormat
package com.hadoop.multi; import java.io.DataOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils; public abstract class MultipleOutputFormat<K extends WritableComparable<?>, V extends Writable>
extends FileOutputFormat<K, V> { private MultiRecordWriter writer = null; public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException,
InterruptedException {
if (writer == null) {
writer = new MultiRecordWriter(job, getTaskOutputPath(job));
}
return writer;
} private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {
Path workPath = null;
OutputCommitter committer = super.getOutputCommitter(conf);
if (committer instanceof FileOutputCommitter) {
workPath = ((FileOutputCommitter) committer).getWorkPath();
} else {
Path outputPath = super.getOutputPath(conf);
if (outputPath == null) {
throw new IOException("Undefined job output-path");
}
workPath = outputPath;
}
return workPath;
} protected abstract String generateFileNameForKeyValue(K key, V value, Configuration conf); public class MultiRecordWriter extends RecordWriter<K, V> { private HashMap<String, RecordWriter<K, V>> recordWriters = null;
private TaskAttemptContext job = null; private Path workPath = null;
public MultiRecordWriter(TaskAttemptContext job, Path workPath) {
super();
this.job = job;
this.workPath = workPath;
recordWriters = new HashMap<String, RecordWriter<K, V>>();
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();
while (values.hasNext()) {
values.next().close(context);
}
this.recordWriters.clear();
}
@Override
public void write(K key, V value) throws IOException, InterruptedException {
//得到输出文件名
String baseName = generateFileNameForKeyValue(key, value, job.getConfiguration());
RecordWriter<K, V> rw = this.recordWriters.get(baseName);
if (rw == null) {
rw = getBaseRecordWriter(job, baseName);
this.recordWriters.put(baseName, rw);
}
rw.write(key, value);
} private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName)
throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator = ",";
RecordWriter<K, V> recordWriter = null;
if (isCompressed) {
Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job,
GzipCodec.class);
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
Path file = new Path(workPath, baseName + codec.getDefaultExtension());
FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);
recordWriter = new LineRecordWriter<K, V>(new DataOutputStream(codec
.createOutputStream(fileOut)), keyValueSeparator);
} else {
Path file = new Path(workPath, baseName);
FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);
recordWriter = new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
}
return recordWriter;
}
} }
MultiFileOutPut
package com.hadoop.multi; import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.hadoop.multi.MultipleOutputFormat; public class MultiFileOutPut { public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1);
private Text word = new Text(); public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
} public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
} public static class AlphabetOutputFormat extends MultipleOutputFormat<Text, IntWritable> {
@Override
protected String generateFileNameForKeyValue(Text key, IntWritable value, Configuration conf) {
char c = key.toString().toLowerCase().charAt(0);
if (c >= 'a' && c <= 'z') {
return c + ".txt";
}
return "other.txt";
}
} public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(MultiFileOutPut.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(AlphabetOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Hadoop 实现多文件输出的更多相关文章
- hadoop的输入和输出文件
对于hadoop的输入和输出文件,有什么要求吗?
- hadoop多文件输出
现实环境中,经常遇到一个问题就是想使用多个Reduce,可是迫于setup和cleanup在每个Reduce中会调用一次,仅仅能设置一个Reduce,无法是实现负载均衡. 问题,假设要在reduce中 ...
- mapreduce多文件输出的两方法
mapreduce多文件输出的两方法 package duogemap; import java.io.IOException; import org.apache.hadoop.conf ...
- Hadoop之HDFS文件操作常有两种方式(转载)
摘要:Hadoop之HDFS文件操作常有两种方式,命令行方式和JavaAPI方式.本文介绍如何利用这两种方式对HDFS文件进行操作. 关键词:HDFS文件 命令行 Java API HD ...
- hadoop对于压缩文件的支持及算法优缺点
hadoop对于压缩文件的支持及算法优缺点 hadoop对于压缩格式的是透明识别,我们的MapReduce任务的执行是透明的,hadoop能够自动为我们 将压缩的文件解压,而不用我们去关心. 如果 ...
- MR案例:多文件输出MultipleOutputs
问题描述:现有 ip-to-hosts.txt 数据文件,文件中每行数据有两个字段:分别是ip地址和该ip地址对应的国家,以'\t'分隔.要求汇总不同国家的IP数,并以国家名为文件名将其输出.解读:M ...
- Hadoop IO基于文件的数据结构详解【列式和行式数据结构的存储策略】
Charles所有关于hadoop的文章参考自hadoop权威指南第四版预览版 大家可以去safari免费阅读其英文预览版.本人也上传了PDF版本在我的资源中可以免费下载,不需要C币,点击这里下载. ...
- Hadoop之HDFS文件操作
摘要:Hadoop之HDFS文件操作常有两种方式.命令行方式和JavaAPI方式.本文介绍怎样利用这两种方式对HDFS文件进行操作. 关键词:HDFS文件 命令行 Java API HD ...
- 使用log4j配置不同文件输出不同内容
敲代码中很不注意写日志,虽然明白很重要.今天碰到记录日志,需要根据内容分别输出到不同的文件. 参考几篇文章: 感觉最详细:http://blog.csdn.net/azheng270/article/ ...
随机推荐
- windows socket 网络编程
样例代码就在我的博客中,包含六个UDP和TCP发送接受的cpp文件,一个基于MFC的局域网聊天小工具project,和此小工具的全部执行时库.资源和执行程序.代码的压缩包位置是http://www.b ...
- 使用IntelliLock加密授权你的.Net程序
原文:使用IntelliLock加密授权你的.Net程序 转自:http://www.nsoff.com/post/2012/05/23/%E4%BD%BF%E7%94%A8IntelliLock%E ...
- Web Service学习笔记:动态调用WebService
原文:Web Service学习笔记:动态调用WebService 多数时候我们通过 "添加 Web 引用..." 创建客户端代理类的方式调用WebService,但在某些情况下我 ...
- CSS3制作日历
目标是制作如下面DEMO显示的一个日历效果: HTML Markup 先来看看其结构: <div class="calendar"> <span class=&q ...
- Nginx均衡负载(IP_HASH)未生效
由于公司业务的发展,单台服务器已经无法满足并发和用户的需求,所以只能通过水平拓展的方式加机器来解决,线上采用的是Nginx+Tomcat集群的方式来解决.由于当前业务量不是很大,而且由于之前代码的问题 ...
- 《Shell十三问》笔记(下)
继续开始shell十三问中11-13问和后续补充的笔记,加油! (14)输入重定向与输出重定向 “>”是标准输出重定向,可以把输出结果送入文件 “<”是标准输入重定向,可以重新指定文件的内 ...
- UVA Graph Coloring
主题如以下: Graph Coloring You are to write a program that tries to find an optimal coloring for agiven ...
- 负载均衡DNS和反向代理优缺点
负载均衡 (Load Balancing) 建立在现有网络结构之上,它提供了一种廉价有效透明的方法扩展网络设备和服务器的带宽.增加吞吐量.加强网络数据处理能力.提高网络的灵活性和可用性. 负载均衡(又 ...
- 编写Windows Service 备忘
项目需求要做一个定时扫表,将按条件查询到的数据插入或者更新到另一个数据表的需求,老大要求让用window service来做 因为以前没有做过,把这次的经历写出来.作为备忘. 1.什么是windows ...
- 凝结时光:ImageMagick 制作 gif
从去年12.12开始,工作日都会从单位的落地窗向外拍一张照片,有点每日打卡的意思.weekday 是一天不落的,weekend 不是每次都到单位,落了几次. 想法来自上学的时候,有同学在同一个地方,拍 ...