Hadoop 实现多文件输出

比如word.txt内容如下：

aaa bbb aba abc

bba bbd bbbc

cc ccd cce

要求按单词的首字母区分单词并分文件输出

代码如下：

LineRecordWriter

package com.hadoop.multi;

import java.io.DataOutputStream;

import java.io.IOException;

import java.io.UnsupportedEncodingException;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.RecordWriter;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

public class LineRecordWriter<K, V> extends RecordWriter<K, V> {

	private static final String utf8 = "UTF-8";

	private static final byte[] newline;

	static {

		try {

			newline = "n".getBytes(utf8);

		} catch (UnsupportedEncodingException uee) {

			throw new IllegalArgumentException("can't find " + utf8

					+ " encoding");

		}

	}

	protected DataOutputStream out;

	private final byte[] keyValueSeparator;

	public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {

		this.out = out;

		try {

			this.keyValueSeparator = keyValueSeparator.getBytes(utf8);

		} catch (UnsupportedEncodingException uee) {

			throw new IllegalArgumentException("can't find " + utf8

					+ " encoding");

		}

	}

	public LineRecordWriter(DataOutputStream out) {

		this(out, "t");

	}

	private void writeObject(Object o) throws IOException {

		if (o instanceof Text) {

			Text to = (Text) o;

			out.write(to.getBytes(), 0, to.getLength());

		} else {

			out.write(o.toString().getBytes(utf8));

		}

	}

	public synchronized void write(K key, V value) throws IOException {

		boolean nullKey = key == null || key instanceof NullWritable;

		boolean nullValue = value == null || value instanceof NullWritable;

		if (nullKey && nullValue) {

			return;

		}

		if (!nullKey) {

			writeObject(key);

		}

		if (!(nullKey || nullValue)) {

			out.write(keyValueSeparator);

		}

		if (!nullValue) {

			writeObject(value);

		}

		out.write(newline);

	}

	public synchronized void close(TaskAttemptContext context)

			throws IOException {

		out.close();

	}

}

MultipleOutputFormat

package com.hadoop.multi;

import java.io.DataOutputStream;

import java.io.IOException;

import java.util.HashMap;

import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataOutputStream;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.compress.CompressionCodec;

import org.apache.hadoop.io.compress.GzipCodec;

import org.apache.hadoop.mapreduce.OutputCommitter;

import org.apache.hadoop.mapreduce.RecordWriter;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.ReflectionUtils; 

public abstract class MultipleOutputFormat<K extends WritableComparable<?>, V extends Writable>

        extends FileOutputFormat<K, V> {

	private MultiRecordWriter writer = null; 

	public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException,

		   InterruptedException {

		if (writer == null) {

		    writer = new MultiRecordWriter(job, getTaskOutputPath(job));

		}

		return writer;

	}

	private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {

        Path workPath = null;

        OutputCommitter committer = super.getOutputCommitter(conf);

        if (committer instanceof FileOutputCommitter) {

            workPath = ((FileOutputCommitter) committer).getWorkPath();

        } else {

            Path outputPath = super.getOutputPath(conf);

            if (outputPath == null) {

                throw new IOException("Undefined job output-path");

            }

            workPath = outputPath;

        }

        return workPath;

    } 

	protected abstract String generateFileNameForKeyValue(K key, V value, Configuration conf);   

	public class MultiRecordWriter extends RecordWriter<K, V> {   

        private HashMap<String, RecordWriter<K, V>> recordWriters = null;

        private TaskAttemptContext job = null;   

        private Path workPath = null;

        public MultiRecordWriter(TaskAttemptContext job, Path workPath) {

            super();

            this.job = job;

            this.workPath = workPath;

            recordWriters = new HashMap<String, RecordWriter<K, V>>();

        }

        @Override

        public void close(TaskAttemptContext context) throws IOException, InterruptedException {

            Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();

            while (values.hasNext()) {

                values.next().close(context);

            }

            this.recordWriters.clear();

        }

        @Override

        public void write(K key, V value) throws IOException, InterruptedException {

            //得到输出文件名

            String baseName = generateFileNameForKeyValue(key, value, job.getConfiguration());

            RecordWriter<K, V> rw = this.recordWriters.get(baseName);

            if (rw == null) {

                rw = getBaseRecordWriter(job, baseName);

                this.recordWriters.put(baseName, rw);

            }

            rw.write(key, value);

        }   

        private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName)

                throws IOException, InterruptedException {

            Configuration conf = job.getConfiguration();

            boolean isCompressed = getCompressOutput(job);

            String keyValueSeparator = ",";

            RecordWriter<K, V> recordWriter = null;

            if (isCompressed) {

                Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job,

                        GzipCodec.class);

                CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);

                Path file = new Path(workPath, baseName + codec.getDefaultExtension());

                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);

                recordWriter = new LineRecordWriter<K, V>(new DataOutputStream(codec

                        .createOutputStream(fileOut)), keyValueSeparator);

            } else {

                Path file = new Path(workPath, baseName);

                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);

                recordWriter = new LineRecordWriter<K, V>(fileOut, keyValueSeparator);

            }

            return recordWriter;

        }

    }   

}

MultiFileOutPut

package com.hadoop.multi;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import com.hadoop.multi.MultipleOutputFormat;

public class MultiFileOutPut {

  public static class TokenizerMapper

       extends Mapper<Object, Text, Text, IntWritable>{

    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();

    public void map(Object key, Text value, Context context

                    ) throws IOException, InterruptedException {

      StringTokenizer itr = new StringTokenizer(value.toString());

      while (itr.hasMoreTokens()) {

        word.set(itr.nextToken());

        context.write(word, one);

      }

    }

  }

  public static class IntSumReducer

       extends Reducer<Text,IntWritable,Text,IntWritable> {

    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values,

                       Context context

                       ) throws IOException, InterruptedException {

      int sum = 0;

      for (IntWritable val : values) {

        sum += val.get();

      }

      result.set(sum);

      context.write(key, result);

    }

  }

  public static class AlphabetOutputFormat extends MultipleOutputFormat<Text, IntWritable> {

      @Override

      protected String generateFileNameForKeyValue(Text key, IntWritable value, Configuration conf) {

          char c = key.toString().toLowerCase().charAt(0);

          if (c >= 'a' && c <= 'z') {

              return c + ".txt";

          }

          return "other.txt";

      }

  }  

  public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (otherArgs.length != 2) {

      System.err.println("Usage: wordcount <in> <out>");

      System.exit(2);

    }

    Job job = new Job(conf, "word count");

    job.setJarByClass(MultiFileOutPut.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);

    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);

    job.setOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AlphabetOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

  }

}

Hadoop 实现多文件输出的更多相关文章

hadoop的输入和输出文件
对于hadoop的输入和输出文件,有什么要求吗?
hadoop多文件输出
现实环境中,经常遇到一个问题就是想使用多个Reduce,可是迫于setup和cleanup在每个Reduce中会调用一次,仅仅能设置一个Reduce,无法是实现负载均衡. 问题,假设要在reduce中 ...
mapreduce多文件输出的两方法
mapreduce多文件输出的两方法 package duogemap; import java.io.IOException; import org.apache.hadoop.conf ...
Hadoop之HDFS文件操作常有两种方式（转载）
摘要:Hadoop之HDFS文件操作常有两种方式,命令行方式和JavaAPI方式.本文介绍如何利用这两种方式对HDFS文件进行操作. 关键词:HDFS文件命令行 Java API HD ...
hadoop对于压缩文件的支持及算法优缺点
hadoop对于压缩文件的支持及算法优缺点 hadoop对于压缩格式的是透明识别,我们的MapReduce任务的执行是透明的,hadoop能够自动为我们将压缩的文件解压,而不用我们去关心. 如果 ...
MR案例：多文件输出MultipleOutputs
问题描述:现有 ip-to-hosts.txt 数据文件,文件中每行数据有两个字段:分别是ip地址和该ip地址对应的国家,以'\t'分隔.要求汇总不同国家的IP数,并以国家名为文件名将其输出.解读:M ...
Hadoop IO基于文件的数据结构详解【列式和行式数据结构的存储策略】
Charles所有关于hadoop的文章参考自hadoop权威指南第四版预览版大家可以去safari免费阅读其英文预览版.本人也上传了PDF版本在我的资源中可以免费下载,不需要C币,点击这里下载. ...
Hadoop之HDFS文件操作
摘要:Hadoop之HDFS文件操作常有两种方式.命令行方式和JavaAPI方式.本文介绍怎样利用这两种方式对HDFS文件进行操作. 关键词:HDFS文件命令行 Java API HD ...
使用log4j配置不同文件输出不同内容
敲代码中很不注意写日志,虽然明白很重要.今天碰到记录日志,需要根据内容分别输出到不同的文件. 参考几篇文章: 感觉最详细:http://blog.csdn.net/azheng270/article/ ...

随机推荐

MVC 5 Web编程2 -- URL映射
ASP.NET MVC 5 Web编程2 -- URL映射(路由原理) 2015-02-12 08:50 by hangwei, 704 阅读, 5 评论, 收藏, 编辑本章将讲述ASP.NET M ...
MyEclipse 设置全部jsp的编码为UFT-8 的方法
centos6的安装，一步一图，有图有真相
打开虚拟机VMware,点击文件,选择[新建虚拟机],如图所示
Installshield获取安装包版本的系统变量是IFX_PRODUCT_VERSION
原文:Installshield获取安装包版本的系统变量是IFX_PRODUCT_VERSION Installshield获取安装包版本的系统变量为IFX_PRODUCT_VERSION 当笔记记下 ...
PHP 15:异常
原文:PHP 15:异常看完了out_put_fns.php文件,让我们再看看db_fns.php文件.其代码非常简单,如下: ?> 其作用是连接数据库,并返回一个数据库连接.在这里我们 ...
bzoj 1799: [Ahoi2009]self 类似的分布解读
[原标题] 1799: [Ahoi2009]self 同类分布 Time Limit: 50 Sec Memory Limit: 64 MB Submit: 554 Solved: 194 [id ...
Qt Creator（编译器MinGW）中使用__attribute__(packed)的问题
http://www.bttr-software.de/forum/mix_entry.php?id=11767 假设我们从串口中读到一串数据,当我们想要处理这串数据的时候通常是这样做的: 1 将这些 ...
openwrt构建过程探索
参考网站:http://wiki.openwrt.org/doc/howto/buildroot.exigence 需要下载必要的库文件,编译器等... 1 首先要获得openwrt的源码,参考ope ...
点击按钮，通过JS代码实现复制INPUT表单，表格：
获取表单: <SCRIPT LANGUAGE="JavaScript"> function copyinput() { var input=document.getEl ...
Hive中抽取连续多天登录用户
昨天群上有人发个阿里的面试题,题目描述大概如下: 数据源:用户登录表,只有俩个字段,uid和dt 试用HQL抽取出连续登录了K天的用户uid 第一个想法就是直接用一个UDF解决,按uid分组,把dt收 ...

Hadoop 实现多文件输出

Hadoop 实现多文件输出的更多相关文章

随机推荐

热门专题