hadoop 使用map合并小文件到SequenceFile
上一例是直接用SequenceFile的createWriter来实现,本例采用mapreduce的方式。
1、把小文件整体读入需要自定义InputFormat格式,自定义InputFormat格式需要先定义RecordReader读取方式,为了整体读入,RecordReader使用一次性读入所有字节。
1.1 继承RecordReader泛型,重写这个类。
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; public class WholeFileRecordReader extends RecordReader<NullWritable,BytesWritable> {
private FileSplit fileSplit;
private Configuration conf;
private BytesWritable value = new BytesWritable();
private boolean processed = false;
/**
* Called once at initialization.
*
* @param split the split that defines the range of records to read
* @param context the information about the task
* @throws IOException
* @throws InterruptedException
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.fileSplit = (FileSplit) split;
this.conf = context.getConfiguration();
} /**
* Read the next key, value pair.
*
* @return true if a key/value pair was read
* @throws IOException
* @throws InterruptedException
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(!processed){
byte[] contents = new byte[(int)fileSplit.getLength()];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in,contents,0,contents.length);//一次全部读取
value.set(contents,0,contents.length);
}finally {
IOUtils.closeStream(in);
}
processed = true;
return true;
} return false;
} /**
* Get the current key
*
* @return the current key or null if there is no current key
* @throws IOException
* @throws InterruptedException
*/
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
} /**
* Get the current value.
*
* @return the object that was read
* @throws IOException
* @throws InterruptedException
*/
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
} /**
* The current progress of the record reader through its data.
*
* @return a number between 0.0 and 1.0 that is the fraction of the data read
* @throws IOException
* @throws InterruptedException
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return processed ? 1.0f:0.0f;
} /**
* Close the record reader.
*/
@Override
public void close() throws IOException { }
}
1.2 继承FileInputFormat泛型,重写文件输入格式
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import java.io.IOException; public class WholeFileInputFormat extends FileInputFormat<NullWritable,BytesWritable> {
/**
* Is the given filename splittable? Usually, true, but if the file is
* stream compressed, it will not be.
* <p>
* The default implementation in <code>FileInputFormat</code> always returns
* true. Implementations that may deal with non-splittable files <i>must</i>
* override this method.
* <p>
* <code>FileInputFormat</code> implementations can override this and return
* <code>false</code> to ensure that individual input files are never split-up
* so that process entire files.
*
* @param context the job context
* @param filename the file name to check
* @return is this file splitable?
*/
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;//文件不分片,为了整体读入
} /**
* Create a record reader for a given split. The framework will call
* {@link RecordReader#initialize(InputSplit, TaskAttemptContext)} before
* the split is used.
*
* @param split the split to be read
* @param context the information about the task
* @return a new record reader
* @throws IOException
* @throws InterruptedException
*/
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
WholeFileRecordReader recordReader = new WholeFileRecordReader();
recordReader.initialize(split,context);
return recordReader;
}
}
2、MAPPER,不要写reduce,本例只是合并文件。
public class SequenceFileMapper extends Mapper<NullWritable,BytesWritable,Text,BytesWritable> {
enum FileCounter {
FILENUM
}
private Text filenameKey;
/**
* Called once at the beginning of the task.
*
* @param context
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
InputSplit split = context.getInputSplit();
Path path = ((FileSplit)split).getPath();
filenameKey = new Text(path.toString());
} /**
* Called once for each key/value pair in the input split. Most applications
* should override this, but the default is the identity function.
*
* @param key
* @param value
* @param context
*/
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
context.write(filenameKey,value);
//自定义计数器
context.getCounter(FileCounter.FILENUM).increment(1);
//动态计数器
context.getCounter("FileNameList",filenameKey.toString()).increment(1);
}
}
3、执行job,使用辅助类Tool,也可以不用,直接写job执行就可以。
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; public class SmallFilesToSequenceFileConverter extends Configured implements Tool { /**
* Execute the command with the given arguments.
*
* @param args command specific arguments.
* @return exit code.
* @throws Exception
*/
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
if(conf==null){
return -1;
} Path outPath = new Path(args[1]);
FileSystem fileSystem = outPath.getFileSystem(conf);
//删除输出路径
if(fileSystem.exists(outPath))
{
fileSystem.delete(outPath,true);
} Job job = Job.getInstance(conf,"SmallFilesToSequenceFile");
job.setJarByClass(SmallFilesToSequenceFileConverter.class); job.setMapperClass(SequenceFileMapper.class); job.setInputFormatClass(WholeFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class); FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1])); return job.waitForCompletion(true) ? 0:1;
} public static void main(String[] args) throws Exception{
long startTime = System.currentTimeMillis(); int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(),args);
System.exit(exitCode); long endTime = System.currentTimeMillis();
long timeSpan = endTime - startTime;
System.out.println("运行耗时:"+timeSpan+"毫秒。");
}
}
4、上传集群运行,打包成jar包的时候把META-INF目录和src目录放同级,防止找不到函数入口。
#手动调整reduce数量为2,运算后会生成两个part
[hadoop@bigdata-senior01 ~]$ hadoop jar SmallFilesToSequenceFileConverter.jar -D mapreduce.job.reduces=2 /demo /output3 ...
[hadoop@bigdata-senior01 ~]$ hadoop fs -ls /output3
Found 3 items
-rw-r--r-- 1 hadoop supergroup 0 2019-02-18 16:17 /output3/_SUCCESS
-rw-r--r-- 1 hadoop supergroup 60072 2019-02-18 16:17 /output3/part-r-00000
-rw-r--r-- 1 hadoop supergroup 28520 2019-02-18 16:17 /output3/part-r-00001
hadoop 使用map合并小文件到SequenceFile的更多相关文章
- [转载]mapreduce合并小文件成sequencefile
mapreduce合并小文件成sequencefile http://blog.csdn.net/xiao_jun_0820/article/details/42747537
- Hadoop合并小文件的几种方法
1.Hadoop HAR 将众多小文件打包成一个大文件进行存储,并且打包后原来的文件仍然可以通过Map-Reduce进行操作,打包后的文件由索引和存储两大部分组成: 缺点: 一旦创建就不能修改,也不支 ...
- Hadoop HDFS编程 API入门系列之合并小文件到HDFS(三)
不多说,直接上代码. 代码 package zhouls.bigdata.myWholeHadoop.HDFS.hdfs7; import java.io.IOException;import ja ...
- Hadoop实战项目:小文件合并
项目背景 在实际项目中,输入数据往往是由许多小文件组成,这里的小文件是指小于HDFS系统Block大小的文件(默认128M),早期的版本所定义的小文件是64M,这里的hadoop-2.2.0所定义的小 ...
- Hadoop记录-hive merge小文件
1. Map输入合并小文件对应参数:set mapred.max.split.size=256000000; #每个Map最大输入大小set mapred.min.split.size.per.no ...
- HDFS 07 - HDFS 性能调优之 合并小文件
目录 1 - 为什么要合并小文件 2 - 合并本地的小文件,上传到 HDFS 3 - 合并 HDFS 的小文件,下载到本地 4 - 通过 Java API 实现文件合并和上传 版权声明 1 - 为什么 ...
- Hive 利用 on tez 引擎 合并小文件
Hive 利用 on tez 引擎 合并小文件 标签(空格分隔): Hive \[f(N) + \sum_{i=2}^N f(N-i+1)*X_i\] SET hive.exec.dynamic.pa ...
- 用Hadoop AVRO进行大量小文件的处理(转)
使用 使用使用 使用 HDFS 保存大量小文件的缺点:1.Hadoop NameNode 在内存中保存所有文件的“元信息”数据.据统计,每一个文件需要消耗 NameNode600 字节内存.如果需要保 ...
- iceberg合并小文件冲突测试
基于iceberg的master分支的9b6b5e0d2(2022-2-9). 参数说明 1.PARTIAL_PROGRESS_ENABLED(partial-progress.enabled) 默认 ...
随机推荐
- 成都Uber优步司机奖励政策(3月10日)
滴快车单单2.5倍,注册地址:http://www.udache.com/ 如何注册Uber司机(全国版最新最详细注册流程)/月入2万/不用抢单:http://www.cnblogs.com/mfry ...
- 成都Uber优步司机奖励政策(1月21日)
滴快车单单2.5倍,注册地址:http://www.udache.com/ 如何注册Uber司机(全国版最新最详细注册流程)/月入2万/不用抢单:http://www.cnblogs.com/mfry ...
- Android stado 运行项目,apk does not exist on disk.
报错如下: 03/12 21:38:56: Launching iReader The APK file F:\git\iReader_nubia\iReader\build\outputs\apk\ ...
- SQL计算出百分比
有clients和lead_sources俩表.mysql数据库. lead_sources表结构类似: clients表中的lead_source_id是外键.现在要统计某时间段内client内每种 ...
- iOS的内存分配
iOS中的内存大致可以分为代码区,全局/静态区,常量区,堆区,栈区. 1.代码区 代码段是用来存放可执行文件的操作指令(存放函数的二进制代码),也就是说是它是可执行程序在内存中的镜像.代码段需要防止在 ...
- 问题:Visual Studio 2017 无法推送到github:The requested URL returned error: 403
问题: Visual Studio 2017 无法推送到github:The requested URL returned error: 403 原因分析: Visual Studio 2017记录的 ...
- 使用postman实现半自动化
前些日子项目要上一个活动,其中有一个功能是幸运大转盘,用户可以随机抽奖,奖品有多种满减券及多种商品,但是奖品都是有抽中概率的,且有的商品还设置有库存,所以测试点便是测试抽奖的概率和库存.接下来拆分一下 ...
- LeetCode - 412. Fizz Buzz - ( C++ ) - 解题报告 - to_string
1.题目大意 Write a program that outputs the string representation of numbers from 1 to n. But for multip ...
- FFM
转载自http://blog.csdn.net/jediael_lu/ https://blog.csdn.net/jediael_lu/article/details/77772565 点击率预估算 ...
- lvs+keepalived详解
常用软件安装及使用目录 资源链接:https://pan.baidu.com/s/15rFjO-EnTOyiTM7YRkbxuA 网盘分享的文件在此 官网:http://www.linuxvir ...