关于MapReduce中自定义分组类(三)
/**
* Define the comparator that controls which keys are grouped together
* for a single call to
* {@link Reducer#reduce(Object, Iterable,
* org.apache.hadoop.mapreduce.Reducer.Context)}
* @param cls the raw comparator to use
* @throws IllegalStateException if the job is submitted
* @see #setCombinerKeyGroupingComparatorClass(Class)
*/
publicvoid setGroupingComparatorClass(Class<? extends RawComparator> cls
) throws IllegalStateException{
ensureState(JobState.DEFINE);
conf.setOutputValueGroupingComparator(cls);
}
/**
* Set the user defined {@link RawComparator} comparator for
* grouping keys in the input to the reduce.
*
* <p>This comparator should be provided if the equivalence rules for keys
* for sorting the intermediates are different from those for grouping keys
* before each call to
* {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.</p>
*
* <p>For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
* in a single call to the reduce function if K1 and K2 compare as equal.</p>
*
* <p>Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
* how keys are sorted, this can be used in conjunction to simulate
* <i>secondary sort on values</i>.</p>
*
* <p><i>Note</i>: This is not a guarantee of the reduce sort being
* <i>stable</i> in any sense. (In any case, with the order of available
* map-outputs to the reduce being non-deterministic, it wouldn't make
* that much sense.)</p>
*
* @param theClass the comparator class to be used for grouping keys.
* It should implement <code>RawComparator</code>.
* @see #setOutputKeyComparatorClass(Class)
* @see #setCombinerKeyGroupingComparator(Class)
*/
publicvoid setOutputValueGroupingComparator(
Class<? extends RawComparator> theClass){
setClass(JobContext.GROUP_COMPARATOR_CLASS,
theClass,RawComparator.class);
}
/**
* Get the user defined {@link WritableComparable} comparator for
* grouping keys of inputs to the reduce.
*
* @return comparator set by the user for grouping values.
* @see #setOutputValueGroupingComparator(Class) for details.
*/
publicRawComparator getOutputValueGroupingComparator(){
Class<? extends RawComparator> theClass = getClass(
JobContext.GROUP_COMPARATOR_CLASS, null,RawComparator.class);
if(theClass == null){
return getOutputKeyComparator();
}
returnReflectionUtils.newInstance(theClass,this);
}
RawComparator comparator = job.getOutputValueGroupingComparator();
if(useNewApi){
runNewReducer(job, umbilical, reporter, rIter, comparator,
keyClass, valueClass);
}else{
runOldReducer(job, umbilical, reporter, rIter, comparator,
keyClass, valueClass);
}
private<INKEY,INVALUE,OUTKEY,OUTVALUE>void runNewReducer(JobConf job,
final TaskUmbilicalProtocol umbilical,
final TaskReporter reporter,
RawKeyValueIterator rIter,
RawComparator<INKEY> comparator,
Class<INKEY> keyClass,
Class<INVALUE> valueClass
) throws IOException,InterruptedException,
ClassNotFoundException{
// wrap value iterator to report progress.
final RawKeyValueIterator rawIter = rIter;
rIter =newRawKeyValueIterator(){
publicvoid close() throws IOException{
rawIter.close();
}
publicDataInputBuffer getKey() throws IOException{
return rawIter.getKey();
}
publicProgress getProgress(){
return rawIter.getProgress();
}
publicDataInputBuffer getValue() throws IOException{
return rawIter.getValue();
}
public boolean next() throws IOException{
boolean ret = rawIter.next();
reporter.setProgress(rawIter.getProgress().getProgress());
return ret;
}
};
// make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
getTaskID(), reporter);
// make a reducer
org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE> reducer =
(org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>)
ReflectionUtils.newInstance(taskContext.getReducerClass(), job);
org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> trackedRW =
newNewTrackingRecordWriter<OUTKEY, OUTVALUE>(this, taskContext);
job.setBoolean("mapred.skip.on", isSkipping());
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
org.apache.hadoop.mapreduce.Reducer.Context
reducerContext = createReduceContext(reducer, job, getTaskID(),
rIter, reduceInputKeyCounter,
reduceInputValueCounter,
trackedRW,
committer,
reporter, comparator, keyClass,
valueClass);
try{
reducer.run(reducerContext);
} finally {
trackedRW.close(reducerContext);
}
}
@SuppressWarnings("unchecked")
protectedstatic<INKEY,INVALUE,OUTKEY,OUTVALUE>
org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context
createReduceContext(org.apache.hadoop.mapreduce.Reducer
<INKEY,INVALUE,OUTKEY,OUTVALUE> reducer,
Configuration job,
org.apache.hadoop.mapreduce.TaskAttemptID taskId,
RawKeyValueIterator rIter,
org.apache.hadoop.mapreduce.Counter inputKeyCounter,
org.apache.hadoop.mapreduce.Counter inputValueCounter,
org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> output,
org.apache.hadoop.mapreduce.OutputCommitter committer,
org.apache.hadoop.mapreduce.StatusReporter reporter,
RawComparator<INKEY> comparator,
Class<INKEY> keyClass,Class<INVALUE> valueClass
) throws IOException,InterruptedException{
org.apache.hadoop.mapreduce.ReduceContext<INKEY, INVALUE, OUTKEY, OUTVALUE>
reduceContext =
newReduceContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, taskId,
rIter,
inputKeyCounter,
inputValueCounter,
output,
committer,
reporter,
comparator,
keyClass,
valueClass);
publicReduceContextImpl(Configuration conf,TaskAttemptID taskid,
RawKeyValueIterator input,
Counter inputKeyCounter,
Counter inputValueCounter,
RecordWriter<KEYOUT,VALUEOUT> output,
OutputCommitter committer,
StatusReporter reporter,
RawComparator<KEYIN> comparator,
Class<KEYIN> keyClass,
Class<VALUEIN> valueClass
) throws InterruptedException,IOException{
super(conf, taskid, output, committer, reporter);
this.input = input;
this.inputKeyCounter = inputKeyCounter;
this.inputValueCounter = inputValueCounter;
this.comparator = comparator;
this.serializationFactory =newSerializationFactory(conf);
this.keyDeserializer = serializationFactory.getDeserializer(keyClass);
this.keyDeserializer.open(buffer);
this.valueDeserializer = serializationFactory.getDeserializer(valueClass);
this.valueDeserializer.open(buffer);
hasMore = input.next();
this.keyClass = keyClass;
this.valueClass = valueClass;
this.conf = conf;
this.taskid = taskid;
}
/**
* Advance to the next key/value pair.
*/
@Override
public boolean nextKeyValue() throws IOException,InterruptedException{
if(!hasMore){
key = null;
value = null;
returnfalse;
}
firstValue =!nextKeyIsSame;
DataInputBuffer nextKey = input.getKey();
currentRawKey.set(nextKey.getData(), nextKey.getPosition(),
nextKey.getLength()- nextKey.getPosition());
buffer.reset(currentRawKey.getBytes(),0, currentRawKey.getLength());
key = keyDeserializer.deserialize(key);
DataInputBuffer nextVal = input.getValue();
buffer.reset(nextVal.getData(), nextVal.getPosition(), nextVal.getLength()
- nextVal.getPosition());
value = valueDeserializer.deserialize(value);
currentKeyLength = nextKey.getLength()- nextKey.getPosition();
currentValueLength = nextVal.getLength()- nextVal.getPosition();
if(isMarked){
backupStore.write(nextKey, nextVal);
}
hasMore = input.next();
if(hasMore){
nextKey = input.getKey();
nextKeyIsSame = comparator.compare(currentRawKey.getBytes(),0,
currentRawKey.getLength(),
nextKey.getData(),
nextKey.getPosition(),
nextKey.getLength()- nextKey.getPosition()
)==0;
}else{
nextKeyIsSame =false;
}
inputValueCounter.increment(1);
returntrue;
}
if(theClass == null){
return getOutputKeyComparator();
}
/**
* Get the {@link RawComparator} comparator used to compare keys.
*
* @return the {@link RawComparator} comparator used to compare keys.
*/
publicRawComparator getOutputKeyComparator(){
Class<? extends RawComparator> theClass = getClass(
JobContext.KEY_COMPARATOR, null,RawComparator.class);
if(theClass != null)
returnReflectionUtils.newInstance(theClass,this);
returnWritableComparator.get(getMapOutputKeyClass().asSubclass(WritableComparable.class),this);
}
returnReflectionUtils.newInstance(theClass,this);
关于MapReduce中自定义分组类(三)的更多相关文章
- 关于MapReduce中自定义分区类(四)
MapTask类 在MapTask类中找到run函数 if(useNewApi){ runNewMapper(job, splitMetaInfo, umbilical, reporter ...
- 关于MapReduce中自定义Combine类(一)
MRJobConfig public static fina COMBINE_CLASS_ATTR 属性COMBINE_CLASS_ATTR = "mapreduce.j ...
- 2 weekend110的hadoop的自定义排序实现 + mr程序中自定义分组的实现
我想得到按流量来排序,而且还是倒序,怎么达到实现呢? 达到下面这种效果, 默认是根据key来排, 我想根据value里的某个排, 解决思路:将value里的某个,放到key里去,然后来排 下面,开始w ...
- 关于MapReduce中自定义带比较key类、比较器类(二)——初学者从源码查看其原理
Job类 /** * Define the comparator that controls * how the keys are sorted before they * are pa ...
- flask中自定义日志类
一:项目架构 二:自定义日志类 1. 建立log.conf的配置文件 log.conf [log] LOG_PATH = /log/ LOG_NAME = info.log 2. 定义日志类 LogC ...
- python3.4中自定义数组类(即重写数组类)
'''自定义数组类,实现数组中数字之间的四则运算,内积运算,大小比较,数组元素访问修改及成员测试等功能''' class MyArray: '''保证输入值为数字元素(整型,浮点型,复数)''' de ...
- 一脸懵逼学习Hadoop中的MapReduce程序中自定义分组的实现
1:首先搞好实体类对象: write 是把每个对象序列化到输出流,readFields是把输入流字节反序列化,实现WritableComparable,Java值对象的比较:一般需要重写toStrin ...
- 读取SequenceFile中自定义Writable类型值
1)hadoop允许程序员创建自定义的数据类型,如果是key则必须要继承WritableComparable,因为key要参与排序,而value只需要继承Writable就可以了.以下定义一个Doub ...
- Java中自定义注解类,并加以运用
在Java框架中,经常会使用注解,而且还可以省很多事,来了解下自定义注解. 注解是一种能被添加到java代码中的元数据,类.方法.变量.参数和包都可以用注解来修饰.注解对于它所修饰的代码并没有直接的影 ...
随机推荐
- 【推荐】CentOS安装Subversion-1.8.11+HTTP协议支持配置
注:以下所有操作均在CentOS 6.5 x86_64位系统下完成. 我们需要搭建一个自己的SVN服务器. 此外,搭建好的SVN服务器除了需要支持svn协议外,最好还需要支持HTTP协议和HTTPS协 ...
- Windows 10 虚拟桌面切换
从Windows 10开始,终于有了和Mac一样的虚拟桌面了.但总感觉用着非常的别扭.在Mac中,切换虚拟桌面的操作可谓方便至极:除了触控板和Magic Mouse原生的支持外,通过罗技M557/55 ...
- Which language is best, C, C++, Python or Java?什么编程语言最好
Either you fuck the life or the life fucks you. 转载自 quora 大致翻译一下,不喜勿喷,谢谢支持!以下是内容: I have used each o ...
- BZOJ3160万径人踪灭
Description Input & Output & Sample Input & Sample Output HINT 题解: 题意即求不连续但间隔长度对称的回文串个数. ...
- jquery validate表单验证插件-推荐
1 表单验证的准备工作 在开启长篇大论之前,首先将表单验证的效果展示给大家. 1.点击表单项,显示帮助提示 2.鼠标离开表单项时,开始校验元素 3.鼠标离开后的正确.错误提示及鼠标移入时的帮 ...
- November 1st 2016 Week 45th Tuesday
Difficult circumstances serve as a textbook of life for people. 艰难坎坷是人们的生活教科书. It would be better if ...
- [开源].NET数据库访问框架Chloe.ORM
扯淡 13年毕业之际,进入第一家公司实习,接触了 EntityFramework,当时就觉得这东西太牛了,访问数据库都可以做得这么轻松.优雅!毕竟那时还年轻,没见过世面.工作之前为了拿个实习机会混个工 ...
- [LeetCode] Longest Substring with At Most Two Distinct Characters 最多有两个不同字符的最长子串
Given a string S, find the length of the longest substring T that contains at most two distinct char ...
- mac mysql5.7重置root密码
先停止mysql服务 //停止表权限 cd /usr/local/mysql/bin/ ./mysqld_safe --skip-grant-tables & 直接mysql 进入数据库 up ...
- Python文件读写
读写文件是最常见的IO操作.Python内置了读写文件的函数,用法和C是兼容的 文件打开模式 模式 意义 r 文本只读 w 文本只写 rb 二进制读 rw 二进制写 打开文件 选择一个模式打开一个文件 ...