public static class Map extends Mapper<LongWritable, Text, IntPair, IntWritable>
public static class Reduce extends Reducer<IntPair, NullWritable, IntWritable, IntWritable>
1 首先说一下工作原理:
在map阶段,使用job.setInputFormatClass定义的InputFormat将输入的数据集分割成小数据块splites,同时InputFormat提供一个RecordReder的实现。本例子中使用的是TextInputFormat,他提供的RecordReder会将文本的一行的行号作为key,这一行的文本作为value。这就是自定义Map的输入是<LongWritable, Text>的原因。然后调用自定义Map的map方法,将一个个<LongWritable, Text>对输入给Map的map方法。注意输出应该符合自定义Map中定义的输出<IntPair, IntWritable>。最终是生成一个List<IntPair, IntWritable>。在map阶段的最后,会先调用job.setPartitionerClass对这个List进行分区,每个分区映射到一个reducer。每个分区内又调用job.setSortComparatorClass设置的key比较函数类排序。可以看到,这本身就是一个二次排序。如果没有通过job.setSortComparatorClass设置key比较函数类,则使用key的实现的compareTo方法。在第一个例子中,使用了IntPair实现的compareTo方法,而在下一个例子中,专门定义了key比较函数类。
2 二次排序就是首先按照第一字段排序,然后再对第一字段相同的行按照第二字段排序,注意不能破坏第一次排序的结果 。例如
20 21
50 51
50 52
50 53
50 54
60 51
60 53
60 52
60 56
60 57
70 58
60 61
70 54
70 55
70 56
70 57
70 58
1 2
3 4
5 6
7 82
203 21
50 512
50 522
50 53
530 54
40 511
20 53
20 522
60 56
60 57
740 58
63 61
730 54
71 55
71 56
73 57
74 58
12 211
31 42
50 62
7 8
1 2
3 4
5 6
7 8
7 82
12 211
20 21
20 53
20 522
31 42
40 511
50 51
50 52
50 53
50 53
50 54
50 62
50 512
50 522
60 51
60 52
60 53
60 56
60 56
60 57
60 57
60 61
63 61
70 54
70 55
70 56
70 57
70 58
70 58
71 55
71 56
73 57
74 58
203 21
530 54
730 54
740 58
3 具体步骤:
- //反序列化,从流中的二进制转换成IntPair
- public void readFields(DataInput in) throws IOException
- //序列化,将IntPair转化成使用流传送的二进制
- public void write(DataOutput out)
- //key的比较
- public int compareTo(IntPair o)
- //另外新定义的类应该重写的两个方法
- //The hashCode() method is used by the HashPartitioner (the default partitioner in MapReduce)
- public int hashCode()
- public boolean equals(Object right)
- public static class FirstPartitioner extends Partitioner<IntPair,IntWritable>
- public static class KeyComparator extends WritableComparator
必须有一个构造函数,并且重载 public int compare(WritableComparable w1, WritableComparable w2)
另一种方法是 实现接口RawComparator。
- public static class GroupingComparator extends WritableComparator
分组函数类也必须有一个构造函数,并且重载 public int compare(WritableComparable w1, WritableComparable w2)
3 代码。
- package secondarySort;
- import;
- import;
- import;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import;
- import;
- import;
- import;
- import;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Partitioner;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
- public class SecondarySort {
- //自己定义的key类应该实现WritableComparable接口
- public static class IntPair implements WritableComparable<IntPair> {
- int first;
- int second;
- /**
- * Set the left and right values.
- */
- public void set(int left, int right) {
- first = left;
- second = right;
- }
- public int getFirst() {
- return first;
- }
- public int getSecond() {
- return second;
- }
- @Override
- //反序列化,从流中的二进制转换成IntPair
- public void readFields(DataInput in) throws IOException {
- // TODO Auto-generated method stub
- first = in.readInt();
- second = in.readInt();
- }
- @Override
- //序列化,将IntPair转化成使用流传送的二进制
- public void write(DataOutput out) throws IOException {
- // TODO Auto-generated method stub
- out.writeInt(first);
- out.writeInt(second);
- }
- @Override
- //key的比较
- public int compareTo(IntPair o) {
- // TODO Auto-generated method stub
- if (first != o.first) {
- return first < o.first ? -1 : 1;
- } else if (second != o.second) {
- return second < o.second ? -1 : 1;
- } else {
- return 0;
- }
- }
- //新定义类应该重写的两个方法
- @Override
- //The hashCode() method is used by the HashPartitioner (the default partitioner in MapReduce)
- public int hashCode() {
- return first * 157 + second;
- }
- @Override
- public boolean equals(Object right) {
- if (right == null)
- return false;
- if (this == right)
- return true;
- if (right instanceof IntPair) {
- IntPair r = (IntPair) right;
- return r.first == first && r.second == second;
- } else {
- return false;
- }
- }
- }
- /**
- * 分区函数类。根据first确定Partition。
- */
- public static class FirstPartitioner extends Partitioner<IntPair,IntWritable>{
- @Override
- public int getPartition(IntPair key, IntWritable value,
- int numPartitions) {
- return Math.abs(key.getFirst() * 127) % numPartitions;
- }
- }
- /**
- * 分组函数类。只要first相同就属于同一个组。
- */
- /*//第一种方法,实现接口RawComparator
- public static class GroupingComparator implements RawComparator<IntPair> {
- @Override
- public int compare(IntPair o1, IntPair o2) {
- int l = o1.getFirst();
- int r = o2.getFirst();
- return l == r ? 0 : (l < r ? -1 : 1);
- }
- @Override
- //一个字节一个字节的比,直到找到一个不相同的字节,然后比这个字节的大小作为两个字节流的大小比较结果。
- public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
- // TODO Auto-generated method stub
- return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8,
- b2, s2, Integer.SIZE/8);
- }
- }*/
- //第二种方法,继承WritableComparator
- public static class GroupingComparator extends WritableComparator {
- protected GroupingComparator() {
- super(IntPair.class, true);
- }
- @Override
- //Compare two WritableComparables.
- public int compare(WritableComparable w1, WritableComparable w2) {
- IntPair ip1 = (IntPair) w1;
- IntPair ip2 = (IntPair) w2;
- int l = ip1.getFirst();
- int r = ip2.getFirst();
- return l == r ? 0 : (l < r ? -1 : 1);
- }
- }
- // 自定义map
- public static class Map extends
- Mapper<LongWritable, Text, IntPair, IntWritable> {
- private final IntPair intkey = new IntPair();
- private final IntWritable intvalue = new IntWritable();
- public void map(LongWritable key, Text value, Context context)
- throws IOException, InterruptedException {
- String line = value.toString();
- StringTokenizer tokenizer = new StringTokenizer(line);
- int left = 0;
- int right = 0;
- if (tokenizer.hasMoreTokens()) {
- left = Integer.parseInt(tokenizer.nextToken());
- if (tokenizer.hasMoreTokens())
- right = Integer.parseInt(tokenizer.nextToken());
- intkey.set(left, right);
- intvalue.set(right);
- context.write(intkey, intvalue);
- }
- }
- }
- // 自定义reduce
- //
- public static class Reduce extends
- Reducer<IntPair, IntWritable, Text, IntWritable> {
- private final Text left = new Text();
- private static final Text SEPARATOR =
- new Text("------------------------------------------------");
- public void reduce(IntPair key, Iterable<IntWritable> values,
- Context context) throws IOException, InterruptedException {
- context.write(SEPARATOR, null);
- left.set(Integer.toString(key.getFirst()));
- for (IntWritable val : values) {
- context.write(left, val);
- }
- }
- }
- /**
- * @param args
- */
- public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
- // TODO Auto-generated method stub
- // 读取hadoop配置
- Configuration conf = new Configuration();
- // 实例化一道作业
- Job job = new Job(conf, "secondarysort");
- job.setJarByClass(SecondarySort.class);
- // Mapper类型
- job.setMapperClass(Map.class);
- // 不再需要Combiner类型,因为Combiner的输出类型<Text, IntWritable>对Reduce的输入类型<IntPair, IntWritable>不适用
- //job.setCombinerClass(Reduce.class);
- // Reducer类型
- job.setReducerClass(Reduce.class);
- // 分区函数
- job.setPartitionerClass(FirstPartitioner.class);
- // 分组函数
- job.setGroupingComparatorClass(GroupingComparator.class);
- // map 输出Key的类型
- job.setMapOutputKeyClass(IntPair.class);
- // map输出Value的类型
- job.setMapOutputValueClass(IntWritable.class);
- // rduce输出Key的类型,是Text,因为使用的OutputFormatClass是TextOutputFormat
- job.setOutputKeyClass(Text.class);
- // rduce输出Value的类型
- job.setOutputValueClass(IntWritable.class);
- // 将输入的数据集分割成小数据块splites,同时提供一个RecordReder的实现。
- job.setInputFormatClass(TextInputFormat.class);
- // 提供一个RecordWriter的实现,负责数据输出。
- job.setOutputFormatClass(TextOutputFormat.class);
- // 输入hdfs路径
- FileInputFormat.setInputPaths(job, new Path(args[0]));
- // 输出hdfs路径
- FileOutputFormat.setOutputPath(job, new Path(args[1]));
- // 提交job
- System.exit(job.waitForCompletion(true) ? 0 : 1);
- }
- }
