

  1. 如果我们有10亿个数据,Mapper会生成10亿个键值对在网络间进行传输,但如果我们只是对数据求最大值,那么很明显的Mapper只需要输出它所知道的最大值即可。这样做不仅可以减轻网络压力,同样也可以大幅度提高程序效率。
  2. 使用专利中的国家一项来阐述数据倾斜这个定义。这样的数据远远不是一致性的或者说平衡分布的,由于大多数专利的国家都属于美国,这样不仅Mapper中的键值对、中间阶段(shuffle)的键值对等,大多数的键值对最终会聚集于一个单一的Reducer之上,压倒这个Reducer,从而大大降低程序的性能。






  1. package com;
  3. import java.io.IOException;
  5. import org.apache.hadoop.conf.Configuration;
  6. import org.apache.hadoop.conf.Configured;
  7. import org.apache.hadoop.fs.Path;
  8. import org.apache.hadoop.io.DoubleWritable;
  9. import org.apache.hadoop.io.LongWritable;
  10. import org.apache.hadoop.io.Text;
  11. import org.apache.hadoop.mapreduce.Job;
  12. import org.apache.hadoop.mapreduce.Mapper;
  13. import org.apache.hadoop.mapreduce.Reducer;
  14. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  15. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
  16. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  17. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
  18. import org.apache.hadoop.util.Tool;
  19. import org.apache.hadoop.util.ToolRunner;
  21. public class AveragingWithCombiner extends Configured implements Tool {
  23. public static class MapClass extends Mapper<LongWritable,Text,Text,Text> {
  25. static enum ClaimsCounters { MISSING, QUOTED };
  26. // Map Method
  27. public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  28. String fields[] = value.toString().split(",", -20);
  29. String country = fields[4];
  30. String numClaims = fields[8];
  32. if (numClaims.length() > 0 && !numClaims.startsWith("\"")) {
  33. context.write(new Text(country), new Text(numClaims + ",1"));
  34. }
  35. }
  36. }
  38. public static class Reduce extends Reducer<Text,Text,Text,DoubleWritable> {
  40. // Reduce Method
  41. public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
  42. double sum = 0;
  43. int count = 0;
  44. for (Text value : values) {
  45. String fields[] = value.toString().split(",");
  46. sum += Double.parseDouble(fields[0]);
  47. count += Integer.parseInt(fields[1]);
  48. }
  49. context.write(key, new DoubleWritable(sum/count));
  50. }
  51. }
  53. public static class Combine extends Reducer<Text,Text,Text,Text> {
  55. // Reduce Method
  56. public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
  57. double sum = 0;
  58. int count = 0;
  59. for (Text value : values) {
  60. String fields[] = value.toString().split(",");
  61. sum += Double.parseDouble(fields[0]);
  62. count += Integer.parseInt(fields[1]);
  63. }
  64. context.write(key, new Text(sum+","+count));
  65. }
  66. }
  68. // run Method
  69. public int run(String[] args) throws Exception {
  70. // Create and Run the Job
  71. Job job = new Job();
  72. job.setJarByClass(AveragingWithCombiner.class);
  74. FileInputFormat.addInputPath(job, new Path(args[0]));
  75. FileOutputFormat.setOutputPath(job, new Path(args[1]));
  77. job.setJobName("AveragingWithCombiner");
  78. job.setMapperClass(MapClass.class);
  79. job.setCombinerClass(Combine.class);
  80. job.setReducerClass(Reduce.class);
  81. job.setInputFormatClass(TextInputFormat.class);
  82. job.setOutputFormatClass(TextOutputFormat.class);
  84. job.setOutputKeyClass(Text.class);
  85. job.setOutputValueClass(Text.class);
  87. System.exit(job.waitForCompletion(true) ? 0 : 1);
  88. return 0;
  89. }
  91. public static void main(String[] args) throws Exception {
  92. int res = ToolRunner.run(new Configuration(), new AveragingWithCombiner(), args);
  93. System.exit(res);
  94. }
  96. }

