MapReduce(三)

MapReduce(三):

1.关于倒叙排序前10名

1)TreeMap根据key排序

2)TreeSet排序,传入一个对象,排序按照类中的compareTo方法排序

2.写一个MapReduce的模板

3.MapReduce的分区

             1)手动分区

             2)自动分区

      4.自定义分区


----------------------------------------------------------------------------------------------------------------------------------

一.关于倒叙排序前10名  

将一个文章中字母单词出现的次数进行倒叙排序,只取前十

1)TreeMap

  1. package com.huhu.day03;
  2.  
  3. import java.io.IOException;
  4. import java.util.Collections;
  5. import java.util.Map;
  6. import java.util.StringTokenizer;
  7. import java.util.TreeMap;
  8.  
  9. import org.apache.hadoop.conf.Configuration;
  10. import org.apache.hadoop.fs.Path;
  11. import org.apache.hadoop.io.IntWritable;
  12. import org.apache.hadoop.io.LongWritable;
  13. import org.apache.hadoop.io.Text;
  14. import org.apache.hadoop.mapreduce.Job;
  15. import org.apache.hadoop.mapreduce.Mapper;
  16. import org.apache.hadoop.mapreduce.Reducer;
  17. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  18. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  19. import org.apache.hadoop.util.GenericOptionsParser;
  20. import org.apache.hadoop.util.Tool;
  21. import org.apache.hadoop.util.ToolRunner;
  22.  
  23. /**
  24. * a 80 b 78 r 70 .. 基于value来排序
  25. *
  26. * @author huhu_k
  27. *
  28. */
  29. public class Top10_1 extends ToolRunner implements Tool {
  30.  
  31. private Configuration conf;
  32.  
  33. public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  34. @Override
  35. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  36. StringTokenizer st = new StringTokenizer(value.toString());
  37. while (st.hasMoreTokens()) {
  38. context.write(new Text(st.nextToken()), new IntWritable(1));
  39. }
  40. }
  41. }
  42.  
  43. public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
  44. // TreeMap倒叙排列
  45. private TreeMap<Long, String> map = new TreeMap<>(Collections.reverseOrder());
  46.  
  47. @Override
  48. protected void reduce(Text key, Iterable<IntWritable> values, Context context)
  49. throws IOException, InterruptedException {
  50. int sum = 0;
  51. for (IntWritable v : values) {
  52. sum++;
  53. }
  54. map.put(Long.valueOf(sum), key.toString());
  55. if (map.size() > 10) {
  56. // 按key排序 前十 降序
  57. map.remove(map.lastKey());
  58. }
  59. }
  60.  
  61. @Override
  62. protected void cleanup(Context context) throws IOException, InterruptedException {
  63. for (Map.Entry<Long, String> m : map.entrySet()) {
  64. context.write(new Text(m.getValue()), new IntWritable(Integer.parseInt(m.getKey() + "")));
  65. }
  66. }
  67. }
  68.  
  69. @Override
  70. public Configuration getConf() {
  71. if (conf != null) {
  72. return conf;
  73. }
  74. return new Configuration();
  75. }
  76.  
  77. @Override
  78. public void setConf(Configuration arg0) {
  79.  
  80. }
  81.  
  82. @Override
  83. public int run(String[] other) throws Exception {
  84. Configuration conf = new Configuration();
  85. Job job = Job.getInstance(conf);
  86. job.setJarByClass(Top10_1.class);
  87. job.setMapperClass(MyMapper.class);
  88. job.setMapOutputKeyClass(Text.class);
  89. job.setMapOutputValueClass(IntWritable.class);
  90.  
  91. job.setReducerClass(MyReduce.class);
  92. job.setOutputKeyClass(Text.class);
  93. job.setOutputValueClass(IntWritable.class);
  94.  
  95. FileInputFormat.addInputPath(job, new Path(other[0]));
  96. FileOutputFormat.setOutputPath(job, new Path(other[1]));
  97.  
  98. return job.waitForCompletion(true) ? 0 : 1;
  99. }
  100.  
  101. public static void main(String[] args) throws Exception {
  102. Top10_1 t = new Top10_1();
  103. String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();
  104. if (other.length != 2) {
  105. System.out.println("your input args number is fail,you need input <in> and <out>");
  106. System.exit(0);
  107. }
  108. ToolRunner.run(t.conf, t, other);
  109. }
  110. }
  1. package com.huhu.day03;
  2.  
  3. import java.io.IOException;
  4. import java.util.Iterator;
  5. import java.util.TreeSet;
  6.  
  7. import org.apache.hadoop.conf.Configuration;
  8. import org.apache.hadoop.fs.Path;
  9. import org.apache.hadoop.io.IntWritable;
  10. import org.apache.hadoop.io.LongWritable;
  11. import org.apache.hadoop.io.NullWritable;
  12. import org.apache.hadoop.io.Text;
  13. import org.apache.hadoop.mapreduce.Job;
  14. import org.apache.hadoop.mapreduce.Mapper;
  15. import org.apache.hadoop.mapreduce.Reducer;
  16. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  17. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  18. import org.apache.hadoop.util.GenericOptionsParser;
  19. import org.apache.hadoop.util.Tool;
  20. import org.apache.hadoop.util.ToolRunner;
  21.  
  22. /**
  23. * a 80 b 78 r 70 .. 基于value来排序 TreeSet
  24. *
  25. * @author huhu_k
  26. *
  27. */
  28. public class Top10_2 extends ToolRunner implements Tool {
  29.  
  30. private Configuration conf;
  31.  
  32. public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  33.  
  34. @Override
  35. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  36. String[] line = value.toString().split(" ");
  37. for (String s : line) {
  38. context.write(new Text(s), new IntWritable(1));
  39. }
  40. }
  41. }
  42.  
  43. public static class MyReduce extends Reducer<Text, IntWritable, WCWritable, NullWritable> {
  44. // TreeSet倒叙排列
  45. private TreeSet<WCWritable> set;
  46. private final int KEY = 11;
  47.  
  48. @Override
  49. protected void setup(Context context) throws IOException, InterruptedException {
  50. set = new TreeSet<WCWritable>();
  51. }
  52.  
  53. @Override
  54. protected void reduce(Text key, Iterable<IntWritable> values, Context context)
  55. throws IOException, InterruptedException {
  56. WCWritable w = new WCWritable();
  57. int sum = 0;
  58. for (IntWritable i : values) {
  59. sum += i.get();
  60. }
  61. w.setWord(key.toString());
  62. w.setCount(sum);
  63.  
  64. set.add(w);
  65.  
  66. if (KEY < set.size()) {
  67. set.remove(set.last());
  68. }
  69. }
  70.  
  71. @Override
  72. protected void cleanup(Context context) throws IOException, InterruptedException {
  73. Iterator<WCWritable> iterator = set.iterator();
  74. if (iterator.hasNext()) {
  75. context.write(iterator.next(), NullWritable.get());
  76. }
  77. }
  78. }
  79.  
  80. @Override
  81. public Configuration getConf() {
  82. if (conf != null) {
  83. return conf;
  84. }
  85. return new Configuration();
  86. }
  87.  
  88. @Override
  89. public void setConf(Configuration arg0) {
  90.  
  91. }
  92.  
  93. @Override
  94. public int run(String[] other) throws Exception {
  95. Configuration conf = new Configuration();
  96. Job job = Job.getInstance(conf);
  97. job.setJarByClass(Top10_2.class);
  98. job.setMapperClass(MyMapper.class);
  99. job.setMapOutputKeyClass(Text.class);
  100. job.setMapOutputValueClass(IntWritable.class);
  101.  
  102. job.setReducerClass(MyReduce.class);
  103. job.setOutputKeyClass(WCWritable.class);
  104. job.setOutputValueClass(NullWritable.class);
  105.  
  106. FileInputFormat.addInputPath(job, new Path(other[0]));
  107. FileOutputFormat.setOutputPath(job, new Path(other[1]));
  108.  
  109. return job.waitForCompletion(true) ? 0 : 1;
  110. }
  111.  
  112. public static void main(String[] args) throws Exception {
  113. Top10_2 t = new Top10_2();
  114. String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();
  115. if (other.length != 2) {
  116. System.out.println("your input args number is fail,you need input <in> and <out>");
  117. System.exit(0);
  118. }
  119. ToolRunner.run(t.getConf(), t, other);
  120. }
  121. }

运行还是在集群中运行,报错可以查看日志

2)TreeSet

  1. package com.huhu.day03;
  2.  
  3. import java.io.DataInput;
  4. import java.io.DataOutput;
  5. import java.io.IOException;
  6.  
  7. import org.apache.hadoop.io.WritableComparable;
  8.  
  9. public class WCWritable implements WritableComparable<WCWritable> {
  10.  
  11. private String word;
  12. private int count;
  13.  
  14. public WCWritable() {
  15. super();
  16. }
  17.  
  18. public WCWritable(String word, int count) {
  19. super();
  20. this.word = word;
  21. this.count = count;
  22. }
  23.  
  24. public String getWord() {
  25. return word;
  26. }
  27.  
  28. public void setWord(String word) {
  29. this.word = word;
  30. }
  31.  
  32. public int getCount() {
  33. return count;
  34. }
  35.  
  36. public void setCount(int count) {
  37. this.count = count;
  38. }
  39.  
  40. @Override
  41. public String toString() {
  42. return "WCWritable [word=" + word + ", count=" + count + "]";
  43. }
  44.  
  45. @Override
  46. public int hashCode() {
  47. final int prime = 31;
  48. int result = 1;
  49. result = prime * result + count;
  50. result = prime * result + ((word == null) ? 0 : word.hashCode());
  51. return result;
  52. }
  53.  
  54. @Override
  55. public boolean equals(Object obj) {
  56. if (this == obj)
  57. return true;
  58. if (obj == null)
  59. return false;
  60. if (getClass() != obj.getClass())
  61. return false;
  62. WCWritable other = (WCWritable) obj;
  63. if (count != other.count)
  64. return false;
  65. if (word == null) {
  66. if (other.word != null)
  67. return false;
  68. } else if (!word.equals(other.word))
  69. return false;
  70. return true;
  71. }
  72.  
  73. @Override
  74. public void readFields(DataInput in) throws IOException {
  75. this.word = in.readUTF();
  76. this.count = in.readInt();
  77. }
  78.  
  79. @Override
  80. public void write(DataOutput out) throws IOException {
  81. out.writeUTF(word);
  82. out.writeInt(count);
  83. }
  84.  
  85. @Override
  86. public int compareTo(WCWritable o) {
  87. if (this.count == o.count) {
  88. // 字典顺序
  89. return this.word.compareTo(o.word);
  90. // return this.word.length() - o.word.length();
  91. }
  92. return o.count - this.count;
  93. }
  94. }

  1. package com.huhu.day03;
  2.  
  3. import java.io.IOException;
  4. import java.util.Iterator;
  5. import java.util.TreeSet;
  6.  
  7. import org.apache.hadoop.conf.Configuration;
  8. import org.apache.hadoop.fs.Path;
  9. import org.apache.hadoop.io.IntWritable;
  10. import org.apache.hadoop.io.LongWritable;
  11. import org.apache.hadoop.io.NullWritable;
  12. import org.apache.hadoop.io.Text;
  13. import org.apache.hadoop.mapreduce.Job;
  14. import org.apache.hadoop.mapreduce.Mapper;
  15. import org.apache.hadoop.mapreduce.Reducer;
  16. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  17. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  18. import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
  19. import org.apache.hadoop.util.GenericOptionsParser;
  20. import org.apache.hadoop.util.Tool;
  21. import org.apache.hadoop.util.ToolRunner;
  22.  
  23. /**
  24.  * a 80 b 78 r 70 .. 基于value来排序 TreeSet
  25.  * 
  26.  * @author huhu_k
  27.  *
  28.  */
  29. public class Top10_2 extends ToolRunner implements Tool {
  30. private Configuration conf;
  31.  
  32. static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  33.  
  34. @Override
  35. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  36. String[] line = value.toString().split(" ");
  37. for (String s : line) {
  38. context.write(new Text(s), new IntWritable(1));
  39. }
  40.  
  41. }
  42. }
  43.  
  44. static class MyReducer extends Reducer<Text, IntWritable, WCWritable, NullWritable> {
  45. private TreeSet<WCWritable> set;
  46. private final int KEY = 10;
  47.  
  48. @Override
  49. protected void setup(Context context) throws IOException, InterruptedException {
  50. set = new TreeSet<WCWritable>();
  51. }
  52.  
  53. @Override
  54. protected void reduce(Text key, Iterable<IntWritable> values, Context context)
  55. throws IOException, InterruptedException {
  56. WCWritable w = new WCWritable();
  57. int sum = 0;
  58. for (IntWritable v : values) {
  59. sum += v.get();
  60. }
  61. w.setWord(key.toString());
  62. w.setCount(sum);
  63.  
  64. set.add(w);
  65.  
  66. if (KEY < set.size()) {
  67. set.remove(set.last());
  68. }
  69. }
  70.  
  71. @Override
  72. protected void cleanup(Context context) throws IOException, InterruptedException {
  73. Iterator<WCWritable> iterator = set.iterator();
  74. while (iterator.hasNext()) {
  75. context.write(iterator.next(), NullWritable.get());
  76. }
  77. }
  78. }
  79.  
  80. public static void main(String[] args) throws Exception {
  81. Top10_2 t = new Top10_2();
  82. Configuration con = t.getConf();
  83. String[] other = new GenericOptionsParser(con, args).getRemainingArgs();
  84. if (other.length != 2) {
  85. System.err.println("number is fail");
  86. }
  87. int run = ToolRunner.run(con, t, args);
  88. System.exit(run);
  89. }
  90.  
  91. @Override
  92. public Configuration getConf() {
  93. if (conf != null) {
  94. return conf;
  95. }
  96. return new Configuration();
  97. }
  98.  
  99. @Override
  100. public void setConf(Configuration arg0) {
  101.  
  102. }
  103.  
  104. @Override
  105. public int run(String[] other) throws Exception {
  106. Configuration con = getConf();
  107. Job job = Job.getInstance(con);
  108. job.setJarByClass(Top10_2.class);
  109. job.setMapperClass(MyMapper.class);
  110. job.setMapOutputKeyClass(Text.class);
  111. job.setMapOutputValueClass(IntWritable.class);
  112.  
  113. // 默认分区
  114. job.setPartitionerClass(HashPartitioner.class);
  115.  
  116. job.setReducerClass(MyReducer.class);
  117. job.setOutputKeyClass(WCWritable.class);
  118. job.setOutputValueClass(NullWritable.class);
  119.  
  120. FileInputFormat.addInputPath(job, new Path(other[0]));
  121. FileOutputFormat.setOutputPath(job, new Path(other[1]));
  122.  
  123. return job.waitForCompletion(true) ? 0 : 1;
  124. }
  125. }


二.写一个MapReduce的模板

  1. package util;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.conf.Configuration;
  6. import org.apache.hadoop.fs.Path;
  7. import org.apache.hadoop.io.LongWritable;
  8. import org.apache.hadoop.io.Text;
  9. import org.apache.hadoop.mapreduce.Job;
  10. import org.apache.hadoop.mapreduce.Mapper;
  11. import org.apache.hadoop.mapreduce.Reducer;
  12. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  13. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  14. import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
  15. import org.apache.hadoop.util.GenericOptionsParser;
  16. import org.apache.hadoop.util.Tool;
  17. import org.apache.hadoop.util.ToolRunner;
  18.  
  19. public class Frame extends ToolRunner implements Tool {
  20.  
  21. private Configuration conf;
  22.  
  23. public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
  24.  
  25. @Override
  26. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  27. String[] line = value.toString().split(" ");
  28.  
  29. }
  30. }
  31.  
  32. public static class MyReduce extends Reducer<Text, Text, Text, Text> {
  33.  
  34. @Override
  35. protected void setup(Context context) throws IOException, InterruptedException {
  36. }
  37.  
  38. @Override
  39. protected void reduce(Text key, Iterable<Text> values, Context context)
  40. throws IOException, InterruptedException {
  41. }
  42.  
  43. @Override
  44. protected void cleanup(Context context) throws IOException, InterruptedException {
  45. }
  46. }
  47.  
  48. public static void main(String[] args) throws Exception {
  49. Frame t = new Frame();
  50. Configuration conf = t.getConf();
  51. String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
  52. if (other.length != 2) {
  53. System.err.println("number is fail");
  54. }
  55. int run = ToolRunner.run(conf, t, args);
  56. System.exit(run);
  57. }
  58.  
  59. @Override
  60. public Configuration getConf() {
  61. if (conf != null) {
  62. return conf;
  63. }
  64. return new Configuration();
  65. }
  66.  
  67. @Override
  68. public void setConf(Configuration arg0) {
  69.  
  70. }
  71.  
  72. @Override
  73. public int run(String[] other) throws Exception {
  74. Configuration con = getConf();
  75. Job job = Job.getInstance(con);
  76. job.setJarByClass(Frame.class);
  77. job.setMapperClass(MyMapper.class);
  78. job.setMapOutputKeyClass(Text.class);
  79. job.setMapOutputValueClass(Text.class);
  80.  
  81. //默认分区
  82. job.setPartitionerClass(HashPartitioner.class);
  83.  
  84. job.setReducerClass(MyReduce.class);
  85. job.setOutputKeyClass(Text.class);
  86. job.setOutputValueClass(Text.class);
  87.  
  88. FileInputFormat.addInputPath(job, new Path(other[0]));
  89. FileOutputFormat.setOutputPath(job, new Path(other[1]));
  90.  
  91. return job.waitForCompletion(true) ? 0 : 1;
  92. }
  93.  
  94. }

三.MapReduce的分区

1)手动分区
  1. package com.huhu.day03;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.conf.Configuration;
  6. import org.apache.hadoop.fs.Path;
  7. import org.apache.hadoop.io.IntWritable;
  8. import org.apache.hadoop.io.LongWritable;
  9. import org.apache.hadoop.io.Text;
  10. import org.apache.hadoop.mapreduce.Job;
  11. import org.apache.hadoop.mapreduce.Mapper;
  12. import org.apache.hadoop.mapreduce.Reducer;
  13. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  14. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  15. import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
  16. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
  17. import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
  18. import org.apache.hadoop.util.GenericOptionsParser;
  19. import org.apache.hadoop.util.Tool;
  20. import org.apache.hadoop.util.ToolRunner;
  21. /**
  22.  * 手动分区
  23.  * 
  24.  * @author huhu_k
  25.  *
  26.  */
  27. public class ManualPartition extends ToolRunner implements Tool {
  28.  
  29. private Configuration conf;
  30.  
  31. public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  32.  
  33. @Override
  34. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  35. String[] line = value.toString().split(" ");
  36. for (String s : line) {
  37. context.write(new Text(s), new IntWritable(1));
  38. }
  39. }
  40. }
  41.  
  42. public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
  43.  
  44. private MultipleOutputs<Text, IntWritable> mos;
  45.  
  46. @Override
  47. protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
  48. throws IOException, InterruptedException {
  49. mos = new MultipleOutputs<>(context);
  50. }
  51.  
  52. @Override
  53. protected void reduce(Text key, Iterable<IntWritable> values, Context context)
  54. throws IOException, InterruptedException {
  55. int sum = 0;
  56. for (IntWritable v : values) {
  57. sum += v.get();
  58. }
  59.  
  60. if (key.toString().substring(0, 1).matches("[a-z]")) {
  61. mos.write("az", key.toString(), new IntWritable(sum));
  62. } else if (key.toString().substring(0, 1).matches("[A-Z]")) {
  63. mos.write("AZ", key.toString(), new IntWritable(sum));
  64. } else if (key.toString().substring(0, 1).matches("[0-9]")) {
  65. mos.write("09", key.toString(), new IntWritable(sum));
  66. } else {
  67. mos.write("default", key.toString(), new IntWritable(sum));
  68. }
  69. }
  70.  
  71. @Override
  72. protected void cleanup(Context context) throws IOException, InterruptedException {
  73. // 很重要 -->因为mos类时一个类似的缓冲区 hdfs可以写 更改 追加写
  74. mos.close();
  75. }
  76. }
  77.  
  78. public static void main(String[] args) throws Exception {
  79. ManualPartition t = new ManualPartition();
  80. Configuration conf = t.getConf();
  81. String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
  82. if (other.length != 2) {
  83. System.err.println("number is fail");
  84. }
  85. int run = ToolRunner.run(conf, t, args);
  86. System.exit(run);
  87. }
  88.  
  89. @Override
  90. public Configuration getConf() {
  91. if (conf != null) {
  92. return conf;
  93. }
  94. return new Configuration();
  95. }
  96.  
  97. @Override
  98. public void setConf(Configuration arg0) {
  99.  
  100. }
  101.  
  102. @Override
  103. public int run(String[] other) throws Exception {
  104. Configuration con = getConf();
  105. Job job = Job.getInstance(con);
  106. job.setJarByClass(ManualPartition.class);
  107. job.setMapperClass(MyMapper.class);
  108. job.setMapOutputKeyClass(Text.class);
  109. job.setMapOutputValueClass(IntWritable.class);
  110.  
  111. // 默认分区
  112. job.setPartitionerClass(HashPartitioner.class);
  113.  
  114. job.setReducerClass(MyReduce.class);
  115. job.setOutputKeyClass(Text.class);
  116. job.setOutputValueClass(IntWritable.class);
  117.  
  118. FileInputFormat.addInputPath(job, new Path(other[0]));
  119. FileOutputFormat.setOutputPath(job, new Path(other[1]));
  120. // 手动分区
  121. MultipleOutputs.addNamedOutput(job, "az", TextOutputFormat.class, Text.class, IntWritable.class);
  122. MultipleOutputs.addNamedOutput(job, "AZ", TextOutputFormat.class, Text.class, IntWritable.class);
  123. MultipleOutputs.addNamedOutput(job, "09", TextOutputFormat.class, Text.class, IntWritable.class);
  124. MultipleOutputs.addNamedOutput(job, "default", TextOutputFormat.class, Text.class, IntWritable.class);
  125. return job.waitForCompletion(true) ? 0 : 1;
  126. }
  127.  
  128. }

2)自动分区
  1. package com.huhu.day03.partitioner;
  2.  
  3. import org.apache.hadoop.io.IntWritable;
  4. import org.apache.hadoop.io.Text;
  5. import org.apache.hadoop.mapreduce.Partitioner;
  6.  
  7. public class WordCountAUTOPartitioner extends Partitioner<Text, IntWritable> {
  8.  
  9. @Override
  10. public int getPartition(Text key, IntWritable value, int numPartitioner) {
  11. String firstChar = key.toString().substring(0, 1);
  12. if (firstChar.matches("[a-g]")) {
  13. //返回
  14. return 0 % numPartitioner;
  15. } else if (firstChar.matches("[h-z]")) {
  16. return 1 % numPartitioner;
  17. } else if (firstChar.matches("[0-5]")) {
  18. return 2 % numPartitioner;
  19. } else if (firstChar.matches("[6-9]")) {
  20. return 3 % numPartitioner;
  21. } else if (firstChar.matches("[A-G]")) {
  22. return 0 % numPartitioner;
  23. } else if (firstChar.matches("[H-Z]")) {
  24. return 5 % numPartitioner;
  25. } else {
  26. return 6 % numPartitioner;
  27. }
  28. }
  29.  
  30. }

  1. package com.huhu.day03;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.conf.Configuration;
  6. import org.apache.hadoop.fs.Path;
  7. import org.apache.hadoop.io.IntWritable;
  8. import org.apache.hadoop.io.LongWritable;
  9. import org.apache.hadoop.io.Text;
  10. import org.apache.hadoop.mapreduce.Job;
  11. import org.apache.hadoop.mapreduce.Mapper;
  12. import org.apache.hadoop.mapreduce.Reducer;
  13. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  14. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  15. import org.apache.hadoop.util.GenericOptionsParser;
  16. import org.apache.hadoop.util.Tool;
  17. import org.apache.hadoop.util.ToolRunner;
  18.  
  19. import com.huhu.day03.partitioner.WordCountAUTOPartitioner;
  20.  
  21. public class AutomaticPartition extends ToolRunner implements Tool {
  22.  
  23. private Configuration conf;
  24.  
  25. public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  26.  
  27. @Override
  28. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  29. String[] line = value.toString().split(" ");
  30. for (String s : line) {
  31. context.write(new Text(s), new IntWritable(1));
  32. }
  33. }
  34. }
  35.  
  36. public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
  37.  
  38. @Override
  39. protected void setup(Context context) throws IOException, InterruptedException {
  40. }
  41.  
  42. @Override
  43. protected void reduce(Text key, Iterable<IntWritable> values, Context context)
  44. throws IOException, InterruptedException {
  45. int sum = 0;
  46. for (IntWritable v : values) {
  47. sum += v.get();
  48. }
  49. context.write(key, new IntWritable(sum));
  50. }
  51.  
  52. @Override
  53. protected void cleanup(Context context) throws IOException, InterruptedException {
  54. }
  55. }
  56.  
  57. public static void main(String[] args) throws Exception {
  58. AutomaticPartition t = new AutomaticPartition();
  59. Configuration conf = t.getConf();
  60. String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
  61. if (other.length != 2) {
  62. System.err.println("number is fail");
  63. }
  64. int run = ToolRunner.run(conf, t, args);
  65. System.exit(run);
  66. }
  67.  
  68. @Override
  69. public Configuration getConf() {
  70. if (conf != null) {
  71. return conf;
  72. }
  73. return new Configuration();
  74. }
  75.  
  76. @Override
  77. public void setConf(Configuration arg0) {
  78.  
  79. }
  80.  
  81. @Override
  82. public int run(String[] other) throws Exception {
  83. Configuration con = getConf();
  84. Job job = Job.getInstance(con);
  85. job.setJarByClass(AutomaticPartition.class);
  86. job.setMapperClass(MyMapper.class);
  87. job.setMapOutputKeyClass(Text.class);
  88. job.setMapOutputValueClass(IntWritable.class);
  89.  
  90. // 默认分区
  91. // job.setPartitionerClass(HashPartitioner.class);
  92.  
  93. // 自定义分区
  94. job.setPartitionerClass(WordCountAUTOPartitioner.class);
  95. // 分40个区
  96. job.setNumReduceTasks(40);
  97.  
  98. job.setReducerClass(MyReduce.class);
  99. job.setOutputKeyClass(Text.class);
  100. job.setOutputValueClass(IntWritable.class);
  101.  
  102. FileInputFormat.addInputPath(job, new Path(other[0]));
  103. FileOutputFormat.setOutputPath(job, new Path(other[1]));
  104.  
  105. return job.waitForCompletion(true) ? 0 : 1;
  106. }
  107.  
  108. }


只有在我的分区类中设置的分区中有内容,别的没有,因为我没有设置


四.自定义分组
  1. package com.huhu.day03;
  2.  
  3. import java.io.IOException;
  4. import java.util.Iterator;
  5.  
  6. import org.apache.hadoop.conf.Configuration;
  7. import org.apache.hadoop.fs.Path;
  8. import org.apache.hadoop.io.IntWritable;
  9. import org.apache.hadoop.io.LongWritable;
  10. import org.apache.hadoop.io.Text;
  11. import org.apache.hadoop.mapreduce.Job;
  12. import org.apache.hadoop.mapreduce.Mapper;
  13. import org.apache.hadoop.mapreduce.Reducer;
  14. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  15. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  16. import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
  17. import org.apache.hadoop.util.GenericOptionsParser;
  18. import org.apache.hadoop.util.Tool;
  19. import org.apache.hadoop.util.ToolRunner;
  20.  
  21. import com.huhu.day03.group.ClassGroupSort;
  22. import com.huhu.day03.pojo.Student;
  23.  
  24. public class StudentAutoGroup extends ToolRunner implements Tool {
  25.  
  26. private Configuration conf;
  27.  
  28. public static class MyMapper extends Mapper<LongWritable, Text, Student, Student> {
  29.  
  30. @Override
  31. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  32. String[] line = value.toString().split(" ");
  33. Student s = new Student(line[0], line[1], Integer.parseInt(line[2]));
  34. context.write(s, s);
  35. }
  36. }
  37.  
  38. public static class MyReduce extends Reducer<Student, Student, Text, IntWritable> {
  39.  
  40. @Override
  41. protected void reduce(Student key, Iterable<Student> values, Context context)
  42. throws IOException, InterruptedException {
  43. int sum = 0;
  44. for (Student s : values) {
  45. sum += s.getSccore();
  46. }
  47. context.write(new Text(key.getGroup()), new IntWritable(sum));
  48. }
  49. }
  50.  
  51. public static void main(String[] args) throws Exception {
  52. StudentAutoGroup t = new StudentAutoGroup();
  53. Configuration conf = t.getConf();
  54. String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
  55. if (other.length != 2) {
  56. System.err.println("number is fail");
  57. }
  58. int run = ToolRunner.run(conf, t, args);
  59. System.exit(run);
  60. }
  61.  
  62. @Override
  63. public Configuration getConf() {
  64. if (conf != null) {
  65. return conf;
  66. }
  67. return new Configuration();
  68. }
  69.  
  70. @Override
  71. public void setConf(Configuration arg0) {
  72.  
  73. }
  74.  
  75. @Override
  76. public int run(String[] other) throws Exception {
  77. Configuration con = getConf();
  78. Job job = Job.getInstance(con);
  79. job.setJarByClass(StudentAutoGroup.class);
  80. job.setMapperClass(MyMapper.class);
  81. job.setMapOutputKeyClass(Student.class);
  82. job.setMapOutputValueClass(Student.class);
  83.  
  84. // 分组
  85. job.setCombinerKeyGroupingComparatorClass(ClassGroupSort.class);
  86. // job.setGroupingComparatorClass(ClassGroupSort.class);
  87. // 默认分区
  88. job.setPartitionerClass(HashPartitioner.class);
  89. job.setNumReduceTasks(1);
  90.  
  91. job.setReducerClass(MyReduce.class);
  92. job.setOutputKeyClass(Text.class);
  93. job.setOutputValueClass(IntWritable.class);
  94.  
  95. FileInputFormat.addInputPath(job, new Path(other[0]));
  96. FileOutputFormat.setOutputPath(job, new Path(other[1]));
  97.  
  98. return job.waitForCompletion(true) ? 0 : 1;
  99. }
  100.  
  101. }
  1. package com.huhu.day03.pojo;
  2.  
  3. import java.io.DataInput;
  4. import java.io.DataOutput;
  5. import java.io.IOException;
  6.  
  7. import org.apache.hadoop.io.WritableComparable;
  8.  
  9. public class Student implements WritableComparable<Student> {
  10.  
  11. private String name;
  12. private String group;
  13. private int sccore;
  14.  
  15. public Student() {
  16. super();
  17. // TODO Auto-generated constructor stub
  18. }
  19.  
  20. public Student(String name, String group, int sccore) {
  21. super();
  22. this.name = name;
  23. this.group = group;
  24. this.sccore = sccore;
  25. }
  26.  
  27. public String getName() {
  28. return name;
  29. }
  30.  
  31. public void setName(String name) {
  32. this.name = name;
  33. }
  34.  
  35. public String getGroup() {
  36. return group;
  37. }
  38.  
  39. public void setGroup(String group) {
  40. this.group = group;
  41. }
  42.  
  43. public int getSccore() {
  44. return sccore;
  45. }
  46.  
  47. public void setSccore(int sccore) {
  48. this.sccore = sccore;
  49. }
  50.  
  51. @Override
  52. public String toString() {
  53. return "Student [name=" + name + ", group=" + group + ", sccore=" + sccore + "]";
  54. }
  55.  
  56. @Override
  57. public void readFields(DataInput in) throws IOException {
  58. this.name = in.readUTF();
  59. this.group = in.readUTF();
  60. this.sccore = in.readInt();
  61. }
  62.  
  63. @Override
  64. public void write(DataOutput out) throws IOException {
  65. out.writeUTF(name);
  66. out.writeUTF(group);
  67. out.writeInt(sccore);
  68. }
  69.  
  70. @Override
  71. public int compareTo(Student o) {
  72. return this.group.compareTo(o.group);
  73. }
  74.  
  75. }
  1. package com.huhu.day03.group;
  2.  
  3. import org.apache.hadoop.io.RawComparator;
  4. import org.apache.hadoop.io.WritableComparator;
  5.  
  6. import com.huhu.day03.pojo.Student;
  7.  
  8. public class ClassGroupSort implements RawComparator<Student> {
  9.  
  10. @Override
  11. public int compare(Student o1, Student o2) {
  12. return (int) (o1.getGroup().equals(o2.getGroup()) ? 0 : 1);
  13. }
  14.  
  15. @Override
  16. public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
  17. return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);
  18. }
  19. }



总结下吧:

MapReduce的分区是对key的类型分:可以根据【a-z】【A-Z】【0-9】.....等等。是对reduce中的key来分的

MapReduce的分组则是忽略key:根据value来分的,比如你传入一个对象,根据对象的属性来比较,虽然传入的是不同的对象,但是只要属性相同,则可以对数据进行操作。

他们之所以又是分组又是分区,一则为了清洗数据,二则为了给数据排序。

MapReduce(三)的更多相关文章

  1. MapReduce三种join实例分析

    本文引自吴超博客 实现原理 1.在Reudce端进行连接. 在Reudce端进行连接是MapReduce框架进行表之间join操作最为常见的模式,其具体的实现原理如下: Map端的主要工作:为来自不同 ...

  2. MapReduce三种路径输入

    目前为止知道MapReduce有三种路径输入方式.1.第一种是通过一下方式输入: FileInputFormat.addInputPath(job, new Path(args[0]));FileIn ...

  3. MapReduce(三) 典型场景(一)

    一.mapreduce多job串联 1.需求 一个稍复杂点的处理逻辑往往需要多个 mapreduce 程序串联处理,多 job 的串联可以借助 mapreduce 框架的 JobControl 实现 ...

  4. mapreduce (三) MapReduce实现倒排索引(二)

    hadoop api http://hadoop.apache.org/docs/r1.0.4/api/org/apache/hadoop/mapreduce/Reducer.html 改变一下需求: ...

  5. 百度和 Google 的搜索技术是一个量级吗?

    著作权归作者所有. 商业转载请联系作者获得授权,非商业转载请注明出处. 作者:Kenny Chao 链接:http://www.zhihu.com/question/22447908/answer/2 ...

  6. MongoDB中聚合工具Aggregate等的介绍与使用

    Aggregate是MongoDB提供的众多工具中的比较重要的一个,类似于SQL语句中的GROUP BY.聚合工具可以让开发人员直接使用MongoDB原生的命令操作数据库中的数据,并且按照要求进行聚合 ...

  7. Hive的HQL语句及数据倾斜解决方案

    [版权申明:本文系作者原创,转载请注明出处] 文章出处:http://blog.csdn.net/sdksdk0/article/details/51675005 作者: 朱培          ID ...

  8. Spark原理概述

    原文来自我的个人网站:http://www.itrensheng.com/archives/Spark_basic_knowledge 一. Spark出现的背景 在Spark出现之前,大数据计算引擎 ...

  9. Apache Flink 如何正确处理实时计算场景中的乱序数据

    一.流式计算的未来 在谷歌发表了 GFS.BigTable.Google MapReduce 三篇论文后,大数据技术真正有了第一次飞跃,Hadoop 生态系统逐渐发展起来. Hadoop 在处理大批量 ...

随机推荐

  1. Gym 100247C Victor's Research(有多少区间之和为S)

    https://vjudge.net/problem/Gym-100247C 题意: 给出一串数,求有多少个区间的和正好等于S. 思路:计算处前缀和,并且用map维护一下每个前缀和出现的次数.这样接下 ...

  2. ASP.NET —— Web Pages

    为简单起见,新建一个空的web工程,再新建一个MVC的视图(.cshtml),因为WP是单页面模型,所以以后就在这个页面中进行试验. Razor语法简介: 变量可用var或者其确切类型声明. 遍历fo ...

  3. _killerstreak

    `count`连杀或终结连杀的数量(最大支持10个) `announceFlag` 0-不广播1-只广播连杀消息2-只广播终结连杀消息3-广播连杀与终结连杀消息 `rewId` 连杀奖励模板Id,对应 ...

  4. mysql / sqlserver / oracle 常见数据库分页

    空闲时间里用着mysql学习开发测试平台和测试用具, 在公司里将可用的测试平台部署,将数据库换成sqlserver 巴望着能去用oracle的公司 mysql中的分页 limit是mysql的语法se ...

  5. Eclipse+maven 构建第一个简单的springmvc项目

    先给出项目的目录: 在eclipse下使用maven构建第一个springmvc项目步骤如下: 1.创建maven project(此处默认你已了解maven),此处需要注意以下两点 2.创建完毕后会 ...

  6. 常见字符集&乱码问题

    字符集 常用字符集分类 ASCII及其扩展字符集 作用:表语英语及西欧语言. 位数:ASCII是用7位表示的,能表示128个字符:其扩展使用8位表示,表示256个字符. 范围:ASCII从00到7F, ...

  7. 学习笔记41—ttest误区

    1.grapPad软件里面双T结果和matlab,EXCEl里面双T结果一致时,设置如下:

  8. vscode 的tab与空格设置

    为了python 的pep8 标准,把tab键输入从\t的制表符 转为4个空格. 1在vscode下边栏点击 “空格” 在上面选项里设置 使用空格缩进, 以及可以 将缩进转换为空格 2在“文件-> ...

  9. QTableWidget自定义表头QHeaderView加全选复选框

    1         QTableWidget自定义表头QHeaderView加全选复选框 在使用QTableWidget时需要在表头添加全选复选框,但是默认的表头无法添加复选框,只能用图片画上去一个复 ...

  10. HDU 4804 Campus Design

    HDU 4804 思路: 轮廓线dp #include<bits/stdc++.h> using namespace std; #define fi first #define se se ...