示例文件同sample join analysis

之前的示例是使用map端的join.这次使用reduce端的join.

根据源的类别写不同的mapper,处理不同的文件,输出的key都是studentno.value是其他的信息同时加上类别信息。

然后使用multipleinputs不同的路径注册不同的mapper.

reduce端相同的studentno的学生信息和考试成绩分配给同一个reduce,而且value中包含了这些信息,

把这些信息抽取出来,再做笛卡尔积即可。

下面的示例代码中,我没有使用multipleinputs来处理,自己修改了TextInputFormat的一些信息,使用返回文件名和当前行的信息。

根据文件名我在mapper中处理两个不同文件的信息,加上不同的类别送出去。

下面的代码中还有很多可以优化的地方,以后再更新。

package myexamples;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.LineReader; public class reducejoin { public static class MyTextInputFormat extends FileInputFormat<Text, Text> { @Override
public MyLineRecordReader createRecordReader(InputSplit split,
TaskAttemptContext context) {
return new MyLineRecordReader();
} @Override
protected boolean isSplitable(JobContext context, Path file) {
CompressionCodec codec = new CompressionCodecFactory(
context.getConfiguration()).getCodec(file);
return codec == null;
} } public static class MyLineRecordReader extends RecordReader<Text, Text> {
private static final Log LOG = LogFactory
.getLog(LineRecordReader.class); private CompressionCodecFactory compressionCodecs = null;
private long start;
private long pos;
private long end;
private LineReader in;
private int maxLineLength;
private Text key = null;
private Text value = null; Text filename = null; public void initialize(InputSplit genericSplit,
TaskAttemptContext context) throws IOException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
this.maxLineLength = job.getInt(
"mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();
key = new Text(file.getName());
compressionCodecs = new CompressionCodecFactory(job);
final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
boolean skipFirstLine = false;
if (codec != null) {
in = new LineReader(codec.createInputStream(fileIn), job);
end = Long.MAX_VALUE;
} else {
if (start != 0) {
skipFirstLine = true;
--start;
fileIn.seek(start);
}
in = new LineReader(fileIn, job);
}
if (skipFirstLine) { // skip first line and re-establish "start".
start += in.readLine(new Text(), 0,
(int) Math.min((long) Integer.MAX_VALUE, end - start));
}
this.pos = start;
} public boolean nextKeyValue() throws IOException {
if (key == null) { } if (value == null) {
value = new Text();
}
int newSize = 0;
while (pos < end) {
newSize = in.readLine(value, maxLineLength, Math.max(
(int) Math.min(Integer.MAX_VALUE, end - pos),
maxLineLength));
if (newSize == 0) {
break;
}
pos += newSize;
if (newSize < maxLineLength) {
break;
} // line too long. try again
LOG.info("Skipped line of size " + newSize + " at pos "
+ (pos - newSize));
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
} @Override
public Text getCurrentKey() {
return key;
} @Override
public Text getCurrentValue() {
return value;
} /**
* Get the progress within the split
*/
public float getProgress() {
if (start == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - start) / (float) (end - start));
}
} public synchronized void close() throws IOException {
if (in != null) {
in.close();
}
}
} public static class studentMapper extends Mapper<Text, Text, Text, Text> {
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
Text newvalue = null;
String strv = value.toString().substring(
value.toString().indexOf(","));
if (key.toString().contains("student")) // student file
newvalue = new Text("student" + strv);
else
newvalue = new Text("score" + strv);
Text newkey = new Text(value.toString().substring(0,
value.toString().indexOf(",")));
context.write(newkey, newvalue);
}
} public static class studentReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
List<String> students = new ArrayList<String>();
List<String> scores = new ArrayList<String>();
for (Text value : values)
if (value.toString().startsWith("student"))
students.add(value.toString().substring(8));
else
scores.add(value.toString().substring(6));
// split real results
for (String student : students)
for (String score : scores)
context.write(key, new Text(student + "," + score));
}
} public static void main(String[] args) throws Exception {
args = "hdfs://namenode:9000/user/hadoop/student/ hdfs://namenode:9000/user/hadoop/reducejoinout"
.split(" "); Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
} myUtils.myUtils.DeleteFolder(conf, otherArgs[1]);
conf.set("io.sort.mb", "10");
Job job = new Job(conf, "reduce join");
job.setInputFormatClass(MyTextInputFormat.class);
// job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(reducejoin.class);
job.setMapperClass(studentMapper.class);
job.setReducerClass(studentReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Reducejoin sample的更多相关文章

  1. MapReduce 示例:减少 Hadoop MapReduce 中的侧连接

    摘要:在排序和reducer 阶段,reduce 侧连接过程会产生巨大的网络I/O 流量,在这个阶段,相同键的值被聚集在一起. 本文分享自华为云社区<MapReduce 示例:减少 Hadoop ...

  2. Linux下UPnP sample分析

        一.UPnP简介   UPnP(Universal Plug and Play)技术是一种屏蔽各种数字设备的硬件和操作系统的通信协议.它是一种数字网络中间件技术,建立在TCP/IP.HTTP协 ...

  3. cocos2d-x for android配置 & 运行 Sample on Linux OS

    1.从http://www.cocos2d-x.org/download下载稳定版 比如cocos2d-x-2.2 2.解压cocos2d-x-2.2.zip,比如本文将其解压到 /opt 目录下 3 ...

  4. android studio2.2 的Find Sample Code点击没有反应

    1 . 出现的问题描述:           右键点击Find Sample Code后半天没有反应,然后提示 Samples are currently unavailable for :{**** ...

  5. jmeter(四)Sample之http请求

    启动jmeter,建立一个测试计划 这里再次说说怎么安装和启动jmeter吧,昨天下午又被人问到怎样安装和使用,我也是醉了:在我看来,百度能解决百分之八十的问题,特别是基础的问题... 安装:去官网下 ...

  6. jcaptcha sample 制作验证码

    Skip to end of metadata Created by marc antoine garrigue, last modified by Jeremy Waters on Feb 23, ...

  7. Python 对不均衡数据进行Over sample(重抽样)

    需要重采样的数据文件(Libsvm format),如heart_scale +1 1:0.708333 2:1 3:1 4:-0.320755 5:-0.105023 6:-1 7:1 8:-0.4 ...

  8. Basic linux command-with detailed sample

    Here I will list some parameters which people use very ofen, I will attach the output of the command ...

  9. 例子:RSS Reader Sample

    本例演示了Rss xml信息的获取,以及如何使用SyndicationFeed来进行符合Rss规范的xml进行解析. SyndicationFeed 解析完成后 可以得到SyndicationItem ...

随机推荐

  1. 重新想象 Windows 8 Store Apps (49) - 输入: 获取输入设备信息, 虚拟键盘, Tab 导航, Pointer, Tap, Drag, Drop

    [源码下载] 重新想象 Windows 8 Store Apps (49) - 输入: 获取输入设备信息, 虚拟键盘, Tab 导航, Pointer, Tap, Drag, Drop 作者:weba ...

  2. Little Jumper---(三分)

    Description Little frog Georgie likes to jump. Recently he have discovered the new playground that s ...

  3. [moka同学摘录]Yii2 csv数据导出扩展

    yii2-thecsv(Yii2框架csv数据导出扩展) github: https://github.com/13552277443/yii2-thecsv 1.安装 运行 php composer ...

  4. mysql 64 zip download

    open the url  ::  http://dev.mysql.com/downloads/file/?id=461109 and click the location "no tha ...

  5. PHP学习笔记:万能随机字符串生成函数(已经封装好)

    做验证码用到的,然后就把这个函数封装起来,使用时候要设置2个参数: $str设置里要被采集的字符串,比如: $str='efasfgzsrhftjxjxjhsrth'; 则在函数里面生成的字符串就回从 ...

  6. virtualenv and virtualenvwrapper on Ubuntu 14.04

    In this post I’ll go over my attempt to setup virtual environments for Python development. Most Pyth ...

  7. Linux 学习手记(3):Linux基本的文件管理操作

    复制文件和目录 在Linux中使用命令cp来复制文件或者目录,使用方式: cp 源文件(文件夹) 目标文件(文件夹) cp命令常用参数: -r 递归复制整个目录 -v 显示详细信息 移动.重命名一个文 ...

  8. ruby 操作数据库语句

    1.多对多 user role u = User.first role = Role.first 插入 u.roles << role u.save 更新 u.roles = [] u.r ...

  9. ECMAScript 6学习笔记(二):let和块级作用域

    同步发布于:https://mingjiezhang.github.io/(转载请说明此出处). ES6中加入了let,也让JavaScript拥有了块级作用域. 没有块级作用域的JavaScript ...

  10. Q:关于Outlook for CRM 中预览记录窗体的设置

    问题: 如何在Outlook for CRM中,将实体记录的预览窗口的信息做调整? 解决方案: 在Outlook里,在打开实体后选择View=>Customize Reading Pane,这里 ...