pig自带的pigstorage不能指定行分隔符,所以自己重写了一个简单的UDF类,可以指定列和行的分隔符,之前研究过的简单的,

http://blog.csdn.net/ruishenh/article/details/12048067

但是弊端大,所以这次重写一下。

操作步骤打好包上传到服务器,

grunt> register  /home/pig/pig-0.11.0/udflib/myStorage.jar

grunt> cat student; 1,xiaohouzi,25/2,xiaohouzi2,24/3,xiaohouzi3,23

grunt> a = load 'student' using com.hcr.hadoop.pig.MyStorage(',','/');

grunt> dump a;

(1,xiaohouzi,25) (2,xiaohouzi2,24) (3,xiaohouzi3,23)

grunt> store a into 'myStorageOut' using com.hcr.hadoop.pig.MyStorage(',','/');

执行提示成功后查看

grunt> cat myStorageOut 1,xiaohouzi,25/2,xiaohouzi2,24/3,xiaohouzi3,23/

源码类

 package com.hcr.hadoop.pig;

 import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.pig.Expression;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.PigException;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.StoreFunc;
import org.apache.pig.StoreFuncInterface;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.StorageUtil; public class MyStorage extends LoadFunc implements StoreFuncInterface,LoadMetadata { private static final Log LOG = LogFactory.getLog(MyStorage.class); private static final String utf8 = "UTF-8"; private static String fieldDel = "\t"; private static String recordDel = "\n"; protected RecordReader recordReader = null; protected RecordWriter writer = null; public MyStorage() {
} public MyStorage(String fieldDel) {
this(fieldDel, "\n");
} public MyStorage(String fieldDel, String recordDel) {
this.fieldDel = fieldDel;
this.recordDel = recordDel;
} @Override
public void setLocation(String s, Job job) throws IOException {
FileInputFormat.setInputPaths(job, s);
} @Override
public InputFormat getInputFormat() throws IOException {
return new MyStorageInputFormat(recordDel);
} @Override
public void prepareToRead(RecordReader recordReader, PigSplit pigSplit)
throws IOException {
this.recordReader = recordReader;
} @Override
public Tuple getNext() throws IOException {
try {
boolean flag = recordReader.nextKeyValue();
if (!flag) {
return null;
}
Text value = (Text) recordReader.getCurrentValue();
String[] strArray = value.toString().split(fieldDel);
List lst = new ArrayList<String>();
int i = 0;
for (String singleItem : strArray) {
lst.add(i++, singleItem);
}
return TupleFactory.getInstance().newTuple(lst);
} catch (InterruptedException e) {
throw new ExecException("Read data error",
PigException.REMOTE_ENVIRONMENT, e);
}
} /**
* */
@Override
public String relToAbsPathForStoreLocation(String location, Path curDir)
throws IOException {
return LoadFunc.getAbsolutePath(location, curDir);
} @Override
public OutputFormat getOutputFormat() throws IOException {
return new MyStorageOutputFormat(StorageUtil.parseFieldDel(fieldDel),
this.recordDel);
} @Override
public void setStoreLocation(String location, Job job) throws IOException {
job.getConfiguration().set("mapred.textoutputformat.separator", "");
FileOutputFormat.setOutputPath(job, new Path(location));
if ("true".equals(job.getConfiguration().get(
"output.compression.enabled"))) {
FileOutputFormat.setCompressOutput(job, true);
String codec = job.getConfiguration().get(
"output.compression.codec");
try {
FileOutputFormat.setOutputCompressorClass(job,
(Class<? extends CompressionCodec>) Class
.forName(codec));
} catch (ClassNotFoundException e) {
throw new RuntimeException("Class not found: " + codec);
}
} else {
// This makes it so that storing to a directory ending with ".gz" or
// ".bz2" works.
setCompression(new Path(location), job);
} } private void setCompression(Path path, Job job) {
String location = path.getName();
if (location.endsWith(".bz2") || location.endsWith(".bz")) {
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
} else if (location.endsWith(".gz")) {
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
} else {
FileOutputFormat.setCompressOutput(job, false);
}
} @Override
public void checkSchema(ResourceSchema s) throws IOException {
// TODO Auto-generated method stub } @Override
public void prepareToWrite(RecordWriter writer) throws IOException {
this.writer = writer;
} @Override
public void putNext(Tuple t) throws IOException {
try {
writer.write(null, t);
} catch (InterruptedException e) {
throw new IOException(e);
}
} @Override
public void setStoreFuncUDFContextSignature(String signature) {
// TODO Auto-generated method stub } @Override
public void cleanupOnFailure(String location, Job job) throws IOException {
StoreFunc.cleanupOnFailureImpl(location, job);
} @Override
public void cleanupOnSuccess(String location, Job job) throws IOException {
// TODO Auto-generated method stub } @Override
public ResourceSchema getSchema(String location, Job job)
throws IOException {
ResourceSchema rs=new ResourceSchema();
FieldSchema c1 = new FieldSchema("c1", DataType.INTEGER);
FieldSchema c2 = new FieldSchema("c2", DataType.INTEGER);
FieldSchema c3 = new FieldSchema("c3", DataType.DOUBLE);
ResourceFieldSchema fs1 =new ResourceFieldSchema(c1);
ResourceFieldSchema fs2 =new ResourceFieldSchema(c2);
ResourceFieldSchema fs3 =new ResourceFieldSchema(c3);
rs.setFields(new ResourceFieldSchema[]{fs1,fs2,fs3});
return rs;
} @Override
public ResourceStatistics getStatistics(String location, Job job)
throws IOException {
// TODO Auto-generated method stub
return null;
} @Override
public String[] getPartitionKeys(String location, Job job)
throws IOException {
// TODO Auto-generated method stub
return null;
} @Override
public void setPartitionFilter(Expression partitionFilter)
throws IOException {
// TODO Auto-generated method stub }
} class MyStorageInputFormat extends TextInputFormat { private final String recordDel; public MyStorageInputFormat(String recordDel) {
this.recordDel = recordDel;
} @Override
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
String delimiter = context.getConfiguration().get(
"textinputformat.record.delimiter");
if (recordDel != null) {
delimiter = recordDel;
}
byte[] recordDelimiterBytes = null;
if (null != delimiter){
try {
recordDelimiterBytes = decode(delimiter).getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return new LineRecordReader(recordDelimiterBytes);
}
/**
* 工作流传过来的列分隔符,有可能是特殊字符,用八进制或者十六进制表示
* @throws IOException
*/
public static String decode(String str) throws IOException {
String re = str;
if (str != null && str.startsWith("\\")) {
str = str.substring(1, str.length());
String[] chars = str.split("\\\\");
byte[] bytes = new byte[chars.length];
for (int i = 0; i < chars.length; i++) {
if (chars[i].equals("t")) {
bytes[i] = 9;
} else if (chars[i].equals("r")) {
bytes[i] = 13;
} else if (chars[i].equals("n")) {
bytes[i] = 10;
} else if (chars[i].equals("b")) {
bytes[i] = 8;
} else {
bytes[i] = Byte.decode(chars[i]);
}
}
try {
re = new String(bytes, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new IOException(str, e);
}
}
return re;
} } class MyStorageOutputFormat extends TextOutputFormat<WritableComparable, Tuple> { private final byte fieldDel; private final String recordDel; public MyStorageOutputFormat(byte delimiter) {
this(delimiter, "\n");
} public MyStorageOutputFormat(byte delimiter, String recordDel) {
this.fieldDel = delimiter;
this.recordDel = recordDel;
} protected static class MyRecordWriter extends
TextOutputFormat.LineRecordWriter<WritableComparable, Tuple> { private static byte[] newline; private final byte fieldDel; public MyRecordWriter(DataOutputStream out, byte fieldDel)
throws UnsupportedEncodingException {
this(out, fieldDel, "\n".getBytes("UTF-8"));
} public MyRecordWriter(DataOutputStream out, byte fieldDel, byte[] record) {
super(out);
this.fieldDel = fieldDel;
this.newline = record;
} public synchronized void write(WritableComparable key, Tuple value)
throws IOException {
int sz = value.size();
for (int i = 0; i < sz; i++) {
StorageUtil.putField(out, value.get(i));
if (i != sz - 1) {
out.writeByte(fieldDel);
}
}
out.write(newline);
}
} @Override
public RecordWriter<WritableComparable, Tuple> getRecordWriter(
TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
boolean isCompressed = getCompressOutput(job);
CompressionCodec codec = null;
String extension = "";
if (isCompressed) {
Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(
job, GzipCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass,
conf);
extension = codec.getDefaultExtension();
}
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
if (!isCompressed) {
FSDataOutputStream fileOut = fs.create(file, false);
return new MyRecordWriter(fileOut, fieldDel,
this.recordDel.getBytes());
} else {
FSDataOutputStream fileOut = fs.create(file, false);
return new MyRecordWriter(new DataOutputStream(
codec.createOutputStream(fileOut)), fieldDel,
this.recordDel.getBytes());
}
} }
grunt> register  /home/pig/pig-0.11.0/udflib/myStorage.jar
grunt> cat X;
keyDataKNZKCZY:ZDKJS:616150:AFS:3842708d_20131219194420-642464756keyDataKNZKCZY:ZDKJS:616614:AFS:3843920d_20131219194420-642464756keyDataKNZKCZY:ZDKJS:616661:AFS:3844040d_20131219194420-642464756
grunt> a = load 'X' using com.hcr.hadoop.pig.MyStorage('\\001','\\002');
grunt> dump a;
(keyData,KNZKCZY:ZDKJS:616150:AFS:3842708,d_20131219194420-642464756)
(keyData,KNZKCZY:ZDKJS:616614:AFS:3843920,d_20131219194420-642464756)
(keyData,KNZKCZY:ZDKJS:616661:AFS:3844040,d_20131219194420-642464756)
grunt>

有的时候如果加载模式不想指定具体模式(比如太多了字段,或者不够公有化)就想使用已存在的模式

实现LoadMetadata接口,然后

重写

@Override

public ResourceSchema getSchema(String location, Job job)throws IOException {

ResourceSchema rs=new ResourceSchema();

FieldSchema c1 = new FieldSchema("c1", DataType.INTEGER);

FieldSchema c2 = new FieldSchema("c2", DataType.INTEGER);

FieldSchema c3 = new FieldSchema("c3", DataType.DOUBLE);

ResourceFieldSchema fs1 =new ResourceFieldSchema(c1);

ResourceFieldSchema fs2 =new ResourceFieldSchema(c2);

ResourceFieldSchema fs3 =new ResourceFieldSchema(c3);

rs.setFields(new ResourceFieldSchema[]{fs1,fs2,fs3});

return rs;

}

这一个简单的例子中就返回了直接使用模式的形式

grunt> register  /home/pig/pig-0.11.0/udflib/myStorage.jar

grunt> a = load 'student' using com.hcr.hadoop.pig.MyStorage(',','/');

grunt> describe a; a: {c1: int,c2: int,c3: double}

grunt> b = foreach  a generate c1,c2,c3;

grunt> describe b;

b: {c1: int,c2: int,c3: double}

摘录地址:http://blog.csdn.net/ruishenh/article/details/12192391

(转)Pig 重写加载函数和存储函数UDF的更多相关文章

  1. js加载事件和js函数定义

    一  dom文档树加载完之后执行一个函数 在Dom加载完成后执行函数,下面这三个的作用是一样的,window.onload 是JavaScript的,window.onload是在dom文档树加载完和 ...

  2. PostgreSql扩展Sql-动态加载共享库(C函数)

    基于 psql (PostgreSQL) 10.4 pg_language表定义了函数实现所使用的语言.主要支持了C语言和SQL语句.一些可选的语言包括pl/pgsql.tcl和perl. ligan ...

  3. 常用js,css文件统一加载方法,并在加载之后调用回调函数

    原创内容,转载请注明出处! 为了方便资源管理和提升工作效率,常用的js和css文件的加载应该放在一个统一文件里面完成,也方便后续的资源维护.所以我用js写了以下方法,存放在“sourceControl ...

  4. php自动加载的两个函数__autoload和__sql_autoload_register

    一.__autoload 这是一个自动加载函数,在PHP5中,当我们实例化一个未定义的类时,就会触发此函数.看下面例子: printit.class.php //文件 <?php class P ...

  5. 010.CI4框架CodeIgniter, autoload自动加载自己的helper函数类

    01.自己定义了一个helper类,里面有个函数用来输出 02.定义一个Controller基本类,我们以后用到的Controllers类都继承自这个类.其中自动加载helper函数如图所示: 03. ...

  6. 八、React实战:可交互待办事务表(表单使用、数据的本地缓存local srtorage、生命同期函数(页面加载就会执行函数名固定为componentDidMount()))

    一.项目功能概述 示例网址:http://www.todolist.cn/ 功能: 输入待做事项,回车,把任务添加到 [正在进行] [正在进行] 任务,勾选之后,变成已[经完成事项] [已完成事务], ...

  7. composer的autoload来自动加载自己编写的函数库与类库?

    1.使用命令composer init生成composer.json文件,并编辑autoload选项内容如下: 其中又包含主要的两个选项: files 和 psr-4. files就是需要compos ...

  8. angular 页面加载时可以调用 函数处理

    转载于 作者:海底苍鹰地址:http://blog.51yip.com/jsjquery/1599.html 我希望页面加载的时候,我能马上处理页面的数据,如请求API .... 所以这样设置 在某个 ...

  9. 页面框架加载完自动执行函数$(function(){});

    页面中有一些大的资源文件,如图片,声音等,如果一个事件绑定写在这些加载资源代码的下方,那么要等资源加载完才会绑定,这样体验不够好. 于是想不等资源加载完,只要框架加载完成就绑定事件,就可以把代码放在以 ...

随机推荐

  1. linux消息队列应用编程

    消息队列:  消息队列提供了一个从一个进程向另外一个进程发送一块数据的方法   每个数据块都被认为是有一个类型,接收者进程接收的数据块可以有不同的类型值   消息队列也有管道一样的不足,就是每个消息的 ...

  2. 解题报告:poj2387 dijkstra

    2017-09-17 17:37:03 writer:pprp dijkstra模板题目,注意去重 代码如下: /* @theme:poj 2387 @declare:最短路,从N到1点 @write ...

  3. 拓扑排序 - hdu 1285(普通和优先队列优化)

    2017-09-12 19:50:58 writer:pprp 最近刚开始接触拓扑排序,拓扑排序适用于:无圈图的顶点的一种排序, 用来解决有优先级别的排序问题,比如课程先修后修,排名等. 主要实现:用 ...

  4. RocketMQ 自己的整理和理解

    每个人的想法不同, RocketMQ 介绍的时候就说 是阿里从他们使用的上 解耦出来 近一步简化 便捷的 目的当然是 让其能快速入手和开发 如果不是在项目设计层面上 只是使用的话 从Git上下载该项目 ...

  5. UVa 11396 爪分解(二分图判定)

    https://vjudge.net/problem/UVA-11396 题意: 给出n个结点的简单无向图,每个点的度数均为3.你的任务是判断能否把它分解成若干爪.每条边必须属于一个爪,但同一个点可以 ...

  6. 【转】asp.net 项目在 IE 11 下出现 “__doPostBack”未定义 的解决办法

    最近我们运营的网站有用户反馈在 IE 11 下<asp:LinkButton> 点击出现 “__doPostBack”未定义”,经过一番google,终于知道了原因:ASP.NET 可能无 ...

  7. ajax实现用户注册

    需求分析 页面中给出注册表单: 在username input标签中绑定onblur事件处理函数. 当input标签失去焦点后获取 username表单字段的值,向服务端发送AJAX请求: djang ...

  8. .Net Core 二级域名绑定到指定的控制器

    在说二级域名绑定之前,先说一下.net core中的区域,关于区域这一块儿在很久之前的博客中,已经提过,详见<03-dotnet core创建区域[Areas]及后台搭建>,在这篇博客中, ...

  9. python脚本10_打印斐波那契数列的第101项

    #打印斐波那契数列的第101项 a = 1 b = 1 for count in range(99): a,b = b,a+b else: print(b) 方法2: #打印斐波那契数列的第101项 ...

  10. Check for Palindromes

    如果给定的字符串是回文,返回true,反之,返回false. 如果一个字符串忽略标点符号.大小写和空格,正着读和反着读一模一样,那么这个字符串就是palindrome(回文). 注意你需要去掉字符串多 ...