strom_hdfs与Sequence详解
这片博客主要是讲解storm-hdfs,Squence及它们的trident方法使用,不多说上代码:
pom.xml
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
<exclusions>
<exclusion>
<artifactId>asm</artifactId>
<groupId>asm</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.1.2</version>
</dependency>
我pom.xml中导的包比较多,这上面我是复制出来的相关包,可能会有遗漏,不过我每个类上的导包都会有不必担心,
HDFS:
HdfsSpout:获取hdfs中的数据
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
/**
* @author cwc
* @date 2018年5月25日
* @description: 接收hdfs的spout
* @version 1.0.0
*/
public class HdfsReadSpout extends BaseRichSpout{
private static final long serialVersionUID = 1L;
private SpoutOutputCollector collector;
private String url="/testData/mytest/app_hdfs-bolt-2-0-1528096963805.log";
private ArrayList<String> arrayList;
@Override
public void nextTuple() {
hdfsData(url);
}
@Override
public void open(Map conf, TopologyContext arg1, SpoutOutputCollector collector) {
this.collector =collector;
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("lines"));
}
private void hdfsData(String FileName){
System.out.println("开始拿数据");
Configuration conf=new Configuration();
conf.set("fs.defaultFS","hdfs://172.18.130.100:8020");
BufferedReader in = null;
FSDataInputStream dis;
String line;
try {
FileSystem hdfs = FileSystem.get(conf);
dis = hdfs.open(new Path(FileName));
in = new BufferedReader(new InputStreamReader(dis, "UTF-8"));
while ((line = in.readLine()) != null){
// System.out.println("拿到的数据为"+line);
this.collector.emit(new Values(line));
}
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
storm-hdfs主要演示类:
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.hdfs.bolt.HdfsBolt;
import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
import org.apache.storm.hdfs.bolt.format.FileNameFormat;
import org.apache.storm.hdfs.bolt.format.RecordFormat;
import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;
import org.apache.storm.hdfs.bolt.rotation.TimedRotationPolicy;
import org.apache.storm.hdfs.bolt.rotation.TimedRotationPolicy.TimeUnit;
import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
import org.apache.storm.hdfs.bolt.sync.SyncPolicy;
import org.apache.storm.hdfs.spout.HdfsSpout;
import org.apache.storm.hdfs.spout.TextFileReader;
import org.apache.storm.topology.TopologyBuilder;
import com.sunsheen.jfids.bigdata.storm.demo.count.TestSpout;
/**
*
* @ClassName: HDFSMain
* @Description: storm数据写入读取hdfs中
* @author cwc
* @date 2018年6月4日 下午3:28:12
* @version 2.0.0 二次改进版
*/
public class HDFSMain{
public static void main(String[] args) {
writeHdfs(args);
// readHdfs(args);
// selectHdfs(args);
}
/**
* storm将数据写入Hdfs
* @param args 传入的参数
*/
public static void writeHdfs(String[] args){
//Configure HDFS bolt
RecordFormat format = new DelimitedRecordFormat().withFieldDelimiter("\t"); // 输出字段分隔符
SyncPolicy syncPolicy = new CountSyncPolicy(1000);// 每1000个tuple同步到HDFS一次
FileRotationPolicy rotationPolicy = new TimedRotationPolicy(1.0f, TimeUnit.MINUTES); // rotate files
FileNameFormat fileNameFormat = new DefaultFileNameFormat()
.withPath("/testData/mytest/").withPrefix("app_").withExtension(".log"); //目录名,文件名
HdfsBolt hdfsBolt = new HdfsBolt()
.withFsUrl("hdfs://172.18.130.100:8020")
.withFileNameFormat(fileNameFormat)
.withRecordFormat(format)
.withRotationPolicy(rotationPolicy)
.withSyncPolicy(syncPolicy);
// configure & build topology
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("kafka-write", new TestSpout(), 5);
builder.setBolt("hdfs-write", hdfsBolt, 2).shuffleGrouping("kafka-write");
// submit topology
Config conf = new Config();
String name = HDFSMain.class.getSimpleName();
if (args != null && args.length > 0) {
String nimbus = args[0];
conf.put(Config.NIMBUS_HOST, nimbus);
conf.setNumWorkers(3);
try {
StormSubmitter.submitTopologyWithProgressBar(name, conf, builder.createTopology());
} catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
conf.setMaxTaskParallelism(3);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(name, conf, builder.createTopology());
try {
Thread.sleep(100000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
cluster.shutdown();
}
}
/**
* 读取Hdfs
* @param args 传入的参数
*/
public static void readHdfs(String[] args){
// configure & build topology
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("hdfs-reader", new HdfsReadSpout(), 3);
builder.setBolt("hdfs-read", new HDFSReadBolt(), 2).shuffleGrouping("hdfs-reader");
// submit topology
Config conf = new Config();
String name = HDFSMain.class.getSimpleName();
if (args != null && args.length > 0) {
String nimbus = args[0];
conf.put(Config.NIMBUS_HOST, nimbus);
conf.setNumWorkers(3);
try {
StormSubmitter.submitTopologyWithProgressBar(name, conf, builder.createTopology());
} catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
conf.setMaxTaskParallelism(3);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(name, conf, builder.createTopology());
try {
Thread.sleep(100000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
cluster.shutdown();
}
}
/**
* 这个算是高级API了吧,是直接读取一个目录的hdfs文件。。。批量读取
* @param args
*/
public static void selectHdfs(String[] args){
// Instantiate spout to read text files
HdfsSpout textReaderSpout = new HdfsSpout().setReaderType("text")
.withOutputFields(TextFileReader.defaultFields)
.setHdfsUri("hdfs://172.18.130.100:8020") // url
.setSourceDir("/testData/mytest") // 要读取的目录
.setArchiveDir("/testData/mytest/done") // 处理完成后移动到该目录
.setBadFilesDir("/testData/mytest/badfiles"); // 读取异常时移动的目录
// Create topology
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("hdfsspout", textReaderSpout, 2);
builder.setBolt("hdfs-read", new HDFSReadBolt(), 2).shuffleGrouping("hdfsspout");
// submit topology
Config conf = new Config();
String name = HDFSMain.class.getSimpleName();
if (args != null && args.length > 0) {
String nimbus = args[0];
conf.put(Config.NIMBUS_HOST, nimbus);
conf.setNumWorkers(3);
try {
StormSubmitter.submitTopologyWithProgressBar(name, conf, builder.createTopology());
} catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
conf.setMaxTaskParallelism(3);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(name, conf, builder.createTopology());
try {
Thread.sleep(100000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
cluster.shutdown();
}
}
}
Trident高级接口:大家看这片之前先去看看trident与普通的方法有什么区别,我们为什么要用这个接口,勤于思考
package com.sunsheen.jfids.bigdata.storm.demo.hdfs.trident;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.hdfs.trident.HdfsState;
import org.apache.storm.hdfs.trident.HdfsStateFactory;
import org.apache.storm.hdfs.trident.HdfsUpdater;
import org.apache.storm.hdfs.trident.format.DefaultFileNameFormat;
import org.apache.storm.hdfs.trident.format.DelimitedRecordFormat;
import org.apache.storm.hdfs.trident.format.FileNameFormat;
import org.apache.storm.hdfs.trident.format.RecordFormat;
import org.apache.storm.hdfs.trident.rotation.FileRotationPolicy;
import org.apache.storm.hdfs.trident.rotation.FileSizeRotationPolicy;
import org.apache.storm.trident.Stream;
import org.apache.storm.trident.TridentState;
import org.apache.storm.trident.TridentTopology;
import org.apache.storm.trident.state.StateFactory;
import org.apache.storm.tuple.Fields;
import com.sunsheen.jfids.bigdata.storm.demo.count.TestSpout;
import com.sunsheen.jfids.bigdata.storm.demo.hdfs.HDFSMain;
/**
* @author cwc
* @date 2018年6月4日
* @description: hdfs Trident 写入接口
* @version 1.0.0
*/
public class HdfsTridents {
public static void main(String[] args){
Config conf = new Config();
conf.setMaxSpoutPending(3);
String name = HDFSMain.class.getSimpleName();
if (args != null && args.length > 0) {
//服务器
try {
StormSubmitter.submitTopology(args[1], conf, buildTopology());
} catch (AlreadyAliveException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTopologyException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (AuthorizationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
//本地
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("test", conf, buildTopology());
try {
Thread.sleep(100000);
cluster.shutdown();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 获取StormTopology
* @return
*/
public static StormTopology buildTopology(){
//注意写入之后发现文件中是空数据,应该是Spout端传值或者分割符有问题
TestSpout spout = new TestSpout();
TridentTopology topology = new TridentTopology();
Stream stream = topology.newStream("stream", spout);//创建stream
Fields hdfsFields = new Fields("line","values"); //拿取spout数据
FileNameFormat fileNameFormat = new DefaultFileNameFormat()
.withPrefix("trident")//文件前缀
.withExtension(".txt")//文件后缀
.withPath("/testData/mytest/trident/");//文件目录
RecordFormat recordFormat = new DelimitedRecordFormat()
.withFieldDelimiter("/t")//分割符
.withFields(hdfsFields);
FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, FileSizeRotationPolicy.Units.MB);//设置一个大小条件,到达后就写入
HdfsState.Options options = new HdfsState.HdfsFileOptions()
.withFileNameFormat(fileNameFormat)
.withRecordFormat(recordFormat)
.withRotationPolicy(rotationPolicy)
.withFsUrl("hdfs://172.18.130.100:8020");//主机名
StateFactory factory = new HdfsStateFactory().withOptions(options);//创建工厂
TridentState state = stream
.partitionPersist(factory, hdfsFields, new HdfsUpdater(), new Fields());//这步是写入最重要的一步
return topology.build();
}
}
Sequence模块:
spout类:
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
/**
* @author cwc
* @date 2018年7月24日
* @version 1.0.0
* @description:读取sequence数据
*/
public class SquenceSpout extends BaseRichSpout {
private static final long serialVersionUID = 1L;
private SpoutOutputCollector collector;
private String fileUrl ="hdfs://172.18.130.100:8020/testData/mytest/trident/Topology-1-0-1532415869703.seq";
/**
* 有参
* @param fileUrl
*/
public SquenceSpout(String fileUrl){
this.fileUrl=fileUrl;//有参可以在调用时直接传参
}
@Override
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
// TODO Auto-generated method stub
this.collector =collector;
}
@Override
public void nextTuple() {
// TODO Auto-generated method stub
getSquence();
}
/**
* 获取Sequence文件数据
*/
public void getSquence(){
Configuration conf = new Configuration();
Path path = new Path(this.fileUrl);
SequenceFile.Reader.Option option1 = Reader.file(path);
// SequenceFile.Reader.Option option2 = Reader.length(30);//这个参数表示读取的长度,不知道长度就别开,开了报错
SequenceFile.Reader reader = null;
try {
reader = new SequenceFile.Reader(conf,option1);
Writable key = (Writable) ReflectionUtils.newInstance(
reader.getKeyClass(), conf);
Writable value = (Writable) ReflectionUtils.newInstance(
reader.getValueClass(), conf);
long position = reader.getPosition();
while (reader.next(key, value)) {
String syncSeen = reader.syncSeen() ? "*" : "";//我的这个数据读出来是空的
System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key,
value);
System.out.println(position+"---------"+syncSeen+"=========="+key+"————————————————"+value);
this.collector.emit(new Values(position,syncSeen,key,value));//将数据发送到后方,用集合是不是更好?
position = reader.getPosition(); //开始记录下一个目录
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
IOUtils.closeStream(reader);
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// TODO Auto-generated method stub
declarer.declare(new Fields("he","llo","wo","rd"));
}
}
bolt类:
import java.io.IOException;
import java.net.URI;
import java.util.Map;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.storm.hdfs.bolt.AbstractHdfsBolt;
import org.apache.storm.hdfs.bolt.format.FileNameFormat;
import org.apache.storm.hdfs.bolt.format.SequenceFormat;
import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;
import org.apache.storm.hdfs.bolt.sync.SyncPolicy;
import org.apache.storm.hdfs.common.AbstractHDFSWriter;
import org.apache.storm.hdfs.common.Partitioner;
import org.apache.storm.hdfs.common.SequenceFileWriter;
import org.apache.storm.hdfs.common.rotation.RotationAction;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author cwc
* @date 2018年7月24日
* @version 1.0.0
* @description:这个是为了给导包没有该类的同学准备的
*/
public class SequenceFileBolt extends AbstractHdfsBolt {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LoggerFactory.getLogger(SequenceFileBolt.class);//打印日志
private SequenceFormat format;
private SequenceFile.CompressionType compressionType = SequenceFile.CompressionType.RECORD;
private transient SequenceFile.Writer writer;
private String compressionCodec = "default";
private transient CompressionCodecFactory codecFactory;
public SequenceFileBolt() {
}
public SequenceFileBolt withCompressionCodec(String codec) {
this.compressionCodec = codec;
return this;
}
public SequenceFileBolt withFsUrl(String fsUrl) {
this.fsUrl = fsUrl;
return this;
}
public SequenceFileBolt withConfigKey(String configKey) {
this.configKey = configKey;
return this;
}
public SequenceFileBolt withFileNameFormat(FileNameFormat fileNameFormat) {
this.fileNameFormat = fileNameFormat;
return this;
}
public SequenceFileBolt withSequenceFormat(SequenceFormat format) {
this.format = format;
return this;
}
public SequenceFileBolt withSyncPolicy(SyncPolicy syncPolicy) {
this.syncPolicy = syncPolicy;
return this;
}
public SequenceFileBolt withRotationPolicy(FileRotationPolicy rotationPolicy) {
this.rotationPolicy = rotationPolicy;
return this;
}
public SequenceFileBolt withCompressionType(SequenceFile.CompressionType compressionType) {
this.compressionType = compressionType;
return this;
}
public SequenceFileBolt withTickTupleIntervalSeconds(int interval) {
this.tickTupleInterval = interval;
return this;
}
public SequenceFileBolt addRotationAction(RotationAction action) {
this.rotationActions.add(action);
return this;
}
public SequenceFileBolt withRetryCount(int fileRetryCount) {
this.fileRetryCount = fileRetryCount;
return this;
}
public SequenceFileBolt withPartitioner(Partitioner partitioner) {
this.partitioner = partitioner;
return this;
}
public SequenceFileBolt withMaxOpenFiles(int maxOpenFiles) {
this.maxOpenFiles = maxOpenFiles;
return this;
}
@Override
public void doPrepare(Map conf, TopologyContext topologyContext, OutputCollector collector) throws IOException {
LOG.info("Preparing Sequence File Bolt...");
if (this.format == null) throw new IllegalStateException("SequenceFormat must be specified.");
this.fs = FileSystem.get(URI.create(this.fsUrl), hdfsConfig);
this.codecFactory = new CompressionCodecFactory(hdfsConfig);
}
@Override
protected String getWriterKey(Tuple tuple) {
return "CONSTANT";
}
@Override
protected AbstractHDFSWriter makeNewWriter(Path path, Tuple tuple) throws IOException {
SequenceFile.Writer writer = SequenceFile.createWriter(
this.hdfsConfig,
SequenceFile.Writer.file(path),
SequenceFile.Writer.keyClass(this.format.keyClass()),
SequenceFile.Writer.valueClass(this.format.valueClass()),
SequenceFile.Writer.compression(this.compressionType, this.codecFactory.getCodecByName(this.compressionCodec))
);
return new SequenceFileWriter(this.rotationPolicy, path, writer, this.format);
}
}
SequenceFileTopology:
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.io.SequenceFile;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
import org.apache.storm.hdfs.bolt.format.DefaultSequenceFormat;
import org.apache.storm.hdfs.bolt.format.FileNameFormat;
import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;
import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy;
import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy.Units;
import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
import org.apache.storm.hdfs.bolt.sync.SyncPolicy;
import org.apache.storm.hdfs.common.rotation.MoveFileAction;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.yaml.snakeyaml.Yaml;
/**
* @author cwc
* @date 2018年7月24日
* @version 1.0.0
* @description:Sequence文件写入
*/
public class SequenceFileTopology {
static final String SENTENCE_SPOUT_ID = "sentence-spout";
static final String BOLT_ID = "Topology";//定义生成文件的前缀
static final String TOPOLOGY_NAME = "test-topology";
public static void main(String[] args) throws Exception {
Config config = new Config();
config.setNumWorkers(1);
SentenceSpout spout = new SentenceSpout();
// 在每个1k元组之后同步文件系统
SyncPolicy syncPolicy = new CountSyncPolicy(1000);
// 当文件达到5MB时进行旋转
FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, Units.MB);
FileNameFormat fileNameFormat = new DefaultFileNameFormat()
.withPath("/testData/mytest/trident/")//存储位置
.withExtension(".seq");//后缀名
// create sequence format instance.
DefaultSequenceFormat format = new DefaultSequenceFormat("timestamp", "sentence");//spout接收到的数据
// Yaml yaml = new Yaml();//这个目的就是给config传值,这个应该是从xml中获取数据
// InputStream in = new FileInputStream("hdfs://172.18.130.100:8020");//这里传入的数据是否是url?
// Map<String, Object> yamlConf = (Map<String, Object>) yaml.load(in);
// in.close();
Map<String, Object> yamlConf =new HashMap<String, Object>();
config.put("hdfs.config", yamlConf);//加载自定义的参数
SequenceFileBolt bolt = new SequenceFileBolt()
.withFsUrl("hdfs://172.18.130.100:8020")//hdfs
.withConfigKey("hdfs.config")
.withFileNameFormat(fileNameFormat)
.withSequenceFormat(format)
.withRotationPolicy(rotationPolicy)
.withSyncPolicy(syncPolicy)
.withCompressionType(SequenceFile.CompressionType.RECORD)
.withCompressionCodec("deflate")//放出
.addRotationAction(new MoveFileAction().toDestination("/testData/mytest/seqOne/"));//添加一个循环操作中有个移除文件的作用是什么作用?
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout(SENTENCE_SPOUT_ID, spout, 1);
// SentenceSpout --> MyBolt
builder.setBolt(BOLT_ID, bolt, 4)
.shuffleGrouping(SENTENCE_SPOUT_ID);
String topoName = TOPOLOGY_NAME;
//线上
// if (args.length == 3) {
// topoName = args[2];
// } else if (args.length > 3) {
// System.out.println("Usage: SequenceFileTopology [hdfs url] [hdfs yaml config file] <topology name>");
// return;
// }
// StormSubmitter.submitTopology(topoName, config, builder.createTopology());
//本地测试运行
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(topoName, config, builder.createTopology());
try {
Thread.sleep(100000);
cluster.shutdown();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void waitForSeconds(int seconds) {
try {
Thread.sleep(seconds * 1000);
} catch (InterruptedException e) {
}
}
public static class SentenceSpout extends BaseRichSpout {
private static final long serialVersionUID = 1L;
private ConcurrentHashMap<UUID, Values> pending;
private SpoutOutputCollector collector;
private String[] sentences = {
"my dog has fleas",
"i like cold beverages",
"the dog ate my homework",
"don't have a cow man",
"i don't think i like fleas"
};
private int index = 0;
private int count = 0;
private long total = 0L;
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("sentence", "timestamp"));
}
public void open(Map config, TopologyContext context,
SpoutOutputCollector collector) {
this.collector = collector;
this.pending = new ConcurrentHashMap<UUID, Values>();
}
public void nextTuple() {
Values values = new Values(sentences[index], System.currentTimeMillis());
UUID msgId = UUID.randomUUID();
this.pending.put(msgId, values);
this.collector.emit(values, msgId);
index++;
if (index >= sentences.length) {
index = 0;
}
count++;
total++;
if (count > 20000) {
count = 0;
System.out.println("Pending count: " + this.pending.size() + ", total: " + this.total);
}
Thread.yield();
}
public void ack(Object msgId) {
// System.out.println("ACK");
this.pending.remove(msgId);
}
public void fail(Object msgId) {
System.out.println("**** RESENDING FAILED TUPLE");
this.collector.emit(this.pending.get(msgId), msgId);
}
}
public static class MyBolt extends BaseRichBolt {
private static final long serialVersionUID = 1L;
// private HashMap<String, Long> counts = null;
private HashMap<String, String> counts = null;
private OutputCollector collector;
public void prepare(Map config, TopologyContext context, OutputCollector collector) {
// this.counts = new HashMap<String, Long>();
this.counts = new HashMap<String,String>();
this.collector = collector;
}
public void execute(Tuple tuple) {
collector.ack(tuple);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// this bolt does not emit anything
}
@Override
public void cleanup() {
}
}
}
sequenceTrident:
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.hdfs.common.rotation.MoveFileAction;
import org.apache.storm.hdfs.trident.HdfsState;
import org.apache.storm.hdfs.trident.HdfsStateFactory;
import org.apache.storm.hdfs.trident.HdfsUpdater;
import org.apache.storm.hdfs.trident.format.DefaultFileNameFormat;
import org.apache.storm.hdfs.trident.format.DefaultSequenceFormat;
import org.apache.storm.hdfs.trident.format.FileNameFormat;
import org.apache.storm.hdfs.trident.rotation.FileRotationPolicy;
import org.apache.storm.hdfs.trident.rotation.FileSizeRotationPolicy;
import org.apache.storm.trident.Stream;
import org.apache.storm.trident.TridentState;
import org.apache.storm.trident.TridentTopology;
import org.apache.storm.trident.state.StateFactory;
import org.apache.storm.tuple.Fields;
import com.sunsheen.jfids.bigdata.storm.demo.hdfs.SequenceFileTopology.SentenceSpout;
/**
* @author cwc
* @date 2018年6月5日
* @description:Sequence-Trident 高级接口写入
* @version 1.0.0
*/
public class TridentSequenceTopology {
/**
* trident与普通Topology区别在于返回值及工厂的使用
* @param hdfsUrl
* @return
*/
public static StormTopology buildTopology(String hdfsUrl){
SentenceSpout spout =new SentenceSpout();
TridentTopology topology = new TridentTopology();
Stream stream = topology.newStream("spout1", spout);
Fields hdfsFields = new Fields("sentence", "timestamp");
FileNameFormat fileNameFormat = new DefaultFileNameFormat()
.withPath("/testData/mytest/trident/")
.withPrefix("trident")
.withExtension(".seq");
//旋转
FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, FileSizeRotationPolicy.Units.MB);
HdfsState.Options seqOpts = new HdfsState.SequenceFileOptions()
.withFileNameFormat(fileNameFormat)
.withSequenceFormat(new DefaultSequenceFormat("timestamp", "sentence"))
.withRotationPolicy(rotationPolicy)
.withFsUrl(hdfsUrl)
// .withConfigKey("hdfs.config")
.addRotationAction(new MoveFileAction().toDestination("/testData/mytest/trident111/"));
StateFactory factory = new HdfsStateFactory().withOptions(seqOpts);
TridentState state = stream
.partitionPersist(factory, hdfsFields, new HdfsUpdater(), new Fields());
return topology.build();}
public static void main(String[] args) {
String hdfsUrl ="hdfs://172.18.130.100:8020";
Config conf = new Config();
conf.setMaxSpoutPending(5);
if (args != null && args.length > 0) {
//服务器
try {
StormSubmitter.submitTopology(args[1], conf, buildTopology(hdfsUrl));
} catch (AlreadyAliveException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTopologyException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (AuthorizationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
else{
//本地
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("test", conf, buildTopology(hdfsUrl));
try {
Thread.sleep(100000);
cluster.shutdown();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
有疑问的多看看代码,多思考,愿诸君努力。
strom_hdfs与Sequence详解的更多相关文章
- oracle之sequence详解
Oracle提供了sequence对象,由系统提供自增长的序列号,每次取的时候它会自动增加,通常用于生成数据库数据记录的自增长主键或序号的地方. sequence的创建需要用户具有create seq ...
- Oracle数据库中序列(SEQUENCE)的用法详解
Oracle数据库中序列(SEQUENCE)的用法详解 在Oracle数据库中,序列的用途是生成表的主键值,可以在插入语句中引用,也可以通过查询检查当前值,或使序列增至下一个值.本文我们主要介绍了 ...
- Illumina Sequence Identifiers 序列标识符 index详解
大家基本都知道什么是 FASTA 和 FastQ 格式了,但这是不够的. 我们还需要了解世界上最大的测序公司自己定制的 FastQ 格式,因为你可能会经常用到,有时还会亲自去处理它们. 本文主题:Il ...
- Nginx 反向代理、负载均衡、页面缓存、URL重写及读写分离详解
转载:http://freeloda.blog.51cto.com/2033581/1288553 大纲 一.前言 二.环境准备 三.安装与配置Nginx 四.Nginx之反向代理 五.Nginx之负 ...
- trie字典树详解及应用
原文链接 http://www.cnblogs.com/freewater/archive/2012/09/11/2680480.html Trie树详解及其应用 一.知识简介 ...
- C# LINQ详解(转)
C# LINQ详解(一) 原文标题:How does it work in C#?-Part 3 (C# LINQ in detail),作者:Mohammand A Rahman. 目录 LIN ...
- oracle中imp命令详解 .
转自http://www.cnblogs.com/songdavid/articles/2435439.html oracle中imp命令详解 Oracle的导入实用程序(Import utility ...
- Spring MVC 学习总结(二)——控制器定义与@RequestMapping详解
一.控制器定义 控制器提供访问应用程序的行为,通常通过服务接口定义或注解定义两种方法实现. 控制器解析用户的请求并将其转换为一个模型.在Spring MVC中一个控制器可以包含多个Action(动作. ...
- Nginx反向代理、负载均衡、页面缓存、URL重写及读写分离详解
大纲 一.前言 二.环境准备 三.安装与配置Nginx 四.Nginx之反向代理 五.Nginx之负载均衡 六.Nginx之页面缓存 七.Nginx之URL重写 八.Nginx之读写分离 注,操作系统 ...
随机推荐
- icmp的抓包分析
ICMP(Internet Control Message Protocol)Internet控制报文协议.它是TCP/IP协议簇的一个子协议,用于在IP主机.路由器之间传递控制消息.控制消息是指网络 ...
- 双链表【参照redis链表结构】
参照了Redis里面的双链表结构,可以说是完全复制粘贴,redis的双链表还是写的很通俗易懂的,没有什么花里胡哨的东西,但是redis还有个iter迭代器的结构来遍历链表.我这里就没有实现了,只是实现 ...
- 【Linux网络基础】TCP/IP协议簇的详细介绍(三次握手四次断开,11种状态)
一.TCP/IP协议簇(DoD参考模型) 用于简化OSI层次,以及相关的标准. 传输控制协议(tcp/ip)簇是相关国防部DoD所创建的,主要用来确保数据的完整性以及在毁灭性战争中维持通信 是由一组不 ...
- 打造更好用的 EF 自动审计
打造更好用的 EF 自动审计 Intro 上次基于 EF Core 实现了一个自动审计的功能,详细可以参考 https://www.cnblogs.com/weihanli/p/auto-audit- ...
- Aurora: 来自 Amazon 的高性能的企业级关系数据库,兼容 MySQL
近日,在美国召开的AWS re:Invent云计算大会上,Amazon高级副总裁安迪·杰西发布了企业级关系数据库Aurora.Aurora是一个面向Amazon RDS(关系数据库服务).兼容MySQ ...
- Leo2DNT(雷傲论坛转DiscuzNT)1.0转换程序发布
数据转换程序 雷傲论坛(Leobbs4.x) -> Discuz!NT V1.0 本转换程序基于Leobbs4.x设计 声明: 1.本程序只对数据作转换,不会对原来的雷傲论坛(数据 ...
- CF1328E Tree Queries
CF1328E Tree Queries 应该还是比较妙的 题意 给你一个树,然后多次询问 每次询问给出一堆节点,问你是否能找到一个从根出发的链,是的对于给出的每个节点,都能找出链上的点,是的他们的距 ...
- PyCharm 集成 SVN,检出、提交代码
1.安装 SVN,解决 SVN 目录中没有 svn.exe 问题 重新打开 TortoiseSVN 安装文件 选择 Modify 后在command line client tools 选项修改为 W ...
- Jenkins 构建 Jmeter 项目之源代码管理(SVN)
1.查看项目创建中是否又 svn 插件,没有的话下载插件 subversion 2.配置 svn 源代码管理,如下图(testcases 目录下包含 build.xml 和脚本文件) 3.查看 Jen ...
- 不可错过的java面试博客之java集合篇
1. List List 是有序的 Collection.Java List 一共三个实现类: 分别是 ArrayList.Vector 和 LinkedList ArrayList ArrayLis ...