strom_hdfs与Sequence详解

这片博客主要是讲解storm-hdfs，Squence及它们的trident方法使用，不多说上代码：

pom.xml

        <dependency>

			<groupId>org.apache.hadoop</groupId>

			<artifactId>hadoop-client</artifactId>

			<version>2.7.3</version>

			<exclusions>

				<exclusion>

					<groupId>org.slf4j</groupId>

					<artifactId>slf4j-log4j12</artifactId>

				</exclusion>

			</exclusions>

		</dependency>

        <dependency>

			<groupId>org.apache.hadoop</groupId>

			<artifactId>hadoop-hdfs</artifactId>

			<version>2.7.3</version>

			<exclusions>

				<exclusion>

					<artifactId>asm</artifactId>

					<groupId>asm</groupId>

				</exclusion>

			</exclusions>

		</dependency>

		<dependency>

			<groupId>org.apache.hadoop</groupId>

			<artifactId>hadoop-common</artifactId>

			<version>2.7.3</version>

		</dependency>

        <dependency>

			<groupId>org.apache.storm</groupId>

			<artifactId>storm-core</artifactId>

			<version>1.1.2</version>

		</dependency>

我pom.xml中导的包比较多，这上面我是复制出来的相关包，可能会有遗漏，不过我每个类上的导包都会有不必担心，

HDFS：

HdfsSpout：获取hdfs中的数据



import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.ArrayList;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.storm.spout.SpoutOutputCollector;

import org.apache.storm.task.TopologyContext;

import org.apache.storm.topology.OutputFieldsDeclarer;

import org.apache.storm.topology.base.BaseRichSpout;

import org.apache.storm.tuple.Fields;

import org.apache.storm.tuple.Values;

/**

 * @author cwc

 * @date 2018年5月25日

 * @description: 接收hdfs的spout

 * @version 1.0.0

 */

public class HdfsReadSpout  extends BaseRichSpout{

	private static final long serialVersionUID = 1L;

	private SpoutOutputCollector collector;

	private String url="/testData/mytest/app_hdfs-bolt-2-0-1528096963805.log";

	private ArrayList<String> arrayList;

	@Override

	public void nextTuple() {

		hdfsData(url);

	}

	@Override

	public void open(Map conf, TopologyContext arg1, SpoutOutputCollector collector) {

		this.collector =collector;

	}

	@Override

	public void declareOutputFields(OutputFieldsDeclarer declarer) {

		declarer.declare(new Fields("lines"));

	}

	private void hdfsData(String FileName){

		System.out.println("开始拿数据");

		Configuration conf=new Configuration();

		conf.set("fs.defaultFS","hdfs://172.18.130.100:8020");

		 BufferedReader in = null;

		 FSDataInputStream dis;

		 String line;

		try {

			FileSystem hdfs = FileSystem.get(conf);

			dis = hdfs.open(new Path(FileName));

			in = new BufferedReader(new InputStreamReader(dis, "UTF-8"));

			while ((line = in.readLine()) != null){

//				System.out.println("拿到的数据为"+line);

				this.collector.emit(new Values(line));

			}

		} catch (IllegalArgumentException e) {

			e.printStackTrace();

		} catch (IOException e) {

			e.printStackTrace();

		}

	}

}

storm-hdfs主要演示类：



import org.apache.storm.Config;

import org.apache.storm.LocalCluster;

import org.apache.storm.StormSubmitter;

import org.apache.storm.generated.AlreadyAliveException;

import org.apache.storm.generated.AuthorizationException;

import org.apache.storm.generated.InvalidTopologyException;

import org.apache.storm.hdfs.bolt.HdfsBolt;

import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;

import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;

import org.apache.storm.hdfs.bolt.format.FileNameFormat;

import org.apache.storm.hdfs.bolt.format.RecordFormat;

import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;

import org.apache.storm.hdfs.bolt.rotation.TimedRotationPolicy;

import org.apache.storm.hdfs.bolt.rotation.TimedRotationPolicy.TimeUnit;

import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;

import org.apache.storm.hdfs.bolt.sync.SyncPolicy;

import org.apache.storm.hdfs.spout.HdfsSpout;

import org.apache.storm.hdfs.spout.TextFileReader;

import org.apache.storm.topology.TopologyBuilder;

import com.sunsheen.jfids.bigdata.storm.demo.count.TestSpout;

/**

 *

* @ClassName: HDFSMain

* @Description: storm数据写入读取hdfs中

* @author cwc

* @date 2018年6月4日 下午3:28:12

* @version 2.0.0  二次改进版

 */

public class HDFSMain{

	public static void main(String[] args) {

		writeHdfs(args);

//		readHdfs(args);

//		selectHdfs(args);

	}

	/**

	 * storm将数据写入Hdfs

	 * @param args 传入的参数

	 */

	public static void writeHdfs(String[] args){

		//Configure HDFS bolt

		RecordFormat format = new DelimitedRecordFormat().withFieldDelimiter("\t"); // 输出字段分隔符

		SyncPolicy syncPolicy = new CountSyncPolicy(1000);// 每1000个tuple同步到HDFS一次

		FileRotationPolicy rotationPolicy = new TimedRotationPolicy(1.0f, TimeUnit.MINUTES); // rotate files

		FileNameFormat fileNameFormat = new DefaultFileNameFormat()

			       .withPath("/testData/mytest/").withPrefix("app_").withExtension(".log"); //目录名，文件名

		HdfsBolt hdfsBolt = new HdfsBolt()

				 .withFsUrl("hdfs://172.18.130.100:8020")

				 .withFileNameFormat(fileNameFormat)

				 .withRecordFormat(format)

				 .withRotationPolicy(rotationPolicy)

				 .withSyncPolicy(syncPolicy);

		// configure & build topology

		TopologyBuilder builder = new TopologyBuilder();

		builder.setSpout("kafka-write", new TestSpout(), 5);

		builder.setBolt("hdfs-write", hdfsBolt, 2).shuffleGrouping("kafka-write");

		// submit topology

		Config conf = new Config();

		String name = HDFSMain.class.getSimpleName();

		if (args != null && args.length > 0) {

		String nimbus = args[0];

		conf.put(Config.NIMBUS_HOST, nimbus);

		conf.setNumWorkers(3);

		try {

			StormSubmitter.submitTopologyWithProgressBar(name, conf, builder.createTopology());

		} catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		}

		} else {

		conf.setMaxTaskParallelism(3);

		LocalCluster cluster = new LocalCluster();

		cluster.submitTopology(name, conf, builder.createTopology());

		try {

			Thread.sleep(100000);

		} catch (InterruptedException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		}

		cluster.shutdown();

		}

	}

	/**

	 * 读取Hdfs

	 * @param args 传入的参数

	 */

	public static void readHdfs(String[] args){

		// configure & build topology

		TopologyBuilder builder = new TopologyBuilder();

		builder.setSpout("hdfs-reader", new HdfsReadSpout(), 3);

		builder.setBolt("hdfs-read", new HDFSReadBolt(), 2).shuffleGrouping("hdfs-reader");

		// submit topology

		Config conf = new Config();

		String name = HDFSMain.class.getSimpleName();

		if (args != null && args.length > 0) {

		String nimbus = args[0];

		conf.put(Config.NIMBUS_HOST, nimbus);

		conf.setNumWorkers(3);

		try {

			StormSubmitter.submitTopologyWithProgressBar(name, conf, builder.createTopology());

		} catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		}

		} else {

		conf.setMaxTaskParallelism(3);

		LocalCluster cluster = new LocalCluster();

		cluster.submitTopology(name, conf, builder.createTopology());

		try {

			Thread.sleep(100000);

		} catch (InterruptedException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		}

		cluster.shutdown();

		}

	}

	/**

	 * 这个算是高级API了吧，是直接读取一个目录的hdfs文件。。。批量读取

	 * @param args

	 */

	public static void selectHdfs(String[] args){

		// Instantiate spout to read text files

		HdfsSpout textReaderSpout = new HdfsSpout().setReaderType("text")

		                                          .withOutputFields(TextFileReader.defaultFields)

		                                          .setHdfsUri("hdfs://172.18.130.100:8020")  // url

		                                          .setSourceDir("/testData/mytest")          // 要读取的目录

		                                          .setArchiveDir("/testData/mytest/done")    // 处理完成后移动到该目录

		                                          .setBadFilesDir("/testData/mytest/badfiles");     // 读取异常时移动的目录

		// Create topology

		TopologyBuilder builder = new TopologyBuilder();

		builder.setSpout("hdfsspout", textReaderSpout, 2);

		builder.setBolt("hdfs-read", new HDFSReadBolt(), 2).shuffleGrouping("hdfsspout");

		// submit topology

				Config conf = new Config();

				String name = HDFSMain.class.getSimpleName();

				if (args != null && args.length > 0) {

				String nimbus = args[0];

				conf.put(Config.NIMBUS_HOST, nimbus);

				conf.setNumWorkers(3);

				try {

					StormSubmitter.submitTopologyWithProgressBar(name, conf, builder.createTopology());

				} catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {

					// TODO Auto-generated catch block

					e.printStackTrace();

				}

				} else {

				conf.setMaxTaskParallelism(3);

				LocalCluster cluster = new LocalCluster();

				cluster.submitTopology(name, conf, builder.createTopology());

				try {

					Thread.sleep(100000);

				} catch (InterruptedException e) {

					// TODO Auto-generated catch block

					e.printStackTrace();

				}

				cluster.shutdown();

				}

			}

}

Trident高级接口：大家看这片之前先去看看trident与普通的方法有什么区别，我们为什么要用这个接口，勤于思考

package com.sunsheen.jfids.bigdata.storm.demo.hdfs.trident;

import org.apache.storm.Config;

import org.apache.storm.LocalCluster;

import org.apache.storm.StormSubmitter;

import org.apache.storm.generated.AlreadyAliveException;

import org.apache.storm.generated.AuthorizationException;

import org.apache.storm.generated.InvalidTopologyException;

import org.apache.storm.generated.StormTopology;

import org.apache.storm.hdfs.trident.HdfsState;

import org.apache.storm.hdfs.trident.HdfsStateFactory;

import org.apache.storm.hdfs.trident.HdfsUpdater;

import org.apache.storm.hdfs.trident.format.DefaultFileNameFormat;

import org.apache.storm.hdfs.trident.format.DelimitedRecordFormat;

import org.apache.storm.hdfs.trident.format.FileNameFormat;

import org.apache.storm.hdfs.trident.format.RecordFormat;

import org.apache.storm.hdfs.trident.rotation.FileRotationPolicy;

import org.apache.storm.hdfs.trident.rotation.FileSizeRotationPolicy;

import org.apache.storm.trident.Stream;

import org.apache.storm.trident.TridentState;

import org.apache.storm.trident.TridentTopology;

import org.apache.storm.trident.state.StateFactory;

import org.apache.storm.tuple.Fields;

import com.sunsheen.jfids.bigdata.storm.demo.count.TestSpout;

import com.sunsheen.jfids.bigdata.storm.demo.hdfs.HDFSMain;

/**

 * @author cwc

 * @date 2018年6月4日

 * @description: hdfs Trident 写入接口

 * @version 1.0.0

 */

public  class  HdfsTridents {

	public static void main(String[] args){

		Config conf = new Config();

		conf.setMaxSpoutPending(3);

		String name = HDFSMain.class.getSimpleName();

		if (args != null && args.length > 0) {

			//服务器

			try {

				StormSubmitter.submitTopology(args[1], conf, buildTopology());

			} catch (AlreadyAliveException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			} catch (InvalidTopologyException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			} catch (AuthorizationException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			}

			} else {

				//本地

		LocalCluster cluster = new LocalCluster();

        cluster.submitTopology("test", conf, buildTopology());

        try {

			Thread.sleep(100000);

			cluster.shutdown();

		} catch (InterruptedException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		}

		}

	}

	/**

	 * 获取StormTopology

	 * @return

	 */

	public static StormTopology buildTopology(){

		//注意写入之后发现文件中是空数据，应该是Spout端传值或者分割符有问题

		TestSpout spout = new TestSpout();

		TridentTopology topology = new TridentTopology();

		Stream stream = topology.newStream("stream", spout);//创建stream

         Fields hdfsFields = new Fields("line","values"); //拿取spout数据

         FileNameFormat fileNameFormat = new DefaultFileNameFormat()

          .withPrefix("trident")//文件前缀

          .withExtension(".txt")//文件后缀

          .withPath("/testData/mytest/trident/");//文件目录

         RecordFormat recordFormat = new DelimitedRecordFormat()

          .withFieldDelimiter("/t")//分割符

          .withFields(hdfsFields);

         FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, FileSizeRotationPolicy.Units.MB);//设置一个大小条件，到达后就写入

         HdfsState.Options options = new HdfsState.HdfsFileOptions()

          .withFileNameFormat(fileNameFormat)

          .withRecordFormat(recordFormat)

          .withRotationPolicy(rotationPolicy)

          .withFsUrl("hdfs://172.18.130.100:8020");//主机名

         StateFactory factory = new HdfsStateFactory().withOptions(options);//创建工厂

         TridentState state = stream

        		 .partitionPersist(factory, hdfsFields, new HdfsUpdater(), new Fields());//这步是写入最重要的一步

		return topology.build();

	}

}

Sequence模块：

spout类：



import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IOUtils;

import org.apache.hadoop.io.SequenceFile;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.io.SequenceFile.Reader;

import org.apache.hadoop.util.ReflectionUtils;

import org.apache.storm.spout.SpoutOutputCollector;

import org.apache.storm.task.TopologyContext;

import org.apache.storm.topology.OutputFieldsDeclarer;

import org.apache.storm.topology.base.BaseRichSpout;

import org.apache.storm.tuple.Fields;

import org.apache.storm.tuple.Values;

/**

 * @author cwc

 * @date 2018年7月24日

 * @version 1.0.0

 * @description:读取sequence数据

 */

public class SquenceSpout extends BaseRichSpout {

	private static final long serialVersionUID = 1L;

	private SpoutOutputCollector collector;

	private String fileUrl ="hdfs://172.18.130.100:8020/testData/mytest/trident/Topology-1-0-1532415869703.seq";

	/**

	 * 有参

	 * @param fileUrl

	 */

	public SquenceSpout(String fileUrl){

		this.fileUrl=fileUrl;//有参可以在调用时直接传参

	}

	@Override

	public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {

		// TODO Auto-generated method stub

		this.collector =collector;

	}

	@Override

	public void nextTuple() {

		// TODO Auto-generated method stub

		getSquence();

	}

	/**

	 * 获取Sequence文件数据

	 */

	public void getSquence(){

		Configuration conf = new Configuration();

		Path path = new Path(this.fileUrl);

		SequenceFile.Reader.Option option1 = Reader.file(path);

//		SequenceFile.Reader.Option option2 = Reader.length(30);//这个参数表示读取的长度,不知道长度就别开，开了报错

		SequenceFile.Reader reader = null;

		try {

			reader = new SequenceFile.Reader(conf,option1);

			Writable key = (Writable) ReflectionUtils.newInstance(

					reader.getKeyClass(), conf);

			Writable value = (Writable) ReflectionUtils.newInstance(

					reader.getValueClass(), conf);

			long position = reader.getPosition();

			while (reader.next(key, value)) {

				String syncSeen = reader.syncSeen() ? "*" : "";//我的这个数据读出来是空的

				System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key,

						value);

				System.out.println(position+"---------"+syncSeen+"=========="+key+"————————————————"+value);

				this.collector.emit(new Values(position,syncSeen,key,value));//将数据发送到后方，用集合是不是更好？

				position = reader.getPosition(); //开始记录下一个目录

			}

		} catch (IOException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		} finally {

			IOUtils.closeStream(reader);

		}

	}

	@Override

	public void declareOutputFields(OutputFieldsDeclarer declarer) {

		// TODO Auto-generated method stub

		declarer.declare(new Fields("he","llo","wo","rd"));

	}

}

bolt类：



import java.io.IOException;

import java.net.URI;

import java.util.Map;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.SequenceFile;

import org.apache.hadoop.io.compress.CompressionCodecFactory;

import org.apache.storm.hdfs.bolt.AbstractHdfsBolt;

import org.apache.storm.hdfs.bolt.format.FileNameFormat;

import org.apache.storm.hdfs.bolt.format.SequenceFormat;

import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;

import org.apache.storm.hdfs.bolt.sync.SyncPolicy;

import org.apache.storm.hdfs.common.AbstractHDFSWriter;

import org.apache.storm.hdfs.common.Partitioner;

import org.apache.storm.hdfs.common.SequenceFileWriter;

import org.apache.storm.hdfs.common.rotation.RotationAction;

import org.apache.storm.task.OutputCollector;

import org.apache.storm.task.TopologyContext;

import org.apache.storm.tuple.Tuple;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

/**

 * @author cwc

 * @date 2018年7月24日

 * @version 1.0.0

 * @description:这个是为了给导包没有该类的同学准备的

 */

public class SequenceFileBolt extends AbstractHdfsBolt {

	private static final long serialVersionUID = 1L;

	private static final Logger LOG = LoggerFactory.getLogger(SequenceFileBolt.class);//打印日志

    private SequenceFormat format;

    private SequenceFile.CompressionType compressionType = SequenceFile.CompressionType.RECORD;

    private transient SequenceFile.Writer writer;

    private String compressionCodec = "default";

    private transient CompressionCodecFactory codecFactory;

    public SequenceFileBolt() {

    }

    public SequenceFileBolt withCompressionCodec(String codec) {

        this.compressionCodec = codec;

        return this;

    }

    public SequenceFileBolt withFsUrl(String fsUrl) {

        this.fsUrl = fsUrl;

        return this;

    }

    public SequenceFileBolt withConfigKey(String configKey) {

        this.configKey = configKey;

        return this;

    }

    public SequenceFileBolt withFileNameFormat(FileNameFormat fileNameFormat) {

        this.fileNameFormat = fileNameFormat;

        return this;

    }

    public SequenceFileBolt withSequenceFormat(SequenceFormat format) {

        this.format = format;

        return this;

    }

    public SequenceFileBolt withSyncPolicy(SyncPolicy syncPolicy) {

        this.syncPolicy = syncPolicy;

        return this;

    }

    public SequenceFileBolt withRotationPolicy(FileRotationPolicy rotationPolicy) {

        this.rotationPolicy = rotationPolicy;

        return this;

    }

    public SequenceFileBolt withCompressionType(SequenceFile.CompressionType compressionType) {

        this.compressionType = compressionType;

        return this;

    }

    public SequenceFileBolt withTickTupleIntervalSeconds(int interval) {

        this.tickTupleInterval = interval;

        return this;

    }

    public SequenceFileBolt addRotationAction(RotationAction action) {

        this.rotationActions.add(action);

        return this;

    }

    public SequenceFileBolt withRetryCount(int fileRetryCount) {

        this.fileRetryCount = fileRetryCount;

        return this;

    }

    public SequenceFileBolt withPartitioner(Partitioner partitioner) {

        this.partitioner = partitioner;

        return this;

    }

    public SequenceFileBolt withMaxOpenFiles(int maxOpenFiles) {

        this.maxOpenFiles = maxOpenFiles;

        return this;

    }

    @Override

    public void doPrepare(Map conf, TopologyContext topologyContext, OutputCollector collector) throws IOException {

        LOG.info("Preparing Sequence File Bolt...");

        if (this.format == null) throw new IllegalStateException("SequenceFormat must be specified.");

        this.fs = FileSystem.get(URI.create(this.fsUrl), hdfsConfig);

        this.codecFactory = new CompressionCodecFactory(hdfsConfig);

    }

    @Override

    protected String getWriterKey(Tuple tuple) {

        return "CONSTANT";

    }

    @Override

    protected AbstractHDFSWriter makeNewWriter(Path path, Tuple tuple) throws IOException {

        SequenceFile.Writer writer = SequenceFile.createWriter(

            this.hdfsConfig,

            SequenceFile.Writer.file(path),

            SequenceFile.Writer.keyClass(this.format.keyClass()),

            SequenceFile.Writer.valueClass(this.format.valueClass()),

            SequenceFile.Writer.compression(this.compressionType, this.codecFactory.getCodecByName(this.compressionCodec))

        );

        return new SequenceFileWriter(this.rotationPolicy, path, writer, this.format);

    }

}

SequenceFileTopology：



import java.io.FileInputStream;

import java.io.InputStream;

import java.util.HashMap;

import java.util.Map;

import java.util.UUID;

import java.util.concurrent.ConcurrentHashMap;

import org.apache.hadoop.io.SequenceFile;

import org.apache.storm.Config;

import org.apache.storm.LocalCluster;

import org.apache.storm.StormSubmitter;

import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;

import org.apache.storm.hdfs.bolt.format.DefaultSequenceFormat;

import org.apache.storm.hdfs.bolt.format.FileNameFormat;

import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;

import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy;

import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy.Units;

import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;

import org.apache.storm.hdfs.bolt.sync.SyncPolicy;

import org.apache.storm.hdfs.common.rotation.MoveFileAction;

import org.apache.storm.spout.SpoutOutputCollector;

import org.apache.storm.task.OutputCollector;

import org.apache.storm.task.TopologyContext;

import org.apache.storm.topology.OutputFieldsDeclarer;

import org.apache.storm.topology.TopologyBuilder;

import org.apache.storm.topology.base.BaseRichBolt;

import org.apache.storm.topology.base.BaseRichSpout;

import org.apache.storm.tuple.Fields;

import org.apache.storm.tuple.Tuple;

import org.apache.storm.tuple.Values;

import org.yaml.snakeyaml.Yaml;

/**

 * @author cwc

 * @date 2018年7月24日

 * @version 1.0.0

 * @description:Sequence文件写入

 */

public class SequenceFileTopology {

    static final String SENTENCE_SPOUT_ID = "sentence-spout";

    static final String BOLT_ID = "Topology";//定义生成文件的前缀

    static final String TOPOLOGY_NAME = "test-topology";

    public static void main(String[] args) throws Exception {

        Config config = new Config();

        config.setNumWorkers(1);

        SentenceSpout spout = new SentenceSpout();

        // 在每个1k元组之后同步文件系统

        SyncPolicy syncPolicy = new CountSyncPolicy(1000);

        // 当文件达到5MB时进行旋转

        FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, Units.MB);

        FileNameFormat fileNameFormat = new DefaultFileNameFormat()

                .withPath("/testData/mytest/trident/")//存储位置

                .withExtension(".seq");//后缀名

        // create sequence format instance.

        DefaultSequenceFormat format = new DefaultSequenceFormat("timestamp", "sentence");//spout接收到的数据

//        Yaml yaml = new Yaml();//这个目的就是给config传值，这个应该是从xml中获取数据

//        InputStream in = new FileInputStream("hdfs://172.18.130.100:8020");//这里传入的数据是否是url？

//        Map<String, Object> yamlConf = (Map<String, Object>) yaml.load(in);

//        in.close();

        Map<String, Object> yamlConf =new HashMap<String, Object>();

        config.put("hdfs.config", yamlConf);//加载自定义的参数

        SequenceFileBolt bolt = new SequenceFileBolt()

            .withFsUrl("hdfs://172.18.130.100:8020")//hdfs

            .withConfigKey("hdfs.config")

            .withFileNameFormat(fileNameFormat)

            .withSequenceFormat(format)

            .withRotationPolicy(rotationPolicy)

            .withSyncPolicy(syncPolicy)

            .withCompressionType(SequenceFile.CompressionType.RECORD)

            .withCompressionCodec("deflate")//放出

            .addRotationAction(new MoveFileAction().toDestination("/testData/mytest/seqOne/"));//添加一个循环操作中有个移除文件的作用是什么作用？

        TopologyBuilder builder = new TopologyBuilder();

        builder.setSpout(SENTENCE_SPOUT_ID, spout, 1);

        // SentenceSpout --> MyBolt

        builder.setBolt(BOLT_ID, bolt, 4)

               .shuffleGrouping(SENTENCE_SPOUT_ID);

        String topoName = TOPOLOGY_NAME;

        	//线上

//        if (args.length == 3) {

//            topoName = args[2];

//        } else if (args.length > 3) {

//            System.out.println("Usage: SequenceFileTopology [hdfs url] [hdfs yaml config file] <topology name>");

//            return;

//        }

//        StormSubmitter.submitTopology(topoName, config, builder.createTopology());

        //本地测试运行

        LocalCluster cluster = new LocalCluster();

        cluster.submitTopology(topoName, config, builder.createTopology());

 	   	try {

 		   Thread.sleep(100000);

 		   cluster.shutdown();

 	   	} catch (InterruptedException e) {

 		   // TODO Auto-generated catch block

 		   e.printStackTrace();

 	   }

    }

    public static void waitForSeconds(int seconds) {

        try {

            Thread.sleep(seconds * 1000);

        } catch (InterruptedException e) {

        }

    }

    public static class SentenceSpout extends BaseRichSpout {

		private static final long serialVersionUID = 1L;

		private ConcurrentHashMap<UUID, Values> pending;

        private SpoutOutputCollector collector;

        private String[] sentences = {

            "my dog has fleas",

            "i like cold beverages",

            "the dog ate my homework",

            "don't have a cow man",

            "i don't think i like fleas"

        };

        private int index = 0;

        private int count = 0;

        private long total = 0L;

        public void declareOutputFields(OutputFieldsDeclarer declarer) {

            declarer.declare(new Fields("sentence", "timestamp"));

        }

        public void open(Map config, TopologyContext context,

                         SpoutOutputCollector collector) {

            this.collector = collector;

            this.pending = new ConcurrentHashMap<UUID, Values>();

        }

        public void nextTuple() {

            Values values = new Values(sentences[index], System.currentTimeMillis());

            UUID msgId = UUID.randomUUID();

            this.pending.put(msgId, values);

            this.collector.emit(values, msgId);

            index++;

            if (index >= sentences.length) {

                index = 0;

            }

            count++;

            total++;

            if (count > 20000) {

                count = 0;

                System.out.println("Pending count: " + this.pending.size() + ", total: " + this.total);

            }

            Thread.yield();

        }

        public void ack(Object msgId) {

            //            System.out.println("ACK");

            this.pending.remove(msgId);

        }

        public void fail(Object msgId) {

            System.out.println("**** RESENDING FAILED TUPLE");

            this.collector.emit(this.pending.get(msgId), msgId);

        }

    }

    public static class MyBolt extends BaseRichBolt {

		private static final long serialVersionUID = 1L;

//		private HashMap<String, Long> counts = null;

		private HashMap<String, String> counts = null;

        private OutputCollector collector;

        public void prepare(Map config, TopologyContext context, OutputCollector collector) {

//            this.counts = new HashMap<String, Long>();

        	this.counts = new HashMap<String,String>();

            this.collector = collector;

        }

        public void execute(Tuple tuple) {

            collector.ack(tuple);

        }

        public void declareOutputFields(OutputFieldsDeclarer declarer) {

            // this bolt does not emit anything

        }

        @Override

        public void cleanup() {

        }

    }

}

sequenceTrident：



import org.apache.storm.Config;

import org.apache.storm.LocalCluster;

import org.apache.storm.StormSubmitter;

import org.apache.storm.generated.AlreadyAliveException;

import org.apache.storm.generated.AuthorizationException;

import org.apache.storm.generated.InvalidTopologyException;

import org.apache.storm.generated.StormTopology;

import org.apache.storm.hdfs.common.rotation.MoveFileAction;

import org.apache.storm.hdfs.trident.HdfsState;

import org.apache.storm.hdfs.trident.HdfsStateFactory;

import org.apache.storm.hdfs.trident.HdfsUpdater;

import org.apache.storm.hdfs.trident.format.DefaultFileNameFormat;

import org.apache.storm.hdfs.trident.format.DefaultSequenceFormat;

import org.apache.storm.hdfs.trident.format.FileNameFormat;

import org.apache.storm.hdfs.trident.rotation.FileRotationPolicy;

import org.apache.storm.hdfs.trident.rotation.FileSizeRotationPolicy;

import org.apache.storm.trident.Stream;

import org.apache.storm.trident.TridentState;

import org.apache.storm.trident.TridentTopology;

import org.apache.storm.trident.state.StateFactory;

import org.apache.storm.tuple.Fields;

import com.sunsheen.jfids.bigdata.storm.demo.hdfs.SequenceFileTopology.SentenceSpout;

/**

 * @author cwc

 * @date 2018年6月5日

 * @description:Sequence-Trident 高级接口写入

 * @version 1.0.0

 */

public class TridentSequenceTopology {

	/**

	 * trident与普通Topology区别在于返回值及工厂的使用

	 * @param hdfsUrl

	 * @return

	 */

    public static StormTopology buildTopology(String hdfsUrl){ 

    SentenceSpout spout =new SentenceSpout();

    TridentTopology topology = new TridentTopology();

    Stream stream = topology.newStream("spout1", spout);

    Fields hdfsFields = new Fields("sentence", "timestamp");

    FileNameFormat fileNameFormat = new DefaultFileNameFormat()

            .withPath("/testData/mytest/trident/")

            .withPrefix("trident")

            .withExtension(".seq");

    //旋转

    FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, FileSizeRotationPolicy.Units.MB);

    HdfsState.Options seqOpts = new HdfsState.SequenceFileOptions()

            .withFileNameFormat(fileNameFormat)

            .withSequenceFormat(new DefaultSequenceFormat("timestamp", "sentence"))

            .withRotationPolicy(rotationPolicy)

            .withFsUrl(hdfsUrl)

//            .withConfigKey("hdfs.config")

            .addRotationAction(new MoveFileAction().toDestination("/testData/mytest/trident111/"));

    StateFactory factory = new HdfsStateFactory().withOptions(seqOpts);

    TridentState state = stream

            .partitionPersist(factory, hdfsFields, new HdfsUpdater(), new Fields());

    return topology.build();}

    public static void main(String[] args)  {

    	String hdfsUrl ="hdfs://172.18.130.100:8020";

    	Config conf = new Config();

        conf.setMaxSpoutPending(5);

        if (args != null && args.length > 0) {

			//服务器

			try {

				StormSubmitter.submitTopology(args[1], conf, buildTopology(hdfsUrl));

			} catch (AlreadyAliveException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			} catch (InvalidTopologyException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			} catch (AuthorizationException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			}

			}

       else{

			//本地

    	   LocalCluster cluster = new LocalCluster();

    	   cluster.submitTopology("test", conf, buildTopology(hdfsUrl));

    	   try {

    		   Thread.sleep(100000);

    		   cluster.shutdown();

    	   } catch (InterruptedException e) {

    		   // TODO Auto-generated catch block

    		   e.printStackTrace();

    	   }

       }

    }

}

有疑问的多看看代码，多思考，愿诸君努力。

strom_hdfs与Sequence详解的更多相关文章

oracle之sequence详解
Oracle提供了sequence对象,由系统提供自增长的序列号,每次取的时候它会自动增加,通常用于生成数据库数据记录的自增长主键或序号的地方. sequence的创建需要用户具有create seq ...
Oracle数据库中序列(SEQUENCE)的用法详解
Oracle数据库中序列(SEQUENCE)的用法详解在Oracle数据库中,序列的用途是生成表的主键值,可以在插入语句中引用,也可以通过查询检查当前值,或使序列增至下一个值.本文我们主要介绍了 ...
Illumina Sequence Identifiers 序列标识符 index详解
大家基本都知道什么是 FASTA 和 FastQ 格式了,但这是不够的. 我们还需要了解世界上最大的测序公司自己定制的 FastQ 格式,因为你可能会经常用到,有时还会亲自去处理它们. 本文主题:Il ...
Nginx 反向代理、负载均衡、页面缓存、URL重写及读写分离详解
转载:http://freeloda.blog.51cto.com/2033581/1288553 大纲一.前言二.环境准备三.安装与配置Nginx 四.Nginx之反向代理五.Nginx之负 ...
trie字典树详解及应用
原文链接 http://www.cnblogs.com/freewater/archive/2012/09/11/2680480.html Trie树详解及其应用一.知识简介 ...
C# LINQ详解（转）
C# LINQ详解(一) 原文标题:How does it work in C#?-Part 3 (C# LINQ in detail),作者:Mohammand A Rahman. 目录 LIN ...
oracle中imp命令详解 .
转自http://www.cnblogs.com/songdavid/articles/2435439.html oracle中imp命令详解 Oracle的导入实用程序(Import utility ...
Spring MVC 学习总结（二）——控制器定义与@RequestMapping详解
一.控制器定义控制器提供访问应用程序的行为,通常通过服务接口定义或注解定义两种方法实现. 控制器解析用户的请求并将其转换为一个模型.在Spring MVC中一个控制器可以包含多个Action(动作. ...
Nginx反向代理、负载均衡、页面缓存、URL重写及读写分离详解
大纲一.前言二.环境准备三.安装与配置Nginx 四.Nginx之反向代理五.Nginx之负载均衡六.Nginx之页面缓存七.Nginx之URL重写八.Nginx之读写分离注,操作系统 ...

随机推荐

linux awk 命令实用手册
0,简介 Linux awk 是一个实用的文本处理工具,它不仅是一款工具软件,也是一门编程语言.awk 的名称来源于其三位作者的姓氏缩写,其作者分别是Alfred Aho,Peter Weinberg ...
zookeeper笔记(二)
title: zookeeper笔记(二) zookeeper ALC权限控制 getAcl path 可以查看某个node的权限设置权限: 2. world方式 setAcl <path&g ...
UVALive 7501 Business Cycle
细心题 #include<bits/stdc++.h> using namespace std; #define rep(i,a,b) for(int i=a;i<=b;++i) # ...
第三方库PyYAML
建议参考PyYAML Documentation来源:http://pyyaml.org/wiki/PyYAMLDocumentation:http://blog.csdn.net/conquer07 ...
C++编程入门题目--No.4
题目: 输入某年某月某日,判断这一天是这一年的第几天? 程序分析: 以3月5日为例,应该先把前两个月的加起来,然后再加上5天即本年的第几天,特殊情况,闰年且输入月份大于3时需考虑多加一天. #incl ...
数学--数论--hdu 6216 A Cubic number and A Cubic Number (公式推导）
A cubic number is the result of using a whole number in a multiplication three times. For example, 3 ...
Alink漫谈(二) : 从源码看机器学习平台Alink设计和架构
Alink漫谈(二) : 从源码看机器学习平台Alink设计和架构目录 Alink漫谈(二) : 从源码看机器学习平台Alink设计和架构 0x00 摘要 0x01 Alink设计原则 0x02 A ...
Programmatically add an application to Windows Firewall
Programmatically add an application to Windows Firewall 回答1 Not sure if this is the best way, but ...
Leetcode_45. 跳跃游戏 II
每个位置i有一个最大跳跃距离,求最小步数从0跳到n-1. dp[i]表示从0跳到i的最少步数,显然可以转移的状态就是从i-a[i]到i-1. 因为是最小步数,考虑用优先队列优化,再考虑到状态有范围的, ...
Java网络小结
1,定位 IP对机器的定位端口对软件的定位(65535) URL对软件上每一份资源的定位 2,TCP和UDP TCP 安全,性能低 ①ServerSocket②Socket UDP不安全,性能高 ① ...

strom_hdfs与Sequence详解

HDFS：

Sequence模块：

strom_hdfs与Sequence详解的更多相关文章

随机推荐

热门专题