spark streaming 实现接收网络传输数据进行WordCount功能

package iie.udps.example.operator.spark;

import scala.Tuple2;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.function.FlatMapFunction;

import org.apache.spark.api.java.function.Function2;

import org.apache.spark.api.java.function.PairFunction;

import org.apache.spark.streaming.Duration;

import org.apache.spark.streaming.api.java.JavaDStream;

import org.apache.spark.streaming.api.java.JavaPairDStream;

import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;

import org.apache.spark.streaming.api.java.JavaStreamingContext;

import org.apache.spark.streaming.Time;

import java.io.File;

import java.io.IOException;

import java.nio.charset.Charset;

import java.util.Arrays;

import java.util.List;

import com.google.common.io.Files;

import org.apache.spark.api.java.JavaPairRDD;

import com.google.common.base.Optional;

/**

 * To run this on your local machine, you need to first run a Netcat server

 *

 * `$ nc -lk 9999`

 *

 * and run the example as

 *

 * spark-submit --class iie.udps.example.operator.spark.JavaNetworkWordCount

 * --master local /home/xdf/test2.jar localhost 9999 /user/test/checkpoint/

 * /home/xdf/outputFile /home/xdf/totalOutputFile

 *

 * 此示例接收Netcat server产生的数据，进行WordCount操作，分别输出当前结果和历史结果到本地文件中

 */

public final class JavaNetworkWordCount {

	@SuppressWarnings("serial")

	public static void main(String[] args) {

		if (args.length != 5) {

			System.err.println("You arguments were " + Arrays.asList(args));

			System.err

					.println("Usage: JavaNetworkWordCount <hostname> <port> <checkpoint-directory>\n"

							+ "     <output-file> <total-output-file>. <hostname> and <port> describe the TCP server that Spark\n"

							+ "     Streaming would connect to receive data. <checkpoint-directory> directory to\n"

							+ "     HDFS-compatible file system which checkpoint data <output-file> file to which\n"

							+ "     the word counts will be appended\n"

							+ "     <total-output-file> file to which the total word counts will be appended\n"

							+ "\n"

							+ "In local mode, <master> should be 'local[n]' with n > 1\n"

							+ "Both <checkpoint-directory> and <output-file> and <total-output-file> must be absolute paths");

			System.exit(1);

		}

		final String checkpointDirectory = args[2]; // 检查点目录

		final String curOutputPath = args[3];// 输出当前WordCount结果的路径

		final String totalOutputPath = args[4];// 输出全部累计WordCount结果的路径

		System.out.println("Creating new context");

		final File curOutputFile = new File(curOutputPath);

		if (curOutputFile.exists()) {

			curOutputFile.delete();

		}

		final File totalOutputFile = new File(totalOutputPath);

		if (totalOutputFile.exists()) {

			totalOutputFile.delete();

		}

		// Create a StreamingContext

		SparkConf conf = new SparkConf().setAppName("NetworkWordCount");

		final JavaStreamingContext jssc = new JavaStreamingContext(conf,

				new Duration(1000));

		jssc.checkpoint(checkpointDirectory);

		// Create a DStream that will connect to hostname:port, like

		// localhost:9999

		JavaReceiverInputDStream<String> lines = jssc.socketTextStream(args[0],

				Integer.parseInt(args[1]));

		// Split each line into words

		JavaDStream<String> words = lines

				.flatMap(new FlatMapFunction<String, String>() {

					@Override

					public Iterable<String> call(String x) {

						return Arrays.asList(x.split(" "));

					}

				});

		// Count each word in each batch

		JavaPairDStream<String, Integer> pairs = words

				.mapToPair(new PairFunction<String, String, Integer>() {

					@Override

					public Tuple2<String, Integer> call(String s)

							throws Exception {

						return new Tuple2<String, Integer>(s, 1);

					}

				});

		JavaPairDStream<String, Integer> runningCounts = pairs

				.reduceByKey(new Function2<Integer, Integer, Integer>() {

					@Override

					public Integer call(Integer i1, Integer i2)

							throws Exception {

						return i1 + i2;

					}

				});

		runningCounts

				.foreachRDD(new Function2<JavaPairRDD<String, Integer>, Time, Void>() {

					@Override

					public Void call(JavaPairRDD<String, Integer> rdd, Time time)

							throws IOException {

						String counts = "Counts at time " + time + " "

								+ rdd.collect();

						System.out.println(counts);

						System.out.println("Appending to "

								+ curOutputFile.getAbsolutePath());

						Files.append(counts + "\n", curOutputFile,

								Charset.defaultCharset());

						return null;

					}

				});

		Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction = new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {

			@Override

			public Optional<Integer> call(List<Integer> values,

					Optional<Integer> state) {

				Integer newSum = state.or(0);

				for (Integer i : values) {

					newSum += i;

				}

				return Optional.of(newSum);

			}

		};

		JavaPairDStream<String, Integer> TotalCounts = words.mapToPair(

				new PairFunction<String, String, Integer>() {

					@Override

					public Tuple2<String, Integer> call(String s) {

						return new Tuple2<String, Integer>(s, 1);

					}

				}).updateStateByKey(updateFunction);

		TotalCounts

				.foreachRDD(new Function2<JavaPairRDD<String, Integer>, Time, Void>() {

					@Override

					public Void call(JavaPairRDD<String, Integer> rdd, Time time)

							throws IOException {

						String counts = "Counts at time " + time + " "

								+ rdd.collect();

						System.out.println(counts);

						System.out.println("Appending to "

								+ totalOutputFile.getAbsolutePath());

						Files.append(counts + "\n", totalOutputFile,

								Charset.defaultCharset());

						return null;

					}

				});

		jssc.start(); // Start the computation

		jssc.awaitTermination(); // Wait for the computation to terminate

		System.exit(0);

	}

}

spark streaming 实现接收网络传输数据进行WordCount功能的更多相关文章

Spark Streaming 数据接收过程
SparkStreaming 源码分析一节中从源码角度,描述了Streaming执行时代码的调用过程.下边就接收转化阶段过程再简单分析一下,为分析backpressure作准备. SparkStre ...
Spark Streaming与kafka整合实践之WordCount
本次实践使用kafka console作为消息的生产者,Spark Streaming作为消息的消费者,具体实践代码如下首先启动kafka server .\bin\windows\kafka-se ...
Spark Streaming的接收KAFKA的数据
https://github.com/lw-lin/CoolplaySpark/blob/master/Spark%20Streaming%20%E6%BA%90%E7%A0%81%E8%A7%A3% ...
Spark Streaming源码解读之流数据不断接收全生命周期彻底研究和思考
本期内容 : 数据接收架构设计模式数据接收源码彻底研究一.Spark Streaming数据接收设计模式 Spark Streaming接收数据也相似MVC架构: 1. Mode相当于Rece ...
Spark Streaming源码解读之流数据不断接收和全生命周期彻底研究和思考
本节的主要内容: 一.数据接受架构和设计模式二.接受数据的源码解读 Spark Streaming不断持续的接收数据,具有Receiver的Spark 应用程序的考虑. Receiver和Drive ...
Spark入门实战系列--7.Spark Streaming（上）--实时流计算Spark Streaming原理介绍
[注]该系列文章以及使用到安装包/测试数据可以在<倾情大奉送--Spark入门实战系列>获取 .Spark Streaming简介 1.1 概述 Spark Streaming 是Spa ...
Spark Streaming简介及原理
简介: SparkStreaming是一套框架. SparkStreaming是Spark核心API的一个扩展,可以实现高吞吐量的,具备容错机制的实时流数据处理. 支持多种数据源获取数据: Spark ...
.Spark Streaming（上）--实时流计算Spark Streaming原理介
Spark入门实战系列--7.Spark Streaming(上)--实时流计算Spark Streaming原理介绍 http://www.cnblogs.com/shishanyuan/p/474 ...
spark streaming的理解和应用
1.Spark Streaming简介官方网站解释:http://spark.apache.org/docs/latest/streaming-programming-guide.html 该博客转 ...

随机推荐

一个漂亮灵活的PHP图片验证码
<?php class Imagecode{ private $width ; private $height; private $counts; private $distrubcode; p ...
Java对象的序列化和反序列化实践
2013-12-20 14:58 对象序列化的目标是将对象保存在磁盘中,或者允许在网络中直接传输对象.对象序列化机制允许把内存中的Java对象转换成平台无关的二进制流,从而允许把这种二进制流持久的保存 ...
Linux - gcc和g++的区别
一般linux系统都自带了gcc编译器的,你可以用你的安装光盘去安装,如果你是觉得自带的gcc版本太低了,可以去gcc的官方网站可以下载到,编译需要很长的时间,如果你只编译C或者C++可以只下载gcc ...
为什么要进行傅立叶变换？傅立叶变换究竟有何意义？如何用Matlab实现快速傅立叶变换
写在最前面:本文是我阅读了多篇相关文章后对它们进行分析重组整合而得,绝大部分内容非我所原创.在此向多位原创作者致敬!!!一.傅立叶变换的由来关于傅立叶变换,无论是书本还是在网上可以很容易找到关于傅立叶 ...
推荐一款好用轻便的在线UML画图工具
刚接触UML时间不长,看了N多教学视频,下载好了几个软件各种不习惯当我遇见了ProcessOn 从此我彻底“爱上”了它! http://www.processon.com/ UML各类例图它几乎全 ...
dialog参数、方法以及事件
参数(options) DOM方式初始化dialog的,推荐使用集合属性data-options定义参数,如果使用data属性定义参数,注意转换成对应的名称. 名称类型默认值描述 id stri ...
Windows Azure 实操 —— 迁移本地SharePoint服务器到Azure
博客地址 http://blog.csdn.net/foxdave 注意:如果你是第二代虚拟机,那就别看这个了,老老实实在Azure上重新创建吧,Azure不支持第二代虚拟机. 写在之前,对Azure ...
Ubuntu 14.10 下SSH执行远程命令
有些时候需要在远程机器上执行命令,如果每次都等进去挺麻烦的,所以用脚本执行会方便很多.下面介绍一下在shell脚本中执行远程命令. 1,首先写好要运行的脚本 run-command.sh, 加上执行权 ...
Ettus Research USRP B200/B210 simple case
FR #1题解
A. 建图跑最小费用最大流.分类讨论每种情况如何连边,费用怎么定. #include<iostream> #include<cstdio> #include<cstrin ...

spark streaming 实现接收网络传输数据进行WordCount功能

spark streaming 实现接收网络传输数据进行WordCount功能的更多相关文章

随机推荐

热门专题