Spark Streaming之窗口函数和状态转换函数
import kafka.serializer.StringDecoder
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming._
import org.apache.spark.{SparkContext, SparkConf} object ClickStream {
def main (args: Array[String]){
// 屏蔽不必要的日志显示在终端上
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) //创建SparkConf对象,设置应用程序的名称,在程序运行的监控界面可以看到名称
val conf = new SparkConf().setAppName("ClickStream").setMaster("local[*]")
val sc = new SparkContext(conf) //此处设置Batch Interval是在Spark Streaming中生成基本Job的时间单位,窗口和滑动时间间隔一定是该Batch Interval的整数倍
val ssc = new StreamingContext(sc, Seconds(args().toLong)) //由于用到了窗口函数,需要复用前面的RDD,必须checkpoint,注意复用的RDD之间是没有任何关系的
ssc.checkpoint(args()) val topics = Set("clickstream") //所要获取数据在kafka上的主题
val brokers = ",,,"
val kafkaParams = Map[String, String]("" -> brokers)
//val offset = "largest" //values: smallest, largest ,控制读取最新的数据,还是旧的数据, 默认值为largest //从Spark1.3开始,我们能够使用如下方式高效地从kafka上获取数据
val kvsTemp = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
val kvs = => line._2) //第一部分是null为key,第二部分才是所需数据,为string类型 //根据需求对流进来的数据进行清洗、转换等处理
val data ="\\t")).filter(_() == "finance").map(_()).map(_.split("\\?")()).filter(! _.contains("iframe")).map((_, )) //滑动窗口长度为1小时,滑动间隔为10分钟,这会得到过去1小时内,url和pv的对应关系
//val pvWindow = data.reduceByKeyAndWindow((v1: Int, v2: Int) => v1+v2, Minutes(60), Minutes(10)) //滑动窗口长度为1小时,滑动间隔为10分钟,这同样会得到过去1小时内,url和pv的对应关系,只不过这是加新减旧,第一个参数加上新的,第2个参数,减去上一个batch的。
val pvWindow = data.reduceByKeyAndWindow(_ + _, _ - _, Minutes(), Minutes())
pvWindow.print() ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminat
ssc.stop(true, true) //优雅地结束
* Return a new DStream in which each RDD contains the count of distinct elements in
* RDDs in a sliding window over this DStream. Hash partitioning is used to generate
* the RDDs with `numPartitions` partitions (Spark's default number of partitions if
* `numPartitions` not specified).
* @param windowDuration width of the window; must be a multiple of this DStream's
* batching interval
* @param slideDuration sliding interval of the window (i.e., the interval after which
* the new DStream will generate RDDs); must be a multiple of this
* DStream's batching interval
* @param numPartitions number of partitions of each RDD in the new DStream.
def countByValueAndWindow(
windowDuration: Duration,
slideDuration: Duration,
numPartitions: Int =
(implicit ord: Ordering[T] = null)
: DStream[(T, Long)] = ssc.withScope {, 1L)).reduceByKeyAndWindow(
(x: Long, y: Long) => x + y,
(x: Long, y: Long) => x - y,
(x: (T, Long)) => x._2 != 0L
* Return a new DStream in which each RDD has a single element generated by reducing all
* elements in a sliding window over this DStream. However, the reduction is done incrementally
* using the old window's reduced value :
* 1. reduce the new values that entered the window (e.g., adding new counts)
* 2. "inverse reduce" the old values that left the window (e.g., subtracting old counts)
* This is more efficient than reduceByWindow without "inverse reduce" function.
* However, it is applicable to only "invertible reduce functions".
* @param reduceFunc associative and commutative reduce function
* @param invReduceFunc inverse reduce function; such that for all y, invertible x:
* `invReduceFunc(reduceFunc(x, y), x) = y`
* @param windowDuration width of the window; must be a multiple of this DStream's
* batching interval
* @param slideDuration sliding interval of the window (i.e., the interval after which
* the new DStream will generate RDDs); must be a multiple of this
* DStream's batching interval
def reduceByWindow(
reduceFunc: (T, T) => T,
invReduceFunc: (T, T) => T,
windowDuration: Duration,
slideDuration: Duration
): DStream[T] = ssc.withScope {, _))
.reduceByKeyAndWindow(reduceFunc, invReduceFunc, windowDuration, slideDuration, )
* Return a new DStream in which each RDD has a single element generated by counting the number
* of elements in a sliding window over this DStream. Hash partitioning is used to generate
* the RDDs with Spark's default number of partitions.
* @param windowDuration width of the window; must be a multiple of this DStream's
* batching interval
* @param slideDuration sliding interval of the window (i.e., the interval after which
* the new DStream will generate RDDs); must be a multiple of this
* DStream's batching interval
def countByWindow(
windowDuration: Duration,
slideDuration: Duration): DStream[Long] = ssc.withScope { => 1L).reduceByWindow(_ + _, _ - _, windowDuration, slideDuration)
