spark-streaming-kafka-0-8 和 0-10的使用区别
一、spark-streaming-kafka-0-8_2.11-2.0.2.jar
1、pom.xml
- <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.11 -->
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-core_2.11</artifactId>
- <version>2.0.2</version>
- <scope>runtime</scope>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming_2.11 -->
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-streaming_2.11</artifactId>
- <version>2.0.2</version>
- <scope>runtime</scope>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-8_2.11 -->
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
- <version>2.0.2</version>
- <scope>runtime</scope>
- </dependency>
2、Kafka Consumer类
- package com.spark.main;
- import java.util.Arrays;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.Map;
- import java.util.Set;
- import org.apache.spark.SparkConf;
- import org.apache.spark.api.java.JavaRDD;
- import org.apache.spark.api.java.function.Function;
- import org.apache.spark.api.java.function.VoidFunction;
- import org.apache.spark.streaming.Durations;
- import org.apache.spark.streaming.api.java.JavaDStream;
- import org.apache.spark.streaming.api.java.JavaPairInputDStream;
- import org.apache.spark.streaming.api.java.JavaStreamingContext;
- import org.apache.spark.streaming.kafka.KafkaUtils;
- import kafka.serializer.StringDecoder;
- import scala.Tuple2;
- public class KafkaConsumer{
- public static void main(String[] args) throws InterruptedException{
- /**
- * SparkConf sparkConf = new SparkConf().setAppName("KafkaConsumer").setMaster("local[2]");
- * setMaster("local[2]"),至少要指定两个线程,一条用于用于接收消息,一条线程用于处理消息
- * Durations.seconds(2)每两秒读取一次kafka
- */
- SparkConf sparkConf = new SparkConf().setAppName("KafkaConsumer").setMaster("local[2]");
- JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(500));
- jssc.checkpoint("hdfs://192.168.168.200:9000/checkpoint/KafkaConsumer");
- /**
- * 配置连接kafka的相关参数
- */
- Set<String> topicsSet = new HashSet<String>(Arrays.asList("TestTopic"));
- Map<String, String> kafkaParams = new HashMap<String, String>();
- kafkaParams.put("metadata.broker.list", "192.168.168.200:9092");
- kafkaParams.put("auto.offset.reset", "smallest");//smallest:从最初开始;largest :从最新开始
- kafkaParams.put("fetch.message.max.bytes", "524288");
- JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
- StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);
- /**
- * _2()获取第二个对象的值
- */
- JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
- public String call(Tuple2<String, String> tuple2) {
- return tuple2._2();
- }
- });
- lines.foreachRDD(new VoidFunction<JavaRDD<String>>() {
- public void call(JavaRDD<String> rdd) throws Exception {
- rdd.foreach(new VoidFunction<String>() {
- public void call(String s) throws Exception {
- System.out.println(s);
- }
- });
- }
- });
- // Start the computation
- jssc.start();
- jssc.awaitTermination();
- }
- }
二、spark-streaming-kafka-0-10_2.11-2.0.2.jar
1、pom.xml
- <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.11 -->
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-core_2.11</artifactId>
- <version>2.0.2</version>
- <scope>runtime</scope>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming_2.11 -->
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-streaming_2.11</artifactId>
- <version>2.0.2</version>
- <scope>runtime</scope>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10_2.11 -->
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
- <version>2.0.2</version>
- <scope>runtime</scope>
- </dependency>
2、Kafka Consumer类
- package com.spark.main;
- import java.util.Arrays;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.Map;
- import java.util.Set;
- import org.apache.kafka.clients.consumer.ConsumerRecord;
- import org.apache.kafka.common.serialization.StringDeserializer;
- import org.apache.spark.SparkConf;
- import org.apache.spark.api.java.JavaRDD;
- import org.apache.spark.api.java.function.Function;
- import org.apache.spark.api.java.function.VoidFunction;
- import org.apache.spark.streaming.Durations;
- import org.apache.spark.streaming.api.java.JavaDStream;
- import org.apache.spark.streaming.api.java.JavaInputDStream;
- import org.apache.spark.streaming.api.java.JavaPairInputDStream;
- import org.apache.spark.streaming.api.java.JavaStreamingContext;
- import org.apache.spark.streaming.kafka010.ConsumerStrategies;
- import org.apache.spark.streaming.kafka010.KafkaUtils;
- import org.apache.spark.streaming.kafka010.LocationStrategies;
- import kafka.serializer.StringDecoder;
- import scala.Tuple2;
- public class Kafka10Consumer{
- public static void main(String[] args) throws InterruptedException{
- /**
- * SparkConf sparkConf = new SparkConf().setAppName("KafkaConsumer").setMaster("local[2]");
- * setMaster("local[2]"),至少要指定两个线程,一条用于用于接收消息,一条线程用于处理消息
- * Durations.seconds(2)每两秒读取一次kafka
- */
- SparkConf sparkConf = new SparkConf().setAppName("Kafka10Consumer").setMaster("local[2]");
- JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(500));
- jssc.checkpoint("hdfs://192.168.168.200:9000/checkpoint/Kafka10Consumer");
- /**
- * 配置连接kafka的相关参数
- */
- Set<String> topicsSet = new HashSet<String>(Arrays.asList("TestTopic"));
- Map<String, Object> kafkaParams = new HashMap<String, Object>();
- kafkaParams.put("bootstrap.servers", "192.168.168.200:9092");
- kafkaParams.put("key.deserializer", StringDeserializer.class);
- kafkaParams.put("value.deserializer", StringDeserializer.class);
- kafkaParams.put("group.id", "Kafka10Consumer");
- kafkaParams.put("auto.offset.reset", "earliest");//earliest : 从最早开始;latest :从最新开始
- kafkaParams.put("enable.auto.commit", false);
- //通过KafkaUtils.createDirectStream(...)获得kafka数据,kafka相关参数由kafkaParams指定
- JavaInputDStream<ConsumerRecord<Object,Object>> messages = KafkaUtils.createDirectStream(
- jssc,
- LocationStrategies.PreferConsistent(),
- ConsumerStrategies.Subscribe(topicsSet, kafkaParams)
- );
- /**
- * _2()获取第二个对象的值
- */
- JavaDStream<String> lines = messages.map(new Function<ConsumerRecord<Object,Object>, String>() {
- @Override
- public String call(ConsumerRecord<Object, Object> consumerRecord) throws Exception {
- // TODO Auto-generated method stub
- return consumerRecord.value().toString();
- }
- });
- lines.foreachRDD(new VoidFunction<JavaRDD<String>>() {
- public void call(JavaRDD<String> rdd) throws Exception {
- rdd.foreach(new VoidFunction<String>() {
- public void call(String s) throws Exception {
- System.out.println(s);
- }
- });
- }
- });
- // Start the computation
- jssc.start();
- jssc.awaitTermination();
- }
- }
spark-streaming-kafka-0-8 和 0-10的使用区别的更多相关文章
- Spark Streaming + Kafka整合(Kafka broker版本0.8.2.1+)
这篇博客是基于Spark Streaming整合Kafka-0.8.2.1官方文档. 本文主要讲解了Spark Streaming如何从Kafka接收数据.Spark Streaming从Kafka接 ...
- Spark踩坑记——Spark Streaming+Kafka
[TOC] 前言 在WeTest舆情项目中,需要对每天千万级的游戏评论信息进行词频统计,在生产者一端,我们将数据按照每天的拉取时间存入了Kafka当中,而在消费者一端,我们利用了spark strea ...
- Spark Streaming+Kafka
Spark Streaming+Kafka 前言 在WeTest舆情项目中,需要对每天千万级的游戏评论信息进行词频统计,在生产者一端,我们将数据按照每天的拉取时间存入了Kafka当中,而在消费者一端, ...
- spark streaming kafka example
// scalastyle:off println package org.apache.spark.examples.streaming import kafka.serializer.String ...
- spark streaming - kafka updateStateByKey 统计用户消费金额
场景 餐厅老板想要统计每个用户来他的店里总共消费了多少金额,我们可以使用updateStateByKey来实现 从kafka接收用户消费json数据,统计每分钟用户的消费情况,并且统计所有时间所有用户 ...
- Spark踩坑记:Spark Streaming+kafka应用及调优
前言 在WeTest舆情项目中,需要对每天千万级的游戏评论信息进行词频统计,在生产者一端,我们将数据按照每天的拉取时间存入了Kafka当中,而在消费者一端,我们利用了spark streaming从k ...
- Spark streaming + Kafka 流式数据处理,结果存储至MongoDB、Solr、Neo4j(自用)
KafkaStreaming.scala文件 import kafka.serializer.StringDecoder import org.apache.spark.SparkConf impor ...
- IDEA Spark Streaming Kafka数据源-Consumer
import org.apache.spark.SparkConf import org.apache.spark.streaming.kafka.KafkaUtils import org.apac ...
- 4、spark streaming+kafka
一.Receiver模式 1. receiver模式原理图 在SparkStreaming程序运行起来后,Executor中会有receiver tasks接收kafka推送过来的数据.数据会被持久化 ...
- spark.streaming.kafka.maxRatePerPartition的理解
spark.streaming.kafka.maxRatePerPartition设定对目标topic每个partition每秒钟拉取的数据条数. 假设此项设为1,批次间隔为10s,目标topic只有 ...
随机推荐
- 下载从网页里面提取出来的图片(将url指向的图片下载并保存、从命名)
import os #创建文件夹 from urllib import request #下载图片 if not os.path.exists('文件夹名字'): #创建文件夹名字 os.mkdir( ...
- Python 带有参数的装饰器
def wrapper_out(flag): # 装饰器本身的参数 def wrapper(fn): # 目标函数 def inner(*args, **kwargs): # 目标函数执行需要的参数 ...
- Httpclient的学习(一)
1.名词解释 抓包: 抓包(packet capture)就是将网络传输发送与接收的数据包进行截获.重发.编辑.转存等操作,也用来检查网络安全.抓包也经常被用来进行数据截取等. Httpclient: ...
- J - FatMouse's Speed
p的思路不一定要到最后去找到ans:也可以设置成在中间找到ans:比如J - FatMouse's Speed 这个题,如果要是让dp[n]成为最终答案的话,即到了i,最差的情况也是dp[i-1],就 ...
- markdown使用问题
1.配置自定义的markdown.css https://github.com/sameer1994kiki/markdown-css 2.代码块 一行`` 多行 ``` <code>&l ...
- freeSSHd (Auth fail)错误!以及Xmanager的(ssh服务器拒绝了密码,请再试一次)错误!
参考文档:http://blog.csdn.net/zhangliang_571/article/details/45598939 (Auth fail) 以及(ssh服务器拒绝了密码,请再试一次) ...
- 02 http,servlet,servletconfig,HttpServletRequest ,HttpServletResponse
Http协议 协议:双方在交互.通讯的时候, 遵守的一种规范.规则.http协议:针对网络上的客户端 与 服务器端在执行http请求的时候,遵守的一种规范. 其实就是规定了客户端在访问服务器端的时候, ...
- hdu4965 Fast Matrix Calculation 矩阵快速幂
One day, Alice and Bob felt bored again, Bob knows Alice is a girl who loves math and is just learni ...
- mysql的utf8与utf8mb4 异同;utf8mb4_unicode_ci 与 utf8mb4_general_ci 如何选择
如图,一般使用如下配置 utf8mb4是4个字节.utf8是3个字节.utf8mb4兼容性更好,占用空间更大. 主要从排序准确性和性能两方面看: 准确性utf8mb4_unicode_ci 是基于标准 ...
- day 04 Java并发多线程
http://www.cnblogs.com/hellocsl/p/3969768.html?utm_source=tuicool&utm_medium=referralPS:而JVM 每遇到 ...