【SparkStreaming学习之四】 SparkStreaming+kafka管理消费offset
环境
虚拟机:VMware 10
Linux版本:CentOS-6.5-x86_64
客户端:Xshell4
FTP:Xftp4
jdk1.8
scala-2.10.4(依赖jdk1.8)
spark-1.6
从kafka消费消息的偏移量存储到ZK 或者 mysql 或者 hbase,进行主动管理。
以下举例通过ZK进行存储管理:
package manageoffset; import java.util.Map; import kafka.common.TopicAndPartition;
import manageoffset.getOffset.GetTopicOffsetFromKafkaBroker;
import manageoffset.getOffset.GetTopicOffsetFromZookeeper; import org.apache.log4j.Logger;
import org.apache.spark.streaming.api.java.JavaStreamingContext; public class UseZookeeperManageOffset {
/**
* 使用log4j打印日志,“UseZookeeper.class” 设置日志的产生类
*/
static final Logger logger = Logger.getLogger(UseZookeeperManageOffset.class); public static void main(String[] args) {
/**
* 加载log4j的配置文件,方便打印日志
*/
ProjectUtil.LoadLogConfig();
logger.info("project is starting..."); /**
* 从kafka集群中得到topic每个分区中生产消息的最大偏移量位置
*/
Map<TopicAndPartition, Long> topicOffsets = GetTopicOffsetFromKafkaBroker.getTopicOffsets("node1:9092,node2:9092,node3:9092", "mytopic"); /**
* 从zookeeper中获取当前topic每个分区 consumer 消费的offset位置
*/
Map<TopicAndPartition, Long> consumerOffsets =
GetTopicOffsetFromZookeeper.getConsumerOffsets("node3:2181,node4:2181,node5:2181","zhy","mytopic"); /**
* 合并以上得到的两个offset ,
* 思路是:
* 如果zookeeper中读取到consumer的消费者偏移量,那么就zookeeper中当前的offset为准。
* 否则,如果在zookeeper中读取不到当前消费者组消费当前topic的offset,就是当前消费者组第一次消费当前的topic,
* offset设置为topic中消息的最大位置。
*/
if(null!=consumerOffsets && consumerOffsets.size()>0){
topicOffsets.putAll(consumerOffsets);
}
/**
* 如果将下面的代码解开,是将topicOffset 中当前topic对应的每个partition中消费的消息设置为0,就是从头开始。
*/
// for(Map.Entry<TopicAndPartition, Long> item:topicOffsets.entrySet()){
// item.setValue(0l);
// } /**
* 构建SparkStreaming程序,从当前的offset消费消息
*/
JavaStreamingContext jsc = SparkStreamingDirect.getStreamingContext(topicOffsets,"zhy"); jsc.start();
jsc.awaitTermination();
jsc.close();
}
}
package manageoffset.getOffset; import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set; import com.google.common.collect.ImmutableMap; import kafka.api.PartitionOffsetRequestInfo;
import kafka.cluster.Broker;
import kafka.common.TopicAndPartition;
import kafka.javaapi.OffsetRequest;
import kafka.javaapi.OffsetResponse;
import kafka.javaapi.PartitionMetadata;
import kafka.javaapi.TopicMetadata;
import kafka.javaapi.TopicMetadataRequest;
import kafka.javaapi.TopicMetadataResponse;
import kafka.javaapi.consumer.SimpleConsumer; /**
* 测试之前需要启动kafka
* @author root
*
*/
public class GetTopicOffsetFromKafkaBroker
{
public static void main(String[] args)
{ Map<TopicAndPartition, Long> topicOffsets = getTopicOffsets("node1:9092,node2:9092,node3:9092", "mytopic");
Set<Entry<TopicAndPartition, Long>> entrySet = topicOffsets.entrySet();
for(Entry<TopicAndPartition, Long> entry : entrySet) {
TopicAndPartition topicAndPartition = entry.getKey();
Long offset = entry.getValue();
String topic = topicAndPartition.topic();
int partition = topicAndPartition.partition();
System.out.println("topic = "+topic+",partition = "+partition+",offset = "+offset);
} } /**
* 从kafka集群中得到当前topic,生产者在每个分区中生产消息的偏移量位置
* @param KafkaBrokerServer
* @param topic
* @return
*/
public static Map<TopicAndPartition,Long> getTopicOffsets(String KafkaBrokerServer, String topic){
Map<TopicAndPartition,Long> retVals = new HashMap<TopicAndPartition,Long>();
//遍历每个broker
for(String broker:KafkaBrokerServer.split(",")){
//使用SimpleConsumer访问broker
SimpleConsumer simpleConsumer = new SimpleConsumer(broker.split(":")[0],Integer.valueOf(broker.split(":")[1]), 64*10000,1024,"consumer");
TopicMetadataRequest topicMetadataRequest = new TopicMetadataRequest(Arrays.asList(topic));
TopicMetadataResponse topicMetadataResponse = simpleConsumer.send(topicMetadataRequest);
//遍历获取的元数据
for (TopicMetadata metadata : topicMetadataResponse.topicsMetadata()) {
//遍历每个partition上的元数据
for (PartitionMetadata part : metadata.partitionsMetadata()) {
Broker leader = part.leader();
if (leader != null)
{
TopicAndPartition topicAndPartition = new TopicAndPartition(topic, part.partitionId());
PartitionOffsetRequestInfo partitionOffsetRequestInfo = new PartitionOffsetRequestInfo(kafka.api.OffsetRequest.LatestTime(), 10000);
OffsetRequest offsetRequest = new OffsetRequest(ImmutableMap.of(topicAndPartition, partitionOffsetRequestInfo), kafka.api.OffsetRequest.CurrentVersion(), simpleConsumer.clientId());
OffsetResponse offsetResponse = simpleConsumer.getOffsetsBefore(offsetRequest);
if (!offsetResponse.hasError()) {
long[] offsets = offsetResponse.offsets(topic, part.partitionId());
retVals.put(topicAndPartition, offsets[0]);
}
}
}
}
simpleConsumer.close();
}
return retVals;
}
}
package manageoffset.getOffset; import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set; import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.retry.RetryUntilElapsed; import com.fasterxml.jackson.databind.ObjectMapper; import kafka.common.TopicAndPartition; public class GetTopicOffsetFromZookeeper { /**
* 从zookeeper中获取当前topic每个分区 consumer 消费的offset位置
* @param zkServers
* @param groupID
* @param topic
* @return
*/
public static Map<TopicAndPartition,Long> getConsumerOffsets(String zkServers,String groupID, String topic) {
Map<TopicAndPartition,Long> retVals = new HashMap<TopicAndPartition,Long>();
ObjectMapper objectMapper = new ObjectMapper();
CuratorFramework curatorFramework
= CuratorFrameworkFactory
.builder()
.connectString(zkServers)
.connectionTimeoutMs(1000)
.sessionTimeoutMs(10000)
.retryPolicy(new RetryUntilElapsed(1000, 1000))
.build(); curatorFramework.start(); try
{
String nodePath = "/consumers/"+groupID+"/offsets/" + topic;
if(curatorFramework.checkExists().forPath(nodePath)!=null)
{
List<String> partitions = curatorFramework.getChildren().forPath(nodePath);
for(String partiton:partitions)
{
int partitionL = Integer.valueOf(partiton);
Long offset = objectMapper.readValue(curatorFramework.getData().forPath(nodePath+"/"+partiton),Long.class);
TopicAndPartition topicAndPartition = new TopicAndPartition(topic,partitionL);
retVals.put(topicAndPartition, offset);
}
}
}
catch(Exception e)
{
e.printStackTrace();
}
curatorFramework.close();
return retVals;
} public static void main(String[] args) {
Map<TopicAndPartition, Long> consumerOffsets = getConsumerOffsets("node3:2181,node4:2181,node5:2181","zhy","mytopic");
Set<Entry<TopicAndPartition, Long>> entrySet = consumerOffsets.entrySet();
for(Entry<TopicAndPartition, Long> entry : entrySet) {
TopicAndPartition topicAndPartition = entry.getKey();
String topic = topicAndPartition.topic();
int partition = topicAndPartition.partition();
Long offset = entry.getValue();
System.out.println("topic = "+topic+",partition = "+partition+",offset = "+offset);
}
}
}
package manageoffset; import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference; import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder; import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.retry.RetryUntilElapsed;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange; import com.fasterxml.jackson.databind.ObjectMapper; public class SparkStreamingDirect
{ /**
* 根据传入的offset来读取kafka消息
* @param topicOffsets
* @param groupID
* @return
*/
public static JavaStreamingContext getStreamingContext(Map<TopicAndPartition, Long> topicOffsets,String groupID){
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkStreamingOnKafkaDirect");
conf.set("spark.streaming.kafka.maxRatePerPartition", "10");
JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));
jsc.checkpoint("/checkpoint");
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list","node1:9092,node2:9092,node3:9092");
kafkaParams.put("group.id","MyFirstConsumerGroup"); for(Map.Entry<TopicAndPartition,Long> entry:topicOffsets.entrySet()){
System.out.println(entry.getKey().topic()+"\t"+entry.getKey().partition()+"\t"+entry.getValue());
} JavaInputDStream<String> message = KafkaUtils.createDirectStream(
jsc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
String.class,
kafkaParams,
topicOffsets,
new Function<MessageAndMetadata<String,String>,String>() {
private static final long serialVersionUID = 1L; public String call(MessageAndMetadata<String, String> v1)throws Exception {
return v1.message();
}
}
); final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<>(); JavaDStream<String> lines = message.transform(new Function<JavaRDD<String>, JavaRDD<String>>() {
private static final long serialVersionUID = 1L; @Override
public JavaRDD<String> call(JavaRDD<String> rdd) throws Exception {
OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
offsetRanges.set(offsets);
return rdd;
}
}
); message.foreachRDD(new VoidFunction<JavaRDD<String>>(){
private static final long serialVersionUID = 1L; @Override
public void call(JavaRDD<String> t) throws Exception { ObjectMapper objectMapper = new ObjectMapper(); CuratorFramework curatorFramework
= CuratorFrameworkFactory
.builder()
.connectString("node3:2181,node4:2181,node5:2181")
.connectionTimeoutMs(1000)
.sessionTimeoutMs(10000)
.retryPolicy(new RetryUntilElapsed(1000, 1000))
.build(); curatorFramework.start(); for (OffsetRange offsetRange : offsetRanges.get()) {
long fromOffset = offsetRange.fromOffset();//开始offset
long untilOffset = offsetRange.untilOffset();//结束offset
final byte[] offsetBytes = objectMapper.writeValueAsBytes(offsetRange.untilOffset());
String nodePath = "/consumers/"+groupID+"/offsets/" + offsetRange.topic()+ "/" + offsetRange.partition();
System.out.println("nodePath = "+nodePath);
System.out.println("fromOffset = "+fromOffset+",untilOffset="+untilOffset); //ZK记录分区主题的消费偏移量
if(curatorFramework.checkExists().forPath(nodePath) != null)
{
curatorFramework.setData().forPath(nodePath,offsetBytes);
}
else
{
curatorFramework.create().creatingParentsIfNeeded().forPath(nodePath, offsetBytes);
}
}
curatorFramework.close();
} }); lines.print(); return jsc;
}
}
package manageoffset; import java.io.IOException;
import java.io.InputStream;
import java.util.Properties; import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator; public class ProjectUtil {
/**
* 使用log4j配置打印日志
*/
static final Logger logger = Logger.getLogger(UseZookeeperManageOffset.class);
/**
* 加载配置的log4j.properties,默认读取的路径在src下,如果将log4j.properties放在别的路径中要手动加载
*/
public static void LoadLogConfig() {
PropertyConfigurator.configure("d:/eclipse4.7WS/SparkStreaming_Kafka_Manage/resource/log4j.properties");
} /**
* 加载配置文件
* 需要将放config.properties的目录设置成资源目录
* @return
*/
public static Properties loadProperties() { Properties props = new Properties();
InputStream inputStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("config.properties");
if(null != inputStream) {
try {
props.load(inputStream);
} catch (IOException e) {
logger.error(String.format("Config.properties file not found in the classpath"));
}
}
return props; } public static void main(String[] args) {
Properties props = loadProperties();
String value = props.getProperty("hello");
System.out.println(value);
}
}
【SparkStreaming学习之四】 SparkStreaming+kafka管理消费offset的更多相关文章
- 【kafka学习之四】kafka集群性能测试
kafka集群的性能受限于JVM参数.服务器的硬件配置以及kafka的配置,因此需要对所要部署kafka的机器进行性能测试,根据测试结果,找出符合业务需求的最佳配置. 1.kafka broker j ...
- Kafka 是如何管理消费位点的?
Kafka 是一个高度可扩展的分布式消息系统,在实时事件流和流式处理为中心的架构越来越风靡的今天,它扮演了这个架构中核心存储的角色.从某种角度说,Kafka 可以看成实时版的 Hadoop 系统.Ha ...
- Spark Streaming消费Kafka Direct保存offset到Redis,实现数据零丢失和exactly once
一.概述 上次写这篇文章文章的时候,Spark还是1.x,kafka还是0.8x版本,转眼间spark到了2.x,kafka也到了2.x,存储offset的方式也发生了改变,笔者根据上篇文章和网上文章 ...
- Kafka管理与监控——broker宕机后无法消费问题
背景 因磁盘满了,导致kafka所有的服务器全部宕机了,然后重启kafka集群,服务是启动成功了,但有一些报错: broker1: broker2: broker3:一直在刷以下错误信息 虽然报了这些 ...
- Kafka管理工具介绍【转】
Kafka内部提供了许多管理脚本,这些脚本都放在$KAFKA_HOME/bin目录下,而这些类的实现都是放在源码的kafka/core/src/main/scala/kafka/tools/路径下. ...
- Kafka 温故(五):Kafka的消费编程模型
Kafka的消费模型分为两种: 1.分区消费模型 2.分组消费模型 一.分区消费模型 二.分组消费模型 Producer : package cn.outofmemory.kafka; import ...
- Kafka无法消费!?究竟是bug的“沦陷”还是配置的“扭曲”?
在一个月黑风高的夜晚,突然收到现网生产环境Kafka消息积压的告警,梦中惊醒啊,马上起来排查日志. 问题现象 消费请求卡死在查找Coordinator Coordinator为何物?Coordinat ...
- Kafka学习笔记之Kafka背景及架构介绍
0x00 概述 本文介绍了Kafka的创建背景,设计目标,使用消息系统的优势以及目前流行的消息系统对比.并介绍了Kafka的架构,Producer消息路由,Consumer Group以及由其实现的不 ...
- Kafka无法消费?!我的分布式消息服务Kafka却稳如泰山!
在一个月黑风高的夜晚,突然收到现网生产环境Kafka消息积压的告警,梦中惊醒啊,马上起来排查日志. 问题现象:消费请求卡死在查找Coordinator Coordinator为何物?Coordinat ...
随机推荐
- NOIP-扫雷游戏
题目描述 扫雷游戏是一款十分经典的单机小游戏.在n行m列的雷区中有一些格子含有地雷(称之为地雷格),其他格子不含地雷(称之为非地雷格).玩家翻开一个非地雷格时,该格将会出现一个数字——提示周围格子中有 ...
- [LeetCode] N-ary Tree Preorder Traversal N叉树的前序遍历
Given an n-ary tree, return the preorder traversal of its nodes' values. For example, given a 3-ary ...
- Hibernate-day02
OID 1,对象里面没有主键的概念,对象中对应主键的属性,称为OID(对象标识符);2,OID用来唯一标明一个对象实体(加上对象类型)3,OID在对象里面不见得只有一个属性;(映射复合主键)4,OID ...
- yii2 gridview checkbox
给checkbox(在GridView里的)添加一个value 控制器:$dataProvidermStu->key = "student_no"; view:[ 'clas ...
- 测试覆盖率工具EclEmma安装与使用
此文来自于:https://www.cnblogs.com/cnsdhzzl/p/7638883.html EclEmma的简介 一个优秀的开源软件测试工具 eclipse的一个插件 能够对由 Jav ...
- Java中equals,hashcode,==的区别
== :比较java栈局部变量表中变量的地址或值是否相等. equals : 比较变量的地址在java堆中引用对象是否为同一个对象. hashcode : 通过对象在JVM内存中的存储地址通过特定算 ...
- 20175211 2018-2019-2 《Java程序设计》第五周学习总结
目录 教材学习内容总结 第六章 接口与实现 教材学习中的问题和解决过程 代码调试中的问题和解决过程 代码托管 上周考试错题总结 学习进度条 参考资料 教材学习内容总结 第六章 接口与实现 6.1 接口 ...
- sed memo 2
配置文件注释过滤 示例文件 [user_00@txyun test]$ cat sed_test # comment aaaaaaaaaaa bbbb #comment cccc dddd fffo ...
- ionic3 生命周期 之 ionViewWillLeave 坑
ionic3 生命周期 ionViewWillLeave,当页面关闭离开时 执行的事件, 从页面根部跳转 this.appCtrl.getRootNav().setRoot() 方法离开时是 不执行 ...
- visual studio 中被遗忘的任务列表和书签
任务列表(Task List)是VS中被人遗忘的一个功能,用到跳转到不同的代码段非常不便.以后就不用每次前进和后退导航了. 使用“任务列表” 跟踪使用 TODO 和 HACK或自定义令牌等令牌的代码注 ...