Kafka运维大全来了!优化、监控、故障处理……
broker.id=
listeners=SASL_PLAINTEXT://hosip:9092
#broker.id=
listeners=SASL_PLAINTEXT://:9092
zookeeper.connect=zkip1:,zkip2:,zkip3:/kafka # Timeout in ms for connecting to zookeeper
delete.topic.enable=true
zookeeper.connection.timeout.ms=
zookeeper.session.timeout.ms=
controlled.shutdown.enable=true
#很重要
unclean.leader.election.enable=true
auto.create.topics.enable=false
#副本拉取线程数
num.replica.fetchers=
auto.leader.rebalance.enable=true
leader.imbalance.per.broker.percentage=
leader.imbalance.check.interval.seconds=
#副本拉取的最小大小1mb
replica.fetch.min.bytes=
#副本拉取的最大大小20mb
replica.fetch.max.bytes=
#多长时间拉取一次副本
replica.fetch.wait.max.ms=
#超过多长时间副本退出isr
replica.socket.timeout.ms=
#replica.fetch.wait.max.ms=
#缓存大小
replica.socket.receive.buffer.bytes=
num.network.threads=
num.io.threads=
#每当producer写入10000条消息时,刷数据到磁盘
log.flush.interval.messages=
#每间隔1秒钟时间,刷数据到磁盘
log.flush.interval.ms=
socket.receive.buffer.bytes=
socket.send.buffer.bytes=
queued.max.requests=
sasl.enabled.mechanisms=PLAIN
sasl.mechanism.inter.broker.protocol=PLAIN
allow.everyone.if.no.acl.found=false
super.users=User:admin
authorizer.class.name = kafka.security.auth.SimpleAclAuthorizer
security.inter.broker.protocol=SASL_PLAINTEXT
nohup kafka-server-start.sh /usr/local/kafka/config/server.properties >/dev/null >& &
$KAFKA_HOME/bin/kafka-topics.sh --create --topic logstash-yarnnodelog --replication-factor --partitions --zookeeper zkip:/kafka
$KAFKA_HOME/bin/kafka-topics.sh --list --zookeeper zkip:
$KAFKA_HOME/bin/kafka-console-consumer.sh --zookeeper zkip: --topic topic-test --from-beginning
kafka-console-consumer.sh --bootstrap-server brokerip: --from-beginning --topic logstash --new-consumer --consumer.config=/opt/beh/core/kafka/config/consumer.properties
$KAFKA_HOME/bin/kafka-console-producer.sh --broker-list brokerip: --topic topic-test
$KAFKA_HOME/bin/kafka-topics.sh --zookeeper zkip: --delete --topic topic-test
$KAFKA_HOME/bin/kafka-topics.sh --describe --zookeeper zkip:/ --topic test20160807
zookeeper.connect=zkip1:, zkip2:, zkip3: # Timeout in ms for connecting to zookeeper
zookeeper.connection.timeout.ms=
listeners=SASL_PLAINTEXT://:9092
security.inter.broker.protocol=SASL_PLAINTEXT
sasl.enabled.mechanisms=PLAIN
sasl.mechanism.inter.broker.protocol=PLAIN
auto.create.topics.enable=false
allow.everyone.if.no.acl.found=false
delete.topic.enable=true
super.users=User:admin
authorizer.class.name = kafka.security.auth.SimpleAclAuthorizer
KafkaServer {
org.apache.kafka.common.security.plain.PlainLoginModule required
username="admin"
password="admin"
user_admin="admin"
user_hadoop="hadoop"
user_producer1="producer1_test"
user_consumer1="consumer1_test"
user_producer2="producer2_test"
user_consumer2="consumer2_test";
};
vi kafka_client_consumer_jaas.conf KafkaClient {
org.apache.kafka.common.security.plain.PlainLoginModule required
username="consumer1"
password="consumer1_test";
}; Vi kafka_client_producer_jaas.conf KafkaClient {
org.apache.kafka.common.security.plain.PlainLoginModule required
username="producer1"
password="producer1_test";
};
consumer.properties
echo security.protocol=SASL_PLAINTEXT >> producer.properties
echo sasl.mechanism=PLAIN >> producer.properties
echo security.protocol=SASL_PLAINTEXT >> consumer.properties
echo sasl.mechanism=PLAIN >> consumer.properties vi producer.properties security.protocol=SASL_PLAINTEXT
sasl.mechanism=PLAIN vi consumer.properties security.protocol=SASL_PLAINTEXT
sasl.mechanism=PLAIN
export KAFKA_OPTS="-Djava.security.auth.login.config=/opt/beh/core/kafka/config/kafka_server_jaas.conf"
nohup kafka-server-start.sh /opt/beh/core/kafka/config/server.properties &
if [ "x$KAFKA_OPTS" ]; then
export KAFKA_OPTS="-Djava.security.auth.login.config=/opt/beh/core/kafka/config/kafka_client_jaas.conf"
fi
if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then
export KAFKA_HEAP_OPTS="-Xmx512M"
fi
exec $(dirname $)/kafka-run-class.sh kafka.tools.ConsoleProducer "$@" vi kafka-console-consumer.sh if [ "x$KAFKA_OPTS" ]; then
export KAFKA_OPTS="-Djava.security.auth.login.config=/opt/beh/core/kafka/config/kafka_client_jaas.conf"
fi if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then
export KAFKA_HEAP_OPTS="-Xmx512M"
fi exec $(dirname $)/kafka-run-class.sh kafka.tools.ConsoleConsumer "$@"
nohup kafka-server-start.sh /opt/beh/core/kafka/config/server.properties &
kafka-acls.sh --list --authorizer-properties zookeeper.connect=localhost:
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:admin --operation ClusterAction --cluster --add (更新metedata权限)
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:admin --cluster --add
$KAFKA_HOME/bin/kafka-topics.sh --create --topic topic-test1 --replication-factor --partitions --zookeeper localhost:
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --add --allow-principal User:Bob --allow-principal User:Alice --allow-host xxx.xx.xx. --allow-host xxx.xx.xx. --operation Read --operation Write --topic Test-topic
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:producer1 --topic=topic-test --operation Write --add kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:producer1 --topic=test1 --operation Write --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:producer1 --consumer --topic=topic-test --group=* --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:hadoop --consumer --topic=topic-test1 --group=test-consumer-group --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --producer --topic=topic-test1 --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --consumer --topic=topic-test1 --group=* --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --consumer --topic=topic-test1 --group=test-consumer-group --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:producer1 --topic=* --operation Write --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:hadoop --consumer --topic=* --group=test-consumer-group --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:hadoop --consumer --topic=* --group=* --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --topic=* --operation Write --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --consumer --topic=* --group=topic-test --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --consumer --topic=* --group=* --add
bin/kafka-acls.sh --authorizer-properties zookeeper.connect=data-rt-dev02:/kafka_test10 --remove --allow-principal User:Bob --allow-principal User:Alice --allow-host xxx.xx.xx. --allow-host xxx.xx.xx. --operation Read --operation Write --topic test
kafka-acls.sh --list --authorizer-properties zookeeper.connect=localhost:
kafka-acls.sh --list --authorizer-properties zookeeper.connect=localhost: User:hadoop
kafka-acls.sh --list --authorizer-properties zookeeper.connect=localhost: --topic=topic-test1
$KAFKA_HOME/bin/kafka-console-producer.sh --broker-list broker1: --topic topic-test --producer.config=/opt/beh/core/kafka/config/producer.properties
kafka-console-consumer.sh --bootstrap-server broker1: --from-beginning --topic topic-test --new-consumer --consumer.config=/opt/beh/core/kafka/config/consumer.properties
put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username=\"consumer1\" password=\"consumer1_test\";");
put("security.protocol", "SASL_PLAINTEXT");
put("sasl.mechanism", "PLAIN");
- 评估数据量:要求研发提前评估topic一个周期全量的数据大小。
- 计算磁盘总存储:如一块盘825g,一个节点20快盘,10个节点。那么磁盘总存储就是165000g。
- 预估实际数据存储占比:topic一个周期全量数据大小占磁盘总存储的百分比,超过百分之六十,即要求研发减少存储周期。
- 计算磁盘总块数:一个节点20快盘,10个节点,总磁盘块数200个。
- 合理预分区:分区数量为磁盘总数的整数倍。如所有的topic总数据量为50000gb,磁盘个数为200,那么就可以设置总分区数为200,400,600.具体多少分区数视业务决定。若分区数为400,那么一个分区的大小约125g。例如某一个topic:cbss001的预估数据量是210g,那么通过计算可以将其分成两个分区。这样根据Kafka副本落盘策略,各个主机磁盘就能保证最大限度的存储均衡。
- 坏盘会导致节点宕掉,及时更换坏盘,重启节点即可。
- unclean.leader.election.enable 该参数为true配置到topic中会引起消息重复消费。但为false时,会引起节点9092端口断开连接,导致Kafka进程假死。
- 内存溢出,其会导致节点副本不能上线isr。
- 进程,文件数限制也会造成节点报错,后续调优中会给出优化参数。
- flower副本不能及时同步leader副本,同步超时导致副本下线isr。
- 消费offset越界,这种情况首先重启节点,若还是报错,则找到该offset越界的分区,删除几条message,再次查看。知道不报错为止。
vi topics-to-move.json
{"topics": [{"topic": "foo1"}, {"topic": "foo2"}], "version": }
bin/kafka-reassign-partitions.sh --zookeeper localhost:
--topics-to-move-json-file topics-to-move.json --broker-list "5,6" --generate Current partition replica assignment
{"version":,
"partitions":[
{"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]},{"topic":"foo2","partition":,"replicas":[,]}
]
}
Proposed partition reassignment configuration
{"version":,
"partitions":[
{"topic":"foo1","partition":,"replicas":[,]},{"topic":"foo1","partition":,"replicas":[,]},
{"topic":"foo2","partition":,"replicas":[,]},{"topic":"foo2","partition":,"replicas":[,]},
{"topic":"foo1","partition":,"replicas":[,]},{"topic":"foo2","partition":,"replicas":[,]}
]
}
bin/kafka-reassign-partitions.sh --zookeeper localhost: --reassignment-json-file expand-cluster-reassignment.json --execute
Current partition replica assignment
{"version":,
"partitions":[
{"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}
] }
Save this to use as the --reassignment-json-file option during rollback Successfully started reassignment of partitions
{"version":,
"partitions":[
{"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}
]
}
执行验证:–verify bin/kafka-reassign-partitions.sh --zookeeper localhost: --reassignment-json-file custom-reassignment.json --verify
Status of partition reassignment: Reassignment of partition [foo1,]
completed successfully
Reassignment of partition [foo2,]
completed successfully
log.retention.bytes (一个topic的大小限制 =分区数*log.retention.bytes)
log.retention.minutes
log.retention.bytes和log.retention.minutes任意一个达到要求,都会执行数据删除
kafka-configs.sh --zookeeper zkip1: --describe --entity-type topics --entity-name CdrNormal
Configs for topics:CdrNormal are retention.ms=
#!/usr/bin/python
#_*_coding:utf-8_*_
import pycurl
import json
import StringIO
import time
import sys
import zookeeper zk=zookeeper.init("zkip1:2181")
t = zookeeper.get_children(zk,"/brokers/ids")
d=0
for i in t:
d=d+1
b=16-d
if d == 16:
print "ok cb实时kafka1节点存活正常"
sys.exit(0)
else:
print "Critical cb实时kafka1节点有:",b,"个死去节点"
sys.exit(2)
#!/usr/bin/python
#_*_coding:utf-8_*_
import paramiko
import sys
hostname = ['IP1',' IP2']
username = sys.argv[]
password = sys.argv[]
percent = sys.argv[]
disk={}
error=""
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
for i in range(,len(hostname)):
ssh.connect(hostname[i],,username,password)
stdin,stdout,stderr = ssh.exec_command("df -TPh|awk '+$6>%s {print $7}'" % percent)
path = stdout.readlines()
#print path
disk[hostname[i]]=path
#print disk
#it=iter(disk.keys())
#print disk.values()
#for key in hostname:
# print i
#print disk[hostname[i]]
#print disk[next(it)]
#print len(disk[next(it)])
#if len(disk[next(it)])==0:
if not disk:
print("未采集到集群信息!")
sys.exit()
else:
for i in disk.keys():
#print disk.get(i)
if not disk.get(i):
continue
else:
error += "节点"+i+":"
for j in range(,len(disk[i])):
if j == len(disk[i])-:
error += disk[i][j].encode('utf-8')+"。"
else:
error += disk[i][j].encode('utf-8')+","
if not error:
print("cb_rt_kafka业务数据采集集群正常")
sys.exit()
else:
#print ("cb_rt_kafka业务数据采集集群,%s,磁盘存储超出百分之七十") % error.replace("\n", "")
print ("cb_rt_kafka业务数据采集集群,%s,磁盘存储超出百分之%s") % (error.replace("\n", ""),percent)
sys.exit()
ssh.close()
Kafka运维大全来了!优化、监控、故障处理……的更多相关文章
- Kafka运维大全来了!优化、监控、故障处理
Kafka运维大全来了!优化.监控.故障处理…… Kafka概念 Kafka是分布式发布-订阅消息系统.它最初由LinkedIn公司开发,之后成为Apache项目的一部分.Kafka是一个分布式的 ...
- Linux运维不可不知的性能监控和调试工具
Linux运维不可不知的性能监控和调试工具 1 nagios Nagios是一个开源监控解决方案,我觉得他可以监控一切 ,可以看一下我以前的文章:NAGIOS 2 ps #用来查看程序的运行情况 ps ...
- Kafka运维填坑(转)
前提: 只针对Kafka 0.9.0.1版本; 说是运维,其实偏重于问题解决; 大部分解决方案都是google而来, 我只是作了次搬运工; 有些问题的解决方案未必一定是通用的, 若应用到线上请慎重; ...
- kafka运维填坑
转载自:https://www.jianshu.com/p/d2cbaae38014 前提: 只针对Kafka 0.9.0.1版本; 说是运维,其实偏重于问题解决; 大部分解决方案都是google而来 ...
- 从零开始运维之旅:如何监控你的 Windows?
小弟乃刚刚踏入运维圈的资深小白一枚,正所谓完事开头难,公司里怕我把生产系统搞坏就让我先在测试环境上先练练手.巧的是测试环境又是我熟悉的 Windows 环境,心中窃喜啊.但问题随之而来,运维从何下手呢 ...
- Kafka运维
如何在Kafka上创建topic? 手工脚本创建 ./kafka-topics.sh –zookeeper 127.0.0.1:2181 –create –topic test.example –re ...
- Kafka运维命令大全
1.集群管理 前台启动broker bin/kafka-server-start.sh <path>/server.properties Ctrl + C 关闭 后台启动broker bi ...
- python自动化运维二:业务服务监控
p { margin-bottom: 0.25cm; line-height: 120% } a:link { } p { margin-bottom: 0.25cm; line-height: 12 ...
- Linux入门之运维(1) 系统监控 vmstat top
vmstat命令是最常见的Linux/Unix监控工具,可以展现给定时间间隔的服务器的状态值,包括服务器的CPU使用率,内存使用,虚拟内存交换情况,IO读写情况.这个命令是我查看Linux/Unix最 ...
随机推荐
- Ubuntu16安装NVIDIA驱动后重复登录 简单粗暴
第一步 卸载所有NVIDIA的东西 第二步 开机,应该能进入默认驱动的桌面了,在设置里关闭开机密码,开机自动登录 第三步 安装英伟达驱动
- python 生成器,迭代器,闭包,装饰器
1.生成器,迭代器,闭包,装饰器的优点 生成器就是一类特殊的迭代器 迭代器的优点也即生成器的优点: 1.节约内存.python在使用生成器时对延迟操作提供了支持. 2.迭代到下一次的调用时,所使用的参 ...
- 一文熟练使用python mock
mock作为python测试模拟对象工具,在单元测试当中使用较多,官方文档详细不够精简,这篇文章介绍mock常用的用法,以下为引用全文,留给自己和有需要的人查阅. https://realpython ...
- Linux 编程题
1. 产生一个进程树,父进程有2个子进程,这2个子进程分别又有2个子进程,每个进程休眠5秒后退出,并在退出前打印自己的进程id号. # include<stdio.h> # include ...
- 10.JavaSE之包机制
包机制: 为了更好的组织类,Java提供了包机制,用于区别类名的命名空间 包语句的语法格式为: package pkg1[ . pkg2[ . pkg3...]]; package com.duan. ...
- python property()函数:定义属性
正常情况下,类包含的属性应该是隐藏的,只允许通过类提供的方法来间接的实现对类属性的访问和操作. class Person: #构造函数 def __init__(self, name): self.n ...
- Qt Installer Framework翻译(4)
教程:创建安装程序 本教程描述如何为一个小项目创建一个简单的安装程序: 本节描述创建安装程序所必须完成的步骤: 创建一个包文件夹,其中将包含所有配置文件和可安装的包. 创建一个配置文件,其中包含有关如 ...
- NIO&AIO编程模型
NIO线程模型 什么是NIO线程模型? 上图是NIO的线程模型, 基于select实现, 这种线程模型的特点: 多条channel通过一个选择器和单挑线程绑定, 并且在这种编程模型中, Cha ...
- c#数字图像处理(八)图像平移
使图像沿水平方向和垂直方向移动 /// <summary> /// 图像平移 /// </summary> private void translation_Click(obj ...
- 使用Python写的WingPro7 Pyside2 和 PyQt5插件
pyside2的 import wingapi import subprocess pyside2_uic = "pyside2-uic" pyside2_qrc = " ...