Spark streaming + Kafka 流式数据处理,结果存储至MongoDB、Solr、Neo4j(自用)
KafkaStreaming.scala文件
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.{KafkaManagerAdd, KafkaUtils}
import org.json4s.DefaultFormats
import org.json4s.jackson.Json
import com.mongodb.casbah.{MongoClient, MongoClientURI, MongoCollection} import scala.collection.mutable.ArrayBuffer
/**
* Created by zty on 2017/12/20.
*/
object KafkaStreaming {
var journalArticleClass = new JournalArticleDataManagerAdd(MongoClient(MongoClientURI("mongodb://IP:27017"))("数据库名称")("数据集合名称")) def main(args: Array[String]): Unit = {
run()
} def run(): Unit = {
//kafka topic名称
val topicsJournalArticle = "JouArt"
//kafka中间人
val brokers = "IP1:9092,IP2:9092,IP3:9092"
//spark配置
val sparkconf = new SparkConf().setAppName("kafkastreaming").set("spark.streaming.kafka.maxRatePerPartition", "5")
val ssc = new StreamingContext(sparkconf, Seconds(30)) val topicsJournalArticleSet = topicsJournalArticle.split(",").toSet
val journalArticlekafkaParams = Map[String, String]("metadata.broker.list" -> brokers, "group.id" -> "journalArticledataManager",
"fetch.message.max.bytes" -> "20971520", "auto.offset.reset" -> "smallest")
val journalArticleManager = new KafkaManagerAdd(journalArticlekafkaParams)
val jsonsjournalArticleLines = journalArticleManager.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, journalArticlekafkaParams, topicsJournalArticleSet) // 更新offsets
jsonsjournalArticleLines.foreachRDD(rdd => {
journalArticleManager.updateZKOffsets(rdd)
}) val jsonsjournalArticle = jsonsjournalArticleLines.map(_._2)
val arrayjournalArticle = ArrayBuffer[String]()
jsonsjournalArticle.foreachRDD(rdd => {
val count = rdd.count().toInt
rdd.take(count + 1).take(count).foreach(x => {
arrayjournalArticle += x
})
kafkaProducerSendJournalArticle(arrayjournalArticle)
arrayjournalArticle.clear()
}) ssc.start()
ssc.awaitTermination()
} def kafkaProducerSendJournalArticle(args: ArrayBuffer[String]) {
if (args.nonEmpty && args.toString() != "[]" && args.toString() != "ArrayBuffer()") {
args.foreach(line => {
val json: Option[Any] = scala.util.parsing.json.JSON.parseFull(line.toString)
val maps: List[Any] = json.get.asInstanceOf[List[Any]]
maps.foreach(langMap => {
val listJson: Map[String, Any] = langMap.asInstanceOf[Map[String, Any]]
if (Json(DefaultFormats).write(listJson) != "" || Json(DefaultFormats).write(listJson) != null || !Json(DefaultFormats).write(listJson).isEmpty) {
journalArticleClass.MongoDBJournalArticleAdd(Json(DefaultFormats).write(listJson).toString)
journalArticleClass.Neo4jSolrJournalArticleAdd()
}
})
})
}
}
}
JournalArticleDataManagerAdd.scala文件
import java.text.SimpleDateFormat
import java.util.Date
import org.joda.time.DateTime
import com.mongodb.DBObject
import com.mongodb.casbah.{MongoClient, MongoClientURI, MongoCollection}
import com.mongodb.util.JSON
import org.neo4j.driver.v1.{AuthTokens, GraphDatabase, StatementResult}
/**
* Created by zty on 2017/02/01.
*/
class JournalArticleDataManagerAdd (collectionString : MongoCollection) {
//链接mongodb
def createDatabase(url: String, dbName: String, coll: String): MongoCollection = {
MongoClient(MongoClientURI(url))(dbName)(coll)
}
//全局变量 由MongoDBJournalArticleAdd函数接受,与Neo4jSolrJournalArticleAdd共用
var jsonString = "" def MongoDBJournalArticleAdd(JsonString: String): Unit = {
jsonString = JsonString
try{
val bson: DBObject = JSON.parse(jsonString).asInstanceOf[DBObject]
collectionString.insert(bson)
}catch {
case e: ArithmeticException => println(e)
case ex: Throwable =>println(ex)
}
} //solr添加数据
def Neo4jSolrJournalArticleAdd(): Unit = {
val driver = GraphDatabase.driver("bolt://IP/7687", AuthTokens.basic("neo4j", "******"))
val session = driver.session try {
// 字符串转json对象
var json = com.alibaba.fastjson.JSON.parseObject(jsonString)
// ID
var GUID = json.getJSONObject("_id").get("$oid")
// 标题
var titleObject = json.getJSONObject("title")
var titlevalue = titleObject.get("title")
// 文献语种
var language = titleObject.get("language")
var languagevalue=if(language!=null) language.toString else ""
// 其他语种标题
var title_alternative = json.getJSONArray("title_alternative")
var title_alternativevalue=""
for( a <-0 to title_alternative.toArray.length-1){
var item=title_alternative.toArray.apply(a)
title_alternativevalue += com.alibaba.fastjson.JSON.parseObject(item.toString).get("title_alternative") + ","
}
if(title_alternativevalue!="") title_alternativevalue=title_alternativevalue.substring(0,title_alternativevalue.length-1) else title_alternativevalue=""
// 第一作者id
var first_id = json.get("first_contributer_id")
if(first_id==null) first_id="" else first_id=first_id.toString
// 责任者
var contributer_meta = json.getJSONArray("contributer_meta")
var contributer_metavalue = ""
var contributer_idvalue = ""
var contributer_orgvalue = ""
var contributer_org_idvalue = ""
var conid = ""
var conorgid = ""
for (a <- 0 to contributer_meta.toArray.length - 1) {
var item = contributer_meta.toArray.apply(a)
var itemJson = com.alibaba.fastjson.JSON.parseObject(item.toString)
contributer_metavalue += itemJson.get("contributer_name") + ","
var contributer_id = itemJson.getJSONArray("contributer_URI").toArray
if(contributer_id.length != 0){
conid = com.alibaba.fastjson.JSON.parseObject(contributer_id.apply(0).toString).get("contributer_URI").toString
if (conid.length!=0) contributer_idvalue += conid.substring(7,conid.length) + "','" else contributer_idvalue += "','"}
var organization_list = itemJson.getJSONArray("organization_list")
for (b <- 0 to organization_list.toArray.length - 1) {
var list = organization_list.toArray.apply(b)
contributer_orgvalue += com.alibaba.fastjson.JSON.parseObject(list.toString).get("name") + ","
var contributer_org_id = com.alibaba.fastjson.JSON.parseObject(list.toString).getJSONArray("organization_URI").toArray
if(contributer_org_id.length != 0){
conorgid = contributer_org_id.apply(0).toString
if (conorgid.length!=0) contributer_org_idvalue += conorgid.substring(13,conorgid.length) + "','" else contributer_org_idvalue += "','"}
}
}
if(contributer_metavalue!="") contributer_metavalue = contributer_metavalue.substring(0, contributer_metavalue.length - 1) else contributer_metavalue=""
if(contributer_idvalue!="") contributer_idvalue = "['"+contributer_idvalue.substring(0, contributer_idvalue.length - 2)+"]" else contributer_idvalue="[]"
if(contributer_orgvalue!="") contributer_orgvalue = contributer_orgvalue.substring(0, contributer_orgvalue.length - 1) else contributer_orgvalue=""
if(contributer_org_idvalue!="") contributer_org_idvalue = "['"+contributer_org_idvalue.substring(0, contributer_org_idvalue.length - 2)+"]" else contributer_org_idvalue="[]"
// 简介
var abstractvalue = json.getJSONObject("abstractvalue").get("abstractvalue")
var abstractvaluevalue=if(abstractvalue==null) "" else abstractvalue.toString
// 其他语种简介
var abstract_alternative = json.getJSONArray("abstract_alternative")
var abstract_alternativevalue=""
for( a <-0 to abstract_alternative.toArray.length-1){
var item=abstract_alternative.toArray.apply(a)
abstract_alternativevalue += com.alibaba.fastjson.JSON.parseObject(item.toString).get("abstract_alternative") + ","
}
if(abstract_alternativevalue!="") abstract_alternativevalue=abstract_alternativevalue.substring(0,abstract_alternativevalue.length-1) else abstract_alternativevalue=""
// 主题容器-主题词
val subject_list = json.getJSONObject("subject_meta").getJSONArray("subject_list")
var CLCtitle=""
var CLCcode=""
var keywords=""
for (a <- 0 to subject_list.toArray.length - 1) {
var item = com.alibaba.fastjson.JSON.parseObject(subject_list.toArray.apply(a).toString)
var source=item.get("source")
var types=item.get("type")
if(source=="CLC" || source=="clc"){
CLCtitle += item.get("subject_title") + ","
CLCcode += item.get("subject_code") + ","
}
if(types=="Keyword" || types=="keyword") keywords+= item.get("subject_title") + ","
}
if(CLCtitle!="") CLCtitle=CLCtitle.substring(0,CLCtitle.length-1) else CLCtitle=""
if(CLCcode!="") CLCcode=CLCcode.substring(0,CLCcode.length-1) else CLCcode=""
if(keywords!="") keywords=keywords.substring(0,keywords.length-1) else keywords=""
// 基金项目
var funding_list = json.getJSONObject("funding_list").getJSONArray("funding_meta")
var funding_listvalue=""
var funding_list_idvalue=""
var funid = ""
for( a <-0 to funding_list.toArray.length-1){
var item=com.alibaba.fastjson.JSON.parseObject(funding_list.toArray.apply(a).toString)
funding_listvalue += item.get("title") + ","
funding_list_idvalue += item.get("_id") +"','"
}
if(funding_listvalue!="") funding_listvalue=funding_listvalue.substring(0,funding_listvalue.length-1) else funding_listvalue=""
if(funding_list_idvalue!="") funding_list_idvalue = "['"+funding_list_idvalue.substring(0, funding_list_idvalue.length - 2)+"]" else funding_list_idvalue="[]"
// 收录类别
var holding_meta = json.getJSONArray("holding_meta").toArray
var holding_metavalue=""
for( a <-0 to holding_meta.length-1){
var item=holding_meta.apply(a)
holding_metavalue += com.alibaba.fastjson.JSON.parseObject(item.toString).get("holding_code") + ","
}
if(holding_metavalue!="") holding_metavalue=holding_metavalue.substring(0,holding_metavalue.length-1) else holding_metavalue=""
// 期刊id
var journal_id=json.get("journal_URI")
if(journal_id==null) journal_id="[]" else journal_id="['"+journal_id.toString.substring(8,journal_id.toString.length)+"']"
// 期刊名称
var journal_title=json.get("journal_title")
if(journal_title==null) journal_title="" else journal_title=journal_title.toString
// 发表年份
var publication_year=json.get("publication_year")
if(publication_year==null) publication_year="" else publication_year=publication_year.toString
// volume
var volume=json.get("volume")
if(volume==null) volume="" else volume=volume.toString
// issue
var issue=json.get("issue")
if(issue==null) issue="" else issue=issue.toString
// 发表日期
var publication_datevalue=json.getJSONObject("publication_date")
val dateTimep = new DateTime(publication_datevalue.get("$date").asInstanceOf[Number].longValue).toString("yyyy-MM-dd")
var publication_date = "0001-01-01T00:00:00Z"
if(dateTimep!=null) publication_date=dateTimep+ "T00:00:00Z"
// solr Date 类型格式化
val now: Date = new Date()
val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
val date = dateFormat.format(now)
val timeFormat: SimpleDateFormat = new SimpleDateFormat("HH:mm:ssZ")
val time = timeFormat.format(now)
var createTime=date+"T"+time+"Z"
// 拼数据字符串 solr存储数据
var text = ""
var data = "{'id':'"+GUID.toString+
"','text':'"+text+
"','title':'"+titlevalue.toString+
"','title_alternative':'"+title_alternativevalue+
"','first_contributer_id':'"+first_id.toString+
"','contributer_id':"+contributer_idvalue+
",'contributer_name':'"+contributer_metavalue+
"','contributer_org_id':"+contributer_org_idvalue+
",'contributer_org':'"+contributer_orgvalue+
"','abstractvalue':'"+abstractvaluevalue+
"','abstract_alternative':'"+abstract_alternativevalue+
"','funding_list_id':"+funding_list_idvalue+
",'funding_list':'"+funding_listvalue+
"','holding_code':'"+holding_metavalue+
"','journal_id':"+journal_id.toString+
",'journal_title':'"+journal_title.toString+
"','volume':'"+volume+
"','issue':'"+issue+
"','CLCcode':'"+CLCcode+
"','CLCtitle':'"+CLCtitle+
"','keywords':'"+keywords+
"','language':'"+languagevalue+
"','publication_year':'"+publication_year+
"','publication_date':'"+publication_date+
"','createTime':'"+ createTime+
"','updateTime':'"+createTime+"'}"
var zty = new SolrAdd()
zty.postToSolr("JournalArticle", data)
// neo4j存储数据
val script = s"CREATE (:journalArticle {guid:'" + GUID +
"',title:'"+titlevalue.toString+
"',title_alternative:'"+title_alternativevalue+
"',contributer_name:'"+contributer_metavalue+
"',contributer_org:'"+contributer_orgvalue+
"',abstractvalue:'"+abstractvaluevalue+
"',abstract_alternative:'"+abstract_alternativevalue+
"',funding_list:'"+funding_listvalue+
"',holding_code:'"+holding_metavalue+
"',journal_title:'"+journal_title.toString+
"',volume:'"+volume+
"',issue:'"+issue+
"',CLCcode:'"+CLCcode+
"',CLCtitle:'"+CLCtitle+
"',keywords:'"+keywords+
"',language:'"+languagevalue+
"',publication_year:'"+publication_year+
"',publication_date:'"+publication_date+
"'})"
val result: StatementResult = session.run(script)
session.close()
driver.close()
result.consume().counters().nodesCreated()
}catch {
case e: ArithmeticException => println(e)
case ex: Throwable =>println(ex)
}
}
}
KafkaManagerAdd.scala文件
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
import scala.reflect.ClassTag
/**
* Created by zty.
*/
class KafkaManagerAdd(val kafkaParams: Map[String, String]) extends Serializable { private val kc = new KafkaCluster(kafkaParams) /**
* 创建数据流
* @param ssc
* @param kafkaParams
* @param topics
* @tparam K
* @tparam V
* @tparam KD
* @tparam VD
* @return
*/
def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](
ssc: StreamingContext,
kafkaParams: Map[String, String],
topics: Set[String]): InputDStream[(K, V)] = {
val groupId = kafkaParams.get("group.id").get
// 在zookeeper上读取offsets前先根据实际情况更新offsets
setOrUpdateOffsets(topics, groupId) //从zookeeper上读取offset开始消费message
val messages = {
val partitionsE = kc.getPartitions(topics)
if (partitionsE.isLeft)
throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
val partitions = partitionsE.right.get
val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
if (consumerOffsetsE.isLeft)
throw new SparkException(s"get kafka consumer offsets failed: ${consumerOffsetsE.left.get}")
val consumerOffsets = consumerOffsetsE.right.get
KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](
ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message))
}
messages
} /**
* 创建数据流前,根据实际消费情况更新消费offsets
* @param topics
* @param groupId
*/
private def setOrUpdateOffsets(topics: Set[String], groupId: String): Unit = {
topics.foreach(topic => {
var hasConsumed = true
val partitionsE = kc.getPartitions(Set(topic))
if (partitionsE.isLeft)
throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
val partitions = partitionsE.right.get
val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
if (consumerOffsetsE.isLeft) hasConsumed = false
if (hasConsumed) {// 消费过
/**
* 如果streaming程序执行的时候出现kafka.common.OffsetOutOfRangeException,
* 说明zk上保存的offsets已经过时了,即kafka的定时清理策略已经将包含该offsets的文件删除。
* 针对这种情况,只要判断一下zk上的consumerOffsets和earliestLeaderOffsets的大小,
* 如果consumerOffsets比earliestLeaderOffsets还小的话,说明consumerOffsets已过时,
* 这时把consumerOffsets更新为earliestLeaderOffsets
*/
val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
if (earliestLeaderOffsetsE.isLeft)
throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}")
val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get
val consumerOffsets = consumerOffsetsE.right.get // 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为earliestLeaderOffsets
var offsets: Map[TopicAndPartition, Long] = Map()
consumerOffsets.foreach({ case(tp, n) =>
val earliestLeaderOffset = earliestLeaderOffsets(tp).offset
if (n < earliestLeaderOffset) {
println("consumer group:" + groupId + ",topic:" + tp.topic + ",partition:" + tp.partition +
" offsets已经过时,更新为" + earliestLeaderOffset)
offsets += (tp -> earliestLeaderOffset)
}
})
if (!offsets.isEmpty) {
kc.setConsumerOffsets(groupId, offsets)
}
} else {// 没有消费过
val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null
if (reset == Some("smallest")) {
val leaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
if (leaderOffsetsE.isLeft)
throw new SparkException(s"get earliest leader offsets failed: ${leaderOffsetsE.left.get}")
leaderOffsets = leaderOffsetsE.right.get
} else {
val leaderOffsetsE = kc.getLatestLeaderOffsets(partitions)
if (leaderOffsetsE.isLeft)
throw new SparkException(s"get latest leader offsets failed: ${leaderOffsetsE.left.get}")
leaderOffsets = leaderOffsetsE.right.get
}
val offsets = leaderOffsets.map {
case (tp, offset) => (tp, offset.offset)
}
kc.setConsumerOffsets(groupId, offsets)
}
})
} /**
* 更新zookeeper上的消费offsets
* @param rdd
*/
def updateZKOffsets(rdd: RDD[(String, String)]) : Unit = {
val groupId = kafkaParams.get("group.id").get
val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges for (offsets <- offsetsList) {
val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset)))
if (o.isLeft) {
println(s"Error updating the offset to Kafka cluster: ${o.left.get}")
}
}
}
}
SolrAdd.scala文件
import scalaj.http.Http
//post数据到solr
//Author:zty
class SolrAdd () {
// dataType接收solr数据集名称字符串, jsonString接收数据json格式字符串
def postToSolr (dataType:String, jsonString: String): Unit = {
var data = "{'add':{ 'doc':" + jsonString + ",'boost':1.0,'overwrite':true,'commitWithin':1000}}"
val result = Http("http://IP:8985/solr/"+dataType+"/update?wt=json")
.postData(data)
.header("Content-Type", "application/json").asString
println(result)
}
}
Spark streaming + Kafka 流式数据处理,结果存储至MongoDB、Solr、Neo4j(自用)的更多相关文章
- Spark Streaming:大规模流式数据处理的新贵(转)
原文链接:Spark Streaming:大规模流式数据处理的新贵 摘要:Spark Streaming是大规模流式数据处理的新贵,将流式计算分解成一系列短小的批处理作业.本文阐释了Spark Str ...
- Spark Streaming:大规模流式数据处理的新贵
转自:http://www.csdn.net/article/2014-01-28/2818282-Spark-Streaming-big-data 提到Spark Streaming,我们不得不说一 ...
- Spark Streaming与流处理
Spark Streaming与流处理 一.流处理 1.1 静态数据处理 1.2 流处理二.Spark Streaming 2.1 简介 2 ...
- Spark踩坑记——Spark Streaming+Kafka
[TOC] 前言 在WeTest舆情项目中,需要对每天千万级的游戏评论信息进行词频统计,在生产者一端,我们将数据按照每天的拉取时间存入了Kafka当中,而在消费者一端,我们利用了spark strea ...
- Spark Streaming+Kafka
Spark Streaming+Kafka 前言 在WeTest舆情项目中,需要对每天千万级的游戏评论信息进行词频统计,在生产者一端,我们将数据按照每天的拉取时间存入了Kafka当中,而在消费者一端, ...
- Spark踩坑记:Spark Streaming+kafka应用及调优
前言 在WeTest舆情项目中,需要对每天千万级的游戏评论信息进行词频统计,在生产者一端,我们将数据按照每天的拉取时间存入了Kafka当中,而在消费者一端,我们利用了spark streaming从k ...
- Spark Streaming + Kafka整合(Kafka broker版本0.8.2.1+)
这篇博客是基于Spark Streaming整合Kafka-0.8.2.1官方文档. 本文主要讲解了Spark Streaming如何从Kafka接收数据.Spark Streaming从Kafka接 ...
- 4、spark streaming+kafka
一.Receiver模式 1. receiver模式原理图 在SparkStreaming程序运行起来后,Executor中会有receiver tasks接收kafka推送过来的数据.数据会被持久化 ...
- spark streaming kafka example
// scalastyle:off println package org.apache.spark.examples.streaming import kafka.serializer.String ...
随机推荐
- CSS边框属性
边框 圆角 border-radius border-top-left-radius border-top-right-radius border-bottom-left-radlius border ...
- 安装cuda8.0时出现错误:Missing recommended library: libGLU.so Missing recommended library: libGL.so
安装cuda8.0时出现错误: Missing recommended library: libGLU.soMissing recommended library: libGL.so Installi ...
- js面向对象编程(一):封装(转载)
一. 生成对象的原始模式 假定我们把猫看成一个对象,它有"名字"和"颜色"两个属性. var Cat = { name : '', color : '' } 现 ...
- sqlalchemy源代码阅读随笔(2)
这次阅读的,是Strategies.py文件. 文件自身,是这么描述的: """Strategies for creating new instances of Engi ...
- 工作管理 (job control)
这个工作管理 (job control) 是用在 bash 环境下的,也就是说:『当我们登入系统取得创建的 bash shell 进程之后,在该bush下同时进行多个工作的行为管理 』. 而所有创建的 ...
- CentOS6.8-minimal安装gnome桌面 安装NVC远程桌面连接
https://blog.csdn.net/nimasike/article/details/72844403
- 最小生成树的Kruskal算法
库鲁斯卡尔(Kruskal)算法是一种按照连通网中边的权值递增的顺序构造最小生成树的方法.Kruskal算法的基本思想是:假设连通网G=(V,E),令最小生成树的初始状态为只有n个顶点而无边的 ...
- NAT+穿洞基础知识梳理
参考:https://www.cnblogs.com/shilxfly/p/6589255.html https://blog.csdn.net/phoenix06/article/details/7 ...
- php中parse_url函数的源码及分析(scheme部分)
前言 看师傅们的文章时发现,parse_url出现的次数较多,单纯parse_url解析漏洞的考题也有很多,在此研究一下源码(太菜了看不懂,待日后再补充Orz) 源码 在ext/standard/ur ...
- Qimage与IplImage的转换
QImage test2012::ImageCV2Qimg(IplImage* img){ assert(img!=NULL); int h = img->height; int w = img ...