spark发现新词
- package com.icklick.spark.wordSegment
- import org.apache.log4j.{ Level, Logger }
- import org.apache.spark.{ SparkConf, SparkContext }
- import com.iclick.spark.wordSegment.util.CounterMap
- import scala.collection.mutable.ArrayBuffer
- import com.google.common.collect.Maps
- import java.text.SimpleDateFormat
- import scala.collection.JavaConversions._
- import scala.collection.JavaConverters._
- import scala.collection.mutable.Map
- import com.iclick.spark.wordSegment.util.AtomsUitl
- import org.apache.spark.sql.SQLContext
- import org.apache.spark.sql.functions._
- import org.apache.spark.sql.SaveMode
- import com.iclick.spark.wordSegment.util.ConterHashSet
- import org.apache.commons.lang.StringUtils
- import com.mysql.jdbc.Driver
- ///tmp/yuming/webtable/ds=16-04-17 hadoop数据目录
- object WordSegment{
- def main(args: Array[String]): Unit = {
- //关闭一些不必要的日志
- Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
- Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
- //master
- if (args.length < 5) {
- System.err.println("Usage: path ,maxLen ,pmi, info,shuffle_count")
- System.exit(1)
- }
- val path=args(0).toString
- val maxLen=args(1).toInt
- val pmi=args(2).toDouble
- val info=args(3).toDouble
- val shuffle_count=args(4).toInt
- val save_path_result=if(args.length>=6){ args(5).toString} else "/tmp/wilson/"
- val conf = new SparkConf().set("spark.driver.maxResultSize","10g").
- set("spark.sql.shuffle.partitions",s"${shuffle_count}").set("spark.network.timeout","850s").
- set("spark.shuffle.compress","true").set("spark.shuffle.spill.compress","true").set("spark.shuffle.manager","sort")
- if (System.getProperty("local") != null) {
- conf.setMaster("local").setAppName("wordSegname")
- }
- val sc = new SparkContext(conf)
- val sqlContext=new SQLContext(sc)
- //local
- /* val conf = new SparkConf().setAppName("wordSegname").setMaster("local[4]").
- set("spark.sql.shuffle.partitions","10").set("spark.network.timeout","30s")
- .set("spark.shuffle.compress","true").set("spark.shuffle.spill.compress","true")
- .set("spark.shuffle.manager","sort")
- val sc = new SparkContext(conf)
- val sqlContext=new SQLContext(sc)
- val path="D:\\wilson.zhou\\Downloads\\西游记.txt"
- val maxLen=6
- val path1="D:\\temp\\text.txt"
- val pmi=0
- val info=0
- val save_path_result="/tmp/wilson/"*/
- // val word=scala.io.Source.fromFile("D:\\wilson.zhou\\Downloads\\红楼梦.txt").getLines().mkString("")
- val sdf = new java.text.SimpleDateFormat("yyyy-MM-dd:HH:mm:ss")
- var start=sdf.format(System.currentTimeMillis())
- val word1=sc.textFile(path).map{x=>
- val x_filter=x.replaceAll("[" + AtomsUitl.stopwords + "]", " ").replaceAll("\\p{Punct}", " ").replaceAll("\\pP", " ")
- .replaceAll(" ", " ").replaceAll("\\p{Blank}", " ").replaceAll("\\p{Space}", " ").replaceAll("\\p{Cntrl}", " ")
- x_filter
- }
- val sum_document=word1.count()
- val word_document=word1.zipWithIndex.filter { x => !StringUtils.isBlank(x._1) }.flatMap{x=>
- val arr= ArrayBuffer[(String,Int)]()
- val line=x._1.split(" ")
- for(i<-line){
- arr+=((i,x._2.toInt))
- }
- arr }.map{x=>(x._1.trim,x._2)}.filter(x=> !StringUtils.isBlank(x._1))
- println("Calculate the iterms documnt")
- val word_document_caculate= word_document.map{x=>("$"+ x._1 +"$",x._2)}.flatMap{
- x=> var arr=ArrayBuffer[(String,Int)]()
- for( y<- 1 to AtomsUitl.len(x._1)-2){
- arr+=((AtomsUitl.substring(x._1,y, Math.min(maxLen+y,AtomsUitl.len(x._1))),x._2))
- }
- arr
- }.sortBy(x=>x._1)
- println("documnet caculate will start")
- val word_document_result=word_document_caculate.map{
- x=>
- val first=AtomsUitl.substring(x._1, 0, 1)
- (first,x._1,x._2)
- }.groupBy((f:(String,String,Int))=>f._1).map{
- x=>x._2
- }.flatMap{
- x=>
- val documnet=Maps.newHashMap[String,ConterHashSet]
- var arrBuff=ArrayBuffer[(String,Int)]()
- for(curr <- x){
- for( ii<- 1 to AtomsUitl.len(curr._2)-1){
- val w1=AtomsUitl.substring(curr._2, 0,ii)
- if(documnet.containsKey(w1)){
- documnet.get(w1).addelment(curr._3.asInstanceOf[java.lang.Integer])
- }else{
- val cm=new ConterHashSet();
- cm.addelment(curr._3.asInstanceOf[java.lang.Integer])
- documnet.put(w1,cm)
- }
- }
- }
- val documnet_iter=documnet.keySet.iterator
- while(documnet_iter.hasNext()){
- val w=documnet_iter.next()
- val freq=documnet.get(w).getsize()
- arrBuff+=((w,freq))
- }
- arrBuff
- }
- // word_document_result.take(20).foreach(println)
- // println("word_document_result's count:"+word_document_result.count())
- println("information entropy and information")
- val word=word1.flatMap{x=>
- val line=x.split(" ")
- line
- }.filter(x=> !StringUtils.isBlank(x))
- // //计算左信息熵做准备
- println("Calculate the left word information entropy and information entropy .....")
- val wordleft=word.map(x=>AtomsUitl.reverse(x)).map{x=>"$"+ x +"$"}.flatMap{
- x=> var arr=ArrayBuffer[String]()
- for( y<- 1 to AtomsUitl.len(x)-2){
- // arr+=x.substring(y, Math.min(maxLen + y, x.length()))
- arr+=AtomsUitl.substring(x,y, Math.min(maxLen + y, AtomsUitl.len(x)))
- }
- arr
- }.sortBy(x=>x)
- val wordleft_caculate= wordleft.map{
- s=>
- // val first=s.substring(0, 1).toString()
- val first=AtomsUitl.substring(s, 0,1).toString
- (first,s)
- }.groupBy((f:(String,String))=>f._1).map{
- x=>x._2
- }.flatMap{
- x=>
- val stat = Maps.newHashMap[String, CounterMap]()
- var arrBuff=ArrayBuffer[(String,Double)]()
- for(curr <- x){
- for( ii<- 1 to AtomsUitl.len(curr._2)-1){
- // val w = curr._2.substring(0,ii)
- val w = AtomsUitl.substring(curr._2, 0, ii)
- // val suffix = curr._2.substring(ii).substring(0, 1)
- val suffix= AtomsUitl.substring(AtomsUitl.substring(curr._2,ii),0,1)
- if (stat.containsKey(w)) {
- stat.get(w).incr(suffix)
- } else {
- val cm = new CounterMap()
- cm.incr(suffix)
- stat.put(w, cm)
- }
- }
- }
- var iterator_stat=stat.keySet().iterator()
- while(iterator_stat.hasNext()){
- var w=iterator_stat.next()
- var cm = stat.get(w);
- var freq = 0
- var re = 0.0
- var cm_iter=cm.countAll().keySet().iterator()
- while(cm_iter.hasNext()) {
- freq += cm.get(cm_iter.next())
- }
- var cm_iter1=cm.countAll().keySet().iterator()
- while(cm_iter1.hasNext()) {
- var p = cm.get(cm_iter1.next()) * 1.0 / freq
- re += -1 * Math.log(p) * p
- }
- // print("freq的值是:"+freq+" ")
- // println("re的值是:"+re)
- arrBuff+=((AtomsUitl.reverse(w),re))
- }
- arrBuff
- }
- // wordleft_caculate.take(20).foreach(println)
- // println("左邻信息个个数是:"+wordleft_caculate.count())
- // println(wordleft_caculate.map(x=>x._1).distinct().count())
- // println("wordleft'coutn----->"+wordleft.count)
- //计算右信息熵做准备
- println("Calculate the right word information entropy and information entropy .....")
- val wordright=word.map{x=>"$"+ x +"$"}.flatMap{
- x=>
- var arr=ArrayBuffer[String]()
- // AtomsUitl.len(x)-2
- for( y<- 1 to AtomsUitl.len(x)-2){
- // arr+=x.substring(y, java.lang.Math.min(maxLen + y, x.length()))
- arr+=(AtomsUitl.substring(x,y,Math.min(maxLen+y,AtomsUitl.len(x))))
- }
- arr
- }.sortBy(x=>x)
- //计算右邻字信息熵
- val wordright_caculate=wordright.map{
- s=>
- // val first=s.substring(0, 1).toString()
- val first=AtomsUitl.substring(s, 0,1).toString()
- (first,s)
- }.groupBy((f:(String,String))=>f._1).map{
- x=>x._2
- }.flatMap{
- x=>
- var stat = Maps.newHashMap[String, CounterMap]()
- var arrBuff=ArrayBuffer[(String,Int,Double)]()
- for(curr <- x){
- for(i<- 1 to AtomsUitl.len(curr._2)-1){
- // val w = curr._2.substring(0, i)
- val w=AtomsUitl.substring(curr._2,0,i)
- // val suffix = curr._2.substring(i).substring(0, 1)
- val suffix=AtomsUitl.substring(AtomsUitl.substring(curr._2, i), 0,1).toString
- if (stat.containsKey(w)) {
- stat.get(w).incr(suffix);
- } else {
- val cm = new CounterMap();
- cm.incr(suffix);
- stat.put(w, cm);
- }
- }
- }
- var iterator_stat=stat.keySet().iterator()
- while(iterator_stat.hasNext()){
- var w=iterator_stat.next()
- var cm = stat.get(w);
- var freq = 0
- var re = 0.0
- var cm_iter=cm.countAll().keySet().iterator()
- while(cm_iter.hasNext()) {
- freq += cm.get(cm_iter.next())
- }
- var cm_iter1=cm.countAll().keySet().iterator()
- while(cm_iter1.hasNext()) {
- var p = cm.get(cm_iter1.next()) * 1.0 / freq
- re += -1 * Math.log(p) * p
- }
- // print("w的值是:"+w+" ")
- // print("freq的值是:"+freq+" ")
- // println("re的值是"+re)
- arrBuff+=((w,freq,re))
- }
- arrBuff
- }
- // println("计算右邻信息前20条")
- // wordright_caculate.take(20).foreach(println)
- // println("右信息表的总共个数:"+wordright_caculate.count())
- // wordright_caculate.
- //左右合并开始
- println(" Merge will begin to calculated..............")
- import sqlContext.implicits._
- /* val word_caculate_total1=wordright_caculate.union(wordleft_caculate).sortBy(x=>x).groupBy((f:(String,Int,Double))=>f._1,20).map(x=>x._2)
- val word_caculate_total= word_caculate_total1.map{
- x=>
- val hashtable=new java.util.Hashtable[String,String]()
- hashtable.put("name","null")
- hashtable.put("freq","0")
- hashtable.put("e",java.lang.Double.MAX_VALUE.toString())
- for(str<-x){
- hashtable.put("name",str._1)
- if(str._2!= -20){
- hashtable.put("freq",String.valueOf(str._2))
- }
- if(str._3<java.lang.Double.parseDouble(hashtable.get("e"))){
- hashtable.put("e",String.valueOf(str._3))
- }
- }
- (hashtable.get("name") ,hashtable.get("freq").toInt,hashtable.get("e").toDouble)
- }.filter(x=> !StringUtils.isBlank(x._1) && x._1.length>1)*/
- val wordright_caculate_todf= wordright_caculate.toDF("right_name","freq","right_info")
- val wordleft_caculate_todf= wordleft_caculate.toDF("left_name","left_info")
- val udf_get_min:((Double,Double)=>Double)=(arg1:Double,arg2:Double)=>Math.min(arg1,arg2)
- val sqlfunctin=udf(udf_get_min)
- val word_caculate_total=wordright_caculate_todf.join(wordleft_caculate_todf,wordright_caculate_todf("right_name")===wordleft_caculate_todf("left_name"),"left").
- withColumn("info", sqlfunctin(col("right_info"),col("left_info"))).drop("right_info").
- drop("left_name").drop("left_info").filter(length(wordright_caculate_todf("right_name"))>1).rdd
- // wordright_caculate.union(wordleft_caculate).groupBy((f:(String,Int,Double))=>f._1).map(x=>x._2).take(20).foreach(println)
- println("计算凝固度")
- val size_pmi=wordright_caculate.count()
- println("最后步骤中的size的总数是:"+size_pmi)
- println("map_total has down")
- //计算凝固度
- val last= word_caculate_total.flatMap{
- x=>
- var w=x.apply(0).toString
- var f=x.apply(1).toString.toInt
- var e=x.apply(2).toString.toDouble
- // var w=x._1
- // var f=x._2
- // var e=x._3
- var arr=ArrayBuffer[(String,Int,Double,String,String)]()
- for(s <- 1 to AtomsUitl.len(w)-1){
- // var lw=w.substring(0,s)
- try{
- var lw=AtomsUitl.substring(w, 0,s)
- // var rw=w.substring(s)
- var rw=AtomsUitl.substring(w, s)
- arr+=((w,f,e,lw,rw))
- }catch{
- case e:Exception=>arr+=(("",0,0.0,"",""))
- }
- }
- arr
- }.filter(f=> !StringUtils.isBlank(f._4)&& !StringUtils.isBlank(f._5))
- println("dataframe merge will begin to calculated..............")
- // last.take(30).foreach(println)
- val df= last.toDF("w_total","f","e","lw","rw")
- val df1=wordright_caculate.toDF("w","freq","re")
- val df2_drop=df.join(df1,df("lw")===df1("w"),"left").drop("re").drop("w").withColumnRenamed("freq", "lw_freq")
- // val df2_drop=df2.drop("re").drop("w").withColumnRenamed("freq", "lw_freq")
- val df3_drop=df2_drop.join(df1,df2_drop("rw")===df1("w"),"left").drop("re").drop("w").withColumnRenamed("freq", "rw_freq")
- // val df3_drop=df3.drop("re").drop("w").withColumnRenamed("freq", "rw_freq")
- // 948014
- //凝固度計算
- /*val result=df3_drop.rdd.groupBy{f=>f(0)}.map{
- x=>
- val map=new java.util.HashMap[String,String]()
- map.put("max","1")
- for(i<-x._2){
- map.put("w_total",i.apply(0).toString)
- map.put("f",i.apply(1).toString)
- map.put("e",i.apply(2).toString)
- var ff:java.lang.Long=try{
- i.apply(5).toString.toLong*i.apply(6).toString.toLong
- }catch{
- case e:Exception=>1l
- }
- if(ff>map.get("max").toLong){
- map.put("max",ff.toString)
- }
- }
- var pf=map.get("f").toLong*size_pmi*1.0/map.get("max").toLong
- var pmi=Math.log(pf)
- var w_total= map.get("w_total")
- var f=map.get("f").toInt
- var e=map.get("e").toDouble
- map.clear()
- (w_total,f,pmi,e,0)
- // ( map.get("w_total"),map.get("f").toInt ,pmi,map.get("e").toDouble,0)
- }.filter(f=>f._3>pmi&& f._4>info&& !StringUtils.isBlank(f._1))
- val resultToDf= result.toDF("name","freq","pmi","info","zero")
- */
- println("dataframe join has down")
- //计算凝聚度 改用DataFrame的形式
- val udf_get_pmi=(arg1:Int,arg2:Int,arg3:Int)=>Math.log((arg1.toLong*size_pmi.toLong*1.0)/(arg2.toLong*arg3.toLong))
- val udf_get_pmi_udf=udf(udf_get_pmi)
- val resultToDf=df3_drop.withColumn("pmi",udf_get_pmi_udf(col("f"),col("rw_freq"),col("lw_freq"))).withColumn("zero", col("f")*0).
- drop("rw_freq").drop("lw_freq").drop("lw").drop("rw").sort($"w_total",$"pmi".desc).dropDuplicates(Array("w_total")).
- filter($"pmi">pmi && $"e">info).withColumnRenamed("w_total", "name").withColumnRenamed("f", "freq").withColumnRenamed("e", "info")
- println("The final result will be caculated")
- val word_document_resultToDf=word_document_result.toDF("name1","document")
- val resultToDf2= resultToDf.join(word_document_resultToDf,word_document_resultToDf("name1")===resultToDf("name"),"left").
- withColumn("documentcount",col("zero")+sum_document).drop("zero").drop("name1")
- // val resultToDf2 =resultToDf1.withColumn("documentcount",col("zero")+sum_document).drop("zero").drop("name1")
- // resultToDf2.show(20)
- // 互信息 凝聚度pmi
- // 左右熵 e
- //把结果存入到hdfs中
- println("Results will stored into HDFS.")
- val sdf1=new SimpleDateFormat("yy-MM-dd")
- val save_path=save_path_result+sdf1.format(System.currentTimeMillis())
- try{
- resultToDf2.rdd.map{
- x=>
- var name=x.apply(0).toString
- var freq=x.apply(1).toString
- var entropy=x.apply(2).toString
- var info=x.apply(3).toString
- var document=x.apply(4).toString
- var documenttotal=x.apply(5).toString
- s"${name},${freq},${info},${entropy},${document},${documenttotal}"
- }.saveAsTextFile(save_path)
- println("....................sucess.............")
- // resultToDf2.rdd.repartition(1).saveAsTextFile(save_path)
- }catch{
- case e:Exception=>println("some errors happend when sava the last datas")
- }
- //把结果插入到mysql数据库中
- /* val driver="com.mysql.jdbc.Driver"
- Class.forName(driver)
- val url ="jdbc:mysql://10.1.1.28:3306/spark"
- val pro=new java.util.Properties
- pro.setProperty("user","usr_dba")
- pro.setProperty("password","4rfv%TGB^YHN")
- pro.setProperty("use_unicode", "true")
- pro.setProperty("characterEncoding", "utf8")
- resultToDf2.write.mode(SaveMode.Overwrite).jdbc(url, "wordsegment",pro)
- */
- println(start)
- println(sdf.format(System.currentTimeMillis()))
- sc.stop()
- }
- }
spark发现新词的更多相关文章
- 基于大规模语料的新词发现算法【转自matix67】
最近需要对商品中的特有的词识别,因此需新词发现算法,matrix的这篇算法很好. 对中文资料进行自然语言处理时,我们会遇到很多其他语言不会有的困难,例如分词——汉语的词与词之间没有空格,那计算机怎么才 ...
- 用python实现新词发现程序——基于凝固度和自由度
互联网时代,信息产生的数量和传递的速度非常快,语言文字也不断变化更新,新词层出不穷.一个好的新词发现程序对做NLP(自然预言处理)来说是非常重要的. N-Gram加词频 最原始的新词算法莫过于n-gr ...
- 【新词发现】基于SNS的文本数据挖掘、短语挖掘
互联网时代的社会语言学:基于SNS的文本数据挖掘 python实现 https://github.com/jtyoui/Jtyoui/tree/master/jtyoui/word 这是一个无监督训 ...
- 解决在编程方式下无法访问Spark Master问题
我们可以选择使用spark-shell,spark-submit或者编写代码的方式运行Spark.在产品环境下,利用spark-submit将jar提交到spark,是较为常见的做法.但是在开发期间, ...
- 互联网时代的社会语言学:基于SNS的文本数据挖掘
今年上半年,我在人人网实习了一段时间,期间得到了很多宝贵的数据,并做了一些还算有意义的事情,在这里和大家一块儿分享.感谢人人网提供的数据 与工作环境,感谢赵继承博士.詹卫东老师的支持和建议.在这项工作 ...
- word2vec 在 非 自然语言处理 (NLP) 领域的应用
word2vec 本来就是用来解决自然语言处理问题的,它在 NLP 中的应用是显然的. 比如,你可以直接用它来寻找相关词.发现新词.命名实体识别.信息索引.情感分析等:你也可以将词向量作为其他模型的输 ...
- IKanalyzer、ansj_seg、jcseg三种中文分词器的实战较量
转自:http://lies-joker.iteye.com/blog/2173086 选手:IKanalyzer.ansj_seg.jcseg 硬件:i5-3470 3.2GHz 8GB win7 ...
- TF-IDF计算方法和基于图迭代的TextRank
文本处理方法概述 说明:本篇以实践为主,理论部分会尽量给出参考链接 摘要: 1.分词 2.关键词提取 3.主题模型(LDA/TWE) 4.词的两种表现形式(词袋模型和分布式词向量) 5.关于文本的特征 ...
- NLP之分词
不同分词工具原理解析 对各种分词工具的介绍,具体参考: http://www.cnblogs.com/en-heng/p/6234006.html 1) jieba 具体参考: https://blo ...
随机推荐
- bzoj1801 [Ahoi2009]中国象棋
Description 在N行M列的棋盘上,放若干个炮可以是0个,使得没有任何一个炮可以攻击另一个炮. 请问有多少种放置方法,中国像棋中炮的行走方式大家应该很清楚吧. Input 一行包含两个整数N, ...
- Oracle数据库几种启动方式及查询当前状态
Oracle数据库几种启动方式 1.startup nomount: 非安装启动,这种方式下启动可执行:重建控制文件.重建数据库,读取init.ora文件,启动instance,即启动SGA和后台进程 ...
- PHP APC安装与使用
先要解决一下httpd-devel依赖库问题 yum install cyrus-sasl-devel db4-devel openldap apr apr-util apr-util-devel p ...
- 创建 XXXXXXXX 的配置节处理程序时出错: 请求失败
今天碰到这个错误,之前的程序在测试的时候都没有问题,同样的程序打包通过QQ传给其他人,在XP下测试也没有问题,我的Win7系统从QQ信箱下载压缩包,解压之后执行程序就会出问题,本来还是考虑自己程序是不 ...
- P2878 [USACO07JAN]保护花朵Protecting the Flowers
一个类似国王游戏的贪心 话说要是先做了这个题,国王游戏之余懵逼这么久吗? #include<iostream> #include<cstdio> #include<alg ...
- 一篇SSM框架整合友好的文章(一)
转载请标明出处: http://blog.csdn.net/forezp/article/details/53730333 本文出自方志朋的博客 最近实在太忙,之前写的<rxjava系列文章&g ...
- runit git-daemon-run 等错误
正在处理用于 man-db (2.7.5-1) 的触发器 ... 正在设置 runit (2.1.2-3ubuntu1) ... start: 无法连接到 Upstart: Failed to con ...
- iOS MapKit地图
地图框架:#import <MapKit/MapKit.h> 基本属性和方法: 属性: 地图类视图:MKMapView 地图类型:MKMapType mapType 地图旋转:rotate ...
- JavaScript 十行原生代码实现复制内容到剪贴板
十行原生代码,不引入任何 JS 库,目前大部分浏览器与移动平台都可兼容. function copyToClipboard(value, callback) { var textarea = docu ...
- 常用的功能封装 pool.js
//import { setInterval } from "timers"; //获取最大值 function getMax(){ var max = arguments[0]; ...