代码本身就是最好的解释,不赘述。

文本聚类输出: cluster.py

#!/usr/bin/env python
# coding=utf-8 import jieba,re
from gensim import corpora,models
from sklearn.cluster import KMeans
import sys
reload(sys)
sys.setdefaultencoding('utf-8') class MyCorpus(object):
def __init__(self,fname):
self.fname = fname def __iter__(self):
for line in open(self.fname):
yield jieba.cut(line,cut_all=False) class MyCluster(object): def __init__(self):
self.CLEAN = re.compile(ur"[^\u4e00-\u9f5aA-Za-z0-9]")
self.dictionary = {}
self.corpus = [] def gen_dataset(self,documents):
self.gen_corpus(documents)
res = [self.doc2vec(doc) for doc in documents]
return res def gen_corpus(self,documents):
texts = [ list(jieba.cut(doc)) for doc in documents ]
self.dictionary = corpora.Dictionary(texts)
self.corpus = [self.dictionary.doc2bow(text) for text in texts]
self.tfidf = models.TfidfModel(self.corpus) def doc2vec(self,doc):
vec = self.dictionary.doc2bow(jieba.cut(doc))
vec = self.tfidf[vec]
wordlist = [.0] * len(self.dictionary)
for w in vec:
wordlist[w[0]] = w[1]
return wordlist def kcluster(self,texts,k=3):
from random import shuffle
data = self.gen_dataset(texts)
data = [ map(lambda x:round(x,5),line) for line in data ]
km = KMeans(n_clusters=k,init='k-means++',max_iter=200,n_init=1,verbose=True)
km.fit(data)
labels = km.labels_
flag = [0]*len(labels)
randomtext = zip(labels,texts)
shuffle(randomtext)
res = []
for d in randomtext:
if flag[d[0]]==0:
res.append(d[1])
flag[d[0]] = 1 return res if __name__ == "__main__":
texts = [ line for line in open('data/python.db') ]
test = MyCluster()
res = test.kcluster(texts,k=4) print '\n'.join(res)

自动生成主文件: auto_gen_jd.py

#!/usr/bin/env python
# coding=utf-8 import sys,os
import simplejson as json
import codecs
# from snownlp import SnowNLP
from simhash import Simhash
# from bosonnlp import BosonNLP
from cluster import MyCluster
from jd_parser import JdParser
import re
reload(sys)
sys.setdefaultencoding('utf-8') class AutoGenJD(object):
''' 自动生成JD,输入一个职位名 和句子数,输出一份岗位描述和要求 ''' def __init__(self):
self.CLEAR_NUM = re.compile(u"^\d+[\.、::]|^[\(\(]\d+[\)\)\.]?|\d\s*[\))】]")
self.CLEAR_COLO = re.compile(u"^[。\.)(【】]\S+|[\.;:;。]$")
self.jd_database = json.load(codecs.open('data/lagou_jd_clean.json'))
# self.jobname = [ jobname[:-3] for jobname in os.listdir("data") if jobname.endswith(".db") ]
self.jobname = self.jd_database.keys()
# self.bosonnlp = BosonNLP('UYTG1Csb.3652.5pZ2otkIncEn')
self.jdparser = JdParser()
self.km = MyCluster() def load_json_data(self,fname="../preprocess/data/mini_jd.json",arg1=None,arg2=None):
for line in codecs.open(fname):
try:
data = json.loads(line)
except Exception,e:
print e
continue
if data.get(arg1,False) != False and data[arg1].has_key("job_title") and data[arg1].has_key("job_description"):
if len(data[arg1]["job_title"])<2 or len(data[arg1]["job_title"])>16:
continue
else:
fw = codecs.open('./data/'+data[arg1][arg2]+".txt",'w','utf-8')
fw.write(data[arg1]["job_description"].strip()+"\n\n")
print "writing...",data[arg1][arg2] # 去除 序列号等清洗数据
def clean_jd(self,fname="./data/java.txt"):
clean_sents = set()
with codecs.open(fname+".txt",'r','utf-8') as fr:
for line in fr:
line = self.CLEAR_NUM.sub("",line.strip())
line = self.CLEAR_COLO.sub("",line.strip())
if len(line)>2:
clean_sents.add(line.strip())
with codecs.open(fname[:-3]+"db",'w','utf-8') as fw:
for line in clean_sents:
fw.write(line+'\n')
return clean_sents def is_most_english(self,line):
en_word = [ uchar for uchar in line if (uchar>=u'\u0041' and uchar<=u'\u005a') or (uchar>=u'\u0061' and uchar<=u'\u007a') ]
return float(len(en_word)*1.0/len(line))>0.7 def clean_jd2(self,jdstr):
"""
清洗数据,去除句子前后的标点符合,序号等杂乱数据
"""
res = set()
for line in jdstr.split("\n"):
line = line.strip()
if len(line)<12:
print "line",line
if re.search(u"[;\.;。]\d+|\d?[,,、::\.]$|^\d\s{0,1}[\u4e00-\u9f5e]",line) or len(line)<8 or len(line)>32:continue
if self.is_most_english(line):continue
line = self.CLEAR_NUM.sub("",line)
line = self.CLEAR_COLO.sub("",line)
res.add(line)
return res # 获取和用户输入相似度最近的职位名
def get_closet_job(self,jobname="java"):
dis = [ (other,Simhash(jobname).distance(Simhash(other))) for other in self.jobname ]
sorteddis = sorted(dis,key = lambda x:x[1])
for k,v in sorteddis[:5]:
print k,v
return sorteddis[0][0] # 规范化jd句子数目
def norm_jd_num(self,num):
if num<1:
num=1
elif num>20:
num = 20
return num # 根据职位名和句子数,获得jd
def get_jd_with_snownlp(self,jobname="java",num=5):
jobname = self.get_closet_job(jobname)
# with open("./data/"+jobname+".db") as fr:
# s = SnowNLP(fr.read())
# return s.summary(num)
jdstr = self.clean_jd2(self.jd_database[jobname])
s = SnowNLP(jdstr)
return s.summary(num) def get_jd_with_bosonnlp(self,jobname="java",num=5): res = set()
jobname = self.get_closet_job(jobname)
jdstr = self.clean_jd2(self.jd_database[jobname])[:80]
all_cluster = self.bosonnlp.cluster(jdstr)
sort_all_cluster = sorted(all_cluster,key = lambda x:x['num'],reverse=True)
for idx,cluster in enumerate(sort_all_cluster):
print idx+1,cluster['_id']
res.add(jdstr[cluster['_id']])
return res def _get_sent_score(self,line):
"""
句子得分,最后结果排序使用,分值越小,排序越靠前
"""
s = len(line)+100
if re.search(u"男|女|男女不限|性别|岁",line):
s -= 60
if re.search(u"学历|专业|\d+[kK元]",line):
s -= 40
if re.search(u"经验",line):
s -= 20
return s def get_jd_with_kmeans(self,jobname='python',num=6):
"""
使用kmeans 进行聚类,相同一类只出现一句
"""
jobname = self.get_closet_job(jobname)
jdstr = self.clean_jd2(self.jd_database[jobname])
print "jdstr",len(jdstr)
print self.jd_database[jobname] if len(jdstr)<int(num):
num = len(jdstr)
res = self.km.kcluster(jdstr,k=int(num))
return sorted(res,cmp=lambda x,y:self._get_sent_score(x)-self._get_sent_score(y)) def jd_parser(self,jdstr):
result = self.jdparser.parser(jdstr)
return result if __name__ == "__main__": test = AutoGenJD()
jobname = sys.argv[1]
jdnum = int(sys.argv[2])
print "job name:",jobname
print "demand:"
demand = test.get_jd_with_kmeans(jobname,jdnum)
for i,jdstr in enumerate(demand):
print "%d. %s" %(i+1,jdstr)

根据职位名,自动生成jd的更多相关文章

  1. Java代码自动生成,生成前端vue+后端controller、service、dao代码,根据表名自动生成增删改查功能

    本项目地址:https://github.com/OceanBBBBbb/ocean-code-generator 项目简介 ocean-code-generator采用(适用):     ,并使用m ...

  2. 转载:C#保存文件时重名自动生成新文件的方法

    /// <summary> /// Generates a new path for duplicate filenames. /// </summary> /// <p ...

  3. c# datagridview禁止自动生成额外列

    在某些时候,处于重用pojo的考虑,我们希望在不同的datagridview之间进行复用,这就涉及到pojo中的字段会比有些datagridview所需要的字段多,默认情况下,.net对于pojo中的 ...

  4. oracle数据库高级应用之《自动生成指定表的insert,update,delete语句》

    /* * 多条记录连接成一条 * tableName 表名 * type 类型:可以是insert/update/select之一 */ create or replace function my_c ...

  5. 懒人小工具:自动生成Model,Insert,Select,Delete以及导出Excel的方法

    在开发的过程中,我们为了节约时间,往往会将大量重复机械的代码封装,考虑代码的复用性,这样我们可以节约很多时间来做别的事情.最近跳槽到一节webform开发的公司,主要是开发自己公司用的ERP.开始因为 ...

  6. 懒人小工具1:winform自动生成Model,Insert,Select,Delete以及导出Excel的方法

       懒人小工具2:T4自动生成Model,Insert,Select,Delete以及导出Excel的方法    github地址:https://github.com/Jimmey-Jiang/J ...

  7. PowerDesigner中表名过长,自动生成的主键名截取的问题

    在PowerDesinger中,若表名过长,自动生成的主键名会被自动截取. 解决如下:DataBase/Edit Current DBMS/Scripts/Objects/PKey/ConstName ...

  8. Linq to Sql自动生成实体类重名情况的处理

    使用Linq to sql自动生成实体类时,如果要生成多个库的实体类,往往会遇到类名重名的情况,也就是表名重名,这样编译会不通过,这种情况下要在自动生成的实体类文件中(.designer.cs后缀)将 ...

  9. eclipse自动生成变量名声明(按方法返回值为本地变量赋值)

    eclipse自动生成变量名声明(按方法返回值为本地变量赋值) ctrl+2+L 这个快捷键可自动补全代码,极大提升编码效率! 注:ctrl和2同时按完以后释放,再快速按L.不能同时按! 比如写这句代 ...

随机推荐

  1. C#实现图片文件到数据流再到图片文件的转换 --转

    /----引入必要的命名空间 using System.IO; using System.Drawing.Imaging; //----代码部分----// private byte[] photo; ...

  2. unity, polygon collider 2D 添加顶点

    正常情况下只要按下了Edit Collider按钮,鼠标停在polygon collider 2D的一条边上,就会出现一个虚拟的新顶点,此时如果按下鼠标,新顶点就创建出来了. 但是我今天遇到一个奇怪的 ...

  3. div圆角和颜色渐变的设置

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...

  4. 关于c语言char类型输入输出的一个bug

    题目 输入一个整数n,接下来n行每一行输入两个用一个空格分隔的字符. 对每一对字符,比较其大小关系并输出比较的结果:1.0.-1. 解决的代码如下: #include<stdio.h> i ...

  5. 【转】DNS劫持和DNS污染的区别

    什么是DNS服务器? 简单来说,DNS服务器就是域名管理系统. DNS(Domain Name System)是域名解析服务器的意思. DNS服务器是干什么的? DNS服务器在互联网的作用是:把域名转 ...

  6. css_样式样式器的分类

    详情:http://www.w3school.com.cn/h.asp 1.标签样式器:此样式器仅对html页面中div标签有效果 div{ background-color: rosybrown; ...

  7. ExtJs学习笔记之TextField

    输入框TextField 一个基本文本框表单项.可以直接代替传统文本输入框, 或者作为许多复杂基本控件的基类({如@link Ext.form.field.TextArea}) 和Ext.form.f ...

  8. jquery点击改变class并toggle

    <html> <head> <meta charset="utf-8"> <title></title> <scr ...

  9. Jquery中的offset()和position()

    今天遇到这个偏移量的问题,特做此记录.以便日后查看. 先看看这两个方法的定义. offset(): 获取匹配元素在当前视口的相对偏移. 返回的对象包含两个整形属性:top 和 left.此方法只对可见 ...

  10. LeetCode "Count of Smaller Number After Self"

    Almost identical to LintCode "Count of Smaller Number before Self". Corner case needs to b ...