根据职位名,自动生成jd
代码本身就是最好的解释,不赘述。
文本聚类输出: cluster.py
#!/usr/bin/env python
# coding=utf-8 import jieba,re
from gensim import corpora,models
from sklearn.cluster import KMeans
import sys
reload(sys)
sys.setdefaultencoding('utf-8') class MyCorpus(object):
def __init__(self,fname):
self.fname = fname def __iter__(self):
for line in open(self.fname):
yield jieba.cut(line,cut_all=False) class MyCluster(object): def __init__(self):
self.CLEAN = re.compile(ur"[^\u4e00-\u9f5aA-Za-z0-9]")
self.dictionary = {}
self.corpus = [] def gen_dataset(self,documents):
self.gen_corpus(documents)
res = [self.doc2vec(doc) for doc in documents]
return res def gen_corpus(self,documents):
texts = [ list(jieba.cut(doc)) for doc in documents ]
self.dictionary = corpora.Dictionary(texts)
self.corpus = [self.dictionary.doc2bow(text) for text in texts]
self.tfidf = models.TfidfModel(self.corpus) def doc2vec(self,doc):
vec = self.dictionary.doc2bow(jieba.cut(doc))
vec = self.tfidf[vec]
wordlist = [.0] * len(self.dictionary)
for w in vec:
wordlist[w[0]] = w[1]
return wordlist def kcluster(self,texts,k=3):
from random import shuffle
data = self.gen_dataset(texts)
data = [ map(lambda x:round(x,5),line) for line in data ]
km = KMeans(n_clusters=k,init='k-means++',max_iter=200,n_init=1,verbose=True)
km.fit(data)
labels = km.labels_
flag = [0]*len(labels)
randomtext = zip(labels,texts)
shuffle(randomtext)
res = []
for d in randomtext:
if flag[d[0]]==0:
res.append(d[1])
flag[d[0]] = 1 return res if __name__ == "__main__":
texts = [ line for line in open('data/python.db') ]
test = MyCluster()
res = test.kcluster(texts,k=4) print '\n'.join(res)
自动生成主文件: auto_gen_jd.py
#!/usr/bin/env python
# coding=utf-8 import sys,os
import simplejson as json
import codecs
# from snownlp import SnowNLP
from simhash import Simhash
# from bosonnlp import BosonNLP
from cluster import MyCluster
from jd_parser import JdParser
import re
reload(sys)
sys.setdefaultencoding('utf-8') class AutoGenJD(object):
''' 自动生成JD,输入一个职位名 和句子数,输出一份岗位描述和要求 ''' def __init__(self):
self.CLEAR_NUM = re.compile(u"^\d+[\.、::]|^[\(\(]\d+[\)\)\.]?|\d\s*[\))】]")
self.CLEAR_COLO = re.compile(u"^[。\.)(【】]\S+|[\.;:;。]$")
self.jd_database = json.load(codecs.open('data/lagou_jd_clean.json'))
# self.jobname = [ jobname[:-3] for jobname in os.listdir("data") if jobname.endswith(".db") ]
self.jobname = self.jd_database.keys()
# self.bosonnlp = BosonNLP('UYTG1Csb.3652.5pZ2otkIncEn')
self.jdparser = JdParser()
self.km = MyCluster() def load_json_data(self,fname="../preprocess/data/mini_jd.json",arg1=None,arg2=None):
for line in codecs.open(fname):
try:
data = json.loads(line)
except Exception,e:
print e
continue
if data.get(arg1,False) != False and data[arg1].has_key("job_title") and data[arg1].has_key("job_description"):
if len(data[arg1]["job_title"])<2 or len(data[arg1]["job_title"])>16:
continue
else:
fw = codecs.open('./data/'+data[arg1][arg2]+".txt",'w','utf-8')
fw.write(data[arg1]["job_description"].strip()+"\n\n")
print "writing...",data[arg1][arg2] # 去除 序列号等清洗数据
def clean_jd(self,fname="./data/java.txt"):
clean_sents = set()
with codecs.open(fname+".txt",'r','utf-8') as fr:
for line in fr:
line = self.CLEAR_NUM.sub("",line.strip())
line = self.CLEAR_COLO.sub("",line.strip())
if len(line)>2:
clean_sents.add(line.strip())
with codecs.open(fname[:-3]+"db",'w','utf-8') as fw:
for line in clean_sents:
fw.write(line+'\n')
return clean_sents def is_most_english(self,line):
en_word = [ uchar for uchar in line if (uchar>=u'\u0041' and uchar<=u'\u005a') or (uchar>=u'\u0061' and uchar<=u'\u007a') ]
return float(len(en_word)*1.0/len(line))>0.7 def clean_jd2(self,jdstr):
"""
清洗数据,去除句子前后的标点符合,序号等杂乱数据
"""
res = set()
for line in jdstr.split("\n"):
line = line.strip()
if len(line)<12:
print "line",line
if re.search(u"[;\.;。]\d+|\d?[,,、::\.]$|^\d\s{0,1}[\u4e00-\u9f5e]",line) or len(line)<8 or len(line)>32:continue
if self.is_most_english(line):continue
line = self.CLEAR_NUM.sub("",line)
line = self.CLEAR_COLO.sub("",line)
res.add(line)
return res # 获取和用户输入相似度最近的职位名
def get_closet_job(self,jobname="java"):
dis = [ (other,Simhash(jobname).distance(Simhash(other))) for other in self.jobname ]
sorteddis = sorted(dis,key = lambda x:x[1])
for k,v in sorteddis[:5]:
print k,v
return sorteddis[0][0] # 规范化jd句子数目
def norm_jd_num(self,num):
if num<1:
num=1
elif num>20:
num = 20
return num # 根据职位名和句子数,获得jd
def get_jd_with_snownlp(self,jobname="java",num=5):
jobname = self.get_closet_job(jobname)
# with open("./data/"+jobname+".db") as fr:
# s = SnowNLP(fr.read())
# return s.summary(num)
jdstr = self.clean_jd2(self.jd_database[jobname])
s = SnowNLP(jdstr)
return s.summary(num) def get_jd_with_bosonnlp(self,jobname="java",num=5): res = set()
jobname = self.get_closet_job(jobname)
jdstr = self.clean_jd2(self.jd_database[jobname])[:80]
all_cluster = self.bosonnlp.cluster(jdstr)
sort_all_cluster = sorted(all_cluster,key = lambda x:x['num'],reverse=True)
for idx,cluster in enumerate(sort_all_cluster):
print idx+1,cluster['_id']
res.add(jdstr[cluster['_id']])
return res def _get_sent_score(self,line):
"""
句子得分,最后结果排序使用,分值越小,排序越靠前
"""
s = len(line)+100
if re.search(u"男|女|男女不限|性别|岁",line):
s -= 60
if re.search(u"学历|专业|\d+[kK元]",line):
s -= 40
if re.search(u"经验",line):
s -= 20
return s def get_jd_with_kmeans(self,jobname='python',num=6):
"""
使用kmeans 进行聚类,相同一类只出现一句
"""
jobname = self.get_closet_job(jobname)
jdstr = self.clean_jd2(self.jd_database[jobname])
print "jdstr",len(jdstr)
print self.jd_database[jobname] if len(jdstr)<int(num):
num = len(jdstr)
res = self.km.kcluster(jdstr,k=int(num))
return sorted(res,cmp=lambda x,y:self._get_sent_score(x)-self._get_sent_score(y)) def jd_parser(self,jdstr):
result = self.jdparser.parser(jdstr)
return result if __name__ == "__main__": test = AutoGenJD()
jobname = sys.argv[1]
jdnum = int(sys.argv[2])
print "job name:",jobname
print "demand:"
demand = test.get_jd_with_kmeans(jobname,jdnum)
for i,jdstr in enumerate(demand):
print "%d. %s" %(i+1,jdstr)
根据职位名,自动生成jd的更多相关文章
- Java代码自动生成,生成前端vue+后端controller、service、dao代码,根据表名自动生成增删改查功能
本项目地址:https://github.com/OceanBBBBbb/ocean-code-generator 项目简介 ocean-code-generator采用(适用): ,并使用m ...
- 转载:C#保存文件时重名自动生成新文件的方法
/// <summary> /// Generates a new path for duplicate filenames. /// </summary> /// <p ...
- c# datagridview禁止自动生成额外列
在某些时候,处于重用pojo的考虑,我们希望在不同的datagridview之间进行复用,这就涉及到pojo中的字段会比有些datagridview所需要的字段多,默认情况下,.net对于pojo中的 ...
- oracle数据库高级应用之《自动生成指定表的insert,update,delete语句》
/* * 多条记录连接成一条 * tableName 表名 * type 类型:可以是insert/update/select之一 */ create or replace function my_c ...
- 懒人小工具:自动生成Model,Insert,Select,Delete以及导出Excel的方法
在开发的过程中,我们为了节约时间,往往会将大量重复机械的代码封装,考虑代码的复用性,这样我们可以节约很多时间来做别的事情.最近跳槽到一节webform开发的公司,主要是开发自己公司用的ERP.开始因为 ...
- 懒人小工具1:winform自动生成Model,Insert,Select,Delete以及导出Excel的方法
懒人小工具2:T4自动生成Model,Insert,Select,Delete以及导出Excel的方法 github地址:https://github.com/Jimmey-Jiang/J ...
- PowerDesigner中表名过长,自动生成的主键名截取的问题
在PowerDesinger中,若表名过长,自动生成的主键名会被自动截取. 解决如下:DataBase/Edit Current DBMS/Scripts/Objects/PKey/ConstName ...
- Linq to Sql自动生成实体类重名情况的处理
使用Linq to sql自动生成实体类时,如果要生成多个库的实体类,往往会遇到类名重名的情况,也就是表名重名,这样编译会不通过,这种情况下要在自动生成的实体类文件中(.designer.cs后缀)将 ...
- eclipse自动生成变量名声明(按方法返回值为本地变量赋值)
eclipse自动生成变量名声明(按方法返回值为本地变量赋值) ctrl+2+L 这个快捷键可自动补全代码,极大提升编码效率! 注:ctrl和2同时按完以后释放,再快速按L.不能同时按! 比如写这句代 ...
随机推荐
- Web前端开发笔试&面试_05
>>CW 1.JavaScript的2种变量范围有什么不同? 2.JavaScript 的对象有哪些? 3.
- 设置页面不缓存 no-cache
html中设置方法 <head> <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> <M ...
- apache使用ssl数字证书
apache配置: <VirtualHost *:443> ServerName web.p2 .com ProxyPreserveHost On ProxyRequests Off SS ...
- 大象数据库SQL存储过程(函数)
-- Function: antifraudjudge(character varying) -- DROP FUNCTION antifraudjudge(character varying); C ...
- 【mysql】MySQL存储IP地址
为什么要问如何存储IP 首先就来阐明一下部分人得反问:为什么要问IP得怎样存,直接varchar类型不就得了吗? 其实做任何程序设计都要在功能实现的基础上最大限度的优化性能.而数据库设计是程序设计中不 ...
- Myeclipse SVN错误 443
转:Myeclipse SVN错误:Error validating server certificate for https// Error validating server certificat ...
- IOS开发-视频,音频,录音简单总结
/***** * 1. 视频播放 * * @格式:mp4 mov m4v m2v 3gp 3g2 * * @系统框架使用:#import <MediaPlayer/MediaPlayer.h ...
- 剑指offer系列31-----二叉树的下一个节点
[题目]给定一个二叉树和其中的一个结点,请找出中序遍历顺序的下一个结点并且返回. 注意,树中的结点不仅包含左右子结点,同时包含指向父结点的指针. package com.exe7.offer; /** ...
- Chrome离线下载地址
每当chrome有更新之后,都有不少用户想要下载离线版的安装文件,但苦于找不到下载地址而发愁,其实这个问题很简单,下面我来分享一下方法(仅针对Windows操作系统): 对于稳定版(正式版)Chrom ...
- erlang接入远程shell
两种方式 erl -name aaa@127.0.0.1 -setcookie erl -name bbb@127.0.0.1 -setcookie ctrl + g进入jcl模式 h查看帮助 r ' ...