参考代码
ChineseClean_demo1.py:
# -*- coding:utf-8 -*-
import xlrd
import xlwt
'''
python3.4
'''
# file 表示源文件名字,修改此处即可
file="./data/answer_detail_5_15307860968687.xls"
dirs="./result" def read_excel(rows_numb,cols_numb): f = xlwt.Workbook() #创建工作簿
'''
创建第一个sheet:
sheet1
'''
sheet1 = f.add_sheet(u'sheet1_1',cell_overwrite_ok=True) #创建sheet
sheet2 = f.add_sheet(u'sheet1_2',cell_overwrite_ok=True) #创建sheet
row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID'] # 打开文件
workbook = xlrd.open_workbook(file)
sheet0 = workbook.sheet_by_index(0) # sheet索引从0开始
cols = sheet0.col_values(cols_numb)
rows_list_1=[]
rows_list_2=[]
for i in range(1,len(cols)):
if cols[i] == '0':
rows_list_1.append(i)
else:
rows_list_2.append(i) for i in range(0,len(row0)):
sheet1.write(0,i,row0[i])
sheet2.write(0,i,row0[i])
'''
sheet1_1保存0分数据 '''
for j in range(0,len(rows_list_1)):
rows = sheet0.row_values(rows_list_1[j]) # 获取行内容
for i in range(0,len(rows)):
sheet1.write(j+1,i,rows[i])
''' sheet1_2保存非0分数据 '''
for j in range(0,len(rows_list_2)):
rows = sheet0.row_values(rows_list_2[j]) # 获取行内容
for i in range(0,len(rows)):
sheet2.write(j+1,i,rows[i]) f.save('./data/demo1.xls') #保存文件 if __name__ == '__main__':
# 读取文件的行和列
rows_numb=0
cols_numb=6
read_excel(rows_numb,cols_numb) ChineseClean_demo2.py:
# -*- coding:utf-8 -*-
import xlrd
import xlwt
'''
python3.4 '''
# file 表示源文件名字,修改此处即可
file="./data/demo1.xls" def read_excel(rows_numb,cols_numb): f = xlwt.Workbook() #创建工作簿 '''
创建第sheet:
'''
sheet1 = f.add_sheet(u'sheet2_1',cell_overwrite_ok=True) #创建sheet
sheet2 = f.add_sheet(u'sheet2_2',cell_overwrite_ok=True) #创建sheet
sheet3 = f.add_sheet(u'sheet2_3',cell_overwrite_ok=True) #创建sheet
sheet4 = f.add_sheet(u'sheet2_4',cell_overwrite_ok=True) #创建sheet
row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID'] for i in range(0,len(row0)):
sheet1.write(0,i,row0[i])
sheet2.write(0,i,row0[i])
sheet3.write(0,i,row0[i])
sheet4.write(0,i,row0[i]) # 打开文件
workbook = xlrd.open_workbook(file)
sheet0 = workbook.sheet_by_index(0) # sheet索引从0开始
cols = sheet0.col_values(cols_numb) # 获取列内容
rows_list_1=[]
rows_list_2=[]
rows_list_3=[]
rows_list_4=[]
for i in range(1,len(cols)): if float(cols[i]) < 12.0:
rows_list_1.append(i)
if float(cols[i]) >= 12.0 and float(cols[i]) < 16.0:
rows_list_2.append(i)
if float(cols[i]) >= 16.0 and float(cols[i]) < 18.0:
rows_list_3.append(i)
if float(cols[i]) >= 18.0:
print(i)
print(type(cols[i]))
exit()
rows_list_4.append(i) '''
sheet2_1保存差,小于12分 '''
for j in range(0,len(rows_list_1)):
rows = sheet0.row_values(rows_list_1[j]) # 获取行内容
for i in range(0,len(rows)):
sheet1.write(j+1,i,rows[i])
'''
sheet2_2保存中,大于等于12,且小于16分 ''' for j in range(0,len(rows_list_2)):
rows = sheet0.row_values(rows_list_2[j]) # 获取行内容
for i in range(0,len(rows)):
sheet2.write(j+1,i,rows[i]) '''
sheet2_3保存良,大于等于16,且小于18分 '''
for j in range(0,len(rows_list_3)):
rows = sheet0.row_values(rows_list_3[j]) # 获取行内容
for i in range(0,len(rows)):
sheet3.write(j+1,i,rows[i])
'''
sheet2_4保存优,大于等于18分 ''' for j in range(0,len(rows_list_4)):
rows = sheet0.row_values(rows_list_4[j]) # 获取行内容
for i in range(0,len(rows)):
sheet4.write(j+1,i,rows[i]) f.save('./data/demo2.xls') if __name__ == '__main__':
# 读取文件的行和列
rows_numb=0
cols_numb=6
read_excel(rows_numb,cols_numb)
ChineseClean_demo3.py:
# -*- coding:utf-8 -*-
import xlrd
import xlwt
'''
python3.4 '''
file="./data/answer_detail_5_15307860968687.xls" def read_excel(rows_numb,cols_numb): f = xlwt.Workbook() #创建工作簿 '''
创建第一个sheet:
sheet1
'''
sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True) #创建sheet
sheet2 = f.add_sheet(u'sheet2',cell_overwrite_ok=True) #创建sheet
sheet3 = f.add_sheet(u'sheet3',cell_overwrite_ok=True) #创建sheet
sheet4 = f.add_sheet(u'sheet4',cell_overwrite_ok=True) #创建sheet
sheet5 = f.add_sheet(u'sheet5',cell_overwrite_ok=True)
row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID'] for i in range(0,len(row0)):
sheet1.write(0,i,row0[i])
sheet2.write(0,i,row0[i])
sheet3.write(0,i,row0[i])
sheet4.write(0,i,row0[i])
sheet5.write(0,i,row0[i]) # 打开文件
workbook = xlrd.open_workbook(file)
sheet0 = workbook.sheet_by_index(0) # sheet索引从0开始
cols = sheet0.col_values(cols_numb) # 获取列内容
rows_list_1=[]
rows_list_2=[]
rows_list_3=[]
rows_list_4=[]
rows_list_5=[]
for i in range(1,len(cols)): if cols[i] == '100012':
rows_list_1.append(i)
if cols[i] == '100014':
rows_list_2.append(i)
if cols[i] == '100007':
rows_list_3.append(i)
if cols[i] == '100016':
rows_list_4.append(i)
if cols[i] == '100017':
print(i)
print(type(cols[i]))
rows_list_5.append(i)
'''
sheet1保存
'''
for j in range(0,len(rows_list_1)):
rows = sheet0.row_values(rows_list_1[j]) # 获取第四行内容
for i in range(0,len(rows)):
sheet1.write(j+1,i,rows[i])
'''
sheet2保存
''' for j in range(0,len(rows_list_2)):
rows = sheet0.row_values(rows_list_2[j]) # 获取第四行内容
for i in range(0,len(rows)):
sheet2.write(j+1,i,rows[i]) '''
sheet3保存
'''
for j in range(0,len(rows_list_3)):
rows = sheet0.row_values(rows_list_3[j]) # 获取第四行内容
for i in range(0,len(rows)):
sheet3.write(j+1,i,rows[i])
'''
sheet4保存
'''
for j in range(0,len(rows_list_4)):
rows = sheet0.row_values(rows_list_4[j]) # 获取第四行内容
for i in range(0,len(rows)):
sheet4.write(j+1,i,rows[i]) '''
sheet5保存
'''
for j in range(0,len(rows_list_5)):
rows = sheet0.row_values(rows_list_5[j]) # 获取第四行内容
for i in range(0,len(rows)):
sheet5.write(j+1,i,rows[i]) f.save('./data/demo3.xls') #保存文件 if __name__ == '__main__':
# 读取文件的行和列
rows_numb=0
cols_numb=7
read_excel(rows_numb,cols_numb)
ChineseClean_demo4or5.py:
同ChineseClean_demo3.py
ChineseClean_answer_QA.py:
# -*- coding:utf-8 -*-
import re
import xlrd
file="./data/demo5.xls"
dirs="./result" def read_excel(rows_numb,cols1_numb):
number='1'
f2 = open(dirs+'./demo5_sheet1_%s.csv'%number, 'a', encoding='utf-8')
# 打开文件
workbook = xlrd.open_workbook(file)
sheet0 = workbook.sheet_by_index(int(number)-1) # sheet索引从0开始
cols1 = sheet0.col_values(cols1_numb[3]) [1:]# 获取列内容 p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"
pattern1 = re.compile(p1)
for i in range(len(cols1)):
matcher1 = re.findall(pattern1, cols1[i])
str1=str()
if matcher1:
str1 = ' '.join(matcher1)
f2.write(str1)
f2.write('\n') f2.close() if __name__ == '__main__':
# 读取文件的行和列
rows_numb=0
cols1_numb=[0,1,2,3,4,5,6,7]
read_excel(rows_numb,cols1_numb) qa_test_clean_word.py:
# -*- coding: utf-8 -*- import jieba
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords # 对句子进行分词
def seg_sentence(sentence):
sentence_seged = jieba.lcut_for_search(sentence.strip(),HMM=True)
stopwords = stopwordslist('./test/stopwords.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return(outstr) inputs = open('./data/demo5_answer_csv/demo5_sheet5_5.csv', 'r', encoding='utf-8')
outputs = open('./test/demo5_sheet5_5_5.csv', 'w')
for line in inputs:
line_seg = seg_sentence(line)
try:
if len(line_seg):
outputs.write(line_seg + '\n')
except:
pass outputs.close()
inputs.close()
word_fre.py:
# -*- coding: utf-8 -*- import matplotlib.pyplot as plt
from matplotlib.font_manager import *
import numpy as np def drawStatBarh():
'''
画出词频统计条形图,用渐变颜色显示,选取前N个词频
'''
fig, ax = plt.subplots()
myfont = FontProperties(fname='./data/simfang.ttf')
N = 30
words = []
counts = []
for line in open('./data/word_fre.txt'):
if line == '\n':
continue
line.strip('\n') words.append(line.split(' ')[0])
print(line.split(' ')[0])
# exit()
counts.append(int(line.split(' ')[1].strip('\n'))) y_pos = np.arange(N) colors = ['#FA8072'] #这里是为了实现条状的渐变效果,以该色号为基本色实现渐变效果
for i in range(len(words[:N]) - 1):
colors.append('#FA' + str(int(colors[-1][3:]) - 1)) rects = ax.barh(y_pos, counts[:N], align='center', color=colors) ax.set_yticks(np.arange(N))
ax.set_yticklabels(words[:N],fontproperties=myfont)
ax.invert_yaxis() # labels read top-to-bottom
ax.set_title('报告中的高频词汇',fontproperties=myfont, fontsize=17)
ax.set_xlabel(u"出现次数",fontproperties=myfont) autolabel(rects, ax)
plt.show() def autolabel(rects, ax):
"""
给条形图加上文字标签
"""
#fig, ax = plt.subplots()
for rect in rects:
width = rect.get_width()
ax.text(1.03 * width, rect.get_y() + rect.get_height()/2.,
'%d' % int(width),ha='center', va='center') def wordCount(segment_list):
'''
该函数实现词频的统计,并将统计结果存储至本地。
在制作词云的过程中用不到,主要是在画词频统计图时用到。
'''
word_lst = []
word_dict = {}
with open('./data/word_fre.txt','w') as wf2:
word_lst.append(segment_list.split(' '))
for item in word_lst:
for item2 in item:
if item2 not in word_dict:
word_dict[item2] = 1
else:
word_dict[item2] += 1
# print(type(word_dict))
# print(word_dict)
word_dict_sorted =list(sorted(word_dict.items(),key = lambda jj:jj[1],reverse=True))#list是关键,按照词频从大到小排序
# word_dict_sorted = dict(sorted(word_dict.items(),key = lambda item:item[1], reverse=True))#按照词频从大到小排序
print(word_dict_sorted)
# exit()
for tup in word_dict_sorted:
# print(type(tup))
# print(tup)
# exit()
if tup[0] != '':
wf2.write(tup[0].strip('\n')+' '+str(tup[1])+'\n')
wf2.close() if __name__ == "__main__":
segment_list_remove_stopwords=open('./data/demo5_sheet5_1_1.csv').read()
wordCount(segment_list_remove_stopwords)
drawStatBarh()
wordcloud_test2.py:
# - * - coding: utf - 8 -*- from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import jieba
# jieba.load_userdict("txt\userdict.txt")
# 添加用户词库为主词典,原词典变为非主词典
from wordcloud import WordCloud, ImageColorGenerator # 获取当前文件路径
# __file__ 为当前文件, 在ide中运行此行会报错,可改为
# d = path.dirname('.')
d = path.dirname(__file__) stopwords = {}
isCN = 1 #默认启用中文分词
back_coloring_path = "data/lz1.jpg" # 设置背景图片路径
text_path = 'data/demo5_sheet5_1_1.csv' #设置要分析的文本路径,讲原始文件转化为‘ANSI编码即可’
font_path = 'data/simfang.ttf' # 为matplotlib设置中文字体路径
stopwords_path = 'data/stopwords.txt' # 停用词词表
imgname1 = "data/WordCloudDefautColors.png" # 保存的图片名字1(只按照背景图片形状)
imgname2 = "data/WordCloudColorsByImg.png"# 保存的图片名字2(颜色按照背景图片颜色布局生成) # my_words_list = ['CHENGLEI'] # 在结巴的词库中添加新词 back_coloring = imread(path.join(d, back_coloring_path))# 设置背景图片 # 设置词云属性
wc = WordCloud(font_path=font_path, # 设置字体
background_color="white", # 背景颜色
max_words=2000, # 词云显示的最大词数
mask=back_coloring, # 设置背景图片
max_font_size=100, # 字体最大值
random_state=42,
width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
) # 添加自己的词库分词
# def add_word(list):
# for items in list:
# jieba.add_word(items) # add_word(my_words_list) text = open(path.join(d, text_path)).read() # def jiebaclearText(text):
# mywordlist = []
# seg_list = jieba.cut(text, cut_all=False)
# liststr="/ ".join(seg_list)
# f_stop = open(stopwords_path)
# try:
# f_stop_text = f_stop.read( )
# f_stop_text=unicode(f_stop_text,'utf-8')
# finally:
# f_stop.close( )
# f_stop_seg_list=f_stop_text.split('\n')
# for myword in liststr.split('/'):
# if not(myword.strip() in f_stop_seg_list) and len(myword.strip())>1:
# mywordlist.append(myword)
# return ''.join(mywordlist)
#
# if isCN:
# text = jiebaclearText(text) # 生成词云, 可以用generate输入全部文本(wordcloud对中文分词支持不好,建议启用中文分词),也可以我们计算好词频后使用generate_from_frequencies函数
wc.generate(text)
# wc.generate_from_frequencies(text)
# txt_freq例子为[('词a', 100),('词b', 90),('词c', 80)]
# 从背景图片生成颜色值
image_colors = ImageColorGenerator(back_coloring) plt.figure()
# 以下代码显示图片
plt.imshow(wc)
plt.axis("off")
plt.show()
# 绘制词云 # 保存图片
wc.to_file(path.join(d, imgname1)) image_colors = ImageColorGenerator(back_coloring) plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
# 绘制背景图片为颜色的图片
plt.figure()
plt.imshow(back_coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
# 保存图片
wc.to_file(path.join(d, imgname2)) lda_test_ok.py:
# coding=utf-8 import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import lda def doc_topic_word():
print(doc_topic[:, :3])#输出文档主题分布情况(前3列)
print(topic_word[:, :3])#输出主题词分布情况(前3列),采用ifidf计算词频 #导出分布图
def plot_1():
# 计算各个主题中单词权重分布的情况
f, ax= plt.subplots(2, 1, figsize=(6, 6), sharex=True)
for i, k in enumerate([0, 9]): #任意选择两个主题
ax[i].stem(topic_word[k,:], linefmt='b-',
markerfmt='bo', basefmt='w-')
ax[i].set_xlim(-2,2000)
ax[i].set_ylim(0, 1)
ax[i].set_ylabel("Prob")
ax[i].set_title("topic {}".format(k)) ax[1].set_xlabel("word")
plt.tight_layout()
plt.show() def plot_2():
# 计算文档具体分布在那个主题,代码如下所示: f, ax= plt.subplots(2, 1, figsize=(8, 8), sharex=True)
for i, k in enumerate([0,9]): #任意选择两个主题
ax[i].stem(doc_topic[k,:], linefmt='r-',
markerfmt='ro', basefmt='w-')
ax[i].set_xlim(-1, 20) #x坐标下标,即主题的取值范围
ax[i].set_ylim(0, 1.2) #y坐标下标
ax[i].set_ylabel("Prob")
ax[i].set_title("Document {}".format(k))
ax[1].set_xlabel("Topic")
plt.tight_layout()
plt.show() if __name__ == "__main__": #存储读取语料 一行预料为一个文档
corpus = []
for line in open('./data/demo5_sheet5_1_1.csv', 'r').readlines():
corpus.append(line.strip()) #将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer()
print (vectorizer) X = vectorizer.fit_transform(corpus)
analyze = vectorizer.build_analyzer()
weight = X.toarray()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))
print (len(weight))
print (weight[:5, :5]) #LDA算法
print ('LDA:')
model = lda.LDA(n_topics=20, n_iter=50, random_state=1)
# model.fit_transform(X)
model.fit(np.asarray(weight)) # model.fit_transform(X) is also available?
topic_word = model.topic_word_ # model.components_ also works #文档-主题(Document-Topic)分布
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape)) #输出前10篇文章最可能的Topic
label = []
for n in range(10):
topic_most_pr = doc_topic[n].argmax()
label.append(topic_most_pr)
print("doc: {} topic: {}".format(n, topic_most_pr)) #输出主题中的TopN关键词
word = vectorizer.get_feature_names()
n = 6
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(word)[np.argsort(topic_dist)][:-(n+1):-1]
print(u'*Topic {}\n- {}'.format(i, ' '.join(topic_words))) # doc_topic_word()
# plot_1()
plot_2() gensimTopicTest0803.py:
# coding=utf-8
import re
import xlrd
import codecs
import jieba
from gensim import corpora, models, similarities FILE="demo5"#选择要训练的文件
ID='1'#选择要训练的能力ID # 读取停用词表
stopwords = [line.strip() for line in codecs.open('./data/stopwords.txt', 'r', encoding='utf-8').readlines()] def cleanAnswer(cols_numb): f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8')
# 打开文件
workbook = xlrd.open_workbook('./data/%s.xls'%FILE)
#根据sheet索引或者名称获取sheet内容
sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引从0开始
cols1 = sheet0.col_values(cols_numb[3])[1:]# 获取第三列内容,从第一行开始 p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"#(?:)不获取匹配,即不获取括号内的匹配,括号内使用UNICODE编码匹配
pattern1 = re.compile(p1)
for i in range(len(cols1)):
matcher1 = re.findall(pattern1, cols1[i])#以列表形式返回所有能匹配到的子串
str1=str()
if matcher1:
str1 =''.join(matcher1)
f1.write(str1.strip())
f1.write('\n')
f1.close() def ldaAnaly(): print("构造分词库-----train-----")
#去停用词,构建分词库
train = []
fp = codecs.open('./result/%s_sheet%s.csv'%(FILE,ID),'r',encoding='utf8')
for line in fp.readlines():
line = line.strip()
if not len(line):#判断是否为空行
continue
outstr = ' '
seg_list =jieba.cut(line,cut_all=False)#采用精确模式分词,效果最好
for word in seg_list:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
train.append(outstr.strip().split(" "))#字符串转列表
fp.close() print("构造分词库,并保存----“dict_v1.dict”----")
dic = corpora.Dictionary(train)
dic.save('./result/dict_v1.dict') print("保存可读取的分词库----“dic.csv”----")
fd = codecs.open('./result/dic.csv', 'a',encoding = 'utf-8')
for word,index in dic.token2id.items():
fd.write(word +':'+ str(index)+'\n')
fd.close() print("生成语料库,并保存-----“corpus.mm”-----")
corpus = [dic.doc2bow(text) for text in train]
corpora.MmCorpus.serialize('./result/corpus.mm', corpus) print("保存tfidf模型-----“corpus.tfidf_model”-----")
tfidf = models.TfidfModel(corpus)
tfidf.save('./result/corpus.tfidf_model') print("进行LDA主题分析,并保存-----“ldaModel.pkl”-----")
#使用tf-idf模型训练语料库
corpus_tfidf = tfidf[corpus]
#设置100个LDA主题,使用500次迭代
lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=100, iterations=500 )
lda.save('./result/ldaModel.pkl') print("评估文章属于不同主题的概率,一个词对文章的重要性-----“Demo:评估文章1”-----")
for index, score in sorted(lda[corpus_tfidf[0]], key=lambda tup: -1 * tup[1]):
print("Score: {}\t Topic: {}".format(score, lda.print_topic(index, 10))) # 输出100个主题
# ldaOut = lda.print_topics(100)
# print("默认返回每个主题的前10的概率最大的词")
# print (ldaOut[0])
# print (ldaOut[1])
# print (ldaOut[2])
# corpus_lda = lda[corpus_tfidf]
# print("每篇文章属于不同主题的概率分布")
# k = 0
# for doc in corpus_lda:
# print(doc)
# k += 1
# if k == 3:
# break def questionAnswer(cols_numb, questionNumber):
lda = models.LdaModel.load('./result/ldaModel.pkl')
dic = corpora.Dictionary.load('./result/dict_v1.dict')
corpus = corpora.MmCorpus('./result/corpus.mm')
tfidf = models.TfidfModel.load('./result/corpus.tfidf_model') # print("输入一个问题------------------")
f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8')
# 打开文件
workbook = xlrd.open_workbook('./data/%s.xls'%FILE)
sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引从0开始
cols0 = sheet0.col_values(cols_numb[3])[questionNumber] # 获取第三列内容,从第一行开始 #对问题进行去乱码
p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"#(?:)不获取匹配,即不获取括号内的匹配,括号内使用UNICODE编码匹配
pattern1 = re.compile(p1)
matcher1 = re.findall(pattern1, cols0)#以列表形式返回所有能匹配到的子串
query=str()
if matcher1:
query =''.join(matcher1)
# print("待预测的问题(去乱码):", query) #对问题进行分词
seg_list = jieba.cut(query, cut_all=False)
outstr = ' '
for word in seg_list:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
inputTest=list(outstr.strip().split(" "))
# print("分词后的问题(去停用词):", inputTest) #将问题转成词袋
query_bow = dic.doc2bow(inputTest)
# print("生成的词袋:", query_bow) #需要对查询语句进行tfidf转化
query_tfidf = tfidf[query_bow]
lda_vec_tfidf = lda[query_tfidf]
# print("问题对应的主题概率(tfidf)", lda_vec_tfidf) # print("预测问题属于不同主题的概率--------------------")
#输出主题概率的代码
# for index, score in sorted(lda_vec_tfidf, key=lambda tup: -1 * tup[1]):
# print("Score: {}\t Topic: {}".format(score, lda.print_topic(index, 20))) # print("预测问题与数据库中的哪些问题相似,并给出相似度排序(tfidf)--------------------")
#进行相似性检索
similarity = similarities.MatrixSimilarity(corpus) #在TFIDF的基础上,进行相似性检测。query_lsi需要进行预先处理。先变化为dow2bow,然后tfidf.
lda_vec = lda[query_bow] # sims = similarity[lda_vec] #相似度检测的词袋为no-tfidf
sims = similarity[lda_vec_tfidf] #相似度检测的词袋为tfidf #先枚举出来,后进行排序输出
listSims = enumerate(sims)
sort_sims = sorted(listSims, key=lambda item: -item[1])
# print(sort_sims[0:6])#前n名效果最好 #进行分数预测--版本1---
sort_sims_list = sort_sims[0:6]
cols1 = sheet0.col_values(cols_numb[6])[1:]# 获取第三列内容,从第一行开始
f1.close() #采用百分比形式的加权平均法,实质就是加权平均偏差法
sumCore1 = 0
sumPro = 0
for i in range(len(sort_sims_list)):
sumCore1 += float(cols1[sort_sims_list[i][0] - 1]) * sort_sims_list[i][1]
# print(cols1[sort_sims_list[i][0] - 1])
sumPro += sort_sims_list[i][1] preCore1 = sumCore1 / sumPro
# print("采用加权平均偏差法,预测分数1为:%s,实际分数为%s"%(preCore1, cols1[questionNumber-1])) print("保存预测结果----“pre.csv”----")
return preCore1, cols1[questionNumber-1], abs(preCore1 - float(cols1[questionNumber-1])) if __name__ == '__main__': cols_numb = [0,1,2,3,4,5,6,7] #读取文件的列标号
# questionNumber = 124 #待测试的问题号,最大不超过问题总数,主要用于测试
# cleanAnswer(cols_numb) #对数据库中的问题进行提取,并去乱码
# ldaAnaly() #对问题进行训练,生成主题模型
# questionAnswer(cols_numb, questionNumber) #对问题进行预测,给出预测分数 #循环预测的demo
fp = codecs.open('./result/pre_v1.csv', 'a', encoding='utf-8')
sum = 0
i = 1
count = 0
while( i < 8717 ):
questionNumber = i
a = questionAnswer(cols_numb, questionNumber)
sum += a[2]
# print(a, a[2])
# exit()
i += 8
count += 1
fp.write(str(i)+":"+str(a) + '\n')
fp.close() ave = sum / count
print(ave)

  

gensim自然语言处理的更多相关文章

  1. gensim自然语言处理(续)

    上一篇,已经实现了如何将一条语句在一个语料库中比较相似度, 发现运行的时候每次都要编译语料库,通过查找资料,可以一次性编译成预料库,存人文件 编译语料库代码 11_k.py import sysimp ...

  2. Python2和Python3的差异

    之前做Spark大数据分析的时候,考虑要做Python的版本升级,对于Python2和Python3的差异做了一个调研,主要对于语法和第三方工具包支持程度进行了比较. 基本语法差异 核心类差异 Pyt ...

  3. Python2和Python3比较分析

    一直有看到网上有讨论Python2和Python3的比较,最近公司也在考虑是否在spark-python大数据开发环境中升级到python3.通过本篇博文记录Python2.7.13和Pthon3.5 ...

  4. 面试题之python基础

    基础语法 输入和输出 代码中要修改不可变的数据会出现什么问题,抛出什么异常? 代码不会征程运行,抛出TypeError异常 a = 1,b = 2,不用中间变量交换a和b的值? # 方法1 a = a ...

  5. Python2 和Python3 的差异总结

    一.基本语法差异 1.1 核心类差异 Python3对Unicode字符的原生支持 Python2中使用 ASCII 码作为默认编码方式导致string有两种类型str和unicode,Python3 ...

  6. 巨蟒python全栈开发flask1

    1.整体把握 (1)路飞学城 - RestAPI 前后端分离开发 Django Vue.js - DRF DjangoRestFromwork - 线上商城的经验 (2)智能玩具 - RestAPI ...

  7. python2和python3的区别(转)

    基本语法差异 核心类差异 Python3对Unicode字符的原生支持 Python2中使用 ASCII 码作为默认编码方式导致string有两种类型str和unicode,Python3只支持uni ...

  8. python 全栈开发,Day133(玩具与玩具之间的对话,基于jieba gensim pypinyin实现的自然语言处理,打包apk)

    先下载github代码,下面的操作,都是基于这个版本来的! https://github.com/987334176/Intelligent_toy/archive/v1.6.zip 注意:由于涉及到 ...

  9. 自然语言处理--jieba和gensim的分词功能

    一.jieba分词功能 1.主要模式 支持三种分词模式: 精确模式,试图将句子最精确地切开,适合文本分析: 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义: 搜索引擎模 ...

随机推荐

  1. 吴裕雄 python matplotlib 绘图示例

    import matplotlib.pyplot as plt plt.scatter([1,2,3,4],[2,3,2,5])plt.title('My first plot')plt.show() ...

  2. HTTP协议中request和response常用方法

    一.request的常用方法:1.获取请求的方式 getMethod()2.目录的路径 getContextPath()3.获取servlet路径 getServletString()4.获得get请 ...

  3. DOM节点遍历

    "DOM2级遍历和范围"模块定义了两个用于辅助完成顺序遍历DOM结构的类型:NodeIterator 和 TreeWalker .这两个类型能够根据给定的节点对DOM结构进行深度优 ...

  4. Debian 9 Stretch国内常用镜像源

     随着Debian 9的普及,但由于伟大的墙的存在,那就有必要整理一下国内的镜像站点. 1.使用说明 一般情况下,修改/etc/apt/sources.list文件,将Debian的默认源地址改成新的 ...

  5. 针对特定网站scrapy爬虫的性能优化

    在使用scrapy爬虫做性能优化时,一定要根据不同网站的特点来进行优化,不要使用一种固定的模式去爬取一个网站,这个是真理,以下是对58同城的爬取优化策略: 一.先来分析一下影响scrapy性能的set ...

  6. centos7 安装部署gitlab

    Gitlab官网地址:https://about.gitlab.com/downloads/ Linux系统环境: Centos7 gitlab服务安装之前需要安装一些依赖包:yum install ...

  7. vue学习笔记(nvm安装)

    https://github.com/creationix/nvm https://github.com/coreybutler/nvm-windows 慕课网:https://www.imooc.c ...

  8. Xamarin.Android 报错问题

    如果程序无法调试,输出中提示:(无法连接到logcat,GetProcessId 返回了:0) https://yq.aliyun.com/articles/618738

  9. Hbase的基本操作(CDH组件可用)

    Habse创建一张表:    1,创建一个命名空间NameSpace(命名空间NameSpace指的是一个表的逻辑分组 ,同一分组中的各个表有类似的用途,相当于关系型数据库中的DataBase)    ...

  10. Mad Libs游戏

    一. 简单的输入输出 输入代码 name1=input('请输入姓名:') name2=input('请输入一个句子:') name3=input('请输入一个地点:') name4=input('请 ...