1. 词频统计:

 import jieba
txt = open("threekingdoms3.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(15):
word, count = items[i]
print ("{0:<10}{1:>5}".format(word, count))


曹操 946
孔明 737
将军 622
玄德 585
却说 534
关公 509
荆州 413
二人 410
丞相 405
玄德曰 390
不可 387
孔明曰 374
张飞 358
如此 320
不能 318

进一步改进, 我想只知道人物出场统计,代码如下:

 import jieba
txt = open("threekingdoms3.txt", "r", encoding='utf-8').read()
names = {'曹操','孔明','刘备','关羽','张飞','吕布','赵云','孙权','周瑜','袁绍','黄忠','魏延'}
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
elif word == "诸葛亮" or word == "孔明曰":
rword = "孔明"
elif word == "关公" or word == "云长":
rword = "关羽"
elif word == "玄德" or word == "玄德曰":
rword = "刘备"
elif word == "孟德" or word == "丞相":
rword = "曹操"
rword = word
counts[rword] = counts.get(rword,0) + 1
# for word in excludes:
# del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(40):
word, count = items[i]
if word in names:
print ("{0:<10}{1:>5}".format(word, count))


曹操 1358
孔明 1265
刘备 1251
关羽 783
张飞 358
吕布 300
赵云 278
孙权 257
周瑜 217
袁绍 191


 import jieba
import os
import wordcloud def getText(file):
with open(file, 'r', encoding= 'UTF-8') as txt:
txt = txt.read()
return txt directoryname = os.getcwd()
filename = input()
txt = getText(filename + '.txt')
wordclouds = wordcloud.WordCloud(width=1000, height= 800, margin=2).generate(txt)
wordclouds.to_file('{}.png'.format(filename)) os.system('{}.png'.format(filename))


中文wordcloud库默认会出现乱码,解决方法参考 https://blog.csdn.net/Dick633/article/details/80261233


