python 去除停用词 结巴分词 import jieba #stopwords = {}.fromkeys([ line.rstrip() for line in open('stopword.txt') ]) stopwords = {}.fromkeys(['的', '附近']) segs = jieba.cut('北京附近的租房', cut_all=False)final = ''for seg in segs: seg = seg.encode('gbk') if se
文章转载:http://blog.csdn.net/xiaoxiangzi222/article/details/53483931 jieba “结巴”中文分词:做最好的 Python 中文分词组件 "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module. Scroll down
# -*- coding:utf8 -*- import os import jieba def splitSentence(inputFile): fin = open(inputFile, 'r') #以读的方式打开文件 global fout #以写得方式打开文件 #print fin global stop for eachLine in fin: #print eachLine line = eachLine.strip()#.decode('utf-8', 'ignore') #去除