



先使用pip install nltk 安装包


  1. import nltk
  2. nltk.download()!


  1. from nltk.corpus import brown
  2. print(brown.categories()) # 输出brown语料库的类别
  3. print(len(brown.sents())) # 输出brown语料库的句子数量
  4. print(len(brown.words())) # 输出brown语料库的词数量
  5. '''
  6. 结果为:
  7. ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
  8. 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance',
  9. 'science_fiction']
  10. 57340
  11. 1161192
  12. '''








  1. import nltk.classify.util
  2. from nltk.classify import NaiveBayesClassifier
  3. import os
  4. from nltk.corpus import stopwords
  5. import pandas as pd
  6. def extract_features(word_list):
  7. return dict([(word, True) for word in word_list])
  8. #停用词
  9. stop = stopwords.words('english')
  10. stop1 = ['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,' ', 's','...']
  11. stop = stop1+stop
  12. print(stop)
  13. #读取txt文本
  14. def readtxt(f,path):
  15. data1 = ['microwave']
  16. # 以 utf-8 的编码格式打开指定文件
  17. f = open(path+f, encoding="utf-8")
  18. # 输出读取到的数据
  19. #data = f.read().split()
  20. data = f.read().split()
  21. for i in range(len(data)):
  22. if data[i] not in stop:
  23. data[i] = [data[i]]
  24. data1 = data1+data[i]
  25. # 关闭文件
  26. f.close()
  27. del data1[0]
  28. return data1
  29. if __name__ == '__main__':
  30. # 加载积极与消极评论 这些评论去掉了一些停用词,是在readtxt韩硕里处理的,
  31. #停用词如 i am you a this 等等在评论中是非常常见的,有可能对结果有影响,应该事先去除
  32. positive_fileids = os.listdir('pos') # 积极 list类型 42条数据 每一条是一个txt文件
  33. print(type(positive_fileids), len(positive_fileids)) # list类型 42条数据 每一条是一个txt文件
  34. negative_fileids = os.listdir('neg')#消极 list类型 22条数据 每一条是一个txt文件自己找的一些数据
  35. print(type(negative_fileids),len(negative_fileids))
  36. # 将这些评论数据分成积极评论和消极评论
  37. # movie_reviews.words(fileids=[f])表示每一个txt文本里面的内容,结果是单词的列表:['films', 'adapted', 'from', 'comic', 'books', 'have', ...]
  38. # features_positive 结果为一个list
  39. # 结果形如:[({'shakesp: True, 'limit': True, 'mouth': True, ..., 'such': True, 'prophetic': True}, 'Positive'), ..., ({...}, 'Positive'), ...]
  40. path = 'pos/'
  41. features_positive = [(extract_features(readtxt(f,path=path)), 'Positive') for f in positive_fileids]
  42. path = 'neg/'
  43. features_negative = [(extract_features(readtxt(f,path=path)), 'Negative') for f in negative_fileids]
  44. # 分成训练数据集(80%)和测试数据集(20%)
  45. threshold_factor = 0.8
  46. threshold_positive = int(threshold_factor * len(features_positive)) # 800
  47. threshold_negative = int(threshold_factor * len(features_negative)) # 800
  48. # 提取特征 800个积极文本800个消极文本构成训练集 200+200构成测试文本
  49. features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
  50. features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]
  51. print("\n训练数据点的数量:", len(features_train))
  52. print("测试数据点的数量:", len(features_test))
  53. # 训练朴素贝叶斯分类器
  54. classifier = NaiveBayesClassifier.train(features_train)
  55. print("\n分类器的准确性:", nltk.classify.util.accuracy(classifier, features_test))
  56. print("\n五大信息最丰富的单词:")
  57. for item in classifier.most_informative_features()[:5]:
  58. print(item[0])
  59. # 输入一些简单的评论
  60. input_reviews = [
  61. "works well with proper preparation.",
  62. ]
  63. #运行分类器,获得预测结果
  64. print("\n预测:")
  65. for review in input_reviews:
  66. print("\n评论:", review)
  67. probdist = classifier.prob_classify(extract_features(review.split()))
  68. pred_sentiment = probdist.max()
  69. # 打印输出
  70. print("预测情绪:", pred_sentiment)
  71. print("可能性:", round(probdist.prob(pred_sentiment), 2))
  72. print("结束")


  1. <class 'list'> 42
  2. <class 'list'> 22
  3. 训练数据点的数量: 50
  4. 测试数据点的数量: 14
  5. 分类器的准确性: 1.0
  6. 五大信息最丰富的单词:
  7. microwave
  8. product
  9. works
  10. ever
  11. service
  12. 预测:
  13. 评论: works well with proper preparation.
  14. 预测情绪: Positive
  15. 可能性: 0.77
  16. 结束


  1. import pandas as pd
  2. from nltk.sentiment.vader import SentimentIntensityAnalyzer
  3. # 分析句子的情感:情感分析是NLP最受欢迎的应用之一。情感分析是指确定一段给定的文本是积极还是消极的过程。
  4. # 有一些场景中,我们还会将“中性“作为第三个选项。情感分析常用于发现人们对于一个特定主题的看法。
  5. # 定义一个用于提取特征的函数
  6. # 输入一段文本返回形如:{'It': True, 'movie': True, 'amazing': True, 'is': True, 'an': True}
  7. # 返回类型是一个dict
  8. if __name__ == '__main__':
  9. # 输入一些简单的评论
  10. #data = pd.read_excel('data3/microwave1.xlsx')
  11. name = 'hair_dryer1'
  12. data = pd.read_excel('../data3/'+name+'.xlsx')
  13. input_reviews = data[u'review_body']
  14. input_reviews = input_reviews.tolist()
  15. input_reviews = [
  16. "works well with proper preparation.",
  17. "i hate that opening the door moves the microwave towards you and out of its place. thats my only complaint.",
  18. "piece of junk. got two years of use and it died. customer service says too bad. whirlpool dishwasher died a few months ago. whirlpool is dead to me.",
  19. "am very happy with this"
  20. ]
  21. #运行分类器,获得预测结果
  22. for sentence in input_reviews:
  23. sid = SentimentIntensityAnalyzer()
  24. ss = sid.polarity_scores(sentence)
  25. print("句子:"+sentence)
  26. for k in sorted(ss):
  27. print('{0}: {1}, '.format(k, ss[k]), end='')
  28. print()
  29. print("结束")


  1. 句子:works well with proper preparation.
  2. compound: 0.2732, neg: 0.0, neu: 0.656, pos: 0.344,
  3. 句子:i hate that opening the door moves the microwave towards you and out of its place. thats my only complaint.
  4. compound: -0.7096, neg: 0.258, neu: 0.742, pos: 0.0,
  5. 句子:piece of junk. got two years of use and it died. customer service says too bad. whirlpool dishwasher died a few months ago. whirlpool is dead to me.
  6. compound: -0.9432, neg: 0.395, neu: 0.605, pos: 0.0,
  7. 句子:am very happy with this
  8. compound: 0.6115, neg: 0.0, neu: 0.5, pos: 0.5,
  9. 结束







