import nltk
import random
from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words():
all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words) return features print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) featuresets = [(find_features(rev), category) for (rev, category) in documents] # set that we'll train our classifier with
training_set = featuresets[:1900] # set that we'll test against.
testing_set = featuresets[1900:] classifier = nltk.NaiveBayesClassifier.train(training_set) print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100) classifier.show_most_informative_features(15) ######################
Most Informative Features
insulting = True neg : pos = 10.6 : 1.0
ludicrous = True neg : pos = 10.1 : 1.0
winslet = True pos : neg = 9.0 : 1.0
detract = True pos : neg = 8.4 : 1.0
breathtaking = True pos : neg = 8.1 : 1.0
silverstone = True neg : pos = 7.6 : 1.0
excruciatingly = True neg : pos = 7.6 : 1.0
warns = True pos : neg = 7.0 : 1.0
tracy = True pos : neg = 7.0 : 1.0
insipid = True neg : pos = 7.0 : 1.0
freddie = True neg : pos = 7.0 : 1.0
damon = True pos : neg = 5.9 : 1.0
debate = True pos : neg = 5.9 : 1.0
ordered = True pos : neg = 5.8 : 1.0
lang = True pos : neg = 5.7 : 1.0 #############################
##保存和恢复模型
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close() classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

使用nltk自带的继承于ClassifierI的投票器进行集体分类评估,模型包括nltk的classifier和sklearn的一些分类模型

读取文本并统计出前3000的频繁词汇,然后标记这3000个词的好坏,具体判断标准看这3000词是否是事先有好坏标记的词袋里的词

import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC from nltk.classify import ClassifierI
from statistics import mode ##定义VoteClassifier继承于ClassifierI
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers

##返回众数,即投票最多的项
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)

##定义置信区间
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v) choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words():
all_words.append(w.lower()) all_words = nltk.FreqDist(all_words)
##取出现最多的前3000个词
word_features = list(all_words.keys())[:3000]
##标记词的好坏
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words) return features #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) featuresets = [(find_features(rev), category) for (rev, category) in documents] training_set = featuresets[:1900]
testing_set = featuresets[1900:] #classifier = nltk.NaiveBayesClassifier.train(training_set) classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close() print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100) ##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) voted_classifier = VoteClassifier(classifier,
NuSVC_classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier) print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100) print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100) ####################################
out: Original Naive Bayes Algo accuracy percent: 66.0
Most Informative Features
thematic = True pos : neg = 9.1 : 1.0
secondly = True pos : neg = 8.5 : 1.0
narrates = True pos : neg = 7.8 : 1.0
layered = True pos : neg = 7.1 : 1.0
rounded = True pos : neg = 7.1 : 1.0
supreme = True pos : neg = 7.1 : 1.0
crappy = True neg : pos = 6.9 : 1.0
uplifting = True pos : neg = 6.2 : 1.0
ugh = True neg : pos = 5.3 : 1.0
gaining = True pos : neg = 5.1 : 1.0
mamet = True pos : neg = 5.1 : 1.0
wanda = True neg : pos = 4.9 : 1.0
onset = True neg : pos = 4.9 : 1.0
fantastic = True pos : neg = 4.5 : 1.0
milos = True pos : neg = 4.4 : 1.0
MNB_classifier accuracy percent: 67.0
BernoulliNB_classifier accuracy percent: 67.0
LogisticRegression_classifier accuracy percent: 68.0
SGDClassifier_classifier accuracy percent: 57.99999999999999
LinearSVC_classifier accuracy percent: 67.0
NuSVC_classifier accuracy percent: 65.0
voted_classifier accuracy percent: 65.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: pos Confidence %: 57.14285714285714
Classification: pos Confidence %: 85.71428571428571 #########################################

NLTK的探索的更多相关文章

  1. 探索 Python、机器学习和 NLTK 库 开发一个应用程序,使用 Python、NLTK 和机器学习对 RSS 提要进行分类

    挑战:使用机器学习对 RSS 提要进行分类 最近,我接到一项任务,要求为客户创建一个 RSS 提要分类子系统.目标是读取几十个甚至几百个 RSS 提要,将它们的许多文章自动分类到几十个预定义的主题领域 ...

  2. NLTK在自然语言处理

    nltk-data.zip 本文主要是总结最近学习的论文.书籍相关知识,主要是Natural Language Pracessing(自然语言处理,简称NLP)和Python挖掘维基百科Infobox ...

  3. NLTK学习笔记(五):分类和标注词汇

    目录 词性标注器 标注语料库 表示已经标注的标识符:nltk.tag.str2tuple('word/类型') 读取已经标注的语料库 名词.动词.形容词等 尝试找出每个名词类型中最频繁的名词 探索已经 ...

  4. 使用Python中的NLTK和spaCy删除停用词与文本标准化

    概述 了解如何在Python中删除停用词与文本标准化,这些是自然语言处理的基本技术 探索不同的方法来删除停用词,以及讨论文本标准化技术,如词干化(stemming)和词形还原(lemmatizatio ...

  5. 【探索】机器指令翻译成 JavaScript

    前言 前些时候研究脚本混淆时,打算先学一些「程序流程」相关的概念.为了不因太枯燥而放弃,决定想一个有趣的案例,可以边探索边学. 于是想了一个话题:尝试将机器指令 1:1 翻译 成 JavaScript ...

  6. 【探索】利用 canvas 实现数据压缩

    前言 HTTP 支持 GZip 压缩,可节省不少传输资源.但遗憾的是,只有下载才有,上传并不支持.如果上传也能压缩,那就完美了.特别适合大量文本提交的场合,比如博客园,就是很好的例子. 虽然标准不支持 ...

  7. 探索C#之6.0语法糖剖析

    阅读目录: 自动属性默认初始化 自动只读属性默认初始化 表达式为主体的函数 表达式为主体的属性(赋值) 静态类导入 Null条件运算符 字符串格式化 索引初始化 异常过滤器when catch和fin ...

  8. Mysql事务探索及其在Django中的实践(二)

    继上一篇<Mysql事务探索及其在Django中的实践(一)>交代完问题的背景和Mysql事务基础后,这一篇主要想介绍一下事务在Django中的使用以及实际应用给我们带来的效率提升. 首先 ...

  9. Linux学习之探索文件系统

    Linux,一起学习进步-    ls With it, we can see directory contents and determine a variety of important file ...

随机推荐

  1. 09_springmvc异常处理

    一.异常处理思路 系统中异常包括两类:预期异常和运行时异常RuntimeException,前者通过捕获异常从而获取异常信息,后者主要通过规范代码开发.测试通过手段减少运行时异常的发生. 系统的dao ...

  2. XStream JavaBean对象转换成XML!

    代码实例: 1.javaBean类: package com.hsinfo.web.Demo.XStream; public class City { private String name; pub ...

  3. html常用标签详解3-a标签

    a标签 1.a标签的属性 a标签属于行内元素标签,双标签<a></a> href:a标签的跳转地址 target:打开方式(_self自身:_blank:新窗口) title: ...

  4. Collection接口和Map接口

  5. hibernate和jdbc的区别 优缺点

    JDBC与Hibernate在性能上相比,JDBC灵活性有优势.而Hibernate在易学性,易用性上有些优势.当用到很多复杂的多表联查和复杂的数据库操作时,JDBC有优势. 相同点: ◆两者都是JA ...

  6. leyou_07_对数据的操作

    1.目标在数据库的两张表中拿到以下数据,并完成状态.搜索和分页功能 实体类Spu(页面需要的数据) 实体类Category(页面需要的数据) name:商品分类 2.分析: 返回的数据在两个实体类中, ...

  7. mac上python3.x安装 图文详解

    mac安装brew报错及解决办法 https://blog.csdn.net/zdp072/article/details/82563320 更改脚本中的资源链接,替换成中国科学技术大学的镜像 htt ...

  8. 2019-6-23-win10-uwp-未给任务-GenerateAppxPackageRecipe-的必需参数-AppxManifestXml-赋值

    title author date CreateTime categories win10 uwp 未给任务 GenerateAppxPackageRecipe 的必需参数 AppxManifestX ...

  9. groups 用户所归属的用户组查询

    groups 用法很简单,就是查询用户所归属哪个或哪些用户组: 语法格式:  groups  用户名 实例: [beinan@localhost ~]$ groups beinan  注:查询bein ...

  10. [原创]Machine Learning/机器学习 文章合集

    转载请注明出处:https://www.codelast.com/ ➤ 用人话解释机器学习中的Logistic Regression(逻辑回归) ➤ 如何防止softmax函数上溢出(overflow ...