import nltk
import random
from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words():
all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words) return features print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) featuresets = [(find_features(rev), category) for (rev, category) in documents] # set that we'll train our classifier with
training_set = featuresets[:1900] # set that we'll test against.
testing_set = featuresets[1900:] classifier = nltk.NaiveBayesClassifier.train(training_set) print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100) classifier.show_most_informative_features(15) ######################
Most Informative Features
insulting = True neg : pos = 10.6 : 1.0
ludicrous = True neg : pos = 10.1 : 1.0
winslet = True pos : neg = 9.0 : 1.0
detract = True pos : neg = 8.4 : 1.0
breathtaking = True pos : neg = 8.1 : 1.0
silverstone = True neg : pos = 7.6 : 1.0
excruciatingly = True neg : pos = 7.6 : 1.0
warns = True pos : neg = 7.0 : 1.0
tracy = True pos : neg = 7.0 : 1.0
insipid = True neg : pos = 7.0 : 1.0
freddie = True neg : pos = 7.0 : 1.0
damon = True pos : neg = 5.9 : 1.0
debate = True pos : neg = 5.9 : 1.0
ordered = True pos : neg = 5.8 : 1.0
lang = True pos : neg = 5.7 : 1.0 #############################
##保存和恢复模型
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close() classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

使用nltk自带的继承于ClassifierI的投票器进行集体分类评估,模型包括nltk的classifier和sklearn的一些分类模型

读取文本并统计出前3000的频繁词汇,然后标记这3000个词的好坏,具体判断标准看这3000词是否是事先有好坏标记的词袋里的词

import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC from nltk.classify import ClassifierI
from statistics import mode ##定义VoteClassifier继承于ClassifierI
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers

##返回众数,即投票最多的项
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)

##定义置信区间
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v) choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words():
all_words.append(w.lower()) all_words = nltk.FreqDist(all_words)
##取出现最多的前3000个词
word_features = list(all_words.keys())[:3000]
##标记词的好坏
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words) return features #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) featuresets = [(find_features(rev), category) for (rev, category) in documents] training_set = featuresets[:1900]
testing_set = featuresets[1900:] #classifier = nltk.NaiveBayesClassifier.train(training_set) classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close() print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100) ##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) voted_classifier = VoteClassifier(classifier,
NuSVC_classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier) print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100) print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100) ####################################
out: Original Naive Bayes Algo accuracy percent: 66.0
Most Informative Features
thematic = True pos : neg = 9.1 : 1.0
secondly = True pos : neg = 8.5 : 1.0
narrates = True pos : neg = 7.8 : 1.0
layered = True pos : neg = 7.1 : 1.0
rounded = True pos : neg = 7.1 : 1.0
supreme = True pos : neg = 7.1 : 1.0
crappy = True neg : pos = 6.9 : 1.0
uplifting = True pos : neg = 6.2 : 1.0
ugh = True neg : pos = 5.3 : 1.0
gaining = True pos : neg = 5.1 : 1.0
mamet = True pos : neg = 5.1 : 1.0
wanda = True neg : pos = 4.9 : 1.0
onset = True neg : pos = 4.9 : 1.0
fantastic = True pos : neg = 4.5 : 1.0
milos = True pos : neg = 4.4 : 1.0
MNB_classifier accuracy percent: 67.0
BernoulliNB_classifier accuracy percent: 67.0
LogisticRegression_classifier accuracy percent: 68.0
SGDClassifier_classifier accuracy percent: 57.99999999999999
LinearSVC_classifier accuracy percent: 67.0
NuSVC_classifier accuracy percent: 65.0
voted_classifier accuracy percent: 65.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: pos Confidence %: 57.14285714285714
Classification: pos Confidence %: 85.71428571428571 #########################################

NLTK的探索的更多相关文章

  1. 探索 Python、机器学习和 NLTK 库 开发一个应用程序,使用 Python、NLTK 和机器学习对 RSS 提要进行分类

    挑战:使用机器学习对 RSS 提要进行分类 最近,我接到一项任务,要求为客户创建一个 RSS 提要分类子系统.目标是读取几十个甚至几百个 RSS 提要,将它们的许多文章自动分类到几十个预定义的主题领域 ...

  2. NLTK在自然语言处理

    nltk-data.zip 本文主要是总结最近学习的论文.书籍相关知识,主要是Natural Language Pracessing(自然语言处理,简称NLP)和Python挖掘维基百科Infobox ...

  3. NLTK学习笔记(五):分类和标注词汇

    目录 词性标注器 标注语料库 表示已经标注的标识符:nltk.tag.str2tuple('word/类型') 读取已经标注的语料库 名词.动词.形容词等 尝试找出每个名词类型中最频繁的名词 探索已经 ...

  4. 使用Python中的NLTK和spaCy删除停用词与文本标准化

    概述 了解如何在Python中删除停用词与文本标准化,这些是自然语言处理的基本技术 探索不同的方法来删除停用词,以及讨论文本标准化技术,如词干化(stemming)和词形还原(lemmatizatio ...

  5. 【探索】机器指令翻译成 JavaScript

    前言 前些时候研究脚本混淆时,打算先学一些「程序流程」相关的概念.为了不因太枯燥而放弃,决定想一个有趣的案例,可以边探索边学. 于是想了一个话题:尝试将机器指令 1:1 翻译 成 JavaScript ...

  6. 【探索】利用 canvas 实现数据压缩

    前言 HTTP 支持 GZip 压缩,可节省不少传输资源.但遗憾的是,只有下载才有,上传并不支持.如果上传也能压缩,那就完美了.特别适合大量文本提交的场合,比如博客园,就是很好的例子. 虽然标准不支持 ...

  7. 探索C#之6.0语法糖剖析

    阅读目录: 自动属性默认初始化 自动只读属性默认初始化 表达式为主体的函数 表达式为主体的属性(赋值) 静态类导入 Null条件运算符 字符串格式化 索引初始化 异常过滤器when catch和fin ...

  8. Mysql事务探索及其在Django中的实践(二)

    继上一篇<Mysql事务探索及其在Django中的实践(一)>交代完问题的背景和Mysql事务基础后,这一篇主要想介绍一下事务在Django中的使用以及实际应用给我们带来的效率提升. 首先 ...

  9. Linux学习之探索文件系统

    Linux,一起学习进步-    ls With it, we can see directory contents and determine a variety of important file ...

随机推荐

  1. 版本控制git之四-忽略特殊文件

    版本控制git之四-忽略特殊文件   有些时候,你必须把某些文件放到Git工作目录中,但又不能提交它们,比如保存了数据库密码的配置文件啦,等等,每次git status都会显示Untracked fi ...

  2. fastjson过滤字段

    1.注解(字段上添加) @JSONField(serialize=false) 2.过滤器 PropertyFilter propertyFilter = new PropertyFilter() { ...

  3. ThinkCMF框架任意内容包含漏洞复现

    1. 漏洞概述 ThinkCMF是一款基于PHP+MYSQL开发的中文内容管理框架,底层采用ThinkPHP3.2.3构建. 利用此漏洞无需任何权限情况下,构造恶意的url,可以向服务器写入任意内容的 ...

  4. Python学习day24-面向对象的三大特征之继承

    figure:last-child { margin-bottom: 0.5rem; } #write ol, #write ul { position: relative; } img { max- ...

  5. PKUWC 2018 真实排名

    PKUWC2018 真实排名 题面描述 共有\(n\)个人,每个人有一个能力值,每个人的排名为所有能力值不比他小的人的个数(包括他自己). 现在有\(k\)个人能力值翻倍,但我们无法得知是哪\(k\) ...

  6. [转]【全面解禁!真正的Expression Blend实战开发技巧】第六章 认识ListBox

    反反复复考虑后,准备把这一章的切入点瞄准ListBox.并用了一个看起来有点别扭的标题“认识ListBox",许多人看到这里就不爱看了,即使是大学里用winform的学生也会说ListBox ...

  7. 使用gRPC-Gateway快速构建微服务-双向认证下rpc-gateway使用(同时提供rpc和http接口)

    https://github.com/grpc-ecosystem/grpc-gateway 在grpc之上加一层代理并转发,转变成protobuf格式来访问grpc服务 安装 go get -u g ...

  8. MyBatis配置文件(七)--environments运行环境

    一.environments配置信息: environments的作用是用来配置数据库信息,可以配置多个,其有两个可配的子元素,分别是:事务管理器transactionManager和数据源dataS ...

  9. Java内功修炼系列一代理模式

    代理模式是JAVA设计模式之一,网上设计模式相关的博文铺天盖地,参考它们有助于自己理解,但是所谓“尽信书不如无书”,在参考的同时也要思考其正确性,写博客也是为了记录自己理解知识点的思路历程和心路历程, ...

  10. MySQL示例数据导入

    从官网下载示例数据,参考压缩文件中的README.txt,整理所得 /******************* 示例数据导入 *******************/ /** 官网下载 http://d ...