NLTK的探索

import nltk

import random

from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)

             for category in movie_reviews.categories()

             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():

    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):

    words = set(document)

    features = {}

    for w in word_features:

        features[w] = (w in words)

    return features

print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents]

# set that we'll train our classifier with

training_set = featuresets[:1900]

# set that we'll test against.

testing_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

classifier.show_most_informative_features(15)

######################

Most Informative Features

insulting = True neg : pos = 10.6 : 1.0

ludicrous = True neg : pos = 10.1 : 1.0

winslet = True pos : neg = 9.0 : 1.0

detract = True pos : neg = 8.4 : 1.0

breathtaking = True pos : neg = 8.1 : 1.0

silverstone = True neg : pos = 7.6 : 1.0

excruciatingly = True neg : pos = 7.6 : 1.0

warns = True pos : neg = 7.0 : 1.0

tracy = True pos : neg = 7.0 : 1.0

insipid = True neg : pos = 7.0 : 1.0

freddie = True neg : pos = 7.0 : 1.0

damon = True pos : neg = 5.9 : 1.0

debate = True pos : neg = 5.9 : 1.0

ordered = True pos : neg = 5.8 : 1.0

lang = True pos : neg = 5.7 : 1.0

#############################
##保存和恢复模型

save_classifier = open("naivebayes.pickle","wb")

pickle.dump(classifier, save_classifier)

save_classifier.close()

classifier_f = open("naivebayes.pickle", "rb")

classifier = pickle.load(classifier_f)

classifier_f.close()

使用nltk自带的继承于ClassifierI的投票器进行集体分类评估，模型包括nltk的classifier和sklearn的一些分类模型

读取文本并统计出前3000的频繁词汇，然后标记这3000个词的好坏，具体判断标准看这3000词是否是事先有好坏标记的词袋里的词

import nltk

import random

from nltk.corpus import movie_reviews

from nltk.classify.scikitlearn import SklearnClassifier

import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from sklearn.linear_model import LogisticRegression, SGDClassifier

from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI

from statistics import mode

##定义VoteClassifier继承于ClassifierI

class VoteClassifier(ClassifierI):

    def __init__(self, *classifiers):

        self._classifiers = classifiers

    
   ##返回众数，即投票最多的项

    def classify(self, features):

        votes = []

        for c in self._classifiers:

            v = c.classify(features)

            votes.append(v)

        return mode(votes)

    
    ##定义置信区间

    def confidence(self, features):

        votes = []

        for c in self._classifiers:

            v = c.classify(features)

            votes.append(v)

        choice_votes = votes.count(mode(votes))

        conf = choice_votes / len(votes)

        return conf

documents = [(list(movie_reviews.words(fileid)), category)

             for category in movie_reviews.categories()

             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():

    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

##取出现最多的前3000个词

word_features = list(all_words.keys())[:3000]

##标记词的好坏

def find_features(document):

    words = set(document)

    features = {}

    for w in word_features:

        features[w] = (w in words)

    return features

#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents]

training_set = featuresets[:1900]

testing_set =  featuresets[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f = open("naivebayes.pickle","rb")

classifier = pickle.load(classifier_f)

classifier_f.close()

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())

MNB_classifier.train(training_set)

print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())

BernoulliNB_classifier.train(training_set)

print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())

LogisticRegression_classifier.train(training_set)

print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())

SGDClassifier_classifier.train(training_set)

print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

##SVC_classifier = SklearnClassifier(SVC())

##SVC_classifier.train(training_set)

##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())

LinearSVC_classifier.train(training_set)

print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())

NuSVC_classifier.train(training_set)

print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

voted_classifier = VoteClassifier(classifier,

                                  NuSVC_classifier,

                                  LinearSVC_classifier,

                                  SGDClassifier_classifier,

                                  MNB_classifier,

                                  BernoulliNB_classifier,

                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)

print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)

print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)

print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)

print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)

print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

####################################
out：

Original Naive Bayes Algo accuracy percent: 66.0

Most Informative Features

                thematic = True              pos : neg    =      9.1 : 1.0

                secondly = True              pos : neg    =      8.5 : 1.0

                narrates = True              pos : neg    =      7.8 : 1.0

                 layered = True              pos : neg    =      7.1 : 1.0

                 rounded = True              pos : neg    =      7.1 : 1.0

                 supreme = True              pos : neg    =      7.1 : 1.0

                  crappy = True              neg : pos    =      6.9 : 1.0

               uplifting = True              pos : neg    =      6.2 : 1.0

                     ugh = True              neg : pos    =      5.3 : 1.0

                 gaining = True              pos : neg    =      5.1 : 1.0

                   mamet = True              pos : neg    =      5.1 : 1.0

                   wanda = True              neg : pos    =      4.9 : 1.0

                   onset = True              neg : pos    =      4.9 : 1.0

               fantastic = True              pos : neg    =      4.5 : 1.0

                   milos = True              pos : neg    =      4.4 : 1.0

MNB_classifier accuracy percent: 67.0

BernoulliNB_classifier accuracy percent: 67.0

LogisticRegression_classifier accuracy percent: 68.0

SGDClassifier_classifier accuracy percent: 57.99999999999999

LinearSVC_classifier accuracy percent: 67.0

NuSVC_classifier accuracy percent: 65.0

voted_classifier accuracy percent: 65.0

Classification: neg Confidence %: 100.0

Classification: pos Confidence %: 57.14285714285714

Classification: neg Confidence %: 57.14285714285714

Classification: neg Confidence %: 57.14285714285714

Classification: pos Confidence %: 57.14285714285714

Classification: pos Confidence %: 85.71428571428571

#########################################

NLTK的探索的更多相关文章

探索 Python、机器学习和 NLTK 库开发一个应用程序，使用 Python、NLTK 和机器学习对 RSS 提要进行分类
挑战:使用机器学习对 RSS 提要进行分类最近,我接到一项任务,要求为客户创建一个 RSS 提要分类子系统.目标是读取几十个甚至几百个 RSS 提要,将它们的许多文章自动分类到几十个预定义的主题领域 ...
NLTK在自然语言处理
nltk-data.zip 本文主要是总结最近学习的论文.书籍相关知识,主要是Natural Language Pracessing(自然语言处理,简称NLP)和Python挖掘维基百科Infobox ...
NLTK学习笔记(五):分类和标注词汇
目录词性标注器标注语料库表示已经标注的标识符:nltk.tag.str2tuple('word/类型') 读取已经标注的语料库名词.动词.形容词等尝试找出每个名词类型中最频繁的名词探索已经 ...
使用Python中的NLTK和spaCy删除停用词与文本标准化
概述了解如何在Python中删除停用词与文本标准化,这些是自然语言处理的基本技术探索不同的方法来删除停用词,以及讨论文本标准化技术,如词干化(stemming)和词形还原(lemmatizatio ...
【探索】机器指令翻译成 JavaScript
前言前些时候研究脚本混淆时,打算先学一些「程序流程」相关的概念.为了不因太枯燥而放弃,决定想一个有趣的案例,可以边探索边学. 于是想了一个话题:尝试将机器指令 1:1 翻译成 JavaScript ...
【探索】利用 canvas 实现数据压缩
前言 HTTP 支持 GZip 压缩,可节省不少传输资源.但遗憾的是,只有下载才有,上传并不支持.如果上传也能压缩,那就完美了.特别适合大量文本提交的场合,比如博客园,就是很好的例子. 虽然标准不支持 ...
探索C#之6.0语法糖剖析
阅读目录: 自动属性默认初始化自动只读属性默认初始化表达式为主体的函数表达式为主体的属性(赋值) 静态类导入 Null条件运算符字符串格式化索引初始化异常过滤器when catch和fin ...
Mysql事务探索及其在Django中的实践（二）
继上一篇<Mysql事务探索及其在Django中的实践(一)>交代完问题的背景和Mysql事务基础后,这一篇主要想介绍一下事务在Django中的使用以及实际应用给我们带来的效率提升. 首先 ...
Linux学习之探索文件系统
Linux,一起学习进步- ls With it, we can see directory contents and determine a variety of important file ...

随机推荐

java接口的意义
java当中继承一个接口,要重写他的方法的话,那为什么还要多此一举的去实现一个接口呢? 直接把方法写在类当中不就可以了?就是说去掉类名后面的Implements 接口 ,可以不可以呢? 接口的最主要的 ...
NTP时钟同步学习记录
--1 要点回顾 . 1. NTP唯一配置文件:/etc/ntp.conf . 2. NTP系统日志记录:/var/log/ntp . 3. ntp.conf简要介绍 - 利用 restrict 来管 ...
Django之模板语言(二)-----Filter
1.其他常用的模板语言: 通过模板语言可以让前端页面显示数据,数据可以是基本数据类型,也可以是对象亦或者对象的列表,结合着模板中的for.if等配合使用. 要注意前端页面中,出现没有后端数据的情况,随 ...
架构发展史Spring Cloud
转自:https://www.iteye.com/news/32734 Spring Cloud作为一套微服务治理的框架,几乎考虑到了微服务治理的方方面面,之前也写过一些关于Spring Cloud文 ...
2018-8-10-dotnet-从入门到放弃的-500-篇文章合集
title author date CreateTime categories dotnet 从入门到放弃的 500 篇文章合集 lindexi 2018-08-10 19:16:52 +0800 2 ...
php 随意参数方法的使用
1, 用到的PHP函数: func_get_arg() / func_get_args()/ func_num_args 2, func_get_arg(index) :根据索引取得参数具体值 ...
vector以及array和数组
//比较数组.vector.array #include <iostream> #include <vector> #include <array> #includ ...
HZOI20190828模拟32题解
题面:https://www.cnblogs.com/Juve/articles/11428730.html chinese: 考虑$\sum\limits_{i=0}^{n*m}i*f_i$的意义: ...
软件-浏览器-GoogleChrome：Google Chrome
ylbtech-软件-浏览器-GoogleChrome:Google Chrome Google Chrome是一款由Google公司开发的网页浏览器,该浏览器基于其他开源软件撰写,包括WebKit, ...
Leetcode438.Find All Anagrams in a String找到字符串中所有字母异位词
给定一个字符串 s 和一个非空字符串 p,找到 s 中所有是 p 的字母异位词的子串,返回这些子串的起始索引. 字符串只包含小写英文字母,并且字符串 s 和 p 的长度都不超过 20100. 说明: ...

NLTK的探索

NLTK的探索的更多相关文章

随机推荐

热门专题