from numpy import *

import time
starttime = time.time() def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea',
'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him',
'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute',
'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless',
'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how',
'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food',
'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec def createVocabList(dataSet): # dataSet = postingList
vocabSet = set([]) # vocabSet = set(dataSet)
for document in dataSet:
vocabSet = vocabSet | set(document) #
return list(vocabSet) # createVocabList = list(set(dataSet)) def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList) 0
for word in vocabList:
if word in inputSet:
returnVec[vocabList.index(word)] = 1 + 1.0
else:
returnVec[vocabList.index(word)] = 1.0
print "the word: %s is not in my Vocabulary!" % word
return returnVec def txt2trainxy(filename1, filename2):
import re
reg = re.compile(r'\W*') #
# step 1: loading data...
print "stet 1: loading data..."
from os import listdir
ld1 = listdir('email/' + filename1); ld2 = listdir('email/' + filename2)
filelist = ld1 + ld2
trainy = ((filename1 + '\t') * len(ld1) + (filename2 + '\t') * len(ld2)).split() trainx = []; fulltext = []; i = 0
for File in filelist:
if i < len(ld1):
fr = reg.split(open('email/' + filename1 + '/' + File).readlines()[0].lower())
else:
fr = reg.split(open('email/' + filename2 + '/' + File).readlines()[0].lower())
trainx.append([f for f in fr if len(f) > 2]) #
fulltext.extend([f for f in fr if len(f) > 2]) #
i += 1
fulltext = list(set(fulltext))
# set of words
trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]
# bag of words
trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx] return trainxws, trainxwb, trainy, trainx, fulltext def testx2vec(testx, fulltext):
# set of words
testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #
# bag of words
testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #
for word in testx:
if word not in fulltext:
print "the word: %s is not in my fulltext!" % word
return testxws, testxwb def bayes(testx, trainx, trainy, fulltext):
print "---Getting Prob..."
s = set(trainy); l = len(trainy); r = len(trainx[0])
IDs = [[id for id in range(l) if trainy[id] == item] for item in s]
logproby = [log(array(trainy.count(item)) / float(l)) for item in s]
numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]
numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #
probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]
logprobx = [[log(p[i]) for i in range(r)] for p in probx]
print "---Printing Prob..."
#print probx
print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big
print trainy[IDs[0][0]]
print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]
print trainy[IDs[1][0]]
"""
print IDs
print numbxv
print logprobx
""" # step 4: showing the result...
print "---Showing the result..."
# set of words
sumlogpxws = sum(array(logprobx) * testx, 1)
sumlogpxyws = array(sumlogpxws) + array(logproby)
#print logprobx
print sumlogpxws
print sum(array(probx) * testx, 1)
bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]
print "---From set of words: ", bestyws
"""
# bag of words
sumlogpxwb = sum(array(logprobx) * testxwb, 1)
sumlogpxywb = array(sumlogpxwb) + array(logproby)
bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]
print "---From bag of words: ", bestywb
"""
return bestyws def main():
# step 1: loading data...
trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy('spam','ham')
print fulltext # step 2: training...
print "step 2: training..."
pass # step 3: testing...
print "step 3: testing..."
print "---Preparing testdata..."
import random
l = len(trainy)
testid = random.sample(range(l), 20)
testxxx = [trainxws[i] for i in testid]
testyyy = [trainy[i] for i in testid]
testtrainxws = [trainxws[i] for i in range(l) if i not in testid]
testtrainy = [trainy[i] for i in range(l) if i not in testid]
print "---Testing now..."
errorcount = 0; p = len(testid)
for i in range(p):
if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:
errorcount += 1
print errorcount
print p
print "---Errorrate is: ", (errorcount / float(p)) # step 4: showing the result
print "step 4: using..."
testx = ['love', 'my', 'dalmation']
print "the testx is: ", testx
print "---Changing testx into vector..."
testxws, testxwb = testx2vec(testx, fulltext)
#print testxws
bayes(testxws, testtrainxws, testtrainy, fulltext) main() """
trainx, trainy = loadDataSet()
fulltext = createVocabList(trainx)
print fulltext
print setOfWords2Vec(fulltext, trainx[0])
trainxws = []
for t in trainx:
trainxws.append(setOfWords2Vec(fulltext, t))
testEntry1 = ['love', 'my', 'dalmation']
testEntry2 = ['stupid', 'garbage']
bayes(testEntry1, trainxws, trainy, fulltext) """

bayes的更多相关文章

  1. 【十大经典数据挖掘算法】Naïve Bayes

    [十大经典数据挖掘算法]系列 C4.5 K-Means SVM Apriori EM PageRank AdaBoost kNN Naïve Bayes CART 朴素贝叶斯(Naïve Bayes) ...

  2. 最大似然判别法和Bayes公式判别法

    最大似然判别法 Bayes公式判别法

  3. [Machine Learning & Algorithm] 朴素贝叶斯算法(Naive Bayes)

    生活中很多场合需要用到分类,比如新闻分类.病人分类等等. 本文介绍朴素贝叶斯分类器(Naive Bayes classifier),它是一种简单有效的常用分类算法. 一.病人分类的例子 让我从一个例子 ...

  4. Spark MLlib 之 Naive Bayes

    1.前言: Naive Bayes(朴素贝叶斯)是一个简单的多类分类算法,该算法的前提是假设各特征之间是相互独立的.Naive Bayes 训练主要是为每一个特征,在给定的标签的条件下,计算每个特征在 ...

  5. 基于Bayes和KNN的newsgroup 18828文本分类器的Python实现

    向@yangliuy大牛学习NLP,这篇博客是数据挖掘-基于贝叶斯算法及KNN算法的newsgroup18828文本分类器的JAVA实现(上)的Python实现.入门为主,没有太多自己的东西. 1. ...

  6. Microsoft Naive Bayes 算法——三国人物身份划分

    Microsoft朴素贝叶斯是SSAS中最简单的算法,通常用作理解数据基本分组的起点.这类处理的一般特征就是分类.这个算法之所以称为“朴素”,是因为所有属性的重要性是一样的,没有谁比谁更高.贝叶斯之名 ...

  7. Naive Bayes理论与实践

    Naive Bayes: 简单有效的常用分类算法,典型用途:垃圾邮件分类 假设:给定目标值时属性之间相互条件独立 同样,先验概率的贝叶斯估计是 优点: 1. 无监督学习的一种,实现简单,没有迭代,学习 ...

  8. [ML] Naive Bayes for Text Classification

    TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...

  9. 朴素贝叶斯方法(Naive Bayes Method)

        朴素贝叶斯是一种很简单的分类方法,之所以称之为朴素,是因为它有着非常强的前提条件-其所有特征都是相互独立的,是一种典型的生成学习算法.所谓生成学习算法,是指由训练数据学习联合概率分布P(X,Y ...

  10. 数据挖掘十大经典算法(9) 朴素贝叶斯分类器 Naive Bayes

    贝叶斯分类器 贝叶斯分类器的分类原理是通过某对象的先验概率,利用贝叶斯公式计算出其后验概率,即该对象属于某一类的概率,选择具有最大后验概率的类作为该对象所属的类.眼下研究较多的贝叶斯分类器主要有四种, ...

随机推荐

  1. -_-#【Canvas】

    context.lineWidth = 0.5 incorrect display lineWidth=1 at html5 canvas canvas.save() canvas.restore() ...

  2. Java多线程的join()

    假设在main线程里又起了一个thread1线程,在调用了thread1.start()之后: 如果在main线程里调用了thread1.join(),那么main线程将会block,直到thread ...

  3. Java学习日记-2.2 增强后的switch-case

    switch-case语句的基本格式 switch(expression){ case condition1: ... break; case condition2: ... break; case ...

  4. [Design Pattern] Filter Pattern 简单案例

    Filter Pattern,即过滤模式,通过不同的过滤标准,或者低耦合将过滤标准组合在一起,对一组对象进行过滤,属于结构类的设计模式. 下面是一个过滤模式的简单案例. Criteria 定义过滤接口 ...

  5. [LeetCode] 187. Repeated DNA Sequences 解题思路

    All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACG ...

  6. Flask-SQLAlchemy获取一个字段里去掉重复的数据

    注意:可排序的列表内元素不可以是字典等复杂数据类型   比较容易记忆的是用内置的set l1 = ['b','c','d','b','c','a','a']l2 = list(set(l1))prin ...

  7. Gradle构建Java Web应用(转)

    转自:http://www.blogjava.net/jiangshachina/archive/2014/02/03/409285.html 本文是发布在java.net上的一篇摘自于<Gra ...

  8. Column count of mysql.proc is wrong. Expected 20, found 16. Created with MySQL 50096, now running 50173.

    IDEA链接mysql提示 Column count of mysql.proc is wrong. Expected 20, found 16. Created with MySQL 50096, ...

  9. java的 IO流之缓冲流(转载)

    java缓冲流本身不具IO功能,只是在别的流上加上缓冲提高效率,像是为别的流装上一种包装.当对文件或其他目标频繁读写或操作效率低,效能差.这时使用缓冲流能够更高效的读写信息.因为缓冲流先将数据缓存起来 ...

  10. Py3快速下载地址

    pip3.exe install 包名称 -i  http://mirrors.aliyun.com/pypi/simple  --trusted-host mirrors.aliyun.com