bayes

from numpy import *

import time

starttime = time.time()

def loadDataSet():

    postingList = [['my', 'dog', 'has', 'flea',

                    'problems', 'help', 'please'],

                    ['maybe', 'not', 'take', 'him',

                    'to', 'dog', 'park', 'stupid'],

                    ['my', 'dalmation', 'is', 'so', 'cute',

                    'I', 'love', 'him'],

                    ['stop', 'posting', 'stupid', 'worthless',

                    'garbage'],

                    ['mr', 'licks', 'ate', 'my', 'steak', 'how',

                    'to', 'stop', 'him'],

                    ['quit', 'buying', 'worthless', 'dog', 'food',

                    'stupid']]

    classVec = [0, 1, 0, 1, 0, 1]

    return postingList, classVec

def createVocabList(dataSet): # dataSet = postingList

    vocabSet = set([]) # vocabSet = set(dataSet)

    for document in dataSet:

        vocabSet = vocabSet | set(document) #

    return list(vocabSet) # createVocabList = list(set(dataSet)) 

def setOfWords2Vec(vocabList, inputSet):

    returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList)  0

    for word in vocabList:

        if word in inputSet:

            returnVec[vocabList.index(word)] = 1 + 1.0

        else:

            returnVec[vocabList.index(word)] = 1.0

            print "the word: %s is not in my Vocabulary!" % word

    return returnVec 

def txt2trainxy(filename1, filename2):

    import re

    reg = re.compile(r'\W*') #

    # step 1: loading data...

    print "stet 1: loading data..."

    from os import listdir

    ld1 = listdir('email/' + filename1); ld2 = listdir('email/' + filename2)

    filelist = ld1 + ld2

    trainy = ((filename1 + '\t') * len(ld1) + (filename2 + '\t') * len(ld2)).split()

    trainx = []; fulltext = []; i = 0

    for File in filelist:

        if i < len(ld1):

            fr = reg.split(open('email/' + filename1 + '/' + File).readlines()[0].lower())

        else:

            fr = reg.split(open('email/' + filename2 + '/' + File).readlines()[0].lower())

        trainx.append([f for f in fr if len(f) > 2]) #

        fulltext.extend([f for f in fr if len(f) > 2]) #

        i += 1

    fulltext = list(set(fulltext))

    # set of words

    trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]

    # bag of words

    trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx]

    return trainxws, trainxwb, trainy, trainx, fulltext

def testx2vec(testx, fulltext):

    # set of words

    testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #

    # bag of words

    testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #

    for word in testx:

        if word not in fulltext:

            print "the word: %s is not in my fulltext!" % word

    return testxws, testxwb

def bayes(testx, trainx, trainy, fulltext):

    print "---Getting Prob..."

    s = set(trainy); l = len(trainy); r = len(trainx[0])

    IDs = [[id for id in range(l) if trainy[id] == item] for item in s]

    logproby = [log(array(trainy.count(item)) / float(l)) for item in s]

    numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]

    numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #

    probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]

    logprobx = [[log(p[i]) for i in range(r)] for p in probx]

    print "---Printing Prob..."

    #print probx

    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big

    print trainy[IDs[0][0]]

    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]

    print trainy[IDs[1][0]]

    """

    print IDs

    print numbxv

    print logprobx

    """

    # step 4: showing the result...

    print "---Showing the result..."

    # set of words

    sumlogpxws = sum(array(logprobx) * testx, 1)

    sumlogpxyws = array(sumlogpxws) + array(logproby)

    #print logprobx

    print sumlogpxws

    print sum(array(probx) * testx, 1)

    bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]

    print "---From set of words: ", bestyws

    """

    # bag of words

    sumlogpxwb = sum(array(logprobx) * testxwb, 1)

    sumlogpxywb = array(sumlogpxwb) + array(logproby)

    bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]

    print "---From bag of words: ", bestywb

    """

    return bestyws

def main():

    # step 1: loading data...

    trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy('spam','ham')

    print fulltext

    # step 2: training...

    print "step 2: training..."

    pass

    # step 3: testing...

    print "step 3: testing..."

    print "---Preparing testdata..."

    import random

    l = len(trainy)

    testid = random.sample(range(l), 20)

    testxxx = [trainxws[i] for i in testid]

    testyyy = [trainy[i] for i in testid]

    testtrainxws = [trainxws[i] for i in range(l) if i not in testid]

    testtrainy = [trainy[i] for i in range(l) if i not in testid]

    print "---Testing now..."

    errorcount = 0; p = len(testid)

    for i in range(p):

        if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:

            errorcount += 1

    print errorcount

    print p

    print "---Errorrate is: ", (errorcount / float(p))

    # step 4: showing the result

    print "step 4: using..."

    testx = ['love', 'my', 'dalmation']

    print "the testx is: ", testx

    print "---Changing testx into vector..."

    testxws, testxwb = testx2vec(testx, fulltext)

    #print testxws

    bayes(testxws, testtrainxws, testtrainy, fulltext)

main()

"""

trainx, trainy = loadDataSet()

fulltext = createVocabList(trainx)

print fulltext

print setOfWords2Vec(fulltext, trainx[0])

trainxws = []

for t in trainx:

    trainxws.append(setOfWords2Vec(fulltext, t))

testEntry1 = ['love', 'my', 'dalmation']

testEntry2 = ['stupid', 'garbage']

bayes(testEntry1, trainxws, trainy, fulltext)

"""

bayes的更多相关文章

【十大经典数据挖掘算法】Naïve Bayes
[十大经典数据挖掘算法]系列 C4.5 K-Means SVM Apriori EM PageRank AdaBoost kNN Naïve Bayes CART 朴素贝叶斯(Naïve Bayes) ...
最大似然判别法和Bayes公式判别法
最大似然判别法 Bayes公式判别法
[Machine Learning & Algorithm] 朴素贝叶斯算法（Naive Bayes）
生活中很多场合需要用到分类,比如新闻分类.病人分类等等. 本文介绍朴素贝叶斯分类器(Naive Bayes classifier),它是一种简单有效的常用分类算法. 一.病人分类的例子让我从一个例子 ...
Spark MLlib 之 Naive Bayes
1.前言: Naive Bayes(朴素贝叶斯)是一个简单的多类分类算法,该算法的前提是假设各特征之间是相互独立的.Naive Bayes 训练主要是为每一个特征,在给定的标签的条件下,计算每个特征在 ...
基于Bayes和KNN的newsgroup 18828文本分类器的Python实现
向@yangliuy大牛学习NLP,这篇博客是数据挖掘-基于贝叶斯算法及KNN算法的newsgroup18828文本分类器的JAVA实现(上)的Python实现.入门为主,没有太多自己的东西. 1. ...
Microsoft Naive Bayes 算法——三国人物身份划分
Microsoft朴素贝叶斯是SSAS中最简单的算法,通常用作理解数据基本分组的起点.这类处理的一般特征就是分类.这个算法之所以称为“朴素”,是因为所有属性的重要性是一样的,没有谁比谁更高.贝叶斯之名 ...
Naive Bayes理论与实践
Naive Bayes: 简单有效的常用分类算法,典型用途:垃圾邮件分类假设:给定目标值时属性之间相互条件独立同样,先验概率的贝叶斯估计是优点: 1. 无监督学习的一种,实现简单,没有迭代,学习 ...
[ML] Naive Bayes for Text Classification
TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...
朴素贝叶斯方法（Naive Bayes Method）
朴素贝叶斯是一种很简单的分类方法,之所以称之为朴素,是因为它有着非常强的前提条件-其所有特征都是相互独立的,是一种典型的生成学习算法.所谓生成学习算法,是指由训练数据学习联合概率分布P(X,Y ...
数据挖掘十大经典算法(9) 朴素贝叶斯分类器 Naive Bayes
贝叶斯分类器贝叶斯分类器的分类原理是通过某对象的先验概率,利用贝叶斯公式计算出其后验概率,即该对象属于某一类的概率,选择具有最大后验概率的类作为该对象所属的类.眼下研究较多的贝叶斯分类器主要有四种, ...

随机推荐

-_-#【Canvas】
context.lineWidth = 0.5 incorrect display lineWidth=1 at html5 canvas canvas.save() canvas.restore() ...
Java多线程的join()
假设在main线程里又起了一个thread1线程,在调用了thread1.start()之后: 如果在main线程里调用了thread1.join(),那么main线程将会block,直到thread ...
Java学习日记-2.2 增强后的switch-case
switch-case语句的基本格式 switch(expression){ case condition1: ... break; case condition2: ... break; case ...
[Design Pattern] Filter Pattern 简单案例
Filter Pattern,即过滤模式,通过不同的过滤标准,或者低耦合将过滤标准组合在一起,对一组对象进行过滤,属于结构类的设计模式. 下面是一个过滤模式的简单案例. Criteria 定义过滤接口 ...
[LeetCode] 187. Repeated DNA Sequences 解题思路
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACG ...
Flask-SQLAlchemy获取一个字段里去掉重复的数据
注意:可排序的列表内元素不可以是字典等复杂数据类型比较容易记忆的是用内置的set l1 = ['b','c','d','b','c','a','a']l2 = list(set(l1))prin ...
Gradle构建Java Web应用（转）
转自:http://www.blogjava.net/jiangshachina/archive/2014/02/03/409285.html 本文是发布在java.net上的一篇摘自于<Gra ...
Column count of mysql.proc is wrong. Expected 20, found 16. Created with MySQL 50096, now running 50173.
IDEA链接mysql提示 Column count of mysql.proc is wrong. Expected 20, found 16. Created with MySQL 50096, ...
java的 IO流之缓冲流（转载）
java缓冲流本身不具IO功能,只是在别的流上加上缓冲提高效率,像是为别的流装上一种包装.当对文件或其他目标频繁读写或操作效率低,效能差.这时使用缓冲流能够更高效的读写信息.因为缓冲流先将数据缓存起来 ...
Py3快速下载地址
pip3.exe install 包名称 -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com

bayes

bayes的更多相关文章

随机推荐

热门专题