使用神经网络-垃圾邮件检测-LSTM或者CNN（一维卷积）效果都不错【代码有问题，pass】

from sklearn.feature_extraction.text import CountVectorizer

import os

from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split

from sklearn import metrics

import matplotlib.pyplot as plt

import numpy as np

from sklearn import svm

from sklearn.feature_extraction.text import TfidfTransformer

import tensorflow as tf

import tflearn

from tflearn.layers.core import input_data, dropout, fully_connected

from tflearn.layers.conv import conv_1d, global_max_pool

from tflearn.layers.conv import conv_2d, max_pool_2d

from tflearn.layers.merge_ops import merge

from tflearn.layers.estimator import regression

from tflearn.data_utils import to_categorical, pad_sequences

from sklearn.neural_network import MLPClassifier

from tflearn.layers.normalization import local_response_normalization

from tensorflow.contrib import learn

max_features=500

max_document_length=1024

def load_one_file(filename):

    x=""

    with open(filename) as f:

        for line in f:

            line=line.strip('\n')

            line = line.strip('\r')

            x+=line

    return x

def load_files_from_dir(rootdir):

    x=[]

    list = os.listdir(rootdir)

    for i in range(0, len(list)):

        path = os.path.join(rootdir, list[i])

        if os.path.isfile(path):

            v=load_one_file(path)

            x.append(v)

    return x

def load_all_files():

    ham=[]

    spam=[]

    for i in range(1,5):

        path="../data/mail/enron%d/ham/" % i

        print "Load %s" % path

        ham+=load_files_from_dir(path)

        path="../data/mail/enron%d/spam/" % i

        print "Load %s" % path

        spam+=load_files_from_dir(path)

    return ham,spam

def get_features_by_wordbag():

    ham, spam=load_all_files()

    x=ham+spam

    y=[0]*len(ham)+[1]*len(spam)

    vectorizer = CountVectorizer(

                                 decode_error='ignore',

                                 strip_accents='ascii',

                                 max_features=max_features,

                                 stop_words='english',

                                 max_df=1.0,

                                 min_df=1 )

    print vectorizer

    x=vectorizer.fit_transform(x)

    x=x.toarray()

    return x,y

def show_diffrent_max_features():

    global max_features

    a=[]

    b=[]

    for i in range(1000,20000,2000):

        max_features=i

        print "max_features=%d" % i

        x, y = get_features_by_wordbag()

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

        gnb = GaussianNB()

        gnb.fit(x_train, y_train)

        y_pred = gnb.predict(x_test)

        score=metrics.accuracy_score(y_test, y_pred)

        a.append(max_features)

        b.append(score)

        plt.plot(a, b, 'r')

    plt.xlabel("max_features")

    plt.ylabel("metrics.accuracy_score")

    plt.title("metrics.accuracy_score VS max_features")

    plt.legend()

    plt.show()

def do_nb_wordbag(x_train, x_test, y_train, y_test):

    print "NB and wordbag"

    gnb = GaussianNB()

    gnb.fit(x_train,y_train)

    y_pred=gnb.predict(x_test)

    print metrics.accuracy_score(y_test, y_pred)

    print metrics.confusion_matrix(y_test, y_pred)

def do_svm_wordbag(x_train, x_test, y_train, y_test):

    print "SVM and wordbag"

    clf = svm.SVC()

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)

    print metrics.accuracy_score(y_test, y_pred)

    print metrics.confusion_matrix(y_test, y_pred)

def get_features_by_wordbag_tfidf():

    ham, spam=load_all_files()

    x=ham+spam

    y=[0]*len(ham)+[1]*len(spam)

    vectorizer = CountVectorizer(binary=True,

                                 decode_error='ignore',

                                 strip_accents='ascii',

                                 max_features=max_features,

                                 stop_words='english',

                                 max_df=1.0,

                                 min_df=1 )

    print vectorizer

    x=vectorizer.fit_transform(x)

    x=x.toarray()

    transformer = TfidfTransformer(smooth_idf=False)

    print transformer

    tfidf = transformer.fit_transform(x)

    x = tfidf.toarray()

    return  x,y

def do_cnn_wordbag(trainX, testX, trainY, testY):

    global max_document_length

    print "CNN and tf"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)

    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)

    # Converting labels to binary vectors

    trainY = to_categorical(trainY, nb_classes=2)

    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network

    network = input_data(shape=[None,max_document_length], name='input')

    network = tflearn.embedding(network, input_dim=1000000, output_dim=128)

    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")

    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")

    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")

    network = merge([branch1, branch2, branch3], mode='concat', axis=1)

    network = tf.expand_dims(network, 2)

    network = global_max_pool(network)

    network = dropout(network, 0.8)

    network = fully_connected(network, 2, activation='softmax')

    network = regression(network, optimizer='adam', learning_rate=0.001,

                         loss='categorical_crossentropy', name='target')

    # Training

    model = tflearn.DNN(network, tensorboard_verbose=0)

    model.fit(trainX, trainY,

              n_epoch=5, shuffle=True, validation_set=(testX, testY),

              show_metric=True, batch_size=100,run_id="spam")

def do_rnn_wordbag(trainX, testX, trainY, testY):

    global max_document_length

    print "RNN and wordbag"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)

    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)

    # Converting labels to binary vectors

    trainY = to_categorical(trainY, nb_classes=2)

    testY = to_categorical(testY, nb_classes=2)

    # Network building

    net = tflearn.input_data([None, max_document_length])

    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)

    net = tflearn.lstm(net, 128, dropout=0.8)

    net = tflearn.fully_connected(net, 2, activation='softmax')

    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,

                             loss='categorical_crossentropy')

    # Training

    model = tflearn.DNN(net, tensorboard_verbose=0)

    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,

              batch_size=10,run_id="spm-run",n_epoch=5)

def do_dnn_wordbag(x_train, x_test, y_train, y_testY):

    print "DNN and wordbag"

    # Building deep neural network

    clf = MLPClassifier(solver='lbfgs',

                        alpha=1e-5,

                        hidden_layer_sizes = (5, 2),

                        random_state = 1)

    print  clf

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)

    print metrics.accuracy_score(y_test, y_pred)

    print metrics.confusion_matrix(y_test, y_pred)

def  get_features_by_tf():

    global  max_document_length

    x=[]

    y=[]

    ham, spam=load_all_files()

    x=ham+spam

    y=[0]*len(ham)+[1]*len(spam)

    vp=tflearn.data_utils.VocabularyProcessor(max_document_length=max_document_length,

                                              min_frequency=0,

                                              vocabulary=None,

                                              tokenizer_fn=None)

    x=vp.fit_transform(x, unused_y=None)

    x=np.array(list(x))

    return x,y

if __name__ == "__main__":

    print "Hello spam-mail"

    #print "get_features_by_wordbag"

    #x,y=get_features_by_wordbag()

    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)

    #print "get_features_by_wordbag_tfidf"

    #x,y=get_features_by_wordbag_tfidf()

    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)

    #NB

    #do_nb_wordbag(x_train, x_test, y_train, y_test)

    #show_diffrent_max_features()

    #SVM

    #do_svm_wordbag(x_train, x_test, y_train, y_test)

    #DNN

    #do_dnn_wordbag(x_train, x_test, y_train, y_test)

    print "get_features_by_tf"

    x,y=get_features_by_wordbag()

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)

    #CNN

    do_cnn_wordbag(x_train, x_test, y_train, y_test)

    #RNN

    #do_rnn_wordbag(x_train, x_test, y_train, y_test)

自己写检测算法的时候也记得多个算法比较下

使用神经网络-垃圾邮件检测-LSTM或者CNN（一维卷积）效果都不错【代码有问题，pass】的更多相关文章

Atitti 文本分类以及垃圾邮件判断原理以及贝叶斯算法的应用解决方案
Atitti 文本分类以及垃圾邮件判断原理以及贝叶斯算法的应用解决方案 1.1. 七.什么是贝叶斯过滤器?1 1.2. 八.建立历史资料库2 1.3. 十.联合概率的计算3 1.4. 十一. ...
基于Python的邮件检测工具
邮件快速检测工具概要介绍 mmpi,是一款使用python实现的开源邮件快速检测工具库,基于community框架设计开发.mmpi支持对邮件头.邮件正文.邮件附件的解析检测,并输出json检测报告 ...
CNN实现垃圾邮件分类(行大小不一致要补全)
以下是利用卷积神经网络对某一个句子的处理结构图我们从上图可知,将一句话转化成一个矩阵.我们看到该句话有6个单词和一个标点符号,所以我们可以将该矩阵设置为7行,对于列的话每个单词可以用什么样的数值表示 ...
数据挖掘、目标检测中的cnn和cn---卷积网络和卷积神经网络
content 概述文字识别系统LeNet-5 简化的LeNet-5系统卷积神经网络的实现问题深度神经网路已经在语音识别,图像识别等领域取得前所未有的成功.本人在多年之前也曾接触过神经网络.本系 ...
【深度学习系列】PaddlePaddle垃圾邮件处理实战（二）
PaddlePaddle垃圾邮件处理实战(二) 前文回顾在上篇文章中我们讲了如何用支持向量机对垃圾邮件进行分类,auc为73.3%,本篇讲继续讲如何用PaddlePaddle实现邮件分类,将深度 ...
如何基于TensorFlow使用LSTM和CNN实现时序分类任务
https://www.jiqizhixin.com/articles/2017-09-12-5 By 蒋思源2017年9月12日 09:54 时序数据经常出现在很多领域中,如金融.信号处理.语音识别 ...
Deep Learning模型之：CNN卷积神经网络（一）深度解析CNN
http://m.blog.csdn.net/blog/wu010555688/24487301 本文整理了网上几位大牛的博客,详细地讲解了CNN的基础结构与核心思想,欢迎交流. [1]Deep le ...
postfix反垃圾邮件说明
参考地址:http://guailele.blog.51cto.com/1156442/780223 1.打开 smtp 的认证模块在/etc/postfix/main.cf文件最后加上: sm ...
postfix疯狂外发垃圾邮件
分析一.查找main.cf配置文件 localhost# find / -name main.cf /etc/postfix/main.cf 二.打开/etc/postfix/main.cf来看看. ...

随机推荐

关于Html基础语法学习
晚上做完初赛,好像有点颓,就来学了学html,毕竟博客里面会用到嘛. 首先贴出我所学习的教程 http://www.w3school.com.cn/html/index.asp 我觉得吧,可能以我的记 ...
关于api接口文档RAP和swagger
前言: 在之前的项目中用了将近一年的RAP,RAP是由阿里开源出来的,非常好用.github地址:https://github.com/thx/RAP. 当初在用此工具时,项目成员需要在接口文档在所改 ...
pinpoint体系中,关于如何清理过期hbase数据
版本: pinpoint:1.7.1 hbase:1.2.6 命令行命令: $HBASE_HOME/bin/hbase shell newrestruct.hbase 备注:保留一天半的数据(秒 ...
python第三方模块大杂烩
Python单元测试框架之pytest---如何执行测试用例 unittest单元测试框架实现参数化 (用例有相似参数断言时使用,可以精简代码) python中标示符作用详解一篇文章让你彻底搞清楚P ...
uva 1658 Admiral 【最小费用最大流】
拆点,每个点拆成 i,i' 在i 和i‘之间连一条费用为0,容量为1的边,就可以保证每个点只经过一次特殊的点,1和n之间,,,n和2*n之间连一条费用为0,容量为2的边,可以求出两条路径 #incl ...
PhotoZoom Classic 7有什么用？高品质的放大模糊图片！
PhotoZoom Classic 7专门用于放大照片,同时保持质量.该软件配备了BenVista独特的S-Spline技术,可轻松超越Photoshop的双三次插值等替代解决方案. PhotoZoo ...
Robot Framework（二）测试数据语法
2.1.1文件和目录测试数据的层次结构安排如下: 测试数据在测试数据文件中创建. 测试数据文件会自动创建一个包含该文件中的测试数据的测试套件. 包含测试数据文件的目录构成了更高级别的测试套件.这样的 ...
vue项目布局
1.底部有分类布局类似这种底部有分类的,点击四个tap分别道不同的页面这样的,每个页面都是一个路由,把底部作为一个组件在每一个页面中引入就行.组件就是公用的,能公用的就写成组件.如下 { path: ...
day02_20190106 基础数据类型编码运算符
一.格式化输出 name = input('请输入姓名') age = input('请输入年龄') hobby = input('请输入爱好') job = input('请输入你的工作') # m ...
xshell 连接 ubuntu 16.04报错
outgoing encryption 错误使用xshell和xftp连接 ubuntu 16.04 时出现找不到匹配的 outgoing encryption 算法的错误提示. 问题阐述: 在 ...

使用神经网络-垃圾邮件检测-LSTM或者CNN（一维卷积）效果都不错【代码有问题，pass】

使用神经网络-垃圾邮件检测-LSTM或者CNN（一维卷积）效果都不错【代码有问题，pass】的更多相关文章

随机推荐

热门专题