fasttext和cnn的比较，使用keras imdb看效果—

fasttext:

'''This example demonstrates the use of fasttext for text classification

Based on Joulin et al's paper:

Bags of Tricks for Efficient Text Classification

https://arxiv.org/abs/1607.01759

Results on IMDB datasets with uni and bi-gram embeddings:

    Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.

    Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.

'''

from __future__ import print_function

import numpy as np

from keras.preprocessing import sequence

from keras.models import Sequential

from keras.layers import Dense

from keras.layers import Embedding

from keras.layers import GlobalAveragePooling1D

from keras.datasets import imdb

import numpy as np

import json

import warnings

def load_data(path='imdb.npz', num_words=None, skip_top=0,

              maxlen=None, seed=113,

              start_char=1, oov_char=2, index_from=3, **kwargs):

    """Loads the IMDB dataset.

    # Arguments

        path: where to cache the data (relative to `~/.keras/dataset`).

        num_words: max number of words to include. Words are ranked

            by how often they occur (in the training set) and only

            the most frequent words are kept

        skip_top: skip the top N most frequently occurring words

            (which may not be informative).

        maxlen: sequences longer than this will be filtered out.

        seed: random seed for sample shuffling.

        start_char: The start of a sequence will be marked with this character.

            Set to 1 because 0 is usually the padding character.

        oov_char: words that were cut out because of the `num_words`

            or `skip_top` limit will be replaced with this character.

        index_from: index actual words with this index and higher.

    # Returns

        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.

    # Raises

        ValueError: in case `maxlen` is so low

            that no input sequence could be kept.

    Note that the 'out of vocabulary' character is only used for

    words that were present in the training set but are not included

    because they're not making the `num_words` cut here.

    Words that were not seen in the training set but are in the test set

    have simply been skipped.

    """

    # Legacy support

    if 'nb_words' in kwargs:

        warnings.warn('The `nb_words` argument in `load_data` '

                      'has been renamed `num_words`.')

        num_words = kwargs.pop('nb_words')

    if kwargs:

        raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))

    #path = get_file(path,

    #                origin='https://s3.amazonaws.com/text-datasets/imdb.npz',

    #                file_hash='599dadb1135973df5b59232a0e9a887c')

    with np.load(path) as f:

        x_train, labels_train = f['x_train'], f['y_train']

        x_test, labels_test = f['x_test'], f['y_test']

    np.random.seed(seed)

    indices = np.arange(len(x_train))

    np.random.shuffle(indices)

    x_train = x_train[indices]

    labels_train = labels_train[indices]

    indices = np.arange(len(x_test))

    np.random.shuffle(indices)

    x_test = x_test[indices]

    labels_test = labels_test[indices]

    xs = np.concatenate([x_train, x_test])

    labels = np.concatenate([labels_train, labels_test])

    if start_char is not None:

        xs = [[start_char] + [w + index_from for w in x] for x in xs]

    elif index_from:

        xs = [[w + index_from for w in x] for x in xs]

    if maxlen:

        xs, labels = _remove_long_seq(maxlen, xs, labels)

        if not xs:

            raise ValueError('After filtering for sequences shorter than maxlen=' +

                             str(maxlen) + ', no sequence was kept. '

                             'Increase maxlen.')

    if not num_words:

        num_words = max([max(x) for x in xs])

    # by convention, use 2 as OOV word

    # reserve 'index_from' (=3 by default) characters:

    # 0 (padding), 1 (start), 2 (OOV)

    if oov_char is not None:

        xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]

    else:

        xs = [[w for w in x if skip_top <= w < num_words] for x in xs]

    idx = len(x_train)

    x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])

    x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])

    return (x_train, y_train), (x_test, y_test)

def create_ngram_set(input_list, ngram_value=2):

    """

    Extract a set of n-grams from a list of integers.

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)

    {(4, 9), (4, 1), (1, 4), (9, 4)}

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)

    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]

    """

    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):

    """

    Augment the input list of list (sequences) by appending n-grams values.

    Example: adding bi-gram

    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]

    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}

    >>> add_ngram(sequences, token_indice, ngram_range=2)

    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]

    Example: adding tri-gram

    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]

    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}

    >>> add_ngram(sequences, token_indice, ngram_range=3)

    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]

    """

    new_sequences = []

    for input_list in sequences:

        new_list = input_list[:]

        for ngram_value in range(2, ngram_range + 1):

            for i in range(len(new_list) - ngram_value + 1):

                ngram = tuple(new_list[i:i + ngram_value])

                if ngram in token_indice:

                    new_list.append(token_indice[ngram])

        new_sequences.append(new_list)

    return new_sequences

# Set parameters:

# ngram_range = 2 will add bi-grams features

ngram_range = 1

max_features = 20000

maxlen = 400

batch_size = 32

embedding_dims = 50

epochs = 5

print('Loading data...')

# the data, split between train and test sets

#(x_train, y_train), (x_test, y_test) = load_data()

(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)

print(len(x_train), 'train sequences')

print(len(x_test), 'test sequences')

print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))

print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

if ngram_range > 1:

    print('Adding {}-gram features'.format(ngram_range))

    # Create set of unique n-gram from the training set.

    ngram_set = set()

    for input_list in x_train:

        for i in range(2, ngram_range + 1):

            set_of_ngram = create_ngram_set(input_list, ngram_value=i)

            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.

    # Integer values are greater than max_features in order

    # to avoid collision with existing features.

    start_index = max_features + 1

    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}

    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.

    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features

    x_train = add_ngram(x_train, token_indice, ngram_range)

    x_test = add_ngram(x_test, token_indice, ngram_range)

    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))

    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

print('Pad sequences (samples x time)')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print('x_train shape:', x_train.shape)

print('x_test shape:', x_test.shape)

print('Build model...')

model = Sequential()

# we start off with an efficient embedding layer which maps

# our vocab indices into embedding_dims dimensions

model.add(Embedding(max_features,

                    embedding_dims,

                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings

# of all words in the document

model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',

              optimizer='adam',

              metrics=['accuracy'])

model.fit(x_train, y_train,

          batch_size=batch_size,

          epochs=epochs, validation_data=(x_test, y_test))

效果：

Train on 25000 samples, validate on 25000 samples

Epoch 1/50

2018-06-06 15:50:28.133461: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA

25000/25000 [==============================] - 9s 379us/step - loss: 0.6125 - acc: 0.7431 - val_loss: 0.5050 - val_acc: 0.8227

Epoch 2/50

25000/25000 [==============================] - 10s 402us/step - loss: 0.4059 - acc: 0.8633 - val_loss: 0.3738 - val_acc: 0.8646

Epoch 3/50

25000/25000 [==============================] - 11s 441us/step - loss: 0.3061 - acc: 0.8934 - val_loss: 0.3219 - val_acc: 0.8783

Epoch 4/50

25000/25000 [==============================] - 9s 375us/step - loss: 0.2550 - acc: 0.9110 - val_loss: 0.2970 - val_acc: 0.8853

Epoch 5/50

可以看到一个epoch只需要10来秒，还是很快的！但是我训练到50个epoch后发现acc 100%，但是验证集上数据acc 86%，看来是过拟合了。

再看看传统cnn：

'''This example demonstrates the use of Convolution1D for text classification.

Gets to 0.89 test accuracy after 2 epochs.

90s/epoch on Intel i5 2.4Ghz CPU.

10s/epoch on Tesla K40 GPU.

'''

from __future__ import print_function

from keras.preprocessing import sequence

from keras.models import Sequential

from keras.layers import Dense, Dropout, Activation

from keras.layers import Embedding

from keras.layers import Conv1D, GlobalMaxPooling1D

from keras.datasets import imdb

# set parameters:

max_features = 5000

import numpy as np

import json

import warnings

def load_data(path='imdb.npz', num_words=None, skip_top=0,

              maxlen=None, seed=113,

              start_char=1, oov_char=2, index_from=3, **kwargs):

    """Loads the IMDB dataset.

    # Arguments

        path: where to cache the data (relative to `~/.keras/dataset`).

        num_words: max number of words to include. Words are ranked

            by how often they occur (in the training set) and only

            the most frequent words are kept

        skip_top: skip the top N most frequently occurring words

            (which may not be informative).

        maxlen: sequences longer than this will be filtered out.

        seed: random seed for sample shuffling.

        start_char: The start of a sequence will be marked with this character.

            Set to 1 because 0 is usually the padding character.

        oov_char: words that were cut out because of the `num_words`

            or `skip_top` limit will be replaced with this character.

        index_from: index actual words with this index and higher.

    # Returns

        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.

    # Raises

        ValueError: in case `maxlen` is so low

            that no input sequence could be kept.

    Note that the 'out of vocabulary' character is only used for

    words that were present in the training set but are not included

    because they're not making the `num_words` cut here.

    Words that were not seen in the training set but are in the test set

    have simply been skipped.

    """

    # Legacy support

    if 'nb_words' in kwargs:

        warnings.warn('The `nb_words` argument in `load_data` '

                      'has been renamed `num_words`.')

        num_words = kwargs.pop('nb_words')

    if kwargs:

        raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))

    #path = get_file(path,

    #                origin='https://s3.amazonaws.com/text-datasets/imdb.npz',

    #                file_hash='599dadb1135973df5b59232a0e9a887c')

    with np.load(path) as f:

        x_train, labels_train = f['x_train'], f['y_train']

        x_test, labels_test = f['x_test'], f['y_test']

    np.random.seed(seed)

    indices = np.arange(len(x_train))

    np.random.shuffle(indices)

    x_train = x_train[indices]

    labels_train = labels_train[indices]

    indices = np.arange(len(x_test))

    np.random.shuffle(indices)

    x_test = x_test[indices]

    labels_test = labels_test[indices]

    xs = np.concatenate([x_train, x_test])

    labels = np.concatenate([labels_train, labels_test])

    if start_char is not None:

        xs = [[start_char] + [w + index_from for w in x] for x in xs]

    elif index_from:

        xs = [[w + index_from for w in x] for x in xs]

    if maxlen:

        xs, labels = _remove_long_seq(maxlen, xs, labels)

        if not xs:

            raise ValueError('After filtering for sequences shorter than maxlen=' +

                             str(maxlen) + ', no sequence was kept. '

                             'Increase maxlen.')

    if not num_words:

        num_words = max([max(x) for x in xs])

    # by convention, use 2 as OOV word

    # reserve 'index_from' (=3 by default) characters:

    # 0 (padding), 1 (start), 2 (OOV)

    if oov_char is not None:

        xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]

    else:

        xs = [[w for w in x if skip_top <= w < num_words] for x in xs]

    idx = len(x_train)

    x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])

    x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])

    return (x_train, y_train), (x_test, y_test)

def create_ngram_set(input_list, ngram_value=2):

    """

    Extract a set of n-grams from a list of integers.

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)

    {(4, 9), (4, 1), (1, 4), (9, 4)}

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)

    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]

    """

    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):

    """

    Augment the input list of list (sequences) by appending n-grams values.

    Example: adding bi-gram

    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]

    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}

    >>> add_ngram(sequences, token_indice, ngram_range=2)

    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]

    Example: adding tri-gram

    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]

    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}

    >>> add_ngram(sequences, token_indice, ngram_range=3)

    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]

    """

    new_sequences = []

    for input_list in sequences:

        new_list = input_list[:]

        for ngram_value in range(2, ngram_range + 1):

            for i in range(len(new_list) - ngram_value + 1):

                ngram = tuple(new_list[i:i + ngram_value])

                if ngram in token_indice:

                    new_list.append(token_indice[ngram])

        new_sequences.append(new_list)

    return new_sequences

maxlen = 400

batch_size = 32

embedding_dims = 50

filters = 250

kernel_size = 3

hidden_dims = 250

epochs = 5

print('Loading data...')

(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)

print(len(x_train), 'train sequences')

print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print('x_train shape:', x_train.shape)

print('x_test shape:', x_test.shape)

print('Build model...')

model = Sequential()

# we start off with an efficient embedding layer which maps

# our vocab indices into embedding_dims dimensions

model.add(Embedding(max_features,

                    embedding_dims,

                    input_length=maxlen))

model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters

# word group filters of size filter_length:

model.add(Conv1D(filters,

                 kernel_size,

                 padding='valid',

                 activation='relu',

                 strides=1))

# we use max pooling:

model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:

model.add(Dense(hidden_dims))

model.add(Dropout(0.2))

model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:

model.add(Dense(1))

model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',

              optimizer='adam',

              metrics=['accuracy'])

model.fit(x_train, y_train,

          batch_size=batch_size,

          epochs=epochs,

validation_data=(x_test, y_test))

效果：

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
2018-06-06 16:10:34.733973: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
25000/25000 [==============================] - 117s 5ms/step - loss: 0.4044 - acc: 0.8007 - val_loss: 0.3212 - val_acc: 0.8600
Epoch 2/5
25000/25000 [==============================] - 121s 5ms/step - loss: 0.2323 - acc: 0.9057 - val_loss: 0.2903 - val_acc: 0.8801
Epoch 3/5
25000/25000 [==============================] - 124s 5ms/step - loss: 0.1640 - acc: 0.9377 - val_loss: 0.2720 - val_acc: 0.8900
Epoch 4/5
25000/25000 [==============================] - 116s 5ms/step - loss: 0.1136 - acc: 0.9579 - val_loss: 0.3353 - val_acc: 0.8811
Epoch 5/5
25000/25000 [==============================] - 114s 5ms/step - loss: 0.0764 - acc: 0.9726 - val_loss: 0.3958 - val_acc: 0.8793

可以看出cnn的确要慢10倍。

fasttext和cnn的比较，使用keras imdb看效果——cnn要慢10倍。的更多相关文章

Deep Learning模型之：CNN卷积神经网络（一）深度解析CNN
http://m.blog.csdn.net/blog/wu010555688/24487301 本文整理了网上几位大牛的博客,详细地讲解了CNN的基础结构与核心思想,欢迎交流. [1]Deep le ...
Keras（四）CNN 卷积神经网络 RNN 循环神经网络原理及实例
CNN 卷积神经网络卷积池化 https://www.cnblogs.com/peng8098/p/nlp_16.html 中有介绍以数据集MNIST构建一个卷积神经网路 from keras. ...
【Keras案例学习】 CNN做手写字符分类（mnist_cnn ）
from __future__ import print_function import numpy as np np.random.seed(1337) from keras.datasets im ...
Keras框架下使用CNN进行CIFAR-10的识别测试
有手册,然后代码不知道看一下:https://keras-cn.readthedocs.io/en/latest/ 首先是下载数据集,下载太慢了就从网盘上下载: 链接:https://pan.baid ...
[C1W1] Neural Networks and Deep Learning - Introduction to Deep Learning
第一周:深度学习引言(Introduction to Deep Learning) 欢迎(Welcome) 深度学习改变了传统互联网业务,例如如网络搜索和广告.但是深度学习同时也使得许多新产品和企业以 ...
keras入门（三）搭建CNN模型破解网站验证码
项目介绍在文章CNN大战验证码中,我们利用TensorFlow搭建了简单的CNN模型来破解某个网站的验证码.验证码如下: 在本文中,我们将会用Keras来搭建一个稍微复杂的CNN模型来破解以上的 ...
使用Keras进行深度学习：（二）CNN讲解及实践
欢迎大家关注我们的网站和系列教程:http://www.tensorflownews.com/,学习更多的机器学习.深度学习的知识! 现今最主流的处理图像数据的技术当属深度神经网络了,尤其是卷积神经网 ...
深度学习：Keras入门(二)之卷积神经网络(CNN)
说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么是卷积? 简单来说,卷积(或内积)就是一种先把对应位置相乘然后再把结果相加的运算.(具体含义或者数学公式 ...
深度学习：Keras入门(二)之卷积神经网络(CNN)【转】
本文转载自:https://www.cnblogs.com/lc1217/p/7324935.html 说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么 ...

随机推荐

JavaScript数据结构-树
我认为这社会上,也不差钱好多人,可能好多人也不差权力.可是我认为能得到这样的满足的也不多. –郭小平<临汾红丝带学校校长> 树是计算机科学中经经常使用到的一种数据结构. 树是一种非线性 ...
https-SSL请求
# coding:utf-8import requests# 禁用安全请求警告from requests.packages.urllib3.exceptions import InsecureRequ ...
centos7 Mysql5.6 升级Mysql5.7
1 2. 卸载Mysql5.6 ,一共有三个包要卸载: (1)先卸载mysql-server包 : 执行命令 yum remove mysql mysql-server (2)再卸载mysql-c ...
Spring笔记：事务管理
Spring笔记:事务管理事务管理 Spring事务管理是通过SpringAOP去实现的.默认情况下Spring在执行方法抛出异常后,引发事务回顾,当然你可以用拦截器或者配置去改变它们. 这部门内容 ...
Linux下32位与64位数据类型大小
Redhat Enterprise Linux 32 Redhat Enterprise Linux 64
PHP 最大化资源配置 Resource Limits 错误两则
报错信息1:PHP Fatal error: Allowed memory size of 25165824 bytes exhausted (tried to allocate 67108888 b ...
Go make 和 new的区别
在Go语言中: make 被用来分配引用类型的内存: map, slice, channel new 被用来分配除了引用类型的所有其他类型的内存: int, string, array等本文主要给大 ...
CCNA 课程五
VLSM (可变长子网掩码)也就是子网的划分过程子网掩码和ip地址相与得到的是IP地址的网络地址(0&1 == 0 : 1&1 == 1) 简单来说就是 IP地址和子网掩码上下 ...
spark总结5 RDD
创建RDD 有两种方式 1 通过hdfs支持的文件系统创建 RDD, RDD里面没有真正要计算的数据,只记录了一下元数据 2 从过scala集合或者数组以并行化的方式创建RDD collect 把结果 ...
BASE64Encoded() 方法报错说方法未定义
代码: String enParams = new BASE64Encoder().encode(strParams.getBytes()); 出错,显示方法未定义解决方法:项目右键——>pr ...

fasttext和cnn的比较，使用keras imdb看效果——cnn要慢10倍。

fasttext和cnn的比较，使用keras imdb看效果——cnn要慢10倍。的更多相关文章

随机推荐

热门专题