fasttext和cnn的比较,使用keras imdb看效果——cnn要慢10倍。
fasttext:
'''This example demonstrates the use of fasttext for text classification
Based on Joulin et al's paper:
Bags of Tricks for Efficient Text Classification
https://arxiv.org/abs/1607.01759
Results on IMDB datasets with uni and bi-gram embeddings:
Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.
Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.
''' from __future__ import print_function
import numpy as np from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb import numpy as np
import json
import warnings def load_data(path='imdb.npz', num_words=None, skip_top=0,
maxlen=None, seed=113,
start_char=1, oov_char=2, index_from=3, **kwargs):
"""Loads the IMDB dataset.
# Arguments
path: where to cache the data (relative to `~/.keras/dataset`).
num_words: max number of words to include. Words are ranked
by how often they occur (in the training set) and only
the most frequent words are kept
skip_top: skip the top N most frequently occurring words
(which may not be informative).
maxlen: sequences longer than this will be filtered out.
seed: random seed for sample shuffling.
start_char: The start of a sequence will be marked with this character.
Set to 1 because 0 is usually the padding character.
oov_char: words that were cut out because of the `num_words`
or `skip_top` limit will be replaced with this character.
index_from: index actual words with this index and higher.
# Returns
Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
# Raises
ValueError: in case `maxlen` is so low
that no input sequence could be kept.
Note that the 'out of vocabulary' character is only used for
words that were present in the training set but are not included
because they're not making the `num_words` cut here.
Words that were not seen in the training set but are in the test set
have simply been skipped.
"""
# Legacy support
if 'nb_words' in kwargs:
warnings.warn('The `nb_words` argument in `load_data` '
'has been renamed `num_words`.')
num_words = kwargs.pop('nb_words')
if kwargs:
raise TypeError('Unrecognized keyword arguments: ' + str(kwargs)) #path = get_file(path,
# origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
# file_hash='599dadb1135973df5b59232a0e9a887c')
with np.load(path) as f:
x_train, labels_train = f['x_train'], f['y_train']
x_test, labels_test = f['x_test'], f['y_test'] np.random.seed(seed)
indices = np.arange(len(x_train))
np.random.shuffle(indices)
x_train = x_train[indices]
labels_train = labels_train[indices] indices = np.arange(len(x_test))
np.random.shuffle(indices)
x_test = x_test[indices]
labels_test = labels_test[indices] xs = np.concatenate([x_train, x_test])
labels = np.concatenate([labels_train, labels_test]) if start_char is not None:
xs = [[start_char] + [w + index_from for w in x] for x in xs]
elif index_from:
xs = [[w + index_from for w in x] for x in xs] if maxlen:
xs, labels = _remove_long_seq(maxlen, xs, labels)
if not xs:
raise ValueError('After filtering for sequences shorter than maxlen=' +
str(maxlen) + ', no sequence was kept. '
'Increase maxlen.')
if not num_words:
num_words = max([max(x) for x in xs]) # by convention, use 2 as OOV word
# reserve 'index_from' (=3 by default) characters:
# 0 (padding), 1 (start), 2 (OOV)
if oov_char is not None:
xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
else:
xs = [[w for w in x if skip_top <= w < num_words] for x in xs] idx = len(x_train)
x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
return (x_train, y_train), (x_test, y_test) def create_ngram_set(input_list, ngram_value=2):
"""
Extract a set of n-grams from a list of integers.
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
"""
return set(zip(*[input_list[i:] for i in range(ngram_value)])) def add_ngram(sequences, token_indice, ngram_range=2):
"""
Augment the input list of list (sequences) by appending n-grams values.
Example: adding bi-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
Example: adding tri-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
>>> add_ngram(sequences, token_indice, ngram_range=3)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
"""
new_sequences = []
for input_list in sequences:
new_list = input_list[:]
for ngram_value in range(2, ngram_range + 1):
for i in range(len(new_list) - ngram_value + 1):
ngram = tuple(new_list[i:i + ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list) return new_sequences # Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 1
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5 print('Loading data...')
# the data, split between train and test sets
#(x_train, y_train), (x_test, y_test) = load_data()
(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) if ngram_range > 1:
print('Adding {}-gram features'.format(ngram_range))
# Create set of unique n-gram from the training set.
ngram_set = set()
for input_list in x_train:
for i in range(2, ngram_range + 1):
set_of_ngram = create_ngram_set(input_list, ngram_value=i)
ngram_set.update(set_of_ngram) # Dictionary mapping n-gram token to a unique integer.
# Integer values are greater than max_features in order
# to avoid collision with existing features.
start_index = max_features + 1
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
indice_token = {token_indice[k]: k for k in token_indice} # max_features is the highest integer that could be found in the dataset.
max_features = np.max(list(indice_token.keys())) + 1 # Augmenting x_train and x_test with n-grams features
x_train = add_ngram(x_train, token_indice, ngram_range)
x_test = add_ngram(x_test, token_indice, ngram_range)
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape) print('Build model...')
model = Sequential() # we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
embedding_dims,
input_length=maxlen)) # we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D()) # We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']) model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs, validation_data=(x_test, y_test))
效果:
Train on 25000 samples, validate on 25000 samples
Epoch 1/50
2018-06-06 15:50:28.133461: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
25000/25000 [==============================] - 9s 379us/step - loss: 0.6125 - acc: 0.7431 - val_loss: 0.5050 - val_acc: 0.8227
Epoch 2/50
25000/25000 [==============================] - 10s 402us/step - loss: 0.4059 - acc: 0.8633 - val_loss: 0.3738 - val_acc: 0.8646
Epoch 3/50
25000/25000 [==============================] - 11s 441us/step - loss: 0.3061 - acc: 0.8934 - val_loss: 0.3219 - val_acc: 0.8783
Epoch 4/50
25000/25000 [==============================] - 9s 375us/step - loss: 0.2550 - acc: 0.9110 - val_loss: 0.2970 - val_acc: 0.8853
Epoch 5/50
可以看到一个epoch只需要10来秒,还是很快的!但是我训练到50个epoch后发现acc 100%,但是验证集上数据acc 86%,看来是过拟合了。
再看看传统cnn:
'''This example demonstrates the use of Convolution1D for text classification.
Gets to 0.89 test accuracy after 2 epochs.
90s/epoch on Intel i5 2.4Ghz CPU.
10s/epoch on Tesla K40 GPU.
'''
from __future__ import print_function from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb # set parameters:
max_features = 5000
import numpy as np
import json
import warnings def load_data(path='imdb.npz', num_words=None, skip_top=0,
maxlen=None, seed=113,
start_char=1, oov_char=2, index_from=3, **kwargs):
"""Loads the IMDB dataset.
# Arguments
path: where to cache the data (relative to `~/.keras/dataset`).
num_words: max number of words to include. Words are ranked
by how often they occur (in the training set) and only
the most frequent words are kept
skip_top: skip the top N most frequently occurring words
(which may not be informative).
maxlen: sequences longer than this will be filtered out.
seed: random seed for sample shuffling.
start_char: The start of a sequence will be marked with this character.
Set to 1 because 0 is usually the padding character.
oov_char: words that were cut out because of the `num_words`
or `skip_top` limit will be replaced with this character.
index_from: index actual words with this index and higher.
# Returns
Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
# Raises
ValueError: in case `maxlen` is so low
that no input sequence could be kept.
Note that the 'out of vocabulary' character is only used for
words that were present in the training set but are not included
because they're not making the `num_words` cut here.
Words that were not seen in the training set but are in the test set
have simply been skipped.
"""
# Legacy support
if 'nb_words' in kwargs:
warnings.warn('The `nb_words` argument in `load_data` '
'has been renamed `num_words`.')
num_words = kwargs.pop('nb_words')
if kwargs:
raise TypeError('Unrecognized keyword arguments: ' + str(kwargs)) #path = get_file(path,
# origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
# file_hash='599dadb1135973df5b59232a0e9a887c')
with np.load(path) as f:
x_train, labels_train = f['x_train'], f['y_train']
x_test, labels_test = f['x_test'], f['y_test'] np.random.seed(seed)
indices = np.arange(len(x_train))
np.random.shuffle(indices)
x_train = x_train[indices]
labels_train = labels_train[indices] indices = np.arange(len(x_test))
np.random.shuffle(indices)
x_test = x_test[indices]
labels_test = labels_test[indices] xs = np.concatenate([x_train, x_test])
labels = np.concatenate([labels_train, labels_test]) if start_char is not None:
xs = [[start_char] + [w + index_from for w in x] for x in xs]
elif index_from:
xs = [[w + index_from for w in x] for x in xs] if maxlen:
xs, labels = _remove_long_seq(maxlen, xs, labels)
if not xs:
raise ValueError('After filtering for sequences shorter than maxlen=' +
str(maxlen) + ', no sequence was kept. '
'Increase maxlen.')
if not num_words:
num_words = max([max(x) for x in xs]) # by convention, use 2 as OOV word
# reserve 'index_from' (=3 by default) characters:
# 0 (padding), 1 (start), 2 (OOV)
if oov_char is not None:
xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
else:
xs = [[w for w in x if skip_top <= w < num_words] for x in xs] idx = len(x_train)
x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
return (x_train, y_train), (x_test, y_test) def create_ngram_set(input_list, ngram_value=2):
"""
Extract a set of n-grams from a list of integers.
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
"""
return set(zip(*[input_list[i:] for i in range(ngram_value)])) def add_ngram(sequences, token_indice, ngram_range=2):
"""
Augment the input list of list (sequences) by appending n-grams values.
Example: adding bi-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
Example: adding tri-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
>>> add_ngram(sequences, token_indice, ngram_range=3)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
"""
new_sequences = []
for input_list in sequences:
new_list = input_list[:]
for ngram_value in range(2, ngram_range + 1):
for i in range(len(new_list) - ngram_value + 1):
ngram = tuple(new_list[i:i + ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list) return new_sequences
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 5 print('Loading data...')
(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences') print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape) print('Build model...')
model = Sequential() # we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
embedding_dims,
input_length=maxlen))
model.add(Dropout(0.2)) # we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
kernel_size,
padding='valid',
activation='relu',
strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D()) # We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu')) # We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_test, y_test))
效果:
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
2018-06-06 16:10:34.733973: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
25000/25000 [==============================] - 117s 5ms/step - loss: 0.4044 - acc: 0.8007 - val_loss: 0.3212 - val_acc: 0.8600
Epoch 2/5
25000/25000 [==============================] - 121s 5ms/step - loss: 0.2323 - acc: 0.9057 - val_loss: 0.2903 - val_acc: 0.8801
Epoch 3/5
25000/25000 [==============================] - 124s 5ms/step - loss: 0.1640 - acc: 0.9377 - val_loss: 0.2720 - val_acc: 0.8900
Epoch 4/5
25000/25000 [==============================] - 116s 5ms/step - loss: 0.1136 - acc: 0.9579 - val_loss: 0.3353 - val_acc: 0.8811
Epoch 5/5
25000/25000 [==============================] - 114s 5ms/step - loss: 0.0764 - acc: 0.9726 - val_loss: 0.3958 - val_acc: 0.8793
可以看出cnn的确要慢10倍。
fasttext和cnn的比较,使用keras imdb看效果——cnn要慢10倍。的更多相关文章
- Deep Learning模型之:CNN卷积神经网络(一)深度解析CNN
http://m.blog.csdn.net/blog/wu010555688/24487301 本文整理了网上几位大牛的博客,详细地讲解了CNN的基础结构与核心思想,欢迎交流. [1]Deep le ...
- Keras(四)CNN 卷积神经网络 RNN 循环神经网络 原理及实例
CNN 卷积神经网络 卷积 池化 https://www.cnblogs.com/peng8098/p/nlp_16.html 中有介绍 以数据集MNIST构建一个卷积神经网路 from keras. ...
- 【Keras案例学习】 CNN做手写字符分类(mnist_cnn )
from __future__ import print_function import numpy as np np.random.seed(1337) from keras.datasets im ...
- Keras框架下使用CNN进行CIFAR-10的识别测试
有手册,然后代码不知道看一下:https://keras-cn.readthedocs.io/en/latest/ 首先是下载数据集,下载太慢了就从网盘上下载: 链接:https://pan.baid ...
- [C1W1] Neural Networks and Deep Learning - Introduction to Deep Learning
第一周:深度学习引言(Introduction to Deep Learning) 欢迎(Welcome) 深度学习改变了传统互联网业务,例如如网络搜索和广告.但是深度学习同时也使得许多新产品和企业以 ...
- keras入门(三)搭建CNN模型破解网站验证码
项目介绍 在文章CNN大战验证码中,我们利用TensorFlow搭建了简单的CNN模型来破解某个网站的验证码.验证码如下: 在本文中,我们将会用Keras来搭建一个稍微复杂的CNN模型来破解以上的 ...
- 使用Keras进行深度学习:(二)CNN讲解及实践
欢迎大家关注我们的网站和系列教程:http://www.tensorflownews.com/,学习更多的机器学习.深度学习的知识! 现今最主流的处理图像数据的技术当属深度神经网络了,尤其是卷积神经网 ...
- 深度学习:Keras入门(二)之卷积神经网络(CNN)
说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么是卷积? 简单来说,卷积(或内积)就是一种先把对应位置相乘然后再把结果相加的运算.(具体含义或者数学公式 ...
- 深度学习:Keras入门(二)之卷积神经网络(CNN)【转】
本文转载自:https://www.cnblogs.com/lc1217/p/7324935.html 说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么 ...
随机推荐
- springMVC中使用 RequestBody 及 Ajax POST请求 415 (Unsupported Media Type)
使用POST请求的时候一直报错: Ajax 未设置 contentType 时会报 415 . 后台 RequestBody 承接前台参数,故对参数data的要求为“必传”“JSON”,否则会报40 ...
- MySQL密码的恢复方法
MySQL密码的恢复方法之一 1.首先确认服务器出于安全的状态,也就是没有人能够任意地连接MySQL数据库. 因为在重新设置MySQL的root密码的期间,MySQL数据库完全出于没有密码保护的 状态 ...
- linux命令详解之df(6/19)
df命令作用是列出文件系统的整体磁盘空间使用情况.可以用来查看磁盘已被使用多少空间和还剩余多少空间. df命令显示系统中包含每个文件名参数的磁盘使用情况,如果没有文件名参数,则显示所有当前已挂载文件系 ...
- TCP客户端和服务器间传输数据遇到的TypeError: a bytes-like object is required, not 'str'问题
使用python实现python核心编程3第472页和474页的TCP时间戳服务器和客户端服务器间数据传输编程时遇到TypeError: a bytes-like object is required ...
- Python自然语言处理 - 系列三
有监督分类过程 ![enter image description here][1]例子:涉及一个特征器,给定一个姓名分析出是男性名字还是女性名字 分析:男性和女性的名字有一些鲜明的特点.以a,e 和 ...
- 测试 js 方法运行时间
早期测速的时候是这样的,呵呵呵,一开始还挺爽的 function testFunctionTime(fn) { var start = new Date().getTime(); if (fn) fn ...
- Python编程-常用模块及方法
常用模块介绍 一.time模块 在Python中,通常有这几种方式来表示时间: 时间戳(timestamp):通常来说,时间戳表示的是从1970年1月1日00:00:00开始按秒计算的偏移量.我们运行 ...
- 基于Visual c++ 2012的php扩展开发 - 环境搭建
软件准备 Apache2.4 php-5.6.20-Win32-VC11-x86 php-5.6.20-src mysql-5.5.45-win32 vcredist_x86.exe vs2012旗舰 ...
- numpy模块之创建矩阵、矩阵运算
本文参考给妹子讲python https://zhuanlan.zhihu.com/p/34673397 NumPy是Numerical Python的简写,是高性能科学计算和数据分析的基础包,他是 ...
- JavaWeb Request和Response
1. Request与Response 1.1. Web应用运行机制 到目前为止,我们已经掌握了Web应用程序的运行机制,现在学习的就是Web应用程序运行机制中很重要的内容 —— Request与Re ...