tensorflow实现Word2vec
# coding: utf-8
'''
Note: Step 3 is missing. That's why I left it.
''' from __future__ import absolute_import
from __future__ import print_function import collections
import math
import numpy as np
import os
import random
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import zipfile # Step 1: Download the data.
# Downloading data. If the file already exists, check that it was received correctly (the file size is the same).
# Return filename after download.
print("Step 1: Download the data.")
url = 'http://mattmahoney.net/dc/' def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urllib.request.urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
print(statinfo.st_size)
raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename filename = maybe_download('text8.zip', 31344016) # Read the data into a string.
# file (zipfile) 을 읽어옴
# text8.zip contains only one file. Looking at the code, it seems to be words separated by ''.
def read_data(filename):
f = zipfile.ZipFile(filename)
for name in f.namelist():
return f.read(name).split()
f.close() words = read_data(filename)
print('Data size', len(words))
print('Sample words: ', words[:10]) # Step 2: Build the dictionary and replace rare words with UNK token.
print("\nStep 2: Build the dictionary and replace rare words with UNK token.")
vocabulary_size = 50000 def build_dataset(words):
#vocabulary_size is the number of frequent words to use.
#all words that do not fit within the top 50000 (vocabulary_size) are treated as UNK.
#Param words: literally a list of words
#Return data: indices of words including UNK. That is, words index list.
#Return count: collections.Counter which counts the frequency of occurrence of each word
#:return dictionary: {"word": "index"}
#:return reverse_dictionary: {"index": "word"}. e.g.) {0: 'UNK', 1: 'the', ...}
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary) # insert index to dictionary (len이 계속 증가하므로 결과적으로 index의 효과)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count = unk_count + 1
data.append(index]
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary data, count, dictionary, reverse_dictionary = build_dataset(words)
del words # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data: ', data[:10])
print('Sample count: ', count[:10])
print('Sample dict: ', dictionary.items()[:10])
print('Sample reverse dict: ', reverse_dictionary.items()[:10]) data_index = 0 # Step 4: Function to generate a training batch for the skip-gram model.
print("\nStep 4: Function to generate a training batch for the skip-gram model.")
def generate_batch(batch_size, num_skips, skip_window): #Function to generate minibatch.
#Data_index is declared as global, which acts as static here. That is, the value of data_index is retained even if this function is continually recalled.
#Param batch_size: batch_size.
#Param num_skips: how many (target, context) pairs to generate in the context window.
#Param skip_window: context window size. The skip-gram model predicts the surrounding words from the target word, and skip_window defines the range of the surrounding words.
#Return batch: mini-batch of data.
#Return labels: labels of mini-batch. 2d array of [batch_size] [1]. global data_index
assert batch_size % num_skips == 0 # num_skips의 배수로 batch가 생성되므로.
assert num_skips <= 2 * skip_window # num_skips == 2*skip_window 이면 모든 context window의 context에 대해 pair가 생성된다.
# That is, it should not be larger than that.
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
# Deques are a generalization of stacks and queues.
# The name is pronounced "deck" and is short for "double-ended queue".
# You can both push (append) & pop both sides.
# buffer = data[data_index:data_index+span] with circling
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data) # operator to discard the remainder or decimal point
#Skip-gram is a model for predicting surrounding context words from target words.
#Before we learn skip-gram model, we need to convert words to (target, context) type.
#The code below will do the job with batch_size size. for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [ skip_window ]
for j in range(num_skips):
while target in targets_to_avoid:
# Extracting the context from the context window is done randomly.
# However, if skip_window * 2 == num_skips, all contexts are taken out, so random is not meaningful. The order is random only.
target = random.randint(0, span - 1) targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target] buffer.append(data[data_index])
data_index = (data_index + 1) % len(data) return batch, labels # To see how the batch is configured, output it once:
print("Generating batch ... ")
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
print("Sample batches: ", batch[:10])
print("Sample labels: ", labels[:10])
for i in range(8):
print(batch[i], '->', labels[i, 0])
print(reverse_dictionary[batch[i]], '->', reverse_dictionary[labels[i, 0]]) # Step 5: Build and train a skip-gram model.
print("\nStep 5: Build and train a skip-gram model.")
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label. # We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(np.arange(valid_window), valid_size))
# [0 ~ valid_window] And then sampled by valid_size.
# In other words, valid_examples is a random selection of 16 from 0 to 99.
num_sampled = 64 # Number of negative examples to sample. print("valid_examples: ", valid_examples) graph = tf.Graph() with graph.as_default(): # Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # Ops and variables pinned to the CPU because of missing GPU implementation
# Embedding_lookup This GPU implementation is not implemented, so it must be a CPU.
# Since default is GPU, it explicitly specifies CPU.
with tf.device('/cpu:0'):
# Look up embeddings for inputs.
# embedding matrix (vectors)
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
# Only the embedding vectors pointed to by train_inputs (mini-batch; indices) in the entire embedding matrix are extracted
embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss
# NCE loss is defined using a logistic regression model.
# That is, for logistic regression, weight and bias are needed for each word of vocabulary.
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
num_sampled, vocabulary_size)) # Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) # Compute the cosine similarity between minibatch examples and all embeddings.
# Calculate the cosine similarity between minibatch (valid_embeddings) and all embeddings.
# This process is to show which words are closest to each valid_example as the learning progresses (ie to show the learning process).
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) # Step 6: Begin training
print("\nStep 6: Begin training")
num_steps = 100001 with tf.Session(graph=graph) as session:
# We must initialize all variables before we use them.
tf.initialize_all_variables().run()
print("Initialized") average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels} # We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
# Use feed_dict to put data into the placeholder and learn it.
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val if step % 2000 == 0:
if step > 0:
average_loss = average_loss / 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0 # note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log_str = "Nearest to %s:" % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval() # Step 7: Visualize the embeddings.
print("\nStep 7: Visualize the embeddings.")
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) #in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i,:]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom') plt.savefig(filename) try:
# If you get an error here, update scikit-learn and matplotlib to the latest version.
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500 low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels) except ImportError:
print("Please install sklearn and matplotlib to visualize embeddings.")
tensorflow实现Word2vec的更多相关文章
- 文本分布式表示(二):用tensorflow和word2vec训练词向量
看了几天word2vec的理论,终于是懂了一些.理论部分我推荐以下几篇教程,有博客也有视频: 1.<word2vec中的数学原理>:http://www.cnblogs.com/pegho ...
- 利用 TensorFlow 入门 Word2Vec
利用 TensorFlow 入门 Word2Vec 原创 2017-10-14 chen_h coderpai 博客地址:http://www.jianshu.com/p/4e16ae0aad25 或 ...
- Tensorflow 的Word2vec demo解析
简单demo的代码路径在tensorflow\tensorflow\g3doc\tutorials\word2vec\word2vec_basic.py Sikp gram方式的model思路 htt ...
- 使用tensorflow训练word2vec
from http://blog.csdn.net/wangyangzhizhou/article/details/77530479?locationNum=1&fps=1 使用了tensor ...
- Tensorflow做阅读理解与完形填空
catalogue . 前言 . 使用的数据集 . 数据预处理 . 训练 . 测试模型运行结果: 进行实际完形填空 0. 前言 开始写这篇文章的时候是晚上12点,突然想到几点新的理解,赶紧记下来.我们 ...
- 利用python中的gensim模块训练和测试word2vec
word2vec的基础知识介绍参考上一篇博客和列举的参考资料. 首先利用安装gensim模块,相关依赖如下,注意版本要一致: Python >= 2.7 (tested with version ...
- 对word2vec的理解及资料整理
对word2vec的理解及资料整理 无他,在网上看到好多对word2vec的介绍,当然也有写的比较认真的,但是自己学习过程中还是看了好多才明白,这里按照自己整理梳理一下资料,形成提纲以便学习. 介绍较 ...
- 《TensorFlow实战》读书笔记(完结)
1 TensorFlow基础 ---1.1TensorFlow概要 TensorFlow使用数据流图进行计算,一次编写,各处运行. ---1.2 TensorFlow编程模型简介 TensorFlow ...
- DeepLearning入门笔记(一),准备工作与注意事项
本文记录了安装theano.keras.tensorflow以及运行tutorial程序时遇到的一些问题,供后人参考. 实验机器:联想笔记本,i7-6700HQ,GTX960M,16G内存,SSD硬盘 ...
随机推荐
- SpringMVC中@Controller和@RequestMapping用法
一.简介 在SpringMVC 中,控制器Controller 负责处理由DispatcherServlet 分发的请求,它把用户请求的数据经过业务处理层处理之后封装成一个Model ,然后再把该Mo ...
- .NET Core 中的 Swagger 应用与微服务场景下的Swagger Api 集成显示
Swagger 与 OpenAPI 的历史来源: Swagger 项目于 2015 年捐赠给 OpenAPI Initiative,此后被称为 OpenAPI.这两个名称可以互换使用.但是," ...
- 42 张图带你撸完 MySQL 优化
Hey guys,这里是程序员cxuan,欢迎你阅读我最新一期的文章,这篇文章是 MySQL 调优的汇总版,我加了一下日常开发过程中的调优经验,希望对各位小伙伴们有所帮助.下面开始正文. 一般传统互联 ...
- 在NestJS 中添加对Stripe 的WebHook 验证
在NestJS 中添加对Stripe 的WebHook 验证 背景介绍 Nest 是一个用于构建高效,可扩展的NodeJS 服务器端应用程序的框架.它使用渐进式JavaScript, 内置并完全支持T ...
- 解决pip下载速度慢的问题
解决链接:https://blog.csdn.net/u013901768/article/details/82343512 感谢这位博主的分享. 博客内容如下: 1. 安装时强制使用国内源进行安装, ...
- Python -- 长字符串
如果需要写一个非常非常长的字符串,它需要跨多行,那么,可以使用三个引号代替普通引号. print '''This is a very long string. It continues here. A ...
- 有语言基础的人应该如何学习python?
正好最近在学python,感觉有语言基础的话更多在乎一些语法糖,毕竟其他东西在之前应该接触过了. 笔者C++是起始语言,也接触过java.js,介绍一点python的特点吧.帮助自己巩固所学,也希望能 ...
- Java集合框架全解
Collection 集合 集合接口有2个基本方法: public interface Collection<E> { //向集合中添加元素.如果添加元素确实改变了集合就返回 true, ...
- 🔥 LeetCode 热题 HOT 100(51-60)
142. 环形链表 II 思路:快慢指针,快慢指针相遇后,慢指针回到头,快慢指针步伐一致一起移动,相遇点即为入环点 /** * Definition for singly-linked list. * ...
- vulnhub-DC:7靶机渗透记录
准备工作 在vulnhub官网下载DC:7靶机DC: 7 ~ VulnHub 导入到vmware,设置成NAT模式 打开kali准备进行渗透(ip:192.168.200.6) 信息收集 已经知道了靶 ...