tensorflow实现Word2vec

# coding: utf-8

'''

Note: Step 3 is missing. That's why I left it.

'''

from __future__ import absolute_import

from __future__ import print_function

import collections

import math

import numpy as np

import os

import random

from six.moves import urllib

from six.moves import xrange  # pylint: disable=redefined-builtin

import tensorflow as tf

import zipfile

# Step 1: Download the data.

# Downloading data. If the file already exists, check that it was received correctly (the file size is the same).

# Return filename after download.

print("Step 1: Download the data.")

url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):

    """Download a file if not present, and make sure it's the right size."""

    if not os.path.exists(filename):

        filename, _ = urllib.request.urlretrieve(url + filename, filename)

    statinfo = os.stat(filename)

    if statinfo.st_size == expected_bytes:

        print('Found and verified', filename)

    else:

        print(statinfo.st_size)

        raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')

    return filename

filename = maybe_download('text8.zip', 31344016)

# Read the data into a string.

# file (zipfile) 을 읽어옴

# text8.zip contains only one file. Looking at the code, it seems to be words separated by ''.

def read_data(filename):

    f = zipfile.ZipFile(filename)

    for name in f.namelist():

        return f.read(name).split()

    f.close()

words = read_data(filename)

print('Data size', len(words))

print('Sample words: ', words[:10])

# Step 2: Build the dictionary and replace rare words with UNK token.

print("\nStep 2: Build the dictionary and replace rare words with UNK token.")

vocabulary_size = 50000

def build_dataset(words):

#vocabulary_size is the number of frequent words to use.

#all words that do not fit within the top 50000 (vocabulary_size) are treated as UNK.

#Param words: literally a list of words

#Return data: indices of words including UNK. That is, words index list.

#Return count: collections.Counter which counts the frequency of occurrence of each word

#:return dictionary: {"word": "index"}

#:return reverse_dictionary: {"index": "word"}. e.g.) {0: 'UNK', 1: 'the', ...}

    count = [['UNK', -1]]

    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))

    dictionary = dict()

    for word, _ in count:

        dictionary[word] = len(dictionary) # insert index to dictionary (len이 계속 증가하므로 결과적으로 index의 효과)

    data = list()

    unk_count = 0

    for word in words:

       if word in dictionary:

          index = dictionary[word]

       else:

          index = 0  # dictionary['UNK']

    unk_count = unk_count + 1

    data.append(index]

    count[0][1] = unk_count

    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)

del words  # Hint to reduce memory.

print('Most common words (+UNK)', count[:5])

print('Sample data: ', data[:10])

print('Sample count: ', count[:10])

print('Sample dict: ', dictionary.items()[:10])

print('Sample reverse dict: ', reverse_dictionary.items()[:10])

data_index = 0

# Step 4: Function to generate a training batch for the skip-gram model.

print("\nStep 4: Function to generate a training batch for the skip-gram model.")

def generate_batch(batch_size, num_skips, skip_window):

#Function to generate minibatch.

#Data_index is declared as global, which acts as static here. That is, the value of data_index is retained even if this function is continually recalled.

#Param batch_size: batch_size.

#Param num_skips: how many (target, context) pairs to generate in the context window.

#Param skip_window: context window size. The skip-gram model predicts the surrounding words from the target word, and skip_window defines the range of the surrounding words.

#Return batch: mini-batch of data.

#Return labels: labels of mini-batch. 2d array of [batch_size] [1].

    global data_index

    assert batch_size % num_skips == 0  # num_skips의 배수로 batch가 생성되므로.

    assert num_skips <= 2 * skip_window # num_skips == 2*skip_window 이면 모든 context window의 context에 대해 pair가 생성된다.

    # That is, it should not be larger than that.

    batch = np.ndarray(shape=(batch_size), dtype=np.int32)

    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

    span = 2 * skip_window + 1 # [ skip_window target skip_window ]

    buffer = collections.deque(maxlen=span)

    # Deques are a generalization of stacks and queues.

    # The name is pronounced "deck" and is short for "double-ended queue".

    # You can both push (append) & pop both sides.

    # buffer = data[data_index:data_index+span] with circling

    for _ in range(span):

        buffer.append(data[data_index])

        data_index = (data_index + 1) % len(data)

# operator to discard the remainder or decimal point

#Skip-gram is a model for predicting surrounding context words from target words.

#Before we learn skip-gram model, we need to convert words to (target, context) type.

#The code below will do the job with batch_size size.

    for i in range(batch_size // num_skips):

        target = skip_window  # target label at the center of the buffer

        targets_to_avoid = [ skip_window ]

        for j in range(num_skips):

            while target in targets_to_avoid:

                # Extracting the context from the context window is done randomly.

                # However, if skip_window * 2 == num_skips, all contexts are taken out, so random is not meaningful. The order is random only.

                target = random.randint(0, span - 1)

            targets_to_avoid.append(target)

            batch[i * num_skips + j] = buffer[skip_window]

            labels[i * num_skips + j, 0] = buffer[target]

        buffer.append(data[data_index])

        data_index = (data_index + 1) % len(data)

    return batch, labels

# To see how the batch is configured, output it once:

print("Generating batch ... ")

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)

print("Sample batches: ", batch[:10])

print("Sample labels: ", labels[:10])

for i in range(8):

    print(batch[i], '->', labels[i, 0])

    print(reverse_dictionary[batch[i]], '->', reverse_dictionary[labels[i, 0]])

# Step 5: Build and train a skip-gram model.

print("\nStep 5: Build and train a skip-gram model.")

batch_size = 128

embedding_size = 128  # Dimension of the embedding vector.

skip_window = 1       # How many words to consider left and right.

num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the

# validation samples to the words that have a low numeric ID, which by

# construction are also the most frequent.

valid_size = 16     # Random set of words to evaluate similarity on.

valid_window = 100  # Only pick dev samples in the head of the distribution.

valid_examples = np.array(random.sample(np.arange(valid_window), valid_size))

# [0 ~ valid_window] And then sampled by valid_size.

# In other words, valid_examples is a random selection of 16 from 0 to 99.

num_sampled = 64    # Number of negative examples to sample.

print("valid_examples: ", valid_examples)

graph = tf.Graph()

with graph.as_default():

    # Input data.

    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])

    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation

    # Embedding_lookup This GPU implementation is not implemented, so it must be a CPU.

    # Since default is GPU, it explicitly specifies CPU.

    with tf.device('/cpu:0'):

        # Look up embeddings for inputs.

        # embedding matrix (vectors)

        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

        # Only the embedding vectors pointed to by train_inputs (mini-batch; indices) in the entire embedding matrix are extracted

        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # Construct the variables for the NCE loss

        # NCE loss is defined using a logistic regression model.

        # That is, for logistic regression, weight and bias are needed for each word of vocabulary.

        nce_weights = tf.Variable(

            tf.truncated_normal([vocabulary_size, embedding_size],

                                stddev=1.0 / math.sqrt(embedding_size)))

        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.

    # tf.nce_loss automatically draws a new sample of the negative labels each

    # time we evaluate the loss.

    loss = tf.reduce_mean(

        tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,

                       num_sampled, vocabulary_size))

    # Construct the SGD optimizer using a learning rate of 1.0.

    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.

    # Calculate the cosine similarity between minibatch (valid_embeddings) and all embeddings.

    # This process is to show which words are closest to each valid_example as the learning progresses (ie to show the learning process).

    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))

    normalized_embeddings = embeddings / norm

    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)

    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

# Step 6: Begin training

print("\nStep 6: Begin training")

num_steps = 100001

with tf.Session(graph=graph) as session:

    # We must initialize all variables before we use them.

    tf.initialize_all_variables().run()

    print("Initialized")

    average_loss = 0

    for step in xrange(num_steps):

        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)

        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}

        # We perform one update step by evaluating the optimizer op (including it

        # in the list of returned values for session.run()

        # Use feed_dict to put data into the placeholder and learn it.

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)

        average_loss += loss_val

        if step % 2000 == 0:

            if step > 0:

                average_loss = average_loss / 2000

            # The average loss is an estimate of the loss over the last 2000 batches.

            print("Average loss at step ", step, ": ", average_loss)

            average_loss = 0

        # note that this is expensive (~20% slowdown if computed every 500 steps)

        if step % 10000 == 0:

            sim = similarity.eval()

            for i in xrange(valid_size):

                valid_word = reverse_dictionary[valid_examples[i]]

                top_k = 8 # number of nearest neighbors

                nearest = (-sim[i, :]).argsort()[1:top_k+1]

                log_str = "Nearest to %s:" % valid_word

                for k in xrange(top_k):

                    close_word = reverse_dictionary[nearest[k]]

                    log_str = "%s %s," % (log_str, close_word)

                print(log_str)

    final_embeddings = normalized_embeddings.eval()

# Step 7: Visualize the embeddings.

print("\nStep 7: Visualize the embeddings.")

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):

    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"

    plt.figure(figsize=(18, 18))  #in inches

    for i, label in enumerate(labels):

        x, y = low_dim_embs[i,:]

        plt.scatter(x, y)

        plt.annotate(label,

                     xy=(x, y),

                     xytext=(5, 2),

                     textcoords='offset points',

                     ha='right',

                     va='bottom')

    plt.savefig(filename)

try:

    # If you get an error here, update scikit-learn and matplotlib to the latest version.

    from sklearn.manifold import TSNE

    import matplotlib.pyplot as plt

    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)

    plot_only = 500

    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])

    labels = [reverse_dictionary[i] for i in xrange(plot_only)]

    plot_with_labels(low_dim_embs, labels)

except ImportError:

    print("Please install sklearn and matplotlib to visualize embeddings.")

tensorflow实现Word2vec的更多相关文章

文本分布式表示（二）：用tensorflow和word2vec训练词向量
看了几天word2vec的理论,终于是懂了一些.理论部分我推荐以下几篇教程,有博客也有视频: 1.<word2vec中的数学原理>:http://www.cnblogs.com/pegho ...
利用 TensorFlow 入门 Word2Vec
利用 TensorFlow 入门 Word2Vec 原创 2017-10-14 chen_h coderpai 博客地址:http://www.jianshu.com/p/4e16ae0aad25 或 ...
Tensorflow 的Word2vec demo解析
简单demo的代码路径在tensorflow\tensorflow\g3doc\tutorials\word2vec\word2vec_basic.py Sikp gram方式的model思路 htt ...
使用tensorflow训练word2vec
from http://blog.csdn.net/wangyangzhizhou/article/details/77530479?locationNum=1&fps=1 使用了tensor ...
Tensorflow做阅读理解与完形填空
catalogue . 前言 . 使用的数据集 . 数据预处理 . 训练 . 测试模型运行结果: 进行实际完形填空 0. 前言开始写这篇文章的时候是晚上12点,突然想到几点新的理解,赶紧记下来.我们 ...
利用python中的gensim模块训练和测试word2vec
word2vec的基础知识介绍参考上一篇博客和列举的参考资料. 首先利用安装gensim模块,相关依赖如下,注意版本要一致: Python >= 2.7 (tested with version ...
对word2vec的理解及资料整理
对word2vec的理解及资料整理无他,在网上看到好多对word2vec的介绍,当然也有写的比较认真的,但是自己学习过程中还是看了好多才明白,这里按照自己整理梳理一下资料,形成提纲以便学习. 介绍较 ...
《TensorFlow实战》读书笔记(完结)
1 TensorFlow基础 ---1.1TensorFlow概要 TensorFlow使用数据流图进行计算,一次编写,各处运行. ---1.2 TensorFlow编程模型简介 TensorFlow ...
DeepLearning入门笔记（一），准备工作与注意事项
本文记录了安装theano.keras.tensorflow以及运行tutorial程序时遇到的一些问题,供后人参考. 实验机器:联想笔记本,i7-6700HQ,GTX960M,16G内存,SSD硬盘 ...

随机推荐

SpringMVC中@Controller和@RequestMapping用法
一.简介在SpringMVC 中,控制器Controller 负责处理由DispatcherServlet 分发的请求,它把用户请求的数据经过业务处理层处理之后封装成一个Model ,然后再把该Mo ...
.NET Core 中的 Swagger 应用与微服务场景下的Swagger Api 集成显示
Swagger 与 OpenAPI 的历史来源: Swagger 项目于 2015 年捐赠给 OpenAPI Initiative,此后被称为 OpenAPI.这两个名称可以互换使用.但是," ...
42 张图带你撸完 MySQL 优化
Hey guys,这里是程序员cxuan,欢迎你阅读我最新一期的文章,这篇文章是 MySQL 调优的汇总版,我加了一下日常开发过程中的调优经验,希望对各位小伙伴们有所帮助.下面开始正文. 一般传统互联 ...
在NestJS 中添加对Stripe 的WebHook 验证
在NestJS 中添加对Stripe 的WebHook 验证背景介绍 Nest 是一个用于构建高效,可扩展的NodeJS 服务器端应用程序的框架.它使用渐进式JavaScript, 内置并完全支持T ...
解决pip下载速度慢的问题
解决链接:https://blog.csdn.net/u013901768/article/details/82343512 感谢这位博主的分享. 博客内容如下: 1. 安装时强制使用国内源进行安装, ...
Python -- 长字符串
如果需要写一个非常非常长的字符串,它需要跨多行,那么,可以使用三个引号代替普通引号. print '''This is a very long string. It continues here. A ...
有语言基础的人应该如何学习python？
正好最近在学python,感觉有语言基础的话更多在乎一些语法糖,毕竟其他东西在之前应该接触过了. 笔者C++是起始语言,也接触过java.js,介绍一点python的特点吧.帮助自己巩固所学,也希望能 ...
Java集合框架全解
Collection 集合集合接口有2个基本方法: public interface Collection<E> { //向集合中添加元素.如果添加元素确实改变了集合就返回 true, ...
🔥 LeetCode 热题 HOT 100（51-60）
142. 环形链表 II 思路:快慢指针,快慢指针相遇后,慢指针回到头,快慢指针步伐一致一起移动,相遇点即为入环点 /** * Definition for singly-linked list. * ...
vulnhub-DC:7靶机渗透记录
准备工作在vulnhub官网下载DC:7靶机DC: 7 ~ VulnHub 导入到vmware,设置成NAT模式打开kali准备进行渗透(ip:192.168.200.6) 信息收集已经知道了靶 ...

tensorflow实现Word2vec

tensorflow实现Word2vec的更多相关文章

随机推荐

热门专题