tensorflow在文本处理中的使用——辅助函数
代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理
代码地址:https://github.com/nfmcclure/tensorflow-cookbook
在讲述skip-gram,CBOW,Word2Vec,Doc2Vec模型时需要复用的函数
- 加载数据函数
- 归一化文本函数
- 生成词汇表函数
- 生成单词索引表
- 生成批量数据函数
加载数据函数
- # Load the movie review data
- # Check if data was downloaded, otherwise download it and save for future use
- def load_movie_data(data_folder_name):
- pos_file = os.path.join(data_folder_name, 'rt-polarity.pos')
- neg_file = os.path.join(data_folder_name, 'rt-polarity.neg')
- # Check if files are already downloaded
- if os.path.isfile(pos_file):
- pos_data = []
- with open(pos_file, 'r') as temp_pos_file:
- for row in temp_pos_file:
- pos_data.append(row)
- neg_data = []
- with open(neg_file, 'r') as temp_neg_file:
- for row in temp_neg_file:
- neg_data.append(row)
- else: # If not downloaded, download and save
- movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
- stream_data = urllib.request.urlopen(movie_data_url)
- tmp = io.BytesIO()
- while True:
- s = stream_data.read(16384)
- if not s:
- break
- tmp.write(s)
- stream_data.close()
- tmp.seek(0)
- tar_file = tarfile.open(fileobj=tmp, mode="r:gz")
- pos = tar_file.extractfile('rt-polaritydata/rt-polarity.pos')
- neg = tar_file.extractfile('rt-polaritydata/rt-polarity.neg')
- # Save pos/neg reviews
- pos_data = []
- for line in pos:
- pos_data.append(line.decode('ISO-8859-1').encode('ascii',errors='ignore').decode())
- neg_data = []
- for line in neg:
- neg_data.append(line.decode('ISO-8859-1').encode('ascii',errors='ignore').decode())
- tar_file.close()
- # Write to file
- if not os.path.exists(save_folder_name):
- os.makedirs(save_folder_name)
- # Save files
- with open(pos_file, "w") as pos_file_handler:
- pos_file_handler.write(''.join(pos_data))
- with open(neg_file, "w") as neg_file_handler:
- neg_file_handler.write(''.join(neg_data))
- texts = pos_data + neg_data
- target = [1]*len(pos_data) + [0]*len(neg_data)
- return(texts, target)
归一化文本函数
- # Normalize text
- def normalize_text(texts, stops):
- # Lower case
- texts = [x.lower() for x in texts]
- # Remove punctuation
- texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
- # Remove numbers
- texts = [''.join(c for c in x if c not in '') for x in texts]
- # Remove stopwords
- texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]
- # Trim extra whitespace
- texts = [' '.join(x.split()) for x in texts]
- return(texts)
生成词汇表函数
- # Build dictionary of words构建词汇表(单词和单词数对),词频不够的单词(即标记为unknown的单词)标记为RARE
- def build_dictionary(sentences, vocabulary_size):
- # Turn sentences (list of strings) into lists of words
- split_sentences = [s.split() for s in sentences]
- words = [x for sublist in split_sentences for x in sublist]
- # Initialize list of [word, word_count] for each word, starting with unknown
- count = [['RARE', -1]]
- # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
- count.extend(collections.Counter(words).most_common(vocabulary_size-1))
- # Now create the dictionary
- word_dict = {}
- # For each word, that we want in the dictionary, add it, then make it
- # the value of the prior dictionary length
- for word, word_count in count:
- word_dict[word] = len(word_dict)
- return(word_dict)
生成单词索引表
- # Turn text data into lists of integers from dictionary
- def text_to_numbers(sentences, word_dict):
- # Initialize the returned data
- data = []
- for sentence in sentences:
- sentence_data = []
- # For each word, either use selected index or rare word index
- for word in sentence.split():
- if word in word_dict:
- word_ix = word_dict[word]
- else:
- word_ix = 0
- sentence_data.append(word_ix)
- data.append(sentence_data)
- return(data)
生成批量数据函数
- # Generate data randomly (N words behind, target, N words ahead)
- def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
- # Fill up data batch
- batch_data = []
- label_data = []
- while len(batch_data) < batch_size:
- # select random sentence to start
- rand_sentence_ix = int(np.random.choice(len(sentences), size=1))
- rand_sentence = sentences[rand_sentence_ix]
- # Generate consecutive windows to look at
- window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
- # Denote which element of each window is the center word of interest
- label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
- # Pull out center word of interest for each window and create a tuple for each window
- if method=='skip_gram':
- batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
- # Make it in to a big list of tuples (target word, surrounding word)
- tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
- batch, labels = [list(x) for x in zip(*tuple_data)]
- elif method=='cbow':
- batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
- # Only keep windows with consistent 2*window_size
- batch_and_labels = [(x,y) for x,y in batch_and_labels if len(x)==2*window_size]
- batch, labels = [list(x) for x in zip(*batch_and_labels)]
- elif method=='doc2vec':
- # For doc2vec we keep LHS window only to predict target word
- batch_and_labels = [(rand_sentence[i:i+window_size], rand_sentence[i+window_size]) for i in range(0, len(rand_sentence)-window_size)]
- batch, labels = [list(x) for x in zip(*batch_and_labels)]
- # Add document index to batch!! Remember that we must extract the last index in batch for the doc-index
- batch = [x + [rand_sentence_ix] for x in batch]
- else:
- raise ValueError('Method {} not implmented yet.'.format(method))
- # extract batch and labels
- batch_data.extend(batch[:batch_size])
- label_data.extend(labels[:batch_size])
- # Trim batch and label at the end
- batch_data = batch_data[:batch_size]
- label_data = label_data[:batch_size]
- # Convert to numpy array
- batch_data = np.array(batch_data)
- label_data = np.transpose(np.array([label_data]))
- return(batch_data, label_data)
tensorflow在文本处理中的使用——辅助函数的更多相关文章
- tensorflow在文本处理中的使用——Doc2Vec情感分析
代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...
- tensorflow在文本处理中的使用——Word2Vec预测
代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...
- tensorflow在文本处理中的使用——CBOW词嵌入模型
代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...
- tensorflow在文本处理中的使用——skip-gram模型
代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...
- tensorflow在文本处理中的使用——TF-IDF算法
代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...
- tensorflow在文本处理中的使用——词袋
代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...
- tensorflow在文本处理中的使用——skip-gram & CBOW原理总结
摘自:http://www.cnblogs.com/pinard/p/7160330.html 先看下列三篇,再理解此篇会更容易些(个人意见) skip-gram,CBOW,Word2Vec 词向量基 ...
- TensorFlow实现文本情感分析详解
http://c.biancheng.net/view/1938.html 前面我们介绍了如何将卷积网络应用于图像.本节将把相似的想法应用于文本. 文本和图像有什么共同之处?乍一看很少.但是,如果将句 ...
- jQuery文本框中的事件应用
jQuery文本框中的事件应用 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "ht ...
随机推荐
- Ajax之基础
版权声明:本文为博主原创文章,未经博主同意不得转载. https://blog.csdn.net/liu_yujie2011com/article/details/29812777 几 ...
- Directx教程(30) 如何保证渲染物体不会变形
原文:Directx教程(30) 如何保证渲染物体不会变形 在Directx11教程(6)中, 我们曾经实现过这个功能,但那时是在SystemClass中,处理WM_SIZE时候,重新调用m ...
- PersistGate轻松几步让Redux实现数据持久化
在开发的过程中,数据用redux管理,觉得希望将数据持久化保存,也就是说当用户下一次打开app或网站的时候,我们希望浏览器/APP自动加载出上次的数据,怎么办?有没有一个
- Oracle 11g Pivot函数实现行转列
先上语法规范: SELECT .... FROM <table-expr> PIVOT ( aggregate-function(<column>) FOR <pivot ...
- oracle一些常见的问题
对于权限审计和大部分语句,by session无效,无论指定by session/by access还是不指定,审计都自动为by access. 审计的语句级可以指定ALL,但是ALL只包括大部分语句 ...
- Kubernetes1.4新特性前瞻:设置JOB执行计划
(一) 核心概念 Kubernetes在新版中会新增了一个设置JOB执行计划的功能,在1.3中已经可以初见端倪了,从进度上来看会在1.4版本中进行发布,下面我们先睹为快. Kubernetes通过这 ...
- javascript导图 标签: javascript 2015-12-06 16:37 721人阅读 评论(24)
- 报错No module named IPython的解决方法
没有按照 ipython 或者 ide 没有选择编译器
- Hbase API: 写入Bigtable.
- oracle函数 UPPER(c1)
[功能]将字符串全部转为大写 [参数]c1,字符表达式 [返回]字符型 [示例] SQL> select upper('AaBbCcDd') upper from dual; UPPER --- ...