代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理

代码地址:https://github.com/nfmcclure/tensorflow-cookbook

在讲述skip-gram,CBOW,Word2Vec,Doc2Vec模型时需要复用的函数

  • 加载数据函数
  • 归一化文本函数
  • 生成词汇表函数
  • 生成单词索引表
  • 生成批量数据函数

加载数据函数

  1. # Load the movie review data
  2. # Check if data was downloaded, otherwise download it and save for future use
  3. def load_movie_data(data_folder_name):
  4. pos_file = os.path.join(data_folder_name, 'rt-polarity.pos')
  5. neg_file = os.path.join(data_folder_name, 'rt-polarity.neg')
  6.  
  7. # Check if files are already downloaded
  8. if os.path.isfile(pos_file):
  9. pos_data = []
  10. with open(pos_file, 'r') as temp_pos_file:
  11. for row in temp_pos_file:
  12. pos_data.append(row)
  13. neg_data = []
  14. with open(neg_file, 'r') as temp_neg_file:
  15. for row in temp_neg_file:
  16. neg_data.append(row)
  17. else: # If not downloaded, download and save
  18. movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
  19. stream_data = urllib.request.urlopen(movie_data_url)
  20. tmp = io.BytesIO()
  21. while True:
  22. s = stream_data.read(16384)
  23. if not s:
  24. break
  25. tmp.write(s)
  26. stream_data.close()
  27. tmp.seek(0)
  28.  
  29. tar_file = tarfile.open(fileobj=tmp, mode="r:gz")
  30. pos = tar_file.extractfile('rt-polaritydata/rt-polarity.pos')
  31. neg = tar_file.extractfile('rt-polaritydata/rt-polarity.neg')
  32. # Save pos/neg reviews
  33. pos_data = []
  34. for line in pos:
  35. pos_data.append(line.decode('ISO-8859-1').encode('ascii',errors='ignore').decode())
  36. neg_data = []
  37. for line in neg:
  38. neg_data.append(line.decode('ISO-8859-1').encode('ascii',errors='ignore').decode())
  39. tar_file.close()
  40. # Write to file
  41. if not os.path.exists(save_folder_name):
  42. os.makedirs(save_folder_name)
  43. # Save files
  44. with open(pos_file, "w") as pos_file_handler:
  45. pos_file_handler.write(''.join(pos_data))
  46. with open(neg_file, "w") as neg_file_handler:
  47. neg_file_handler.write(''.join(neg_data))
  48. texts = pos_data + neg_data
  49. target = [1]*len(pos_data) + [0]*len(neg_data)
  50. return(texts, target)

归一化文本函数

  1. # Normalize text
  2. def normalize_text(texts, stops):
  3. # Lower case
  4. texts = [x.lower() for x in texts]
  5.  
  6. # Remove punctuation
  7. texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
  8.  
  9. # Remove numbers
  10. texts = [''.join(c for c in x if c not in '') for x in texts]
  11.  
  12. # Remove stopwords
  13. texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]
  14.  
  15. # Trim extra whitespace
  16. texts = [' '.join(x.split()) for x in texts]
  17.  
  18. return(texts)

生成词汇表函数

  1. # Build dictionary of words构建词汇表(单词和单词数对),词频不够的单词(即标记为unknown的单词)标记为RARE
  2. def build_dictionary(sentences, vocabulary_size):
  3. # Turn sentences (list of strings) into lists of words
  4. split_sentences = [s.split() for s in sentences]
  5. words = [x for sublist in split_sentences for x in sublist]
  6.  
  7. # Initialize list of [word, word_count] for each word, starting with unknown
  8. count = [['RARE', -1]]
  9.  
  10. # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
  11. count.extend(collections.Counter(words).most_common(vocabulary_size-1))
  12.  
  13. # Now create the dictionary
  14. word_dict = {}
  15. # For each word, that we want in the dictionary, add it, then make it
  16. # the value of the prior dictionary length
  17. for word, word_count in count:
  18. word_dict[word] = len(word_dict)
  19.  
  20. return(word_dict)

生成单词索引表

  1. # Turn text data into lists of integers from dictionary
  2. def text_to_numbers(sentences, word_dict):
  3. # Initialize the returned data
  4. data = []
  5. for sentence in sentences:
  6. sentence_data = []
  7. # For each word, either use selected index or rare word index
  8. for word in sentence.split():
  9. if word in word_dict:
  10. word_ix = word_dict[word]
  11. else:
  12. word_ix = 0
  13. sentence_data.append(word_ix)
  14. data.append(sentence_data)
  15. return(data)

生成批量数据函数

  1. # Generate data randomly (N words behind, target, N words ahead)
  2. def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
  3. # Fill up data batch
  4. batch_data = []
  5. label_data = []
  6. while len(batch_data) < batch_size:
  7. # select random sentence to start
  8. rand_sentence_ix = int(np.random.choice(len(sentences), size=1))
  9. rand_sentence = sentences[rand_sentence_ix]
  10. # Generate consecutive windows to look at
  11. window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
  12. # Denote which element of each window is the center word of interest
  13. label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
  14.  
  15. # Pull out center word of interest for each window and create a tuple for each window
  16. if method=='skip_gram':
  17. batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
  18. # Make it in to a big list of tuples (target word, surrounding word)
  19. tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
  20. batch, labels = [list(x) for x in zip(*tuple_data)]
  21. elif method=='cbow':
  22. batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
  23. # Only keep windows with consistent 2*window_size
  24. batch_and_labels = [(x,y) for x,y in batch_and_labels if len(x)==2*window_size]
  25. batch, labels = [list(x) for x in zip(*batch_and_labels)]
  26. elif method=='doc2vec':
  27. # For doc2vec we keep LHS window only to predict target word
  28. batch_and_labels = [(rand_sentence[i:i+window_size], rand_sentence[i+window_size]) for i in range(0, len(rand_sentence)-window_size)]
  29. batch, labels = [list(x) for x in zip(*batch_and_labels)]
  30. # Add document index to batch!! Remember that we must extract the last index in batch for the doc-index
  31. batch = [x + [rand_sentence_ix] for x in batch]
  32. else:
  33. raise ValueError('Method {} not implmented yet.'.format(method))
  34.  
  35. # extract batch and labels
  36. batch_data.extend(batch[:batch_size])
  37. label_data.extend(labels[:batch_size])
  38. # Trim batch and label at the end
  39. batch_data = batch_data[:batch_size]
  40. label_data = label_data[:batch_size]
  41.  
  42. # Convert to numpy array
  43. batch_data = np.array(batch_data)
  44. label_data = np.transpose(np.array([label_data]))
  45.  
  46. return(batch_data, label_data)

tensorflow在文本处理中的使用——辅助函数的更多相关文章

  1. tensorflow在文本处理中的使用——Doc2Vec情感分析

    代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...

  2. tensorflow在文本处理中的使用——Word2Vec预测

    代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...

  3. tensorflow在文本处理中的使用——CBOW词嵌入模型

    代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...

  4. tensorflow在文本处理中的使用——skip-gram模型

    代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...

  5. tensorflow在文本处理中的使用——TF-IDF算法

    代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...

  6. tensorflow在文本处理中的使用——词袋

    代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理 代码地址:https://github.com/nfmcclure/tensorflow-coo ...

  7. tensorflow在文本处理中的使用——skip-gram & CBOW原理总结

    摘自:http://www.cnblogs.com/pinard/p/7160330.html 先看下列三篇,再理解此篇会更容易些(个人意见) skip-gram,CBOW,Word2Vec 词向量基 ...

  8. TensorFlow实现文本情感分析详解

    http://c.biancheng.net/view/1938.html 前面我们介绍了如何将卷积网络应用于图像.本节将把相似的想法应用于文本. 文本和图像有什么共同之处?乍一看很少.但是,如果将句 ...

  9. jQuery文本框中的事件应用

    jQuery文本框中的事件应用 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "ht ...

随机推荐

  1. Ajax之基础

    版权声明:本文为博主原创文章,未经博主同意不得转载. https://blog.csdn.net/liu_yujie2011com/article/details/29812777         几 ...

  2. Directx教程(30) 如何保证渲染物体不会变形

    原文:Directx教程(30) 如何保证渲染物体不会变形      在Directx11教程(6)中, 我们曾经实现过这个功能,但那时是在SystemClass中,处理WM_SIZE时候,重新调用m ...

  3. PersistGate轻松几步让Redux实现数据持久化

    在开发的过程中,数据用redux管理,觉得希望将数据持久化保存,也就是说当用户下一次打开app或网站的时候,我们希望浏览器/APP自动加载出上次的数据,怎么办?有没有一个

  4. Oracle 11g Pivot函数实现行转列

    先上语法规范: SELECT .... FROM <table-expr> PIVOT ( aggregate-function(<column>) FOR <pivot ...

  5. oracle一些常见的问题

    对于权限审计和大部分语句,by session无效,无论指定by session/by access还是不指定,审计都自动为by access. 审计的语句级可以指定ALL,但是ALL只包括大部分语句 ...

  6. Kubernetes1.4新特性前瞻:设置JOB执行计划

    (一)  核心概念 Kubernetes在新版中会新增了一个设置JOB执行计划的功能,在1.3中已经可以初见端倪了,从进度上来看会在1.4版本中进行发布,下面我们先睹为快. Kubernetes通过这 ...

  7. javascript导图 标签: javascript 2015-12-06 16:37 721人阅读 评论(24)

  8. 报错No module named IPython的解决方法

    没有按照 ipython 或者 ide 没有选择编译器

  9. Hbase API: 写入Bigtable.

  10. oracle函数 UPPER(c1)

    [功能]将字符串全部转为大写 [参数]c1,字符表达式 [返回]字符型 [示例] SQL> select upper('AaBbCcDd') upper from dual; UPPER --- ...