1. git https://github.com/linyi0604/MachineLearning
  3. 分别使用词袋法和nltk自然预言处理包提供的文本特征提取
  1. from sklearn.feature_extraction.text import CountVectorizer
  2. import nltk
  3. # nltk.download("punkt")
  4. # nltk.download('averaged_perceptron_tagger')
  6. '''
  7. 分别使用词袋法和nltk自然预言处理包提供的文本特征提取
  8. '''
  10. sent1 = "The cat is walking in the bedroom."
  11. sent2 = "A dog was running across the kitchen."
  12. # 使用词袋法 将文本转化为特征向量
  13. count_vec = CountVectorizer()
  14. sentences = [sent1, sent2]
  15. # 输出转化后的特征向量
  16. # print(count_vec.fit_transform(sentences).toarray())
  17. '''
  18. [[0 1 1 0 1 1 0 0 2 1 0]
  19. [1 0 0 1 0 0 1 1 1 0 1]]
  20. '''
  21. # 输出转化后特征的含义
  22. # print(count_vec.get_feature_names())
  23. '''
  24. ['across', 'bedroom', 'cat', 'dog', 'in', 'is', 'kitchen', 'running', 'the', 'walking', 'was']
  25. '''
  27. # 使用nltk对文本进行语言分析
  28. # 对句子词汇分割和正则化 把aren't 分割成 are 和 n't I'm 分割成 I和'm
  29. tokens1 = nltk.word_tokenize(sent1)
  30. tokens2 = nltk.word_tokenize(sent2)
  31. # print(tokens1)
  32. # print(tokens2)
  33. '''
  34. ['The', 'cat', 'is', 'walking', 'in', 'the', 'bedroom', '.']
  35. ['A', 'dog', 'was', 'running', 'across', 'the', 'kitchen', '.']
  36. '''
  37. # 整理词汇表 按照ASCII的顺序排序
  38. vocab_1 = sorted(set(tokens1))
  39. vocab_2 = sorted(set(tokens2))
  40. # print(vocab_1)
  41. # print(vocab_2)
  42. '''
  43. ['.', 'The', 'bedroom', 'cat', 'in', 'is', 'the', 'walking']
  44. ['.', 'A', 'across', 'dog', 'kitchen', 'running', 'the', 'was']
  45. '''
  46. # 初始化stemer 寻找每个单词最原始的词根
  47. stemmer = nltk.stem.PorterStemmer()
  48. stem_1 = [stemmer.stem(t) for t in tokens1]
  49. stem_2 = [stemmer.stem(t) for t in tokens2]
  50. # print(stem_1)
  51. # print(stem_2)
  52. '''
  53. ['the', 'cat', 'is', 'walk', 'in', 'the', 'bedroom', '.']
  54. ['A', 'dog', 'wa', 'run', 'across', 'the', 'kitchen', '.']
  55. '''
  56. # 利用词性标注器 对词性进行标注
  57. pos_tag_1 = nltk.tag.pos_tag(tokens1)
  58. pos_tag_2 = nltk.tag.pos_tag(tokens2)
  59. # print(pos_tag_1)
  60. # print(pos_tag_2)
  61. '''
  62. [('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('walking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bedroom', 'NN'), ('.', '.')]
  63. [('A', 'DT'), ('dog', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('across', 'IN'), ('the', 'DT'), ('kitchen', 'NN'), ('.', '.')]
  64. '''

