
# coding: utf-8
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import itertools ###########################词袋模型特征############################################
def clean_text(text):
:param text:
text = BeautifulSoup(text, 'html.parser').get_text()
text = re.sub(r'[^a-zA-Z]', ' ', text)
words = text.lower().split()
stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwords/stopwords_english.txt')])
eng_stopwords = set(stopwords)
words = [w for w in words if w not in eng_stopwords]
return ' '.join(words) #混淆矩阵
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
This function prints and plots the confusion matrix.
plt.imshow(cm, interpolation='nearest', cmap=cmap)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes) thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
color="white" if cm[i, j] > thresh else "black") plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label') if __name__=='__main__':
df = pd.read_csv('../data/labeledTrainData.tsv', sep='\t', escapechar='\\')
df['clean_review'] = df.review.apply(clean_text)
#抽取bag of words特征(用sklearn的CountVectorizer)
vectorizer = CountVectorizer(max_features=5000)
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
print(train_data_features) # 数据切分
X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2,
# ### 训练分类器
LR_model = LogisticRegression()
LR_model = LR_model.fit(X_train, y_train)
y_pred = LR_model.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred) print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])) print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (
cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1])) # Plot non-normalized confusion matrix
class_names = [0, 1]
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')


import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import nltk
import warnings
from gensim.models.word2vec import Word2Vec
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import itertools
warnings.filterwarnings("ignore") def clean_text(text, remove_stopwords=False):
text = BeautifulSoup(text, 'html.parser').get_text()
text = re.sub(r'[^a-zA-Z]', ' ', text)
words = text.lower().split()
eng_stopwords = set(stopwords.words('english'))
if remove_stopwords:
words = [w for w in words if w not in eng_stopwords]
return words def split_sentences(review):
sentences = [clean_text(s) for s in raw_sentences if s]
return sentences def to_review_vector(review):
global word_vec
review = clean_text(review, remove_stopwords=True)
# print (review)
# words = nltk.word_tokenize(review)
word_vec = np.zeros((1, 300))
for word in review:
# word_vec = np.zeros((1,300))
if word in model:
word_vec += np.array([model[word]])
# print (word_vec.mean(axis = 0))
return pd.Series(word_vec.mean(axis=0)) def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
This function prints and plots the confusion matrix.
plt.imshow(cm, interpolation='nearest', cmap=cmap)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes) thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
color="white" if cm[i, j] > thresh else "black") plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label') if __name__ == '__main__':
df = pd.read_csv('../data/labeledTrainData.tsv', sep='\t', escapechar='\\')
df['clean_review'] = df.review.apply(clean_text)
review_part = df['clean_review']
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = sum(review_part.apply(split_sentences), [])
sentences_list = []
for line in sentences:
sentences_list.append(nltk.word_tokenize(str(line).strip())) #word2vec
num_features = 300 # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)
model = Word2Vec(sentences_list, workers=num_workers, size=num_features, min_count=min_word_count, window=context)
model.save('word2vec.models') train_data_features = df.review.apply(to_review_vector) X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2, random_state=0) LR_model = LogisticRegression()
LR_model = LR_model.fit(X_train, y_train)
y_pred = LR_model.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred) print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))
print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (
cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1])) # Plot non-normalized confusion matrix
class_names = [0, 1]
plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')


