NLP采用Bert进行简单文本情感分类
参照当Bert遇上Kerashttps://spaces.ac.cn/archives/6736此示例准确率达到95.5%+
https://github.com/CyberZHG/keras-bert/blob/master/README.zh-CN.md
示例实现
# ! -*- coding:utf-8 -*-
import json
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import codecs
maxlen = 100
config_path = 'model/bert_config.json'
checkpoint_path = 'model/bert_model.ckpt'
dict_path = 'model/vocab.txt'
token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
def __init__(self, token_dict):
super(OurTokenizer, self).__init__(token_dict)
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R
tokenizer = OurTokenizer(token_dict)
neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None)
data = []
for d in neg[0]:
data.append((d, 0))
for d in pos[0]:
data.append((d, 1))
# 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
])
class data_generator:
def __init__(self, data, batch_size=32):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
idxs = list(range(len(self.data)))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
yield [X1, X2], Y
[X1, X2, Y] = [], [], []
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)
for l in bert_model.layers:
l.trainable = False
x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))
x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x)
p = Dense(1, activation='sigmoid')(x)
model = Model([x1_in, x2_in], p)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(1e-5), # 用足够小的学习率
metrics=['accuracy']
)
model.summary()
train_D = data_generator(train_data)
valid_D = data_generator(valid_data)
test = [train_data[0]]
test_D = data_generator(test)
model.fit_generator(
train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=1,
validation_data=valid_D.__iter__(),
validation_steps=len(valid_D)
)
#保存模型权重值
model.save('model.h5')
原示例存在的问题
模型在保持完之后再进行加载时提示存在自定义层和激活方法的问题,暂没找到解决办法,如有知道办法的小伙伴请留言私信
问题解决
# ! -*- coding:utf-8 -*-
import json
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_custom_objects
import re, os
import codecs
from keras.models import load_model
maxlen = 100
config_path = 'model/bert_config.json'
checkpoint_path = 'model/bert_model.ckpt'
dict_path = 'model/vocab.txt'
token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
def __init__(self, token_dict):
super(OurTokenizer, self).__init__(token_dict)
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R
tokenizer = OurTokenizer(token_dict)
neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None)
data = []
for d in neg[0]:
data.append((d, 0))
for d in pos[0]:
data.append((d, 1))
# 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
])
class data_generator:
def __init__(self, data, batch_size=32):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
idxs = list(range(len(self.data)))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
yield [X1, X2], Y
[X1, X2, Y] = [], [], []
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)
for l in bert_model.layers:
l.trainable = False
x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))
x = bert_model([x1_in, x2_in])
print(bert_model.layers)
x = Lambda(lambda x: x[:, 0])(x)
p = Dense(1, activation='sigmoid')(x)
model = Model([x1_in, x2_in], p)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(1e-5), # 用足够小的学习率
metrics=['accuracy']
)
model.summary()
train_D = data_generator(train_data)
valid_D = data_generator(valid_data)
'''
model.fit_generator(
train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=5,
validation_data=valid_D.__iter__(),
validation_steps=len(valid_D)
)
model.save('save_path.h5')
'''
# 定义生成器将数据集解析为
class data_token_generator:
def __init__(self, data, batch_size=32):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) # self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def get_data(self):
idxs = list(range(len(self.data)))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
print(text)
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
return X1, X2, Y
new_model = load_model('save_path.h5', custom_objects=get_custom_objects())
test_T = data_token_generator(valid_data[0:10])
X_test1, X_test2, Y_test = test_T.get_data()
print(Y_test)
print(new_model.predict([X_test1, X_test2]))
我的实现
# ! -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_checkpoint_paths
import codecs
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam
# 评价文本最大长度
maxlen = 100
dict_path = 'model/vocab.txt'
token_dict = {}
EPOCHS = 30
BATCH_SIZE = 128
# 初始化令牌字典
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
# print(token, len(token_dict))
token_dict[token] = len(token_dict)
# 定义令牌解析器
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R
# 初始化令牌解析器
tokenizer = OurTokenizer(token_dict)
# 读取数据集
neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None)
data = []
for d in neg[0]:
data.append((d, 0))
for d in pos[0]:
data.append((d, 1))
# 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
# 令牌序列长度补全
def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
t = [
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
]
return t
# 定义生成器将数据集解析为
class data_token_generator:
def __init__(self, data, batch_size=32, print_text=False):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) # self.batch_size
self.print_text = print_text
if len(self.data) % self.batch_size != 0:
self.steps += 1
# bert中文模型路径
paths = get_checkpoint_paths('model')
# bert中文模型加载
self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=None)
for l in self.bert_model.layers:
l.trainable = True
def __len__(self):
return self.steps
def get_data(self):
data_x = []
data_y = []
idxs = list(range(len(self.data)))
# 随机
np.random.shuffle(idxs)
indices, segments, Y = [], [], []
for i in idxs:
d = self.data[i]
# 截取数据
text = d[0][:maxlen]
if self.print_text:
print(text)
# 生成指标及段
indice, segment = tokenizer.encode(first=text)
y = d[1]
# 数据放入数组中
indices.append(indice)
segments.append(segment)
Y.append([y])
# 转化成批次
if len(indices) == self.batch_size or i == idxs[-1]:
indices = seq_padding(indices)
segments = seq_padding(segments)
Y = seq_padding(Y)
# 产生词向量
x = self.bert_model.predict([np.array(indices), np.array(segments)])
j_idxs = list(range(len(x)))
for j in j_idxs:
data_x.append(x[j])
data_y.append(Y[j])
print(len(data_y))
[indices, segments, Y] = [], [], []
return np.array(data_x), np.array(data_y)
# 定义二分类网络
x_in = Input(shape=(None, 768))
x = Lambda(lambda x: x[:, 0])(x_in)
p = Dense(1, activation='sigmoid')(x)
model = Model(x_in, p)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(1e-5), # 用足够小的学习率
metrics=['accuracy']
)
# 打印模型结构
model.summary()
# 开始训练
print('Training -----------')
train_T = data_token_generator(train_data)
train_x, train_y = train_T.get_data()
valid_T = data_token_generator(valid_data)
validation_data = valid_T.get_data()
model.fit(
train_x,
train_y,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_data=validation_data
)
model.save('new_model.h5')
# 加载模型验证
import keras
test_T = data_token_generator(valid_data[0:10], print_text=True)
X_test, Y_test = test_T.get_data()
print(Y_test)
new_model = keras.models.load_model('new_model.h5')
y = new_model.predict(X_test)
print(y)
采用哈工大版权重,准确率在80%左右
相关依赖
- keras_bert https://github.com/CyberZHG/keras-bert
中文版权重
NLP采用Bert进行简单文本情感分类的更多相关文章
- NLP之基于TextCNN的文本情感分类
TextCNN @ 目录 TextCNN 1.理论 1.1 基础概念 最大汇聚(池化)层: 1.2 textCNN模型结构 2.实验 2.1 实验步骤 2.2 算法模型 1.理论 1.1 基础概念 在 ...
- 基于Bert的文本情感分类
详细代码已上传到github: click me Abstract: Sentiment classification is the process of analyzing and reaso ...
- NLP文本情感分类传统模型+深度学习(demo)
文本情感分类: 文本情感分类(一):传统模型 摘自:http://spaces.ac.cn/index.php/archives/3360/ 测试句子:工信处女干事每月经过下属科室都要亲口交代24口交 ...
- NLP之基于Bi-LSTM和注意力机制的文本情感分类
Bi-LSTM(Attention) @ 目录 Bi-LSTM(Attention) 1.理论 1.1 文本分类和预测(翻译) 1.2 注意力模型 1.2.1 Attention模型 1.2.2 Bi ...
- 文本情感分类:分词 OR 不分词(3)
为什么要用深度学习模型?除了它更高精度等原因之外,还有一个重要原因,那就是它是目前唯一的能够实现“端到端”的模型.所谓“端到端”,就是能够直接将原始数据和标签输入,然后让模型自己完成一切过程——包括特 ...
- pytorch 文本情感分类和命名实体识别NER中LSTM输出的区别
文本情感分类: 文本情感分类采用LSTM的最后一层输出 比如双层的LSTM,使用正向的最后一层和反向的最后一层进行拼接 def forward(self,input): ''' :param inpu ...
- kaggle之电影评论文本情感分类
电影文本情感分类 Github地址 Kaggle地址 这个任务主要是对电影评论文本进行情感分类,主要分为正面评论和负面评论,所以是一个二分类问题,二分类模型我们可以选取一些常见的模型比如贝叶斯.逻辑回 ...
- NLP(二十二)利用ALBERT实现文本二分类
在文章NLP(二十)利用BERT实现文本二分类中,笔者介绍了如何使用BERT来实现文本二分类功能,以判别是否属于出访类事件为例子.但是呢,利用BERT在做模型预测的时候存在预测时间较长的问题.因此 ...
- 基于 Spark 的文本情感分析
转载自:https://www.ibm.com/developerworks/cn/cognitive/library/cc-1606-spark-seniment-analysis/index.ht ...
随机推荐
- 网络通信框架之volley
介绍 我们平时在开发Android应用的时候不可避免地都需要用到网络技术,而多数情况下应用程序都会使用HTTP协议来发送和接收网络数据.Android系统中主要提供了两种方式来进行HTTP通信,Htt ...
- 阶段3 3.SpringMVC·_03.SpringMVC常用注解_6 CookieValue注解
演示 访问服务器会有session.它是一cookie的形式返回给客户端的 拿到的值
- image-webpack-loader在mac或ubuntu报错
解决办法安装libpng库,在github issue https://github.com/tcoopman/image-webpack-loader/issues/49可查看 mac: brew ...
- RESTful 介绍
什么是RESTful?一种软件架构风格.设计风格,而不是标准,只是提供了一组设计原则和约束条件.它主要用于客户端和服务端交互类的软件.基于这个风格设计的软件可以更简洁,更有层次,更易于实现缓存等机制. ...
- EncryptHelper加密对象-工具类
using System; using System.IO; using System.Security.Cryptography; using System.Text; using System.W ...
- JavaScript 基础入门11 - 运动框架的封装
目录 JavaScript 运动原理 运动基础 简单运动的封装 淡入淡出 不同属性的设置 多属性值同时运动 运动回调,链式运动 缓冲运动 加入缓冲的运动框架 案例1 多图片展开收缩 运动的留言本 Ja ...
- react navigation goBack()返回到任意页面(不集成redux) 一
方案一: 一.适用场景:在app端开发的时候,相反回到某一个页面的时候保持跳转页面的所有状态不更新,也就是说不触发新的生命周期. 例如:A——>B——>C——>D 要想从D页面直接返 ...
- 阿里云Zabbix安装实践过程
1.配置阿里云zabbix yum源 [root@VM_0_8_centos ~]# rpm -ivh https://mirrors.aliyun.com/zabbix/zabbix/3.0/rhe ...
- Tensorflow实战第十一课(RNN Regression 回归例子 )
本节我们会使用RNN来进行回归训练(Regression),会继续使用自己创建的sin曲线预测一条cos曲线. 首先我们需要先确定RNN的各种参数: import tensorflow as tf i ...
- TCP/IP 物理层卷二 -- 交换技术
一.概念 交换技术是指各台主机之间.各通信设备之间或者主机和通信设备之间(简单理解:你的PC和我的PC之间.你的PC和我的路由器.路由器之间)为交换信息所采用的的数据格式和交换装置的方式. 二.交换技 ...