参照当Bert遇上Kerashttps://spaces.ac.cn/archives/6736此示例准确率达到95.5%+

https://github.com/CyberZHG/keras-bert/blob/master/README.zh-CN.md

示例实现

# ! -*- coding:utf-8 -*-

import json
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import codecs maxlen = 100
config_path = 'model/bert_config.json'
checkpoint_path = 'model/bert_model.ckpt'
dict_path = 'model/vocab.txt' token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict) class OurTokenizer(Tokenizer): def __init__(self, token_dict):
super(OurTokenizer, self).__init__(token_dict) def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R tokenizer = OurTokenizer(token_dict) neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None) data = [] for d in neg[0]:
data.append((d, 0)) for d in pos[0]:
data.append((d, 1)) # 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
]) class data_generator: def __init__(self, data, batch_size=32):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1 def __len__(self):
return self.steps def __iter__(self):
while True:
idxs = list(range(len(self.data)))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
yield [X1, X2], Y
[X1, X2, Y] = [], [], [] from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) for l in bert_model.layers:
l.trainable = False x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,)) x = bert_model([x1_in, x2_in]) x = Lambda(lambda x: x[:, 0])(x)
p = Dense(1, activation='sigmoid')(x) model = Model([x1_in, x2_in], p)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(1e-5), # 用足够小的学习率
metrics=['accuracy']
)
model.summary() train_D = data_generator(train_data)
valid_D = data_generator(valid_data) test = [train_data[0]]
test_D = data_generator(test) model.fit_generator(
train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=1,
validation_data=valid_D.__iter__(),
validation_steps=len(valid_D)
) #保存模型权重值
model.save('model.h5')

原示例存在的问题

模型在保持完之后再进行加载时提示存在自定义层和激活方法的问题,暂没找到解决办法,如有知道办法的小伙伴请留言私信

问题解决

# ! -*- coding:utf-8 -*-

import json
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_custom_objects
import re, os
import codecs
from keras.models import load_model maxlen = 100
config_path = 'model/bert_config.json'
checkpoint_path = 'model/bert_model.ckpt'
dict_path = 'model/vocab.txt' token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict) class OurTokenizer(Tokenizer): def __init__(self, token_dict):
super(OurTokenizer, self).__init__(token_dict) def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R tokenizer = OurTokenizer(token_dict) neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None) data = [] for d in neg[0]:
data.append((d, 0)) for d in pos[0]:
data.append((d, 1)) # 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
]) class data_generator: def __init__(self, data, batch_size=32):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1 def __len__(self):
return self.steps def __iter__(self):
while True:
idxs = list(range(len(self.data)))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
yield [X1, X2], Y
[X1, X2, Y] = [], [], [] from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) for l in bert_model.layers:
l.trainable = False x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,)) x = bert_model([x1_in, x2_in]) print(bert_model.layers) x = Lambda(lambda x: x[:, 0])(x)
p = Dense(1, activation='sigmoid')(x) model = Model([x1_in, x2_in], p)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(1e-5), # 用足够小的学习率
metrics=['accuracy']
)
model.summary()
train_D = data_generator(train_data)
valid_D = data_generator(valid_data) '''
model.fit_generator(
train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=5,
validation_data=valid_D.__iter__(),
validation_steps=len(valid_D)
) model.save('save_path.h5')
''' # 定义生成器将数据集解析为
class data_token_generator: def __init__(self, data, batch_size=32):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) # self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1 def __len__(self):
return self.steps def get_data(self):
idxs = list(range(len(self.data)))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
print(text)
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
return X1, X2, Y new_model = load_model('save_path.h5', custom_objects=get_custom_objects())
test_T = data_token_generator(valid_data[0:10])
X_test1, X_test2, Y_test = test_T.get_data()
print(Y_test)
print(new_model.predict([X_test1, X_test2]))

我的实现

# ! -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_checkpoint_paths
import codecs
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam # 评价文本最大长度
maxlen = 100
dict_path = 'model/vocab.txt'
token_dict = {}
EPOCHS = 30
BATCH_SIZE = 128 # 初始化令牌字典
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
# print(token, len(token_dict))
token_dict[token] = len(token_dict) # 定义令牌解析器
class OurTokenizer(Tokenizer): def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R # 初始化令牌解析器
tokenizer = OurTokenizer(token_dict) # 读取数据集
neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None) data = [] for d in neg[0]:
data.append((d, 0)) for d in pos[0]:
data.append((d, 1)) # 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] # 令牌序列长度补全
def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
t = [
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
]
return t # 定义生成器将数据集解析为
class data_token_generator: def __init__(self, data, batch_size=32, print_text=False):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) # self.batch_size
self.print_text = print_text
if len(self.data) % self.batch_size != 0:
self.steps += 1 # bert中文模型路径
paths = get_checkpoint_paths('model')
# bert中文模型加载
self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=None) for l in self.bert_model.layers:
l.trainable = True def __len__(self):
return self.steps def get_data(self):
data_x = []
data_y = []
idxs = list(range(len(self.data)))
# 随机
np.random.shuffle(idxs)
indices, segments, Y = [], [], []
for i in idxs:
d = self.data[i]
# 截取数据
text = d[0][:maxlen]
if self.print_text:
print(text)
# 生成指标及段
indice, segment = tokenizer.encode(first=text)
y = d[1]
# 数据放入数组中
indices.append(indice)
segments.append(segment)
Y.append([y])
# 转化成批次
if len(indices) == self.batch_size or i == idxs[-1]:
indices = seq_padding(indices)
segments = seq_padding(segments)
Y = seq_padding(Y)
# 产生词向量
x = self.bert_model.predict([np.array(indices), np.array(segments)]) j_idxs = list(range(len(x)))
for j in j_idxs:
data_x.append(x[j])
data_y.append(Y[j]) print(len(data_y))
[indices, segments, Y] = [], [], [] return np.array(data_x), np.array(data_y) # 定义二分类网络
x_in = Input(shape=(None, 768))
x = Lambda(lambda x: x[:, 0])(x_in)
p = Dense(1, activation='sigmoid')(x) model = Model(x_in, p)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(1e-5), # 用足够小的学习率
metrics=['accuracy']
)
# 打印模型结构
model.summary() # 开始训练
print('Training -----------') train_T = data_token_generator(train_data)
train_x, train_y = train_T.get_data()
valid_T = data_token_generator(valid_data)
validation_data = valid_T.get_data()
model.fit(
train_x,
train_y,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_data=validation_data
) model.save('new_model.h5') # 加载模型验证
import keras test_T = data_token_generator(valid_data[0:10], print_text=True)
X_test, Y_test = test_T.get_data()
print(Y_test)
new_model = keras.models.load_model('new_model.h5')
y = new_model.predict(X_test)
print(y)

采用哈工大版权重,准确率在80%左右

相关依赖

中文版权重

NLP采用Bert进行简单文本情感分类的更多相关文章

  1. NLP之基于TextCNN的文本情感分类

    TextCNN @ 目录 TextCNN 1.理论 1.1 基础概念 最大汇聚(池化)层: 1.2 textCNN模型结构 2.实验 2.1 实验步骤 2.2 算法模型 1.理论 1.1 基础概念 在 ...

  2. 基于Bert的文本情感分类

    详细代码已上传到github: click me Abstract:    Sentiment classification is the process of analyzing and reaso ...

  3. NLP文本情感分类传统模型+深度学习(demo)

    文本情感分类: 文本情感分类(一):传统模型 摘自:http://spaces.ac.cn/index.php/archives/3360/ 测试句子:工信处女干事每月经过下属科室都要亲口交代24口交 ...

  4. NLP之基于Bi-LSTM和注意力机制的文本情感分类

    Bi-LSTM(Attention) @ 目录 Bi-LSTM(Attention) 1.理论 1.1 文本分类和预测(翻译) 1.2 注意力模型 1.2.1 Attention模型 1.2.2 Bi ...

  5. 文本情感分类:分词 OR 不分词(3)

    为什么要用深度学习模型?除了它更高精度等原因之外,还有一个重要原因,那就是它是目前唯一的能够实现“端到端”的模型.所谓“端到端”,就是能够直接将原始数据和标签输入,然后让模型自己完成一切过程——包括特 ...

  6. pytorch 文本情感分类和命名实体识别NER中LSTM输出的区别

    文本情感分类: 文本情感分类采用LSTM的最后一层输出 比如双层的LSTM,使用正向的最后一层和反向的最后一层进行拼接 def forward(self,input): ''' :param inpu ...

  7. kaggle之电影评论文本情感分类

    电影文本情感分类 Github地址 Kaggle地址 这个任务主要是对电影评论文本进行情感分类,主要分为正面评论和负面评论,所以是一个二分类问题,二分类模型我们可以选取一些常见的模型比如贝叶斯.逻辑回 ...

  8. NLP(二十二)利用ALBERT实现文本二分类

      在文章NLP(二十)利用BERT实现文本二分类中,笔者介绍了如何使用BERT来实现文本二分类功能,以判别是否属于出访类事件为例子.但是呢,利用BERT在做模型预测的时候存在预测时间较长的问题.因此 ...

  9. 基于 Spark 的文本情感分析

    转载自:https://www.ibm.com/developerworks/cn/cognitive/library/cc-1606-spark-seniment-analysis/index.ht ...

随机推荐

  1. oracle中删除某个用户下的所有表

    一般的方法:先使用sql查询: SELECT 'DELETE FROM '|| table_name || ';' FROM USER_TABLES ORDER BY TABLE_NAME; 将查询结 ...

  2. python汉字编解码问题

    http://www.cnblogs.com/rollenholt/archive/2011/08/01/2123889.html

  3. kafka Windows安装

    1:安装JDK. 2:安装Zookeeper 下载地址:https://zookeeper.apache.org/releases.html 下载后,解压放在目录D:\bigdata(本文所用的目录) ...

  4. 联想 ThinkPad 笔记本 Fn 键 关闭与启用方法

    联想 ThinkPad 笔记本 Fn 键 关闭与启用方法 [最快捷的方式] 按 Fn + Esc 键,进行切换启用或者关闭 Fn 功能键 So easy!!! ^_^

  5. OAUTH协议介绍

    OAUTH协议为用户资源的授权提供了一个安全的.开放而又简易的标准.与以往的授权方式不同之处是OAUTH的授权不会使第三方触及到用户的帐号信息(如用户名与密码),即第三方无需使用用户的用户名与密码就可 ...

  6. POJO是什么,javabean是什么,以及POJO与javabean的区别

    POJO(Plain Ordinary Java Object)简单的Java对象,实际就是普通JavaBeans,是为了避免和EJB混淆所创造的简称.使用POJO名称是为了避免和EJB混淆起来, 而 ...

  7. 【Linux】【三】linux 复制文件到指定目录

    将  application/file/test/logs/ 下的文件 logs.log , logs.tar 复制到  application/file/test/tools/ 下,并新建文件夹[l ...

  8. 在Debian下利用URLOS快速安装SqlServer2017

    SqlServer能在Debian上安装吗?答案是可以!网络上也能找到很多Linux系统下安装SqlServer的相关文章,也许经过一些折腾,你也能成功在Debian中安装sqlserver,但是其中 ...

  9. 前端数据Mock

    参考链接:https://www.clloz.com/programming/front-end/js/2019/05/10/data-mock/?utm_medium=hao.caibaojian. ...

  10. Laravel验证问题记录

    1.当购物车提交时,POST传来一个对象{address:2,item:{ {ksu_id:2,count:2},{ksu_id:2,count:2}, } 验证方法: public function ...