cut_sentence.py

"""
实现句子的分词
注意点:
1. 实现单个字分词 2. 实现按照词语分词
2.1 加载词典 3. 使用停用词
""" import string
import jieba
import jieba.posseg as psg
import logging stopwords_path = "../corpus/stopwords.txt" stopwords = [i.strip() for i in open(stopwords_path,encoding="utf-8").readlines()] #关闭jieba日志
jieba.setLogLevel(logging.INFO) #加载词典
jieba.load_userdict("../corpus/keywords.txt") continue_words = string.ascii_lowercase def _cut_sentence_by_word(sentence):
"""
按照单个字进行分词,eg:python 可 以 做 人 工 智 能 么 ? jave
:param sentence:str
:return: [str,str,str]
"""
temp = ""
result = []
for word in sentence:
if word in continue_words:
temp += word
else:
if len(temp)>0:
result.append(temp)
temp = ""
result.append(word) if len(temp)>0:
result.append(temp)
return result def _cut_sentence(sentence,use_stopwords,use_seg):
"""
按照词语进行分词
:param sentence:str
:return: 【str,str,str】
"""
if not use_seg:
result = jieba.lcut(sentence)
else:
result = [(i.word,i.flag) for i in psg.cut(sentence)]
if use_stopwords:
if not use_seg:
result = [i for i in result if i not in stopwords]
else:
result = [i for i in result if i[0] not in stopwords]
return result def cut(sentence,by_word=False,use_stopwords=False,use_seg=False):
"""
封装上述的方法
:param sentence:str
:param by_word: bool,是否按照单个字分词
:param use_stopwords: 是否使用停用词
:param use_seg: 是否返回词性
:return: [(str,seg),str]
"""
sentence = sentence.lower()
if by_word:
return _cut_sentence_by_word(sentence)
else:
return _cut_sentence(sentence,use_stopwords,use_seg)

word_sequence.py

"""
文本序列化
""" class WordSequence:
UNK_TAG = "<UNK>" #表示未知字符
PAD_TAG = "<PAD>" #填充符
SOS_TAG = "<SOS>"
EOS_TAG = "<EOS>"
PAD = 0
UNK = 1
SOS = 2
EOS = 3 def __init__(self):
self.dict = { #保存词语和对应的数字
self.UNK_TAG:self.UNK,
self.PAD_TAG:self.PAD,
self.SOS_TAG:self.SOS,
self.EOS_TAG:self.EOS
}
self.count = {} #统计词频的 def fit(self,sentence):
"""
接受句子,统计词频
:param sentence:[str,str,str]
:return:None
"""
for word in sentence:
self.count[word] = self.count.get(word,0) + 1 #所有的句子fit之后,self.count就有了所有词语的词频 def build_vocab(self,min_count=5,max_count=None,max_features=None):
"""
根据条件构造 词典
:param min_count:最小词频
:param max_count: 最大词频
:param max_features: 最大词语数
:return:
"""
if min_count is not None:
self.count = {word:count for word,count in self.count.items() if count >= min_count}
if max_count is not None:
self.count = {word:count for word,count in self.count.items() if count <= max_count}
if max_features is not None:
#[(k,v),(k,v)....] --->{k:v,k:v}
self.count = dict(sorted(self.count.items(),lambda x:x[-1],reverse=True)[:max_features]) for word in self.count:
self.dict[word] = len(self.dict) #每次word对应一个数字 #把dict进行翻转
self.inverse_dict = dict(zip(self.dict.values(),self.dict.keys())) def transform(self,sentence,max_len=None,add_eos=False):
"""
把句子转化为数字序列
:param sentence:[str,str,str]
:return: [int,int,int]
"""
if add_eos and max_len is not None:
max_len = max_len-1 if len(sentence) > max_len:
sentence = sentence[:max_len]
else:
sentence = sentence + [self.PAD_TAG] *(max_len- len(sentence)) #填充PAD if add_eos:
if self.PAD_TAG in sentence:
index = sentence.index(self.PAD_TAG)
sentence.insert(index,self.EOS_TAG)
else:
sentence += [self.EOS_TAG] return [self.dict.get(i,1) for i in sentence] def inverse_transform(self,incides):
"""
把数字序列转化为字符
:param incides: [int,int,int]
:return: [str,str,str]
"""
result = []
for i in incides:
temp = self.inverse_dict.get(i, "<UNK>")
if temp != self.EOS_TAG:
result.append(temp)
else:
break
return "".join(result) def __len__(self):
return len(self.dict) if __name__ == '__main__':
sentences = [["今天","天气","很","好"],
["今天","去","吃","什么"]]
ws = WordSequence()
for sentence in sentences:
ws.fit(sentence)
ws.build_vocab(min_count=1)
print(ws.dict)
ret = ws.transform(["好","好","好","好","好","好","好","热","呀"],max_len=3)
print(ret)
ret = ws.inverse_transform(ret)
print(ret)
pass

  

dataset.py

"""
准备数据集
"""
import random
from tqdm import tqdm
import config
import torch
from torch.utils.data import DataLoader,Dataset #1. 进行数据集的切分
def chatbot_data_split():
input = open("../corpus/chatbot/input.txt",encoding="utf-8").readlines()
target = open("../corpus/chatbot/target.txt",encoding="utf-8").readlines()
f_train_input = open("../corpus/chatbot/train_input.txt","a",encoding="utf-8")
f_train_target = open("../corpus/chatbot/train_target.txt","a",encoding="utf-8")
f_test_input = open("../corpus/chatbot/test_input.txt","a",encoding="utf-8")
f_test_target = open("../corpus/chatbot/test_target.txt","a",encoding="utf-8")
for input,target in tqdm(zip(input,target),total=len(input)):
if random.random()>0.8:
#放入test
f_test_input.write(input)
f_test_target.write(target)
else:
f_train_input.write(input)
f_train_target.write(target)
f_train_input.close()
f_train_target.close()
f_test_input.close()
f_test_target.close() #2. 准备dataset class ChatDataset(Dataset):
def __init__(self,train=True):
input_path = "../corpus/chatbot/train_input.txt" if train else "../corpus/chatbot/test_input.txt"
target_path = "../corpus/chatbot/train_target.txt" if train else "../corpus/chatbot/test_target.txt"
self.input_data = open(input_path,encoding="utf-8").readlines()
self.target_data = open(target_path,encoding="utf-8").readlines()
assert len(self.input_data) == len(self.target_data),"input target长度不一致!!!" def __getitem__(self, idx):
input = self.input_data[idx].strip().split()
target = self.target_data[idx].strip().split()
#获取真实长度
input_len = len(input) if len(input)<config.chatbot_input_max_len else config.chatbot_input_max_len
target_len = len(target) if len(target)<config.chatbot_target_max_len else config.chatbot_target_max_len input = config.input_ws.transform(input,max_len=config.chatbot_input_max_len)
target = config.target_ws.transform(target,max_len=config.chatbot_target_max_len,add_eos=True)
return input,target,input_len,target_len def __len__(self):
return len(self.input_data) # 3. 准备dataloader
def collate_fn(batch):
"""
:param batch:【(input,target,input_len,target_len),(),(一个getitem的结果)】
:return:
"""
#1. 对batch按照input的长度进行排序
batch = sorted(batch,key=lambda x:x[-2],reverse=True)
#2. 进行batch操作
input, target, input_len, target_len = zip(*batch)
#3. 把输入处理成LongTensor
input = torch.LongTensor(input)
target = torch.LongTensor(target)
input_len = torch.LongTensor(input_len)
target_len = torch.LongTensor(target_len)
return input, target, input_len, target_len def get_dataloader(train=True):
batch_size = config.chatbot_train_batch_size if train else config.chatbot_test_batch_size
return DataLoader(ChatDataset(train),batch_size=batch_size,collate_fn=collate_fn,shuffle=True) if __name__ == '__main__':
loader = get_dataloader()
for idx,(input, target, input_len, target_len) in enumerate(loader):
print(idx)
print(input)
print(target)
print(input_len)
print(target_len)
break

config.py

"""
项目配置
"""
import pickle
import torch device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = ("cpu") ################# classify 相关的配置 ###############
predict_ratio = 0.98 #预测可能性的阈值 ################# chatbot相关的配置 #################
chatbot_train_batch_size = 400
chatbot_test_batch_size = 500 input_ws = pickle.load(open("../chatbot/models/ws_input.pkl","rb"))
target_ws = pickle.load(open("../chatbot/models/ws_target.pkl","rb"))
chatbot_input_max_len = 20
chatbot_target_max_len = 30 chatbot_encoder_embedding_dim = 300
chatbot_encoder_hidden_size = 128
chatbot_encoder_number_layer = 2
chatbot_encoder_bidirectional = True
chatbot_encoder_dropout = 0.3 chatbot_decoder_embedding_dim = 300
chatbot_decoder_hidden_size = 128*2
chatbot_decoder_number_layer = 1
chatbot_decoder_dropout = 0

  

  

encoder.py

"""
进行编码
""" import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence
import config
import torch class Encoder(nn.Module):
def __init__(self):
super(Encoder,self).__init__()
self.embedding = nn.Embedding(num_embeddings=len(config.input_ws),
embedding_dim=config.chatbot_encoder_embedding_dim,
padding_idx=config.input_ws.PAD
)
# 2层双向,每层hidden_size 128
self.gru = nn.GRU(input_size=config.chatbot_encoder_embedding_dim,
hidden_size=config.chatbot_encoder_hidden_size,
num_layers=config.chatbot_encoder_number_layer,
batch_first=True,
bidirectional=config.chatbot_encoder_bidirectional,
dropout=config.chatbot_encoder_dropout) def forward(self, input,input_len):
input_embeded = self.embedding(input) #对输入进行打包
input_packed = pack_padded_sequence(input_embeded,input_len,batch_first=True)
#经过GRU处理
output,hidden = self.gru(input_packed)
# print("encoder gru hidden:",hidden.size())
#进行解包
output_paded,seq_len = pad_packed_sequence(output,batch_first=True,padding_value=config.input_ws.PAD)
#获取最上层的正向和反向最后一个时间步的输出,表示整个句子
encoder_hidden = torch.cat([hidden[-2],hidden[-1]],dim=-1).unsqueeze(0) #[1,batch_size,128*2]
return output_paded,encoder_hidden #[1,batch_size,128*2]

  decoder.py

"""
实现解码器
"""
import torch.nn as nn
import config
import torch
import torch.nn.functional as F
import numpy as np
import random class Decoder(nn.Module):
def __init__(self):
super(Decoder,self).__init__() self.embedding = nn.Embedding(num_embeddings=len(config.target_ws),
embedding_dim=config.chatbot_decoder_embedding_dim,
padding_idx=config.target_ws.PAD) #需要的hidden_state形状:[1,batch_size,64]
self.gru = nn.GRU(input_size=config.chatbot_decoder_embedding_dim,
hidden_size=config.chatbot_decoder_hidden_size,
num_layers=config.chatbot_decoder_number_layer,
bidirectional=False,
batch_first=True,
dropout=config.chatbot_decoder_dropout) #假如encoder的hidden_size=64,num_layer=1 encoder_hidden :[2,batch_sizee,64] self.fc = nn.Linear(config.chatbot_decoder_hidden_size,len(config.target_ws)) def forward(self, encoder_hidden,target):
# print("target size:",target.size())
#第一个时间步的输入的hidden_state
decoder_hidden = encoder_hidden #[1,batch_size,128*2]
#第一个时间步的输入的input
batch_size = encoder_hidden.size(1)
decoder_input = torch.LongTensor([[config.target_ws.SOS]]*batch_size).to(config.device) #[batch_size,1]
# print("decoder_input:",decoder_input.size()) #使用全为0的数组保存数据,[batch_size,max_len,vocab_size]
decoder_outputs = torch.zeros([batch_size,config.chatbot_target_max_len,len(config.target_ws)]).to(config.device) if random.random()>0.5: #teacher_forcing机制 for t in range(config.chatbot_target_max_len):
decoder_output_t,decoder_hidden = self.forward_step(decoder_input,decoder_hidden)
decoder_outputs[:,t,:] = decoder_output_t #获取当前时间步的预测值
value,index = decoder_output_t.max(dim=-1)
decoder_input = index.unsqueeze(-1) #[batch_size,1]
# print("decoder_input:",decoder_input.size())
else:
for t in range(config.chatbot_target_max_len):
decoder_output_t, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
decoder_outputs[:, t, :] = decoder_output_t
#把真实值作为下一步的输入
decoder_input = target[:,t].unsqueeze(-1)
# print("decoder_input size:",decoder_input.size())
return decoder_outputs,decoder_hidden def forward_step(self,decoder_input,decoder_hidden):
'''
计算一个时间步的结果
:param decoder_input: [batch_size,1]
:param decoder_hidden: [1,batch_size,128*2]
:return:
''' decoder_input_embeded = self.embedding(decoder_input)
# print("decoder_input_embeded:",decoder_input_embeded.size()) #out:[batch_size,1,128*2]
#decoder_hidden :[1,bathc_size,128*2]
out,decoder_hidden = self.gru(decoder_input_embeded,decoder_hidden)
# print("decoder_hidden size:",decoder_hidden.size())
#out :【batch_size,1,hidden_size】 out_squeezed = out.squeeze(dim=1) #去掉为1的维度
out_fc = F.log_softmax(self.fc(out_squeezed),dim=-1) #[bathc_size,vocab_size]
# out_fc.unsqueeze_(dim=1) #[bathc_size,1,vocab_size]
# print("out_fc:",out_fc.size())
return out_fc,decoder_hidden def evaluate(self,encoder_hidden): # 第一个时间步的输入的hidden_state
decoder_hidden = encoder_hidden # [1,batch_size,128*2]
# 第一个时间步的输入的input
batch_size = encoder_hidden.size(1)
decoder_input = torch.LongTensor([[config.target_ws.SOS]] * batch_size).to(config.device) # [batch_size,1]
# print("decoder_input:",decoder_input.size()) # 使用全为0的数组保存数据,[batch_size,max_len,vocab_size]
decoder_outputs = torch.zeros([batch_size, config.chatbot_target_max_len, len(config.target_ws)]).to(
config.device) predict_result = []
for t in range(config.chatbot_target_max_len):
decoder_output_t, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
decoder_outputs[:, t, :] = decoder_output_t # 获取当前时间步的预测值
value, index = decoder_output_t.max(dim=-1)
predict_result.append(index.cpu().detach().numpy()) #[[batch],[batch]...]
decoder_input = index.unsqueeze(-1) # [batch_size,1]
# print("decoder_input:",decoder_input.size())
# predict_result.append(decoder_input)
#把结果转化为ndarray,每一行是一条预测结果
predict_result = np.array(predict_result).transpose()
return decoder_outputs, predict_result

  seq2seq.py

"""
完成seq2seq模型
"""
import torch.nn as nn
from chatbot.encoder import Encoder
from chatbot.decoder import Decoder class Seq2Seq(nn.Module):
def __init__(self):
super(Seq2Seq,self).__init__()
self.encoder = Encoder()
self.decoder = Decoder() def forward(self, input,input_len,target):
encoder_outputs,encoder_hidden = self.encoder(input,input_len)
decoder_outputs,decoder_hidden = self.decoder(encoder_hidden,target)
return decoder_outputs def evaluate(self,input,input_len):
encoder_outputs, encoder_hidden = self.encoder(input, input_len)
decoder_outputs, predict_result = self.decoder.evaluate(encoder_hidden)
return decoder_outputs,predict_result

  train.py

"""
进行模型的训练
"""
import torch
import torch.nn.functional as F
from chatbot.seq2seq import Seq2Seq
from torch.optim import Adam
from chatbot.dataset import get_dataloader
from tqdm import tqdm
import config
import numpy as np
import pickle
from matplotlib import pyplot as plt
# from eval import eval model = Seq2Seq().to(config.device) optimizer = Adam(model.parameters()) loss_list = [] def train(epoch):
data_loader = get_dataloader(train=True)
bar = tqdm(data_loader,total=len(data_loader)) for idx,(input,target,input_len,target_len) in enumerate(bar):
input = input.to(config.device)
target = target.to(config.device)
input_len = input_len.to(config.device)
optimizer.zero_grad()
decoder_outputs = model(input,input_len,target) #[batch_Size,max_len,vocab_size]
loss = F.nll_loss(decoder_outputs.view(-1,len(config.target_ws)),target.view(-1),ignore_index=config.input_ws.PAD)
loss.backward()
optimizer.step()
loss_list.append(loss.item())
bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,idx,np.mean(loss_list))) if idx%100 == 0:
torch.save(model.state_dict(),"../chatbot/models/model.pkl")
torch.save(optimizer.state_dict(),"../chatbot/models/optimizer.pkl")
pickle.dump(loss_list,open("../chatbot/models/loss_list.pkl","wb")) if __name__ == '__main__':
for i in range(5):
train(i)
# eval() # plt.figure(figsize=(50,8))
# plt.plot(range(len(loss_list)),loss_list)
# plt.show()

  eval.py

"""
进行模型的评估
""" import torch
import torch.nn.functional as F
from chatbot.dataset import get_dataloader
from tqdm import tqdm
import config
import numpy as np
import pickle
from chatbot.seq2seq import Seq2Seq def eval():
model = Seq2Seq().to(config.device)
model.eval()
model.load_state_dict(torch.load("./models/model.pkl")) loss_list = []
data_loader = get_dataloader(train=False)
bar = tqdm(data_loader,total=len(data_loader),desc="当前进行评估")
with torch.no_grad():
for idx,(input,target,input_len,target_len) in enumerate(bar):
input = input.to(config.device)
target = target.to(config.device)
input_len = input_len.to(config.device) decoder_outputs,predict_result = model.evaluate(input,input_len) #[batch_Size,max_len,vocab_size]
loss = F.nll_loss(decoder_outputs.view(-1,len(config.target_ws)),target.view(-1),ignore_index=config.input_ws.PAD)
loss_list.append(loss.item())
bar.set_description("idx:{} loss:{:.6f}".format(idx,np.mean(loss_list)))
print("当前的平均损失为:",np.mean(loss_list)) def interface():
from chatbot.cut_sentence import cut
import config
#加载模型
model = Seq2Seq().to(config.device)
model.eval()
model.load_state_dict(torch.load("./models/model.pkl")) #准备待预测的数据
while True:
origin_input =input("me>>:")
# if "你是谁" in origin_input or "你叫什么" in origin_input:
# result = "我是小智。"
# elif "你好" in origin_input or "hello" in origin_input:
# result = "Hello"
# else:
_input = cut(origin_input, by_word=True)
input_len = torch.LongTensor([len(_input)]).to(config.device)
_input = torch.LongTensor([config.input_ws.transform(_input,max_len=config.chatbot_input_max_len)]).to(config.device) outputs,predict = model.evaluate(_input,input_len)
result = config.target_ws.inverse_transform(predict[0])
print("chatbot>>:",result) if __name__ == '__main__':
interface()

  

pytorch seq2seq闲聊机器人的更多相关文章

  1. pytorch seq2seq闲聊机器人beam search返回结果

    decoder.py """ 实现解码器 """ import heapq import torch.nn as nn import con ...

  2. pytorch seq2seq闲聊机器人加入attention机制

    attention.py """ 实现attention """ import torch import torch.nn as nn im ...

  3. pytorch seq2seq模型示例

    以下代码可以让你更加熟悉seq2seq模型机制 """ test """ import numpy as np import torch i ...

  4. pytorch seq2seq模型中加入teacher_forcing机制

    在循环内加的teacher forcing机制,这种为目标确定的时候,可以这样加. 目标不确定,需要在循环外加. decoder.py 中的修改 """ 实现解码器 &q ...

  5. pytorch seq2seq模型训练测试

    num_sequence.py """ 数字序列化方法 """ class NumSequence: """ ...

  6. [实现] 利用 Seq2Seq 预测句子后续字词 (Pytorch)2

    最近有个任务:利用 RNN 进行句子补全,即给定一个不完整的句子,预测其后续的字词.本文使用了 Seq2Seq 模型,输入为 5 个中文字词,输出为 1 个中文字词.目录 关于RNN 语料预处理 搭建 ...

  7. [实现] 利用 Seq2Seq 预测句子后续字词 (Pytorch)

    最近有个任务:利用 RNN 进行句子补全,即给定一个不完整的句子,预测其后续的字词.本文使用了 Seq2Seq 模型,输入为5个中文字词,输出为一个中文字词. 目录 关于RNN 语料预处理 搭建数据集 ...

  8. Pytorch学习记录-torchtext和Pytorch的实例( 使用神经网络训练Seq2Seq代码)

    Pytorch学习记录-torchtext和Pytorch的实例1 0. PyTorch Seq2Seq项目介绍 1. 使用神经网络训练Seq2Seq 1.1 简介,对论文中公式的解读 1.2 数据预 ...

  9. seq2seq里的数学

    seq2seq模型详解 原创 2017年12月25日 09:41:04 标签: seq2seq / 自然语言 / 机器人   在李纪为博士的毕业论文中提到,基于生成的闲聊机器人中,seq2seq是一种 ...

随机推荐

  1. 树莓派扩展usb wifi-EPU-N8508GS

    树莓派zero 扩展USB WIFI EPU-N8508GS 指令: sudo lsusb 终端显示如下,其中显示RTL8188CUS信息,说明系统已经成功识别到wifi模块 Bus 001 Devi ...

  2. B【USACO 2015 Jan Gold】牧草鉴赏家

    时间限制 : 10000 MS   空间限制 : 65536 KB 问题描述 约翰有n块草场,编号1到n,这些草场由若干条单行道相连.奶牛贝西是美味牧草的鉴赏家,她想到达尽可能多的草场去品尝牧草. 贝 ...

  3. 1+X Web前端开发(中级)理论考试样题(附答案)

    传送门 教育部:职业教育将启动"1+X"证书制度改革 职业教育改革1+X证书制度试点启动 1+X成绩/证书查询入口 一.单选题(每小题2分,共30小题,共 60 分) 1.在Boo ...

  4. jenkins集成sonarQube实现代码质量检查

    1.sonarQube的简介 SonarQube是一款自动化代码审查工具,用于检测代码中的错误.漏洞和代码异味.它可以与你现有的工作流集成,以支持跨项目分支和拉取请求的连续代码检查. 其工作流程如下: ...

  5. SSM项目依赖,spring5.1.9

    SSM项目依赖,spring5.1.9 简单的SSM项目,未加AOP等等操作 Spring三包bean core content Mybatis 五个mybatis mybatis-spring jd ...

  6. 《Three.js 入门指南》3.1.1 - 基本几何形状 - 球体(SphereGeometry)

    3.1 基本几何形状 球体(SphereGeometry) 构造函数: THREE.SphereGeometry(radius, segmentsWidth, segmentsHeight, phiS ...

  7. vue引入echart Error in mounted hook: "ReferenceError: myChart is not defined" found in

    解决办法: // 实例化echarts对象 var/let myChart = echarts.init(this.$refs.myChart)

  8. Java学习笔记--Comparable & Comparator

    一.    Comparable<T>: Comparable是类内部的比较器,用于创建类的时候实现此接口,同时实现比较方法:对于不能修改源码的类则无法应用此方式进行比较排序等. 源码为: ...

  9. Y分形的平面微带天线生成过程

    Y分形的平面微带天线生成过程 本文介绍了使用Altium Designer脚本程序生成Y型天线的过程,在窗体中线宽.迭代次数,边框长度可以直接设置. Y分形天线用户界面由一个窗体.1个TImage控件 ...

  10. Golang入门(1):安装与配置环境变量的意义

    摘要 在几年前学习Java的时候,环境的配置就会劝退一部分的初学者.而对于Golang来说,也需要从环境的配置开始学起.这一篇文章将从如何安装Golang开始讲起,随后将会提到Golang中的环境变量 ...