BERT模型源码解析

modeling.py

目录

属性

class BertConfig(object)   BERT模型配置参数类

class BertModel(object)   BERT模型类

函数

def gelu(x)  格鲁激活函数

def get_activation(activation_string) 通过名称获取激活函数

def get_assignment_map_from_checkpoint 读取检查点函数

def dropout(input_tensor, dropout_prob) 丢弃函数,按一定比例丢弃权重数据

def layer_norm(input_tensor, name=None) 数据标准化

def layer_norm_and_dropout 先标准化,再丢弃

def create_initializer(initializer_range=0.02) 数据初始化

def embedding_lookup 嵌入查找函数

def embedding_postprocessor 嵌入处理函数

def create_attention_mask_from_input_mask 创建注意力掩码

def attention_layer 注意力层 处理函数

def transformer_model    transformer模型

def get_shape_list 获取张量的形状参数列表

def reshape_to_matrix(input_tensor) 将张量转换为二维矩阵

def reshape_from_matrix(output_tensor, orig_shape_list) 将二维张量转换为指定维数

def assert_rank(tensor, expected_rank, name=None) 断言 张量的维数

源码

许可信息

# coding=utf-8 编码使用utf-8

# Copyright 2018 The Google AI Language Team Authors.版权术语谷歌语言团队的作者

#

# Licensed under the Apache License, Version 2.0 (the "License");根据Apache许可证进行许可

# you may not use this file except in compliance with the License.

如不符合许可证的规定,则不可使用本文件

# You may obtain a copy of the License at 可以通过下面的网址获取许可证副本

#

#     http://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

"""The main BERT model and related functions."""

导入依赖

from __future__ import absolute_import

from __future__ import division

from __future__ import print_function

import collections

import copy

import json

import math

import re

import numpy as np

import six

import tensorflow as tf

模型配置

构造函数

参数说明

class BertConfig(object):

"""Configuration for `BertModel`."""对BERT模型进行参数配置

def __init__(self,

vocab_size,

hidden_size=768,

num_hidden_layers=12,

num_attention_heads=12,

intermediate_size=3072,

hidden_act="gelu",

hidden_dropout_prob=0.1,

attention_probs_dropout_prob=0.1,

max_position_embeddings=512,

type_vocab_size=16,

initializer_range=0.02):

"""Constructs BertConfig.构造函数

Args:参数说明

vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.

inputs_ids集合的大小

hidden_size: Size of the encoder layers and the pooler layer.

编码层和池化层的大小

num_hidden_layers: Number of hidden layers in the Transformer encoder.

Transformer 编码器中隐藏层个数

num_attention_heads: Number of attention heads for each attention layer in

the Transformer encoder.

Transformer 编码器中每个注意层的头数

intermediate_size: The size of the "intermediate" (i.e., feed-forward)

layer in the Transformer encoder.

Transformer 编码器中中间层个数

hidden_act: The non-linear activation function (function or string) in the

encoder and pooler.

编码器和池化器的激活函数

hidden_dropout_prob: The dropout probability for all fully connected

layers in the embeddings, encoder, and pooler.

丢弃概率(嵌入层、编码层、池化层)

attention_probs_dropout_prob: The dropout ratio for the attention

probabilities.

注意力概率的丢弃比例

max_position_embeddings: The maximum sequence length that this model might

ever be used with. Typically set this to something large just in case

(e.g., 512 or 1024 or 2048).

最大序列长度,一般设置大一些以防万一,例如可以设置为512,1024,2048

type_vocab_size: The vocabulary size of the `token_type_ids` passed into

`BertModel`.

token_type_ids的词汇量

initializer_range: The stdev of the truncated_normal_initializer for

initializing all weight matrices.

初始化权重参数的标准差

"""

self.vocab_size = vocab_size

self.hidden_size = hidden_size

self.num_hidden_layers = num_hidden_layers

self.num_attention_heads = num_attention_heads

self.hidden_act = hidden_act

self.intermediate_size = intermediate_size

self.hidden_dropout_prob = hidden_dropout_prob

self.attention_probs_dropout_prob = attention_probs_dropout_prob

self.max_position_embeddings = max_position_embeddings

self.type_vocab_size = type_vocab_size

self.initializer_range = initializer_range

@classmethod 类方法

def from_dict(cls, json_object):

"""Constructs a `BertConfig` from a Python dictionary of parameters."""

从一个参数字典构造配置参数

config = BertConfig(vocab_size=None)

for (key, value) in six.iteritems(json_object):

config.__dict__[key] = value

return config

@classmethod

def from_json_file(cls, json_file): 从JSON文件构造BertConfig对象

"""Constructs a `BertConfig` from a json file of parameters."""

从一个JSON文件构造配置参数

with tf.gfile.GFile(json_file, "r") as reader:

text = reader.read()

return cls.from_dict(json.loads(text))

def to_dict(self): BertConfig对象转换为字典

"""Serializes this instance to a Python dictionary."""

output = copy.deepcopy(self.__dict__)

return output

def to_json_string(self):  BertConfig对象转换为JSON格式字符串

"""Serializes this instance to a JSON string."""

return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

BERT模型类

BertModel

class BertModel(object):

  """BERT model ("Bidirectional Encoder Representations from Transformers").

Transformers模型的双向编码表示

Example usage:示例

```python

已经转换为 词片 id形式

# Already been converted into WordPiece token ids

 tf.constant用于创建常量张量

input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])

input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])

token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])

创建配置参数对象config

config = modeling.BertConfig(vocab_size=32000, hidden_size=512,

num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

创建模型对象model

model = modeling.BertModel(config=config, is_training=True,

input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)

嵌入层标签、池化输出

label_embeddings = tf.get_variable(...)

pooled_output = model.get_pooled_output()

tf.matmul=matrix multiply矩阵相乘

logits = tf.matmul(pooled_output, label_embeddings)

...

```

"""

def __init__(self,

config,

is_training,

input_ids,

input_mask=None,

token_type_ids=None,

use_one_hot_embeddings=False,

scope=None):

"""Constructor for BertModel.

Args:

config: `BertConfig` instance.配置参数对象

is_training: bool. true for training model, false for eval model. Controls

whether dropout will be applied.是否进行训练

input_ids: int32 Tensor of shape [batch_size, seq_length].输入维度

input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].输入掩码

token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].令牌类型

use_one_hot_embeddings: (optional) bool. Whether to use one-hot word

embeddings or tf.embedding_lookup() for the word embeddings.

嵌入层:是否使用one-hot词嵌入

scope: (optional) variable scope. Defaults to "bert".

可选变量,默认值"bert".

Raises:

ValueError: The config is invalid or one of the input tensor shapes

is invalid.

异常:值错误,配置参数无效或者张量形状无效

"""

config = copy.deepcopy(config) 配置参数对象深度克隆

if not is_training: 如果不训练

config.hidden_dropout_prob = 0.0  放弃比例设置为0,表示不放弃参数

config.attention_probs_dropout_prob = 0.0 放弃比例设置为0,表示不放弃参数

获取输入形状

input_shape = get_shape_list(input_ids, expected_rank=2)

batch_size = input_shape[0]  批处理量,每一批处理的数据条数

seq_length = input_shape[1]  序列长度

if input_mask is None:  如果没有输出掩码,则将掩码全部设置为1

input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)

if token_type_ids is None:  如果没有令牌,则将令牌全部设置为0

token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)

variable_scope 变量共享、上下文管理器,作用域;

在这个管理器下做的事情,会被这个管理器管着

with tf.variable_scope(scope, default_name="bert"):

with tf.variable_scope("embeddings"):

# Perform embedding lookup on the word ids.对单词id执行嵌入查找。

(self.embedding_output, self.embedding_table) = embedding_lookup(

input_ids=input_ids,

vocab_size=config.vocab_size,

embedding_size=config.hidden_size,

initializer_range=config.initializer_range,

word_embedding_name="word_embeddings",

use_one_hot_embeddings=use_one_hot_embeddings)

添加位置嵌入、令牌嵌入,然后标准化并执行丢弃

# Add positional embeddings and token type embeddings, then layer

# normalize and perform dropout.

embedding_postprocessor对单词嵌入张量执行各种后处理。

self.embedding_output = embedding_postprocessor(

input_tensor=self.embedding_output,

use_token_type=True,

token_type_ids=token_type_ids,

token_type_vocab_size=config.type_vocab_size,

token_type_embedding_name="token_type_embeddings",

use_position_embeddings=True,

position_embedding_name="position_embeddings",

initializer_range=config.initializer_range,

max_position_embeddings=config.max_position_embeddings,

dropout_prob=config.hidden_dropout_prob)

with tf.variable_scope("encoder"):

将2维掩码转换成3维,用于注意力评分

# This converts a 2D mask of shape [batch_size, seq_length] to a 3D

# mask of shape [batch_size, seq_length, seq_length] which is used

# for the attention scores.

attention_mask = create_attention_mask_from_input_mask(

input_ids, input_mask)

# Run the stacked transformer.  运行堆叠的transformer模型

# `sequence_output` shape = [batch_size, seq_length, hidden_size].

 创建transformer_model对象

self.all_encoder_layers = transformer_model(

input_tensor=self.embedding_output,

attention_mask=attention_mask,

hidden_size=config.hidden_size,

num_hidden_layers=config.num_hidden_layers,

num_attention_heads=config.num_attention_heads,

intermediate_size=config.intermediate_size,

intermediate_act_fn=get_activation(config.hidden_act),

hidden_dropout_prob=config.hidden_dropout_prob,

attention_probs_dropout_prob=config.attention_probs_dropout_prob,

initializer_range=config.initializer_range,

do_return_all_layers=True)

 [-1]表示倒数第一项

self.sequence_output = self.all_encoder_layers[-1]

# The "pooler" converts the encoded sequence tensor of shape

# [batch_size, seq_length, hidden_size] to a tensor of shape

# [batch_size, hidden_size].

pooler改变编码张量的形状,从3维变成了2维

This is necessary for segment-level

# (or segment-pair-level) classification tasks where we need a fixed

# dimensional representation of the segment.

 句子分类任务中,这种转换是必要的,因为我们需要一个固定维度的表达

with tf.variable_scope("pooler"):

# We "pool" the model by simply taking the hidden state corresponding to the first token.

通过获取和第一个令牌一致的隐藏状态,我们池化了模型

We assume that this has been pre-trained

假定模型已经预训练好了

 tf.squeeze从张量的形状中去除大小为1的维数

squeeze英 [skwiːz]  美 [skwiːz]v. 挤压,捏;

first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)

self.pooled_output = tf.layers.dense(

first_token_tensor, 符号张量输入到密集层

config.hidden_size, 隐藏层的大小

activation=tf.tanh, 激活函数:反正切

kernel_initializer=create_initializer(config.initializer_range))

#构造函数结束

def get_pooled_output(self):  获取池化输出

return self.pooled_output

def get_sequence_output(self):   获取序列输出

"""Gets final hidden layer of encoder.  获取编码后的隐藏层

Returns: 返回一个张量,和transformer 编码一致的

float Tensor of shape [batch_size, seq_length, hidden_size] corresponding

to the final hidden of the transformer encoder.

"""

return self.sequence_output

def get_all_encoder_layers(self):  获取所有编码层

return self.all_encoder_layers

def get_embedding_output(self):  获取嵌入层的输出

"""Gets output of the embedding lookup (i.e., input to the transformer).

获取嵌入查找 的结果,例如 transformer的输入

Returns: 返回一个浮点型张量,和嵌入层一致的

将位置嵌入和类型嵌入数据统统相加求和,然后再标准化

这就是transformer的输入

float Tensor of shape [batch_size, seq_length, hidden_size] corresponding

to the output of the embedding layer, after summing the word

embeddings with the positional embeddings and the token type embeddings,

then performing layer normalization. This is the input to the transformer.

"""

return self.embedding_output

def get_embedding_table(self):  获取嵌入表

return self.embedding_table

格鲁激活

■格鲁激活函数

def gelu(x):

"""Gaussian Error Linear Unit.  高斯误差线性单元

This is a smoother version of the RELU.   gelu是relu的平滑版

Original paper: https://arxiv.org/abs/1606.08415

Args:  x是将被激活的张量

x: float Tensor to perform activation.

Returns: 返回值是激活后的张量

`x` with the GELU activation applied.

"""    tf.tanh 反正切函数

cdf = 0.5 * (1.0 + tf.tanh(

(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))

return x * cdf

获取

激活函数

■通过字符串(函数名称)获取激活函数

def get_activation(activation_string):

"""Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.

创建一个字符串到激活函数的映射关系

Args:   输入参数:激活函数名

activation_string: String name of the activation function.

返回值:相应的激活函数。

如果输入的字符串为None、 empty或者"linear",就会返回None。

如果输入参数不是字符串类型,就会返回 `activation_string`

Returns:

A Python function corresponding to the activation function. If

`activation_string` is None, empty, or "linear", this will return None.

If `activation_string` is not a string, it will return `activation_string`.

Raises: 异常:如果字符串无法匹配任何一个激活函数

ValueError: The `activation_string` does not correspond to a known

activation.

"""

如果入参不是字符串,就直接返回去

# We assume that anything that"s not a string is already an activation

# function, so we just return it.

if not isinstance(activation_string, six.string_types):

return activation_string

if not activation_string: 如果字符串为None或者empty,则返回None

return None

将入参字符串转换为小写

act = activation_string.lower()

if act == "linear":

return None

elif act == "relu":   热卤激活函数

return tf.nn.relu

elif act == "gelu":   格鲁激活函数

return gelu

elif act == "tanh":   反正切激活函数

return tf.tanh

else:             触发异常

raise ValueError("Unsupported activation: %s" % act)

读取

检查点

■从检查点获取任务映射

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):

"""Compute the union of the current variables and checkpoint variables."""

获取当前变量和检查点变量

assignment_map = {}

initialized_variable_names = {}

 OrderedDict的 Key 会按照插入的顺序排列,不是Key本身排序

name_to_variable = collections.OrderedDict()

for var in tvars:

name = var.name

m = re.match("^(.*):\\d+$", name)

if m is not None:

name = m.group(1)

name_to_variable[name] = var

init_vars = tf.train.list_variables(init_checkpoint)

assignment_map = collections.OrderedDict()

for x in init_vars:

(name, var) = (x[0], x[1])

if name not in name_to_variable:

continue

assignment_map[name] = name

initialized_variable_names[name] = 1

initialized_variable_names[name + ":0"] = 1

return (assignment_map, initialized_variable_names)

丢弃

标准化

初始化

■丢弃函数

def dropout(input_tensor, dropout_prob):

"""Perform dropout. 进行丢弃

Args:  参数

input_tensor: float Tensor. 输入的张量

dropout_prob: Python float. The probability of dropping out a value (NOT of

*keeping* a dimension as in `tf.nn.dropout`).  丢弃某个值的概率

Returns: 返回值:丢弃部分数据后的张量

A version of `input_tensor` with dropout applied.

"""  如果丢弃概率为None或者为0,则原封不动的返回

if dropout_prob is None or dropout_prob == 0.0:

return input_tensor

  1、tf.nn.dropout 中参数 keep_prob :每一个元素被保存下的概率。

2、tf.layer.dropout 中参数 rate :每一个元素丢弃的概率。keep_prob = 1 - rate

def dropout(x, keep_prob, noise_shape=None, seed=None, name=None)

要么保留,要么丢弃,所以keep_prob+dropout_prob=1

output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)

return output

■数据标准化

def layer_norm(input_tensor, name=None):

"""Run layer normalization on the last dimension of the tensor."""

return tf.contrib.layers.layer_norm(

inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)

■(2合1函数)先标准化,再丢弃,然后返回

def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):

"""Runs layer normalization followed by dropout."""

output_tensor = layer_norm(input_tensor, name)

output_tensor = dropout(output_tensor, dropout_prob)

return output_tensor

■初始化权重参数

def create_initializer(initializer_range=0.02):  initializer_range初始化范围,就是标准差stddev

"""Creates a `truncated_normal_initializer` with the given range."""

正态分布初始化  // 这是神经网络权重和过滤器的推荐初始值。

return tf.truncated_normal_initializer(stddev=initializer_range)

tf.truncated_normal_initializer的意思是:从截断的正态分布中输出随机值。

生成的值服从具有指定平均值和标准偏差的正态分布,

如果生成的值大于平均值2个标准偏差的值则丢弃重新选择。

嵌入查找

■通过词查找对应的嵌入张量

def embedding_lookup(input_ids,

vocab_size,

embedding_size=128,

initializer_range=0.02,

word_embedding_name="word_embeddings",

use_one_hot_embeddings=False):

"""Looks up words embeddings for id tensor.

Args: 入参

input_ids: int32 Tensor of shape [batch_size, seq_length] containing word

ids.包含词的id的整型张量

vocab_size: int. Size of the embedding vocabulary.嵌入词典的大小

embedding_size: int. Width of the word embeddings. 词嵌入的大小

initializer_range: float. Embedding initialization range.权重参数初始化的标准差

word_embedding_name: string. Name of the embedding table.词嵌入名称

use_one_hot_embeddings: bool. If True, use one-hot method for word

embeddings. If False, use `tf.gather()`. 是否使用onehot码

Returns: 返回一个张量

float Tensor of shape [batch_size, seq_length, embedding_size].

""" 假定输入数据形状为 [batch_size, seq_length, num_inputs]

# This function assumes that the input is of shape [batch_size, seq_length,

# num_inputs].

#  如果输入是2D张量,则必须变形为3维张量,增加第三维,并且第三维的大小为1

# If the input is a 2D tensor of shape [batch_size, seq_length], we

# reshape to [batch_size, seq_length, 1].

if input_ids.shape.ndims == 2: 如果输入是2维,则扩张维度tf.expand_dims

input_ids = tf.expand_dims(input_ids, axis=[-1])

嵌入表格

embedding_table = tf.get_variable(

name=word_embedding_name,

shape=[vocab_size, embedding_size],

initializer=create_initializer(initializer_range))

平坦化,降维成1维

哪一维使用了-1,那这一维度就不定义大小,而是根据你的数据情况进行匹配。

即先不管-1的那一个维度,先看其他维度,然后用原矩阵的总元素个数除以确定的维度,就能得到-1维度的值。

不过要注意:但列表中只能存在一个-1。

flat_input_ids = tf.reshape(input_ids, [-1])

if use_one_hot_embeddings:

tf.one_hot()函数是将input转化为one-hot类型数据输出

one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)

one_hot向量和嵌入表相乘,用于向量降维,减少数据量

output = tf.matmul(one_hot_input_ids, embedding_table)

else:

output = tf.gather(embedding_table, flat_input_ids)

input_shape = get_shape_list(input_ids)

张量变形

output = tf.reshape(output,

input_shape[0:-1] + [input_shape[-1] * embedding_size])

return (output, embedding_table)

嵌入

后处理

■嵌入 后处理

def embedding_postprocessor(input_tensor,

use_token_type=False,

token_type_ids=None,

token_type_vocab_size=16,

token_type_embedding_name="token_type_embeddings",

use_position_embeddings=True,

position_embedding_name="position_embeddings",

initializer_range=0.02,

max_position_embeddings=512,

dropout_prob=0.1):

"""Performs various post-processing on a word embedding tensor.

对词嵌入张量进行各种处理

Args: 入参 输入的张量

input_tensor: float Tensor of shape [batch_size, seq_length,

embedding_size].

是否使用类型令牌

use_token_type: bool. Whether to add embeddings for `token_type_ids`.

类型令牌的id,如果要使用类型令牌,那么该参数必须指定

token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].

Must be specified if `use_token_type` is True.

类型令牌 大小

token_type_vocab_size: int. The vocabulary size of `token_type_ids`.

类型令牌的名称

token_type_embedding_name: string. The name of the embedding table variable

for token type ids.

是否使用位置嵌入

use_position_embeddings: bool. Whether to add position embeddings for the

position of each token in the sequence.

位置嵌入表的名称

position_embedding_name: string. The name of the embedding table variable

for positional embeddings.

标准差stdev,也就是参数的范围,用于权重参数的初始化

initializer_range: float. Range of the weight initialization.

位置嵌入的最大长度,可以大于输入序列的长度,但是不能小于

max_position_embeddings: int. Maximum sequence length that might ever be

used with this model. This can be longer than the sequence length of

input_tensor, but cannot be shorter.

丢弃率=1-保留率

dropout_prob: float. Dropout probability applied to the final output tensor.

Returns: 返回值:和输入张量形状相同的另一个张量

float tensor with same shape as `input_tensor`.

Raises: 异常:张量形状或者输入值无效

ValueError: One of the tensor shapes or input values is invalid.

"""

input_shape = get_shape_list(input_tensor, expected_rank=3) 获取形状列表

batch_size = input_shape[0]

seq_length = input_shape[1]

width = input_shape[2]

output = input_tensor

类型嵌入■

if use_token_type:

if token_type_ids is None: 如果没有token_type_ids 就触发异常

raise ValueError("`token_type_ids` must be specified if"

"`use_token_type` is True.")

类型表

token_type_table = tf.get_variable(

name=token_type_embedding_name,

shape=[token_type_vocab_size, width],

initializer=create_initializer(initializer_range))

# This vocab will be small so we always do one-hot here, since it is always

# faster for a small vocabulary.

这个词典比较小,所以使用 one-hot,因为更快

flat_token_type_ids = tf.reshape(token_type_ids, [-1]) 平坦化,变成一维的

转换成one_hot格式的id

one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)

one_hot格式乘以一个类型表,则转换为词向量

token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)

token_type_embeddings = tf.reshape(token_type_embeddings,

[batch_size, seq_length, width])

output += token_type_embeddings 将类型数据加进去

位置嵌入■

if use_position_embeddings: 如果使用位置嵌入

断言条件 x <= y 保持元素

assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)

tf.control_dependencies是tensorflow中的一个flow顺序控制机制,作用有二:

插入依赖(dependencies)和清空依赖(依赖是op或tensor)。

with tf.control_dependencies([assert_op]):

get_variable函数的作用是创建新的tensorflow变量,常见的initializer有:常量初始化器tf.constant_initializer、正太分布初始化器tf.random_normal_initializer、截断正态分布初始化器tf.truncated_normal_initializer、均匀分布初始化器tf.random_uniform_initializer。

full_position_embeddings = tf.get_variable(

name=position_embedding_name,

shape=[max_position_embeddings, width],

initializer=create_initializer(initializer_range))

因为位置嵌入表 是一个学习变量,可以通过最大长度创建;

实际的序列长度可能小于这个长度,因为快速训练任务没有长序列;

# Since the position embedding table is a learned variable, we create it

# using a (long) sequence length `max_position_embeddings`. The actual

# sequence length might be shorter than this, for faster training of

# tasks that do not have long sequences.

所以 全位置嵌入是一个高效的嵌入,并且当前序列有位置信息,所以我们只执行一个切片

# So `full_position_embeddings` is effectively an embedding table

# for position [0, 1, 2, ..., max_position_embeddings-1], and the current

# sequence has positions [0, 1, 2, ... seq_length-1], so we can just

# perform a slice.

函数:tf.slice(inputs, begin, size, name)

作用:从列表、数组、张量等对象中抽取一部分数据

position_embeddings = tf.slice(full_position_embeddings, [0, 0],

[seq_length, -1])

num_dims = len(output.shape.as_list()) 维度个数

只有最后两个维度是有意义的,所以我们在第一个维度广播,通常这个维度是 批处理量

# Only the last two dimensions are relevant (`seq_length` and `width`), so

# we broadcast among the first dimensions, which is typically just

# the batch size.

position_broadcast_shape = [] 广播形状

for _ in range(num_dims - 2):

position_broadcast_shape.append(1)

position_broadcast_shape.extend([seq_length, width]) 扩张

position_embeddings = tf.reshape(position_embeddings,

position_broadcast_shape) 变形

output += position_embeddings  将位置数据加进去

output = layer_norm_and_dropout(output, dropout_prob) 标准化和丢弃

return output

创建掩码

■从输入掩码创建注意力掩码

def create_attention_mask_from_input_mask(from_tensor, to_mask):

"""Create 3D attention mask from a 2D tensor mask.

 2D掩码创建3D掩码

Args: 入参:输入张量,转换成掩码的张量

from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].

to_mask: int32 Tensor of shape [batch_size, to_seq_length].

Returns:  返回值 浮点值的张量

float Tensor of shape [batch_size, from_seq_length, to_seq_length].

""" 获取入参形状参数

from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])

batch_size = from_shape[0]

from_seq_length = from_shape[1]

获取转换张量的形状

to_shape = get_shape_list(to_mask, expected_rank=2)

to_seq_length = to_shape[1]

先变形,然后转换成float32浮点数

to_mask = tf.cast(

tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)

from_tensor不一定是掩码(虽然它可能是)

  我们不太关心(from里面的填充符号),所以创建一个全是1的张量;

# We don't assume that `from_tensor` is a mask (although it could be). We

# don't actually care if we attend *from* padding tokens (only *to* padding)

# tokens so we create a tensor of all ones.

#

# `broadcast_ones` = [batch_size, from_seq_length, 1]

创建全1张量

broadcast_ones = tf.ones(

shape=[batch_size, from_seq_length, 1], dtype=tf.float32)

我们在两个维度上进行广播,从而创建掩码

# Here we broadcast along two dimensions to create the mask.

mask = broadcast_ones * to_mask

return mask

注意力层

■注意力 层

def attention_layer(from_tensor,

to_tensor,

attention_mask=None,

num_attention_heads=1,

size_per_head=512,

query_act=None,

key_act=None,

value_act=None,

attention_probs_dropout_prob=0.0,

initializer_range=0.02,

do_return_2d_tensor=False,

batch_size=None,

from_seq_length=None,

to_seq_length=None):

"""Performs multi-headed attention from `from_tensor` to `to_tensor`.

多头的注意力

This is an implementation of multi-headed attention

based on "Attention is all you Need".

这是一个多头注意力的实现,注意的才是需要的

如果from_tensorto_tensor是一样的,name这个注意力就是自己注意自己,也叫自注意力。

If `from_tensor` and `to_tensor` are the same, then

this is self-attention. Each timestep in `from_tensor` attends to the

corresponding sequence in `to_tensor`, and returns a fixed-with vector.

先将from_tensor投射成query张量,并且将to_tensor投射成key和value张量。

这将产生一系列张量,张量个数=头数,

其中每个张量的形状都是[批处理量,序列长度,头的大小]

This function first projects `from_tensor` into a "query" tensor and

`to_tensor` into "key" and "value" tensors. These are (effectively) a list

of tensors of length `num_attention_heads`, where each tensor is of shape

[batch_size, seq_length, size_per_head].

query 张量和key张量都是 点积的 和成比例的???。

通过softmax运算从而获取注意力数据。

value 张量通过这些注意力数据差值计算得出,然后把它们连接成一个张量。

Then, the query and key tensors are dot-producted and scaled. These are

softmaxed to obtain attention probabilities. The value tensors are then

interpolated by these probabilities, then concatenated back to a single

tensor and returned.

实际操作中,多头注意力进行转置和变形运算,而不是独立的张量运算。

In practice, the multi-headed attention are done with transposes and

reshapes rather than actual separate tensors.

Args: 入参,输入张量,输出张量

from_tensor: float Tensor of shape [batch_size, from_seq_length,

from_width].

to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].

注意力掩码

attention_mask: (optional) int32 Tensor of shape [batch_size,

from_seq_length, to_seq_length]. The values should be 1 or 0. The

attention scores will effectively be set to -infinity for any positions in

the mask that are 0, and will be unchanged for positions that are 1.

注意力头数

num_attention_heads: int. Number of attention heads.

每个头的大小

size_per_head: int. Size of each attention head.

query变形的激活函数

key 变形的激活函数

value 变形的激活函数

query_act: (optional) Activation function for the query transform.

key_act: (optional) Activation function for the key transform.

value_act: (optional) Activation function for the value transform.

注意力数据的 丢弃比例

attention_probs_dropout_prob: (optional) float. Dropout probability of the

attention probabilities.

标准差,数据初始化的范围(截断的正态分布)

initializer_range: float. Range of the weight initializer.

是否返回2d张量

do_return_2d_tensor: bool. If True, the output will be of shape [batch_size

* from_seq_length, num_attention_heads * size_per_head]. If False, the

output will be of shape [batch_size, from_seq_length, num_attention_heads

* size_per_head].

批处理量,输入序列长度,输出序列长度

batch_size: (Optional) int. If the input is 2D, this might be the batch size

of the 3D version of the `from_tensor` and `to_tensor`.

from_seq_length: (Optional) If the input is 2D, this might be the seq length

of the 3D version of the `from_tensor`.

to_seq_length: (Optional) If the input is 2D, this might be the seq length

of the 3D version of the `to_tensor`.

Returns: 返回值 浮点值的张量

float Tensor of shape [batch_size, from_seq_length,

num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is

true, this will be of shape [batch_size * from_seq_length,

num_attention_heads * size_per_head]).

Raises: 异常:参数无效,或者张量形状无效

ValueError: Any of the arguments or tensor shapes are invalid.

"""

■变形+转置→为了获取得分

def transpose_for_scores(input_tensor, batch_size, num_attention_heads,

seq_length, width):

output_tensor = tf.reshape(

input_tensor, [batch_size, seq_length, num_attention_heads, width])

tf.transpose的第二个参数perm=[0,1,2],0代表三维数组的高(即为二维数组的个数),1代表二维数组的行,2代表二维数组的列。

tf.transpose(x, perm=[1,0,2])代表将三位数组的高和行进行转置。

output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])

return output_tensor

获取形状

from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])

to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

if len(from_shape) != len(to_shape):

raise ValueError(

"The rank of `from_tensor` must match the rank of `to_tensor`.")

if len(from_shape) == 3: 三维张量

batch_size = from_shape[0]

from_seq_length = from_shape[1]

to_seq_length = to_shape[1]

elif len(from_shape) == 2: 二维张量

if (batch_size is None or from_seq_length is None or to_seq_length is None):

raise ValueError(

"When passing in rank 2 tensors to attention_layer, the values "

"for `batch_size`, `from_seq_length`, and `to_seq_length` "

"must all be specified.")

引用的 维度

# Scalar dimensions referenced here: 标量维度

#   B = batch size (number of sequences)  B批处理量

#   F = `from_tensor` sequence length  F输入张量的序列长度

#   T = `to_tensor` sequence length   T输出张量的序列长度

#   N = `num_attention_heads`     N注意力头数

#   H = `size_per_head`    H每个头的大小

from_tensor_2d = reshape_to_matrix(from_tensor) 转换为二维矩阵

to_tensor_2d = reshape_to_matrix(to_tensor) 转换为二维矩阵

# `query_layer` = [B*F, N*H]  询问层=[批处理量*输入长度,头数*每头的大小]

query_layer = tf.layers.dense(  创建一个全连接层,密集层

from_tensor_2d,

num_attention_heads * size_per_head,

activation=query_act,

name="query",

kernel_initializer=create_initializer(initializer_range))

# `key_layer` = [B*T, N*H]  关键层=[批处理量*输出长度,头数*每头的大小]

key_layer = tf.layers.dense( 创建一个全连接层,密集层

to_tensor_2d,

num_attention_heads * size_per_head,

activation=key_act,

name="key",

kernel_initializer=create_initializer(initializer_range))

# `value_layer` = [B*T, N*H]  数值层=[批处理量*输出长度,头数*每头的大小]

value_layer = tf.layers.dense( 创建一个全连接层,密集层

to_tensor_2d,

num_attention_heads * size_per_head,

activation=value_act,

name="value",

kernel_initializer=create_initializer(initializer_range))

# `query_layer` = [B, N, F, H] 变形+转置→为了获取得分

query_layer = transpose_for_scores(query_layer, batch_size,

num_attention_heads, from_seq_length,

size_per_head)

# `key_layer` = [B, N, T, H]  变形+转置→为了获取得分

key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,

to_seq_length, size_per_head)

# Take the dot product between "query" and "key" to get the raw

# attention scores.

querykey进行点乘计算,获取原始得分

# `attention_scores` = [B, N, F, T]

attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)

attention_scores = tf.multiply(attention_scores,

1.0 / math.sqrt(float(size_per_head)))

if attention_mask is not None: 如果注意力掩码非空

# `attention_mask` = [B, 1, F, T]

attention_mask = tf.expand_dims(attention_mask, axis=[1])  扩张

对于关心的位置掩码为1,其他位置掩码为0,

这将创建一个新的矩阵,关心的位置为0,掩码位置为-10000

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for

# masked positions, this operation will create a tensor which is 0.0 for

# positions we want to attend and -10000.0 for masked positions.

adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

# Since we are adding it to the raw scores before the softmax, this is

# effectively the same as removing these entirely.

attention_scores += adder

在进行softmax计算前,将其加到得分里面,相当于完全删除这些内容

将注意力分数值标准化,然后就变成了概率值(归一化)

# Normalize the attention scores to probabilities.

# `attention_probs` = [B, N, F, T]  probs=probabilities概率值

attention_probs = tf.nn.softmax(attention_scores)

 这真的丢弃了我们关心的值,这看起来有点不正常,

但是这是按照Transformer 论文进行的

# This is actually dropping out entire tokens to attend to, which might

# seem a bit unusual, but is taken from the original Transformer paper.

attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

进行部分数据的丢弃,防止过拟合

# `value_layer` = [B, T, N, H]

value_layer = tf.reshape(

value_layer,

[batch_size, to_seq_length, num_attention_heads, size_per_head])

# `value_layer` = [B, N, T, H]  第二维和第三维进行转置

value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

# `context_layer` = [B, N, F, H]

context_layer = tf.matmul(attention_probs, value_layer)

注意力矩阵*值矩阵=上下文矩阵

# `context_layer` = [B, F, N, H] 上下文矩阵转置

context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

if do_return_2d_tensor: 如果要求返回二维矩阵,就将上下文矩阵变形成二维

# `context_layer` = [B*F, N*H]

context_layer = tf.reshape(

context_layer,

[batch_size * from_seq_length, num_attention_heads * size_per_head])

else: 如果不要求返回二维矩阵,就变形为一个三维矩阵

# `context_layer` = [B, F, N*H]

context_layer = tf.reshape(

context_layer,

[batch_size, from_seq_length, num_attention_heads * size_per_head])

return context_layer

transformer模型

 

 

 

 

transformer模型 函数

 

def transformer_model(input_tensor,

attention_mask=None,

hidden_size=768,

num_hidden_layers=12,

num_attention_heads=12,

intermediate_size=3072,

intermediate_act_fn=gelu,

hidden_dropout_prob=0.1,

attention_probs_dropout_prob=0.1,

initializer_range=0.02,

do_return_all_layers=False):

"""Multi-headed, multi-layer Transformer from "Attention is All You Need".

多头的,多层的Transformer 模型,基于一个理念“注意的才是需要的”

This is almost an exact implementation of the original Transformer encoder.

这是对原始Transformer 代码的精确实现

See the original paper: 论文参照下面的链接

https://arxiv.org/abs/1706.03762

Also see: 也可以参照GitHub

https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

Args: 入参说明:输入张量,隐藏层大小,隐藏层个数,注意力头数

input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].

attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,

seq_length], with 1 for positions that can be attended to and 0 in

positions that should not be.

hidden_size: int. Hidden size of the Transformer.

num_hidden_layers: int. Number of layers (blocks) in the Transformer.

num_attention_heads: int. Number of attention heads in the Transformer.

中间层大小,中间层的激活函数,隐藏层的丢弃比例,注意力概率层的丢弃比例

intermediate_size: int. The size of the "intermediate" (a.k.a., feed

forward) layer.

intermediate_act_fn: function. The non-linear activation function to apply

to the output of the intermediate/feed-forward layer.

hidden_dropout_prob: float. Dropout probability for the hidden layers.

attention_probs_dropout_prob: float. Dropout probability of the attention

probabilities.

截断的标准正太分布的标准差,

也就是权重参数初始化的数值范围(超出该范围的会被截断)

initializer_range: float. Range of the initializer (stddev of truncated

normal).

是否要求返回所有的层,还是返回最后一层

do_return_all_layers: Whether to also return all layers or just the final

layer.

Returns: 返回值,一个张量,Transformer模型中的最后一个隐藏层

float Tensor of shape [batch_size, seq_length, hidden_size], the final

hidden layer of the Transformer.

Raises: 异常 无效的形状或参数值

ValueError: A Tensor shape or parameter is invalid.

"""

if hidden_size % num_attention_heads != 0:

如果隐藏的大小不能整除注意力头数,就触发异常

raise ValueError(

"The hidden size (%d) is not a multiple of the number of attention "

"heads (%d)" % (hidden_size, num_attention_heads))

attention_head_size = int(hidden_size / num_attention_heads)

input_shape = get_shape_list(input_tensor, expected_rank=3)

batch_size = input_shape[0]

seq_length = input_shape[1]

input_width = input_shape[2]

Transformer 需要对残差进行求和计算,所以 所需的参数和隐藏层相同

# The Transformer performs sum residuals on all layers so the input needs

# to be the same as the hidden size.

if input_width != hidden_size: 参数尺寸不一致,就报错

raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %

(input_width, hidden_size))

我们始终使用2D张量,避免来回的变形;

矩阵变形对于GPU和CPU是很简单的,但是对于TPU就有点麻烦了,

所以要减少这么不必要的转换带来的计算量,从而提高模型效率;

# We keep the representation as a 2D tensor to avoid re-shaping it back and

# forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on

# the GPU/CPU but may not be free on the TPU, so we want to minimize them to

# help the optimizer.

将输入的张量转换为2D矩阵

prev_output = reshape_to_matrix(input_tensor)

all_layer_outputs = [] 定义所有的输出层

variable_scope()是作用域,和tf.get_variable()搭配使用

variable_scope也是个作为上下文管理器的角色,

下文管理器:意思就是,在这个管理器下做的事情,会被这个管理器管着。

variable_scope 主要是因为 变量共享 的需求。

for layer_idx in range(num_hidden_layers): 遍历所有的层

with tf.variable_scope("layer_%d" % layer_idx):

layer_input = prev_output 输入就是原先的输出

with tf.variable_scope("attention"):

attention_heads = [] 定义注意力头的集合

with tf.variable_scope("self"):

attention_head = attention_layer(  每个注意力头就是一个注意力层

from_tensor=layer_input,   源矩阵和目标矩阵相同,也就是自己注意自己

to_tensor=layer_input,

attention_mask=attention_mask, 注意力掩码

num_attention_heads=num_attention_heads,  头数

size_per_head=attention_head_size,  每头的大小

attention_probs_dropout_prob=attention_probs_dropout_prob, 注意力数据丢弃比例

initializer_range=initializer_range, 数据初始化范围,也就是标准差

do_return_2d_tensor=True,  是否要求返回2D张量

batch_size=batch_size,  批处理量

from_seq_length=seq_length, 源序列长度

to_seq_length=seq_length)  目标序列长度

attention_heads.append(attention_head)  将生成的头【矩阵】添加到集合中

attention_output = None

if len(attention_heads) == 1: 如果只有一头,则输出就是这一头

attention_output = attention_heads[0]

else:  如果有好多头

         有多头的情况下,我们将他们连接起来,然后再投影;

# In the case where we have other sequences, we just concatenate

# them to the self-attention head before the projection.

attention_output = tf.concat(attention_heads, axis=-1)

tf.concat(),tensorflow中用来拼接张量的函数tf.concat(),用法:

axis=0 代表在第0个维度拼接; axis=1 代表在第1个维度拼接

axis=-1表示倒数第一个维度,对于三维矩阵拼接来说,axis=-1等价于axis=2。

对于一个二维矩阵,第0个维度代表最外层方括号所框下的子集,第1个维度代表内部方括号所框下的子集。维度越高,括号越小。

# Run a linear projection of `hidden_size` then add a residual

# with `layer_input`.

对隐藏层尺寸进行线性投影,然后再加上一个残差

with tf.variable_scope("output"):

attention_output = tf.layers.dense(  创建一个全连接层/密集层

attention_output,

hidden_size,

kernel_initializer=create_initializer(initializer_range))

attention_output = dropout(attention_output, hidden_dropout_prob) 丢弃

attention_output = layer_norm(attention_output + layer_input) 标准化

激活函数仅用于中间层

# The activation is only applied to the "intermediate" hidden layer.

with tf.variable_scope("intermediate"):

intermediate_output = tf.layers.dense(  创建一个全连接层/密集层

attention_output,  将上一层的输出,作为本层的输入

intermediate_size, 中间层大小

activation=intermediate_act_fn,

kernel_initializer=create_initializer(initializer_range))

向下投射到隐藏层大小,然后再和残差相加

# Down-project back to `hidden_size` then add the residual.

with tf.variable_scope("output"):

layer_output = tf.layers.dense( 创建密集层,进行矩阵投影运算

intermediate_output,

hidden_size,

kernel_initializer=create_initializer(initializer_range))

layer_output = dropout(layer_output, hidden_dropout_prob) 丢弃

layer_output = layer_norm(layer_output + attention_output) 标准化

prev_output = layer_output

all_layer_outputs.append(layer_output)  再添加一个层

if do_return_all_layers: 如果要求返回所有的层

final_outputs = [] 最终返回值

for layer_output in all_layer_outputs: 遍历所有层

final_output = reshape_from_matrix(layer_output, input_shape) 每个层都进行变形

final_outputs.append(final_output) 添加到返回值中

return final_outputs

else: 如果不要求返回所有层

final_output = reshape_from_matrix(prev_output, input_shape) 变形

return final_output

相关

辅助函数

def get_shape_list(tensor, expected_rank=None, name=None):

"""Returns a list of the shape of tensor, preferring static dimensions.

返回一个张量在各个维度上的大小,最好是静态维度

Args:  入参:张量,想要得到的秩,名称

tensor: A tf.Tensor object to find the shape of.

expected_rank: (optional) int. The expected rank of `tensor`. If this is

specified and the `tensor` has a different rank, and exception will be

thrown. 如果指定参数与张量的秩不同,则报错

name: Optional name of the tensor for the error message.

Returns: 返回值:张量在各个维度上的大小,构成的一个列表

A list of dimensions of the shape of tensor. All static dimensions will

be returned as python integers, and dynamic dimensions will be returned

as tf.Tensor scalars如果是动态维度,将返回一个标量

"""

if name is None: 如果没有指定名称,就用张量的名称

name = tensor.name

if expected_rank is not None:  如果没有指定秩,就用张量的秩

assert_rank(tensor, expected_rank, name)

shape = tensor.shape.as_list() 将尺寸参数转换为列表

non_static_indexes = []

for (index, dim) in enumerate(shape):

if dim is None: 维度是None,表示该维度为动态维度dynamic dimension

non_static_indexes.append(index)

if not non_static_indexes: 如果没有 非静态维度(全是静态维度),就直接返回

return shape

dyn_shape = tf.shape(tensor) 包含动态维度的形状

for index in non_static_indexes: 获取所有动态维度

shape[index] = dyn_shape[index]

return shape

■多维变2维

def reshape_to_matrix(input_tensor): 将张量转换为二维矩阵

"""Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""

ndims = input_tensor.shape.ndims

if ndims < 2: 待转换张量维度小于2,就报错

raise ValueError("Input tensor must have at least rank 2. Shape = %s" %

(input_tensor.shape))

if ndims == 2: 维度恰好2,直接返回

return input_tensor

width = input_tensor.shape[-1] 获取最后一维(倒数第一维)的大小

output_tensor = tf.reshape(input_tensor, [-1, width]) 最后一维不变,前面的其他维度自适应(相乘)

return output_tensor

■二维变多维

def reshape_from_matrix(output_tensor, orig_shape_list):

"""Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""

if len(orig_shape_list) == 2:

return output_tensor

output_shape = get_shape_list(output_tensor)

orig_dims = orig_shape_list[0:-1] 将原始形状 去除最后一维

width = output_shape[-1] 宽度为最后一维的大小

return tf.reshape(output_tensor, orig_dims + [width])

■秩的断言

def assert_rank(tensor, expected_rank, name=None):

"""Raises an exception if the tensor rank is not of the expected rank.

如果对不上,就报错

Args:参数:张量,期望的秩,名称(用于打印报错信息)

tensor: A tf.Tensor to check the rank of.

expected_rank: Python integer or list of integers, expected rank.

name: Optional name of the tensor for the error message.

Raises:

ValueError: If the expected shape doesn't match the actual shape.

"""

if name is None: 如果没有指定名称,则取张量的变量名称

name = tensor.name

expected_rank_dict = {}

if isinstance(expected_rank, six.integer_types): 如果指定的秩是个整数

expected_rank_dict[expected_rank] = True

else:

for x in expected_rank:如果指定的秩是一个列表(多个秩)

expected_rank_dict[x] = True

actual_rank = tensor.shape.ndims 实际的秩,就是张量的维数

if actual_rank not in expected_rank_dict:

scope_name = tf.get_variable_scope().name

raise ValueError(

"For the tensor `%s` in scope `%s`, the actual rank "

"`%d` (shape = %s) is not equal to the expected rank `%s`" %

(name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))

run_classifier.py

目录

属性

flags、FLAGS 用于接收输入参数

class InputExample(object) 例子类,每一条数据处理为一个例子

class PaddingInputExample(object) 填充例子,用于凑数的空类

class InputFeatures(object) 输入特征类,用于存放输入数据

class DataProcessor(object) 数据处理基类

class XnliProcessor(DataProcessor) 数据处理类

class MnliProcessor(DataProcessor) 数据处理类

class MrpcProcessor(DataProcessor) 数据处理类

class ColaProcessor(DataProcessor) 数据处理类

函数

def convert_single_example 单个例子转换为特征数据

def file_based_convert_examples_to_features  将文件中所有数据转换为特征数据

def file_based_input_fn_builder 输入处理函数

def _truncate_seq_pair 序列截断函数

def create_model 创建模型

def model_fn_builder 构建模型函数

def input_fn_builder 构建输入处理函数

def convert_examples_to_features 将例子列表转换为特征数据

def main(_) 主函数/入口函数

源码

版权许可

# coding=utf-8

# Copyright 2018 The Google AI Language Team Authors.

#

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

#

#     http://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

"""BERT finetuning runner."""

导入包

from __future__ import absolute_import

from __future__ import division

from __future__ import print_function

import collections

import csv

import os

import modeling

import optimization

import tokenization

import tensorflow as tf

参数处理

flags = tf.flags  参数

FLAGS = flags.FLAGS

## Required parameters  必须的参数/必不可少的

flags.DEFINE_string( 数据存放路径

"data_dir", None,

"The input data dir. Should contain the .tsv files (or other data files) "

"for the task.")

flags.DEFINE_string(  配置文件路径

"bert_config_file", None,

"The config json file corresponding to the pre-trained BERT model. "

"This specifies the model architecture.")

必须指定任务名称,因为对于不同的名称,有不同的数据处理函数

flags.DEFINE_string("task_name", None, "The name of the task to train.")

flags.DEFINE_string("vocab_file", None,  词典文件路径

"The vocabulary file that the BERT model was trained on.")

flags.DEFINE_string(  训练参数输出路径-用于保存训练好的权重参数、预测结果等

"output_dir", None,

"The output directory where the model checkpoints will be written.")

## Other parameters 其他参数--非必要/可选

flags.DEFINE_string(  初始化检查点,从上一次训练的结果为起点,接续训练,类似于端点续传

"init_checkpoint", None,

"Initial checkpoint (usually from a pre-trained BERT model).")

flags.DEFINE_bool(  是否转换为小写

"do_lower_case", True,

"Whether to lower case the input text. Should be True for uncased "

"models and False for cased models.")

flags.DEFINE_integer( 最大序列长度,默认128

"max_seq_length", 128,

"The maximum total input sequence length after WordPiece tokenization. "

"Sequences longer than this will be truncated, and sequences shorter "

"than this will be padded.")

flags.DEFINE_bool("do_train", False, "Whether to run training.") 是否进行训练,默认FALSE

flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") 是否进行验证,默认FALSE

flags.DEFINE_bool( 是否进行预测,默认FALSE

"do_predict", False,

"Whether to run the model in inference mode on the test set.")

训练批大小,默认32

flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")

评估/校验批大小,默认8

flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")

预测批大小,默认8

flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")

学习率,默认值5*10^-5

flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")

flags.DEFINE_float("num_train_epochs", 3.0, 训练周期,默认3个周期

"Total number of training epochs to perform.")

flags.DEFINE_float(  热身比例,默认10%,热身阶段的学习率有所调整

"warmup_proportion", 0.1,

"Proportion of training to perform linear learning rate warmup for. "

"E.g., 0.1 = 10% of training.")

flags.DEFINE_integer("save_checkpoints_steps", 1000, 检查点保存的步距,默认 每1000步保存一次

"How often to save the model checkpoint.")

flags.DEFINE_integer("iterations_per_loop", 1000,  每个循环的迭代次数,默认1000次

"How many steps to make in each estimator call.")

flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 是否使用TPU

tf.flags.DEFINE_string(  TPU的名称,如果选择了使用TPU,则必须指定其名称

"tpu_name", None,

"The Cloud TPU to use for training. This should be either the name "

"used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "

"url.")

tf.flags.DEFINE_string(  TPU所在地区

"tpu_zone", None,

"[Optional] GCE zone where the Cloud TPU is located in. If not "

"specified, we will attempt to automatically detect the GCE project from "

"metadata.")

tf.flags.DEFINE_string(  TPU映射?

"gcp_project", None,

"[Optional] Project name for the Cloud TPU-enabled project. If not "

"specified, we will attempt to automatically detect the GCE project from "

"metadata.")

TensorFlow 的下载地址

tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")

flags.DEFINE_integer(  TPU是几核的,默认8核

"num_tpu_cores", 8,

"Only used if `use_tpu` is True. Total number of TPU cores to use.")

例子类

假例子类

■输入例子类

Example包含四个属性:编号guid,句子a   text_a,句子b  text_b,标签label

class InputExample(object):

"""A single training/test example for simple sequence classification."""

对于序列的分类任务,训练例子或者测试例子

def __init__(self, guid, text_a, text_b=None, label=None):

"""Constructs a InputExample. 构造函数

Args: 参数:每个例子的唯一id,句子a,句子b(可选),标签(可选)

guid: Unique id for the example.

text_a: string. The untokenized text of the first sequence. For single

sequence tasks, only this sequence must be specified.

text_b: (Optional) string. The untokenized text of the second sequence.

Only must be specified for sequence pair tasks.

label: (Optional) string. The label of the example. This should be

specified for train and dev examples, but not for test examples.

"""

self.guid = guid

self.text_a = text_a

self.text_b = text_b

self.label = label

■假例子类,用于将不满整批的凑数成一整批

class PaddingInputExample(object): 一个空类,不包含任何属性和方法

"""Fake example so the num input examples is a multiple of the batch size.

假的例子,数量必须是批处理量的整数倍

在验证/预测时,需要将例子数量增加,直到例子数是批处理量的整数倍,

因为TPU需要整批整批的处理,需要一个固定的批处理量 batch size

也可以采用另一种方法,就是将最后那些不够一整批的例子直接丢弃,

因为这些非整批的例子,会导致无法输出结果数据;

When running eval/predict on the TPU, we need to pad the number of examples

to be a multiple of the batch size, because the TPU requires a fixed batch

size. The alternative is to drop the last batch, which is bad because it means

the entire output data won't be generated.

为什么使用假例子而不直接使用None凑数呢,因为使用None会导致一些无法发现的错误

We use this class instead of `None` because treating `None` as padding

battches could cause silent errors.

"""

特征数据类

■输入特征类

class InputFeatures(object):

"""A single set of features of data.""" 一个简单的特征数据集

只用于存储一些特征数据,而没有任何功能和方法

def __init__(self,

input_ids,

input_mask,

segment_ids,

label_id,

is_real_example=True):

self.input_ids = input_ids

self.input_mask = input_mask

self.segment_ids = segment_ids

self.label_id = label_id

self.is_real_example = is_real_example

数据处理基类

■数据处理类,负责读取数据

class DataProcessor(object):

"""Base class for data converters for sequence classification data sets."""

这是一个基类,用于将数据转换为序列分类数据集

def get_train_examples(self, data_dir): 读取输入例子集合--训练集

"""Gets a collection of `InputExample`s for the train set."""

raise NotImplementedError()

def get_dev_examples(self, data_dir):  读取输入例子集合--验证集

"""Gets a collection of `InputExample`s for the dev set."""

raise NotImplementedError()

def get_test_examples(self, data_dir):  读取输入例子集合--测试集

"""Gets a collection of `InputExample`s for prediction."""

raise NotImplementedError()

def get_labels(self):  读取标签数据

"""Gets the list of labels for this data set."""

raise NotImplementedError()

@classmethod 读取tsv文本文件,返回值是行的集合

def _read_tsv(cls, input_file, quotechar=None):

"""Reads a tab separated value file."""

with tf.gfile.Open(input_file, "r") as f:

reader = csv.reader(f, delimiter="\t", quotechar=quotechar)

lines = []

for line in reader:

lines.append(line)

return lines

数据处理类

■数据处理类(继承DataProcessor)-XNLI数据集

class XnliProcessor(DataProcessor):

"""Processor for the XNLI data set."""

def __init__(self):

self.language = "zh"

def get_train_examples(self, data_dir):

"""See base class.""" 到基类那里看看

lines = self._read_tsv(

os.path.join(data_dir, "multinli",

"multinli.train.%s.tsv" % self.language))

examples = []

for (i, line) in enumerate(lines): 遍历所有行

if i == 0: 跳过标题行

continue

guid = "train-%d" % (i) 标识符

text_a = tokenization.convert_to_unicode(line[0])

text_b = tokenization.convert_to_unicode(line[1])

label = tokenization.convert_to_unicode(line[2])

if label == tokenization.convert_to_unicode("contradictory"): 对立的

label = tokenization.convert_to_unicode("contradiction")  对立

examples.append( 创建一个例子,并添加到列表中

InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

return examples

def get_dev_examples(self, data_dir):

"""See base class."""

lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))

examples = []

for (i, line) in enumerate(lines):

if i == 0:

continue

guid = "dev-%d" % (i)

language = tokenization.convert_to_unicode(line[0])

if language != tokenization.convert_to_unicode(self.language):

continue

text_a = tokenization.convert_to_unicode(line[6])

text_b = tokenization.convert_to_unicode(line[7])

label = tokenization.convert_to_unicode(line[1])

examples.append(

InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

return examples

def get_labels(self):

"""See base class."""

return ["contradiction", "entailment", "neutral"]

■数据处理类(继承DataProcessor)-MNLI数据集

class MnliProcessor(DataProcessor):

"""Processor for the MultiNLI data set (GLUE version)."""

def get_train_examples(self, data_dir):

"""See base class."""

return self._create_examples(

self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):

"""See base class."""

return self._create_examples(

self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),

"dev_matched")

def get_test_examples(self, data_dir):

"""See base class."""

return self._create_examples(

self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")

def get_labels(self):

"""See base class."""

return ["contradiction", "entailment", "neutral"]

def _create_examples(self, lines, set_type):

"""Creates examples for the training and dev sets."""

examples = []

for (i, line) in enumerate(lines):

if i == 0:

continue

guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))

text_a = tokenization.convert_to_unicode(line[8])

text_b = tokenization.convert_to_unicode(line[9])

if set_type == "test":

label = "contradiction"

else:

label = tokenization.convert_to_unicode(line[-1])

examples.append(

InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

return examples

■数据处理类(继承DataProcessor)-MRPC数据集

class MrpcProcessor(DataProcessor):

"""Processor for the MRPC data set (GLUE version)."""

def get_train_examples(self, data_dir):

"""See base class."""

return self._create_examples(

self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):

"""See base class."""

return self._create_examples(

self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_test_examples(self, data_dir):

"""See base class."""

return self._create_examples(

self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

def get_labels(self):

"""See base class."""

return ["0", "1"]

def _create_examples(self, lines, set_type):

"""Creates examples for the training and dev sets."""

examples = []

for (i, line) in enumerate(lines):

if i == 0:

continue

guid = "%s-%s" % (set_type, i)

text_a = tokenization.convert_to_unicode(line[3])

text_b = tokenization.convert_to_unicode(line[4])

if set_type == "test":

label = "0"

else:

label = tokenization.convert_to_unicode(line[0])

examples.append(

InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

return examples

■数据处理类(继承DataProcessor)-CoLA数据集

class ColaProcessor(DataProcessor):

"""Processor for the CoLA data set (GLUE version)."""

def get_train_examples(self, data_dir):

"""See base class."""

return self._create_examples(

self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):

"""See base class."""

return self._create_examples(

self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_test_examples(self, data_dir):

"""See base class."""

return self._create_examples(

self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

def get_labels(self):

"""See base class."""

return ["0", "1"]

def _create_examples(self, lines, set_type):

"""Creates examples for the training and dev sets."""

examples = []

for (i, line) in enumerate(lines):

# Only the test set has a header

if set_type == "test" and i == 0:

continue

guid = "%s-%s" % (set_type, i)

if set_type == "test":

text_a = tokenization.convert_to_unicode(line[1])

label = "0"

else:

text_a = tokenization.convert_to_unicode(line[3])

label = tokenization.convert_to_unicode(line[1])

examples.append(

InputExample(guid=guid, text_a=text_a, text_b=None, label=label))

return examples

单个例子的特征提取

■将例子数据转换为特征数据

def convert_single_example(ex_index, example, label_list, max_seq_length,

tokenizer):

"""Converts a single `InputExample` into a single `InputFeatures`."""

if isinstance(example, PaddingInputExample): 如果是一个假例子

return InputFeatures(

input_ids=[0] * max_seq_length,

input_mask=[0] * max_seq_length,

segment_ids=[0] * max_seq_length,

label_id=0,

is_real_example=False)

如果不是假例子

label_map = {}

for (i, label) in enumerate(label_list):

label_map[label] = i

tokens_a = tokenizer.tokenize(example.text_a) 将文本转换为符号

tokens_b = None    b默认设置为空,适用于一句话的情况

if example.text_b:  如果句子b非空,就把句子b转换为符号

tokens_b = tokenizer.tokenize(example.text_b)

if tokens_b: 句子对的情况,如果有句子b(非空)

修改 符号序列a和符号序列b,使得其长度小于某个指定值;

# Modifies `tokens_a` and `tokens_b` in place so that the total

# length is less than the specified length.

# Account for [CLS], [SEP], [SEP] with "- 3"

序列最大允许长度是max_seq_length,其中包含3个分隔符[CLS] [SEP][SEP]

所以,这两个句子的最大允许长度就是max_seq_length-3

_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 截断序列对

else: 单个句子的情况

# Account for [CLS] and [SEP] with "- 2"

序列最大允许长度是max_seq_length,其中包含2个分隔符[CLS] [SEP]

name符号序列a的最大允许长度就是max_seq_length-2了

if len(tokens_a) > max_seq_length - 2:

tokens_a = tokens_a[0:(max_seq_length - 2)] 截断符号序列a

# The convention in BERT is:  在BERT模型中,转换情况如下所示

# (a) For sequence pairs: 句子对的情况

#  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]

#  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1

# (b) For single sequences: 单个句子的情况

#  tokens:   [CLS] the dog is hairy . [SEP]

#  type_ids: 0     0   0   0  0     0 0

type_ids用来标明第一句和第二句

对于2种类型的嵌入向量,在预训练中已经学习到了数据,

并且添加到了 词片嵌入向量中(还有位置向量)

这并不是严格必要的,因为[SEP] 符号已经将句子切分开了,

但是,这有利于模型理解句子的意思。

# Where "type_ids" are used to indicate whether this is the first

# sequence or the second sequence. The embedding vectors for `type=0` and

# `type=1` were learned during pre-training and are added to the wordpiece

# embedding vector (and position vector). This is not *strictly* necessary

# since the [SEP] token unambiguously separates the sequences, but it makes

# it easier for the model to learn the concept of sequences.

  对于分类任务,第一个向量是一个句子向量,

请注意,只有微调后的模型,这样做才有意义;

# For classification tasks, the first vector (corresponding to [CLS]) is

# used as the "sentence vector". Note that this only makes sense because

# the entire model is fine-tuned.

tokens = [] 符号序列集

segment_ids = [] 句子分割记号,0表示第一句,1表示第二句

tokens.append("[CLS]") 符号序列始终以[CLS]符号作为开头

segment_ids.append(0)   [CLS]符号属于第一个句子,所以分割符号为0

for token in tokens_a: 遍历a句子的符号序列

tokens.append(token) 将a序列的符号添加至新的符号序列

segment_ids.append(0)  a序列的所有符号,都属于句子a,因此都标记为0

tokens.append("[SEP]")  a序列添加完成后,再添加一个[SEP]分隔符

segment_ids.append(0)  a序列后面的[SEP]符号属于句子a

if tokens_b: 如果有序号序列b(not None  and  not Empty)

for token in tokens_b: 开始遍历b符号序列

tokens.append(token) 添加序列b中的每个符号

segment_ids.append(1)  句子b的分割信息都标记为1

tokens.append("[SEP]")  b序列添加完成后,再添加一个[SEP]分隔符

segment_ids.append(1)  b序列后面的[SEP]符号,属于句子b

将符号序列转化为矩阵

input_ids = tokenizer.convert_tokens_to_ids(tokens)

 对于真实的符号,掩码为1,对于填充的假符号,其掩码为0.

只有真实的符号才是我们想要的。

# The mask has 1 for real tokens and 0 for padding tokens. Only real

# tokens are attended to.

input_mask = [1] * len(input_ids) 创建输入掩码[1,1,1,....,1] 全1向量,长度为 len(input_ids) 

用0填充序列,填充到指定长度为止。

# Zero-pad up to the sequence length.

while len(input_ids) < max_seq_length:

input_ids.append(0)

input_mask.append(0)

segment_ids.append(0)

断言:如果断言条件不成立,立马触发异常(报错)

假定,长度都是最大长度了

assert len(input_ids) == max_seq_length

assert len(input_mask) == max_seq_length

assert len(segment_ids) == max_seq_length

label_id = label_map[example.label] 将标签映射为id值

if ex_index < 5:  打印一些信息

tf.logging.info("*** Example ***")

tf.logging.info("guid: %s" % (example.guid))

tf.logging.info("tokens: %s" % " ".join(

[tokenization.printable_text(x) for x in tokens]))

tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))

tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))

tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

tf.logging.info("label: %s (id = %d)" % (example.label, label_id))

构造InputFeatures数据类型 并返回

feature = InputFeatures(

input_ids=input_ids,

input_mask=input_mask,

segment_ids=segment_ids,

label_id=label_id,

is_real_example=True)

return feature

从文件到特征提取

■从文件读取数据,然后再提取特征

def file_based_convert_examples_to_features(

examples, label_list, max_seq_length, tokenizer, output_file):

"""Convert a set of `InputExample`s to a TFRecord file."""

将例子集合转换为TFRecord 格式的文件

writer = tf.python_io.TFRecordWriter(output_file)

for (ex_index, example) in enumerate(examples): 遍历所有例子

if ex_index % 10000 == 0: 打印一些信息

tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

将单个例子转换为特征数据

feature = convert_single_example(ex_index, example, label_list,

max_seq_length, tokenizer)

def create_int_feature(values): 转换为整数类型的数据

f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))

return f

features = collections.OrderedDict() 按照添加顺序排序的字典

features["input_ids"] = create_int_feature(feature.input_ids)

features["input_mask"] = create_int_feature(feature.input_mask)

features["segment_ids"] = create_int_feature(feature.segment_ids)

features["label_ids"] = create_int_feature([feature.label_id])

features["is_real_example"] = create_int_feature(

[int(feature.is_real_example)])

tf_example = tf.train.Example(features=tf.train.Features(feature=features))

writer.write(tf_example.SerializeToString()) 将数据转换成字符串并写入文件

writer.close()

从文件到输入处理函数

def file_based_input_fn_builder(input_file, seq_length, is_training,

drop_remainder):

"""Creates an `input_fn` closure to be passed to TPUEstimator."""

创建输入函数,从而将数据传送到TPU进行评价

name_to_features = {

"input_ids": tf.FixedLenFeature([seq_length], tf.int64),

"input_mask": tf.FixedLenFeature([seq_length], tf.int64),

"segment_ids": tf.FixedLenFeature([seq_length], tf.int64),

"label_ids": tf.FixedLenFeature([], tf.int64),

"is_real_example": tf.FixedLenFeature([], tf.int64),

}

def _decode_record(record, name_to_features):

"""Decodes a record to a TensorFlow example."""将数据记录解码为一个TensorFlow格式的例子

example = tf.parse_single_example(record, name_to_features)

 TensorFlow只支持64位整数,而TPU只支持32位整数

因此,要将64位整数全部转换为32位的,以便送入TPU进行处理

# tf.Example only supports tf.int64, but the TPU only supports tf.int32.

# So cast all int64 to int32.

for name in list(example.keys()):

t = example[name]

if t.dtype == tf.int64:

t = tf.to_int32(t)

example[name] = t

return example

def input_fn(params):

"""The actual input function.""" 实际的输入处理函数

batch_size = params["batch_size"]

对于训练,我们希望并行读取和打乱;

而对于评估,我们希望 是否打乱和并行读取 无关紧要

# For training, we want a lot of parallel reading and shuffling.

# For eval, we want no shuffling and parallel reading doesn't matter.

d = tf.data.TFRecordDataset(input_file)

if is_training: 如果是要训练

d = d.repeat()

d = d.shuffle(buffer_size=100)

如果不是训练

d = d.apply(

tf.contrib.data.map_and_batch(

lambda record: _decode_record(record, name_to_features),

batch_size=batch_size,

drop_remainder=drop_remainder))

return d

return input_fn 返回一个函数???

截断函数

■截断句子对(谁更长,就截谁)

def _truncate_seq_pair(tokens_a, tokens_b, max_length):

"""Truncates a sequence pair in place to the maximum length."""

将句子对截断到最大长度

每次仅仅将更长的句子截断一个符号,这比按比例截断更有意义,原因如下:

对于比较短的序列,每个被截断的符号包含更多的信息

序列越短,每个词在序列中所占的比例就越大,所占的比重就越大,截取掉的信息就相对更多。

# This is a simple heuristic which will always truncate the longer sequence

# one token at a time. This makes more sense than truncating an equal percent

# of tokens from each, since if one sequence is very short then each token

# that's truncated likely contains more information than a longer sequence.

while True:

total_length = len(tokens_a) + len(tokens_b) 总长度

if total_length <= max_length: 总长度不超过允许长度了,就终止循环

break

if len(tokens_a) > len(tokens_b): 如果a比b长,就截断a

tokens_a.pop()

else:

tokens_b.pop()

模型函数

■创建模型

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,

labels, num_labels, use_one_hot_embeddings):

"""Creates a classification model."""

model = modeling.BertModel(

config=bert_config,

is_training=is_training,

input_ids=input_ids,

input_mask=input_mask,

token_type_ids=segment_ids,

use_one_hot_embeddings=use_one_hot_embeddings)

这个例子中,我们做一个简单的分类任务,并提前对句子进行了切分

# In the demo, we are doing a simple classification task on the entire

# segment.

 # 若果你想获取一个符号层级的输出,就是用模型中的get_sequence_output函数

# If you want to use the token-level output, use model.get_sequence_output()

# instead.

output_layer = model.get_pooled_output() 获取池化输出

hidden_size = output_layer.shape[-1].value

output_weights = tf.get_variable( 变量初始化--权重值w

"output_weights", [num_labels, hidden_size],

initializer=tf.truncated_normal_initializer(stddev=0.02)) 初始化为截断的标准正泰分布

output_bias = tf.get_variable( 变量初始化--偏移值b

"output_bias", [num_labels], initializer=tf.zeros_initializer()) 初始化为全0向量

with tf.variable_scope("loss"):

if is_training:

# I.e., 0.1 dropout丢弃层,保留率90%,也就是丢弃率10%

output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

logits = tf.matmul(output_layer, output_weights, transpose_b=True) 矩阵相乘

logits = tf.nn.bias_add(logits, output_bias) 添加偏置项

probabilities = tf.nn.softmax(logits, axis=-1) 标准化

log_probs = tf.nn.log_softmax(logits, axis=-1)  对数标准化

one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 将标签转换为独热码

per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)

tf.reduce_sum按照指定方式对元素进行求和运算,

axis指定按哪个维度进行加和,默认将所有元素进行加和;

tf.reduce_mean()用于计算tensor(张量)沿着指定的数轴(即tensor的某一维度)上的平均值,

用作降维或者计算tensor的平均值。

loss = tf.reduce_mean(per_example_loss)

return (loss, per_example_loss, logits, probabilities)

■模型函数建造器

def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,

num_train_steps, num_warmup_steps, use_tpu,

use_one_hot_embeddings):

"""Returns `model_fn` closure for TPUEstimator."""

返回一个函数,用于TPU的估计运算

def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument

"""The `model_fn` for TPUEstimator."""

定义模型函数,用于TPU 估计

tf.logging.info("*** Features ***")

for name in sorted(features.keys()):

tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

input_ids = features["input_ids"]

input_mask = features["input_mask"]

segment_ids = features["segment_ids"]

label_ids = features["label_ids"]

is_real_example = None

if "is_real_example" in features:

is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) 转换为32位浮点数

else:

is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) 全1.0的张量

is_training = (mode == tf.estimator.ModeKeys.TRAIN)

(total_loss, per_example_loss, logits, probabilities) = create_model(

bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,

num_labels, use_one_hot_embeddings)

tvars表示可训练变量 trainable_variables

tvars = tf.trainable_variables()

initialized_variable_names = {}

scaffold_fn = None

if init_checkpoint: 如果有检查点,就从检查点获取数据

(assignment_map, initialized_variable_names

) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)

if use_tpu: 如果要使用TPU

def tpu_scaffold():

tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

return tf.train.Scaffold()

scaffold_fn = tpu_scaffold

else: 如果不适用TPU,就从检查点初始化

tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

打印一些信息

tf.logging.info("**** Trainable Variables ****")

for var in tvars: 遍历所有可训练参数trainable variables

init_string = ""

if var.name in initialized_variable_names:

init_string = ", *INIT_FROM_CKPT*"

tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,

init_string)

output_spec = None 训练模式

if mode == tf.estimator.ModeKeys.TRAIN:

训练的优化器

train_op = optimization.create_optimizer(

total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

output_spec = tf.contrib.tpu.TPUEstimatorSpec(

mode=mode,

loss=total_loss,

train_op=train_op,

scaffold_fn=scaffold_fn)

elif mode == tf.estimator.ModeKeys.EVAL: 评估模式

定义度量函数

def metric_fn(per_example_loss, label_ids, logits, is_real_example):

predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 预测值

accuracy = tf.metrics.accuracy( 精确度/准确率

labels=label_ids, predictions=predictions, weights=is_real_example)

loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)

return {

"eval_accuracy": accuracy,

"eval_loss": loss,

}

评估度量

eval_metrics = (metric_fn,

[per_example_loss, label_ids, logits, is_real_example])

output_spec = tf.contrib.tpu.TPUEstimatorSpec(

mode=mode,

loss=total_loss,

eval_metrics=eval_metrics,

scaffold_fn=scaffold_fn)

else: 其他模式(不是训练,也不是评估)

output_spec = tf.contrib.tpu.TPUEstimatorSpec(

mode=mode,

predictions={"probabilities": probabilities},

scaffold_fn=scaffold_fn)

return output_spec

return model_fn

# This function is not used by this file but is still used by the Colab and

# people who depend on it.

输入处理函数

■输入函数建造器,在本文件中,这个函数没有使用,但是可以在Colab中使用,或者想用它的人使用

def input_fn_builder(features, seq_length, is_training, drop_remainder):

"""Creates an `input_fn` closure to be passed to TPUEstimator."""

all_input_ids = []

all_input_mask = []

all_segment_ids = []

all_label_ids = []

for feature in features:

all_input_ids.append(feature.input_ids)

all_input_mask.append(feature.input_mask)

all_segment_ids.append(feature.segment_ids)

all_label_ids.append(feature.label_id)

def input_fn(params): 输入函数

"""The actual input function."""

batch_size = params["batch_size"] 批处理量

num_examples = len(features) 例子个数

这个例子用于测试,而不是用于大型数据集;

# This is for demo purposes and does NOT scale to large data sets. We do

# not use Dataset.from_generator() because that uses tf.py_func which is

# not TPU compatible. The right way to load data is with TFRecordReader.

d = tf.data.Dataset.from_tensor_slices({

"input_ids":

tf.constant(

all_input_ids, shape=[num_examples, seq_length],

dtype=tf.int32),

"input_mask":

tf.constant(

all_input_mask,

shape=[num_examples, seq_length],

dtype=tf.int32),

"segment_ids":

tf.constant(

all_segment_ids,

shape=[num_examples, seq_length],

dtype=tf.int32),

"label_ids":

tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),

})

if is_training:

d = d.repeat()

d = d.shuffle(buffer_size=100)

d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)

return d

return input_fn

特征提取

■本文件不使用该函数

# This function is not used by this file but is still used by the Colab and

# people who depend on it.

def convert_examples_to_features(examples, label_list, max_seq_length,

tokenizer):

"""Convert a set of `InputExample`s to a list of `InputFeatures`."""

features = []

for (ex_index, example) in enumerate(examples):

if ex_index % 10000 == 0:

tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

feature = convert_single_example(ex_index, example, label_list,

max_seq_length, tokenizer)

features.append(feature)

return features

主函数

■主函数,tf函数入口

def main(_):

tf.logging.set_verbosity(tf.logging.INFO)

processors = { 数据处理器的映射,用于通过名字 取函数

"cola": ColaProcessor,

"mnli": MnliProcessor,

"mrpc": MrpcProcessor,

"xnli": XnliProcessor,

}

tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,

FLAGS.init_checkpoint)

验证输入参数有效性

if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:

raise ValueError(

"At least one of `do_train`, `do_eval` or `do_predict' must be True.")

bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

if FLAGS.max_seq_length > bert_config.max_position_embeddings:

raise ValueError(

"Cannot use sequence length %d because the BERT model "

"was only trained up to sequence length %d" %

(FLAGS.max_seq_length, bert_config.max_position_embeddings))

tf.gfile.MakeDirs(FLAGS.output_dir) 创建输出目录文件夹

task_name = FLAGS.task_name.lower() 将任务名称转换小写

if task_name not in processors: 任务名是否存在于处理器函数映射表中

raise ValueError("Task not found: %s" % (task_name))

processor = processors[task_name]() 通过名称获取处理函数类,并构造一个类对象

label_list = processor.get_labels() 通过数据处理类获取标签

tokenizer = tokenization.FullTokenizer( 构造分词器

vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

tpu_cluster_resolver = None TPU   解决方案  无

if FLAGS.use_tpu and FLAGS.tpu_name:   如果要使用TPU并指定了TPU名称

tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(

FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2  TPU设置

run_config = tf.contrib.tpu.RunConfig(    TPU运行配置

cluster=tpu_cluster_resolver,

master=FLAGS.master,

model_dir=FLAGS.output_dir,

save_checkpoints_steps=FLAGS.save_checkpoints_steps,

tpu_config=tf.contrib.tpu.TPUConfig(

iterations_per_loop=FLAGS.iterations_per_loop,

num_shards=FLAGS.num_tpu_cores,

per_host_input_for_training=is_per_host))

train_examples = None 训练例子

num_train_steps = None  训练步数

num_warmup_steps = None 热身步数

if FLAGS.do_train: 训练模式

train_examples = processor.get_train_examples(FLAGS.data_dir) 读取训练例子

num_train_steps = int( 计算训练步数

len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)

num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) 计算热身步数

model_fn = model_fn_builder( 通过构造器 构造一个模型函数

bert_config=bert_config,

num_labels=len(label_list),

init_checkpoint=FLAGS.init_checkpoint,

learning_rate=FLAGS.learning_rate,

num_train_steps=num_train_steps,

num_warmup_steps=num_warmup_steps,

use_tpu=FLAGS.use_tpu,

use_one_hot_embeddings=FLAGS.use_tpu)

TPU用不了的时候,就会自动切换为使用CPU或者GPU

# If TPU is not available, this will fall back to normal Estimator on CPU  # or GPU.

estimator = tf.contrib.tpu.TPUEstimator(    TPU评估器

use_tpu=FLAGS.use_tpu,

model_fn=model_fn,

config=run_config,

train_batch_size=FLAGS.train_batch_size,

eval_batch_size=FLAGS.eval_batch_size,

predict_batch_size=FLAGS.predict_batch_size)

if FLAGS.do_train:  训练模式

train_file = os.path.join(FLAGS.output_dir, "train.tf_record") 训练文件路径

file_based_convert_examples_to_features(  从文件到特征值的处理

train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)

tf.logging.info("***** Running training *****") 打印

tf.logging.info("  Num examples = %d", len(train_examples))

tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)

tf.logging.info("  Num steps = %d", num_train_steps)

train_input_fn = file_based_input_fn_builder(  构造输入处理函数

input_file=train_file,

seq_length=FLAGS.max_seq_length,

is_training=True,

drop_remainder=True)

estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)用评估器进行训练

if FLAGS.do_eval: 评估模式/验证模式

eval_examples = processor.get_dev_examples(FLAGS.data_dir) 读取评估数据

num_actual_eval_examples = len(eval_examples) 评估数据个数

if FLAGS.use_tpu: 如果使用TPU

整批处理,数据总数要是批处理量的整数倍

如果不恰好是整批,就填充一些假例子,用于凑数。

这些假例子将会被忽略,他们的权重是0.0

 

# TPU requires a fixed batch size for all batches, therefore the number

# of examples must be a multiple of the batch size, or else examples

# will get dropped. So we pad with fake examples which are ignored

# later on. These do NOT count towards the metric (all tf.metrics

# support a per-instance weight, and these get a weight of 0.0).

while len(eval_examples) % FLAGS.eval_batch_size != 0:

eval_examples.append(PaddingInputExample()) 添加假例子进行凑整

eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") 评估文件路径

file_based_convert_examples_to_features( 直接从文件读取数据,再转换为特征值

eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

tf.logging.info("***** Running evaluation *****")  打印

tf.logging.info("  Num examples = %d (%d actual, %d padding)",

len(eval_examples), num_actual_eval_examples,

len(eval_examples) - num_actual_eval_examples)

tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

# This tells the estimator to run through the entire set. 如果None,评估器要处理整个数据集

eval_steps = None  如果None,评估器要处理整个数据集

如果是使用TPU,就必须明确指出具体的步数

# However, if running eval on the TPU, you will need to specify the

# number of steps.

if FLAGS.use_tpu:

assert len(eval_examples) % FLAGS.eval_batch_size == 0 断言:是整批整批的,

eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) 例子数整除批处理量,就是评估步数

如果使用TPU,就要提醒丢弃数据,否则就不用提醒

eval_drop_remainder = True if FLAGS.use_tpu else False

eval_input_fn = file_based_input_fn_builder( 构造一个函数,从文件开始的输入处理函数

input_file=eval_file,

seq_length=FLAGS.max_seq_length,

is_training=False,

drop_remainder=eval_drop_remainder)

result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) 开始评估运算

output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") 获取输出文件路径

with tf.gfile.GFile(output_eval_file, "w") as writer:  写入评估文件

tf.logging.info("***** Eval results *****")

for key in sorted(result.keys()):

tf.logging.info("  %s = %s", key, str(result[key]))

writer.write("%s = %s\n" % (key, str(result[key])))

if FLAGS.do_predict: 预测模式

predict_examples = processor.get_test_examples(FLAGS.data_dir) 读取预测数据

num_actual_predict_examples = len(predict_examples) 预测数据总数

if FLAGS.use_tpu: 如果使用TPU,同样要用假例子进行补齐操作

# TPU requires a fixed batch size for all batches, therefore the number

# of examples must be a multiple of the batch size, or else examples

# will get dropped. So we pad with fake examples which are ignored

# later on.

while len(predict_examples) % FLAGS.predict_batch_size != 0:不是整批,就补齐为整批

predict_examples.append(PaddingInputExample())

predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") 预测文件路径

file_based_convert_examples_to_features(predict_examples, label_list, 从预测文件到特征值

FLAGS.max_seq_length, tokenizer,

predict_file)

tf.logging.info("***** Running prediction*****") 打印

tf.logging.info("  Num examples = %d (%d actual, %d padding)",

len(predict_examples), num_actual_predict_examples,

len(predict_examples) - num_actual_predict_examples)

tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

predict_drop_remainder = True if FLAGS.use_tpu else False 是否提醒丢弃数据

predict_input_fn = file_based_input_fn_builder( 输入数据处理函数

input_file=predict_file,

seq_length=FLAGS.max_seq_length,

is_training=False,

drop_remainder=predict_drop_remainder)

result = estimator.predict(input_fn=predict_input_fn) 开始预测

output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") 预测结果输出路径

with tf.gfile.GFile(output_predict_file, "w") as writer: 写入文件

num_written_lines = 0

tf.logging.info("***** Predict results *****")

for (i, prediction) in enumerate(result):

probabilities = prediction["probabilities"]

if i >= num_actual_predict_examples:

break

output_line = "\t".join(

str(class_probability)

for class_probability in probabilities) + "\n"

writer.write(output_line)

num_written_lines += 1

assert num_written_lines == num_actual_predict_examples 断言:预测数==写入的行数,否则报错

if __name__ == "__main__": 启动当前文件时

flags.mark_flag_as_required("data_dir") 将参数标记为必须参数

flags.mark_flag_as_required("task_name")将参数标记为必须参数

flags.mark_flag_as_required("vocab_file")将参数标记为必须参数

flags.mark_flag_as_required("bert_config_file")将参数标记为必须参数

flags.mark_flag_as_required("output_dir")将参数标记为必须参数

tf.app.run() 开始运行上面的main函数

BERT模型源码解析的更多相关文章

  1. [源码解析] PyTorch 流水线并行实现 (2)--如何划分模型

    [源码解析] PyTorch 流水线并行实现 (2)--如何划分模型 目录 [源码解析] PyTorch 流水线并行实现 (2)--如何划分模型 0x00 摘要 0x01 问题 0x01 自动平衡 1 ...

  2. [源码解析] 深度学习流水线并行 PipeDream(5)--- 通信模块

    [源码解析] 深度学习流水线并行 PipeDream(5)--- 通信模块 目录 [源码解析] 深度学习流水线并行 PipeDream(5)--- 通信模块 0x00 摘要 0x01 前言 0x02 ...

  3. [源码解析] 深度学习流水线并行 PipeDream(6)--- 1F1B策略

    [源码解析] 深度学习流水线并行 PipeDream(6)--- 1F1B策略 目录 [源码解析] 深度学习流水线并行 PipeDream(6)--- 1F1B策略 0x00 摘要 0x01 流水线比 ...

  4. [源码解析] PyTorch 流水线并行实现 (1)--基础知识

    [源码解析] PyTorch 流水线并行实现 (1)--基础知识 目录 [源码解析] PyTorch 流水线并行实现 (1)--基础知识 0x00 摘要 0x01 历史 1.1 GPipe 1.2 t ...

  5. [源码解析] PyTorch 流水线并行实现 (3)--切分数据和运行时系统

    [源码解析] PyTorch 流水线并行实现 (3)--切分数据和运行时系统 目录 [源码解析] PyTorch 流水线并行实现 (3)--切分数据和运行时系统 0x00 摘要 0x01 分割小批次 ...

  6. [源码解析] PyTorch 流水线并行实现 (4)--前向计算

    [源码解析] PyTorch 流水线并行实现 (4)--前向计算 目录 [源码解析] PyTorch 流水线并行实现 (4)--前向计算 0x00 摘要 0x01 论文 1.1 引论 1.1.1 数据 ...

  7. [源码解析] PyTorch 流水线并行实现 (5)--计算依赖

    [源码解析] PyTorch 流水线并行实现 (5)--计算依赖 目录 [源码解析] PyTorch 流水线并行实现 (5)--计算依赖 0x00 摘要 0x01 前文回顾 0x02 计算依赖 0x0 ...

  8. [源码解析] PyTorch 流水线并行实现 (6)--并行计算

    [源码解析] PyTorch 流水线并行实现 (6)--并行计算 目录 [源码解析] PyTorch 流水线并行实现 (6)--并行计算 0x00 摘要 0x01 总体架构 1.1 使用 1.2 前向 ...

  9. 谷歌BERT预训练源码解析(二):模型构建

    目录前言源码解析模型配置参数BertModelword embeddingembedding_postprocessorTransformerself_attention模型应用前言BERT的模型主要 ...

随机推荐

  1. java基础———标识符和关键字

    标识符以字母开头  (A-Z)或(a-z)    美元符($)     下划线(_) 不能以关键字作为变量名或者方法名 标识符大小写不能混淆 可以中文(不建议) 常用的关键字

  2. 1.7_CSS基础

    层叠样式表 (Cascading Style Sheets) CSS产生缘由 HTML 标签原本被设计为用于定义文档内容.通过使用 <h1>.<p>.<table> ...

  3. 【MySQL】DDL因Waiting for table metadata lock卡住

    在数据库空闲时间,对表做碎片整理: alter table my_abc engine=innodb; 发现会话被阻塞,显示状态是: Waiting for table metadata lock 手 ...

  4. 开源IPTV源服务程序使用教程

    Streaming-Media-Server-Pro 前言 我的目标是将程序打造成属于每个人的直播源服务,且对每个人完全开源免费!可作为家庭影院电视.视频等流媒体的提供商,兼容全平台,只需下载视频播放 ...

  5. Windows平台RTMP/RTSP播放器实现实时音量调节

    为什么要做实时音量调节 RTMP或RTSP直播播放音量调节,主要用于多实例(多窗口)播放场景下,比如同时播放4路RTMP或RTSP流,如果音频全部打开,几路audio同时打开,可能会影响用户体验,我们 ...

  6. 基于 iframe 的微前端框架 —— 擎天

    vivo 互联网前端团队- Jiang Zuohan 一.背景 VAPD是一款专为团队协作办公场景设计的项目管理工具,实践敏捷开发与持续交付,以「项目」为核心,融合需求.任务.缺陷等应用,使用敏捷迭代 ...

  7. winfrom程序只启动一个exe进程

    private static void KillProcess() { Process process1 = Process.GetCurrentProcess(); //获得当前计算机系统内某个进程 ...

  8. 使用Docker方式部署Gitlab,Gitlab-Runner并使用Gitlab提供的CI/CD功能自动化构建SpringBoot项目

    1.Docker安装Gitlab,地址:https://www.cnblogs.com/sanduzxcvbnm/p/13814730.html 2.Docker安装Gitlab-runner,地址: ...

  9. MinIO Client完全指南

    官方文档地址:http://docs.minio.org.cn/docs/master/minio-client-complete-guide 下载,添加云存储服务参考这篇文章:https://www ...

  10. 容器监控:cAdvisor

    CAdvisor是Google开源的一款用于展示和分析容器运行状态的可视化工具.通过在主机上运行CAdvisor用户可以轻松的获取到当前主机上容器的运行统计信息,并以图表的形式向用户展示. 在本地运行 ...