






  1. CPU上定义变量
  2. GPU上分别定义model和gradients操作,得到每个GPU中的梯度
  3. 又回到CPU中计算平均平均梯度,并进行参数更新

Talk is cheap, show me the code!!




  1. 读入数据
  2. 在cpu中定义变量
  3. 搭建Inference
  4. 定义loss
  5. 定义训练过程



def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
# Defaults are not specified since both keys are required.
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
}) image = tf.decode_raw(features['image_raw'], tf.uint8)
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
label = tf.cast(features['label'], tf.int32)
return image, label


def inputs(train, batch_size, num_epochs):
if not num_epochs: num_epochs = None
filename = os.path.join(FLAGS.data_dir,
TRAIN_FILE if train else VALIDATION_FILE) with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(
[filename], num_epochs=num_epochs)
image, label = read_and_decode(filename_queue) images, sparse_labels = tf.train.shuffle_batch(
[image, label], batch_size=batch_size, num_threads=2,
capacity=1000 + 3 * batch_size,
min_after_dequeue=1000) return images, sparse_labels




def _variable_on_cpu(name, shape, initializer):
"""Helper to create a Variable stored on CPU memory. Args:
name: name of the variable
shape: list of ints
initializer: initializer for Variable Returns:
Variable Tensor
with tf.device('/cpu:0'):
dtype = tf.float32
var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
return var



def inference(images):
"""Build the MNIST model. Args:
images: Images returned from MNIST or inputs(). Returns:
x_image = tf.reshape(images, [-1, 28, 28, 1]) # conv1
with tf.variable_scope('conv1') as scope: kernel = _variable_on_cpu('weights',shape=[5,5,1,32],
initializer = tf.truncated_normal_initializer(stddev=5e-2))
biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.0))
conv = tf.nn.conv2d(x_image, kernel, strides=[1, 1, 1, 1],
pre_activation = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(pre_activation, name=scope.name) # pool1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pool1') # conv2
with tf.variable_scope('conv2') as scope: kernel = _variable_on_cpu('weights',shape=[5,5,32,64],
initializer = tf.truncated_normal_initializer(stddev=5e-2))
conv = tf.nn.conv2d(pool1, kernel, strides=[1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
pre_activation = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(pre_activation, name=scope.name) # pool2
pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pool2') # local3
with tf.variable_scope('local3') as scope:
# Move everything into depth so we can perform a single matrix multiply.
reshape = tf.reshape(pool2, [-1, 7 * 7 * 64])
dim = reshape.get_shape()[1].value weights = _variable_on_cpu('weights',shape=[dim,1024],
initializer = tf.truncated_normal_initializer(stddev=0.04))
biases = _variable_on_cpu('biases', [1024],
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,
name=scope.name) # local4
with tf.variable_scope('local4') as scope:
weights = _variable_on_cpu('weight',shape=[1024,10],
initializer = tf.truncated_normal_initializer(stddev=0.04))
biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases,
name=scope.name) # linear layer(WX + b),
# We don't apply softmax here because
# tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
# and performs the softmax internally for efficiency.
with tf.variable_scope('softmax_linear') as scope: weights = _variable_on_cpu('weight',[10,10],
initializer = tf.truncated_normal_initializer(stddev=1 / 192.0))
biases = _variable_on_cpu('biases', [10],
softmax_linear = tf.add(tf.matmul(local4, weights), biases,
name=scope.name) return softmax_linear



def loss(logits, labels):
"""Add L2Loss to all the trainable variables. Add summary for "Loss" and "Loss/avg".
logits: Logits from inference().
labels: Labels from distorted_inputs or inputs(). 1-D tensor
of shape [batch_size] Returns:
Loss tensor of type float.
# Calculate the average cross entropy loss across the batch.
labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=logits, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean) # The total loss is defined as the cross entropy loss plus all of the weight
# decay terms (L2 loss).
return tf.add_n(tf.get_collection('losses'), name='total_loss') def tower_loss(scope):
"""Calculate the total loss on a single tower running the MNIST model. Args:
scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0' Returns:
Tensor of shape [] containing the total loss for a batch of data
# Input images and labels.
images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
# Build inference Graph.
logits = inference(images) # Build the portion of the Graph calculating the losses. Note that we will
# assemble the total_loss using a custom function below.
_ = loss(logits, labels) # Assemble all of the losses for the current tower only.
losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower.
total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do
# the same for the averaged version of the losses.
if FLAGS.tb_logging:
for l in losses + [total_loss]:
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU
# training session. This helps the clarity of presentation on
# tensorboard.
loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
tf.summary.scalar(loss_name, l) return total_loss def average_gradients(tower_grads):
"""Calculate average gradient for each shared variable across all towers. Note that this function provides a synchronization point across all towers. Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
List of pairs of (gradient, variable) where the gradient has been
averaged across all towers.
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0) # Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g) # Average over the 'tower' dimension.
grad = tf.concat(grads, 0)
grad = tf.reduce_mean(grad, 0) # Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
return average_grads



def train():
with tf.Graph().as_default(), tf.device('/cpu:0'):
# Create a variable to count the number of train() calls. This equals
# the number of batches processed * FLAGS.num_gpus.
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(0), trainable=False) # opt = tf.train.MomentumOptimizer(lr,0.9,use_nesterov=True,use_locking=True)
opt = tf.train.MomentumOptimizer(INITIAL_LEARNING_RATE,0.9,use_nesterov=True,use_locking=True) # Calculate the gradients for each model tower.
tower_grads = []
with tf.variable_scope(tf.get_variable_scope()):
for i in xrange(FLAGS.num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope(
'%s_%d' % (TOWER_NAME, i)) as scope:
# Calculate the loss for one tower of the CIFAR model.
# This function constructs the entire CIFAR model but
# shares the variables across all towers.
loss = tower_loss(scope) # Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower.
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
scope) # Calculate the gradients for the batch of data on this
# MNIST tower.
grads = opt.compute_gradients(loss, gate_gradients=0) # Keep track of the gradients across all towers.
tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
grads = average_gradients(tower_grads) train_op = opt.apply_gradients(grads, global_step=global_step) # The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer()) # Start running operations on the Graph. allow_soft_placement must be
# set to True to build towers on GPU, as some of the ops do not have GPU
# implementations.
sess = tf.Session(config=tf.ConfigProto(
sess.run(init_op) # Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord) try:
step = 0
while not coord.should_stop():
start_time = time.time() # Run one step of the model. The return values are
# the activations from the `train_op` (which is
# discarded) and the `loss` op. To inspect the values
# of your ops or variables, you may include them in
# the list passed to sess.run() and the value tensors
# will be returned in the tuple from the call.
_, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time # assert not np.isnan(
# loss_value), 'Model diverged with loss = NaN' # Print an overview fairly often.
if step % 100 == 0:
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step / duration
sec_per_batch = duration / FLAGS.num_gpus
format_str = (
'%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
print(format_str % (datetime.now(), step, loss_value,
examples_per_sec, sec_per_batch))
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (
FLAGS.num_epochs, step))
# When done, ask the threads to stop.
coord.request_stop() # Wait for threads to finish.





