'''
Created on May 25, 2017 @author: p0079482
'''
# 分布式深度学习模型训练模式
# 在一台机器的多个GPU上并行训练深度学习模型
from datetime import datetime
import os
import time import tensorflow as tf
import mnist_inference # 定义训练神经网络时需要用到的配置。
BATCH_SIZE = 100
LEARNING_RATE_BASE = 0.001
LEARNING_RATE_DECAY = 0.99
REGULARAZTION_REATE = 0.0001
TRAINING_STEPS = 1000
MOVING_AVERAGE_DECAY = 0.99
N_GPU = 4 # 定义日志和模型输出的路径
MODEL_SAVE_PATH = "/path/to/logs_and_models/"
MODEL_NAME = "model.ckpt" # 定义数据存储的路径。因为需要为不同的GPU提供不同的训练数据,所以通过placerholder的方式
# 就需要手动准备多分数据。为了方便训练数据的获取过程,可以采用第7章中介绍的输入队列的方式
# 从TFRecord中读取数据。于是在这里提供的数据文件路径为将MNIST训练数据转化为TFRecords格式之后的路径
# 如何将MNIST数据转化为TFRecord格式在第7章中有详细介绍,这里不再赘述
DATA_PATH = "/path/to/output.tfrecords" # 定义输入队列得到训练数据,具体细节可以参考第7章
def get_input():
filename_queue = tf.train.string_input_producer([DATA_PATH])
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# 定义数据解析格式
features = tf.parse_single_example(serialized_example,
features={
'image_raw':tf.FixedLenFeature([], tf.string),
'pixels':tf.FixedLenFeature([], tf.int64),
'label':tf.FixedLenFeature([], tf.int64),
})
# 解析图片和标签信息
decoded_image = tf.decode_raw(features['image_raw'], tf.uint8)
reshaped_image = tf.reshape(decoded_image, [784])
retyped_image = tf.cast(reshaped_image, tf.float32)
label = tf.cast(features['label'], tf.int32) # 定义输入队列并返回
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * BATCH_SIZE
return tf.train.shuffle_batch([retyped_image, label],
batch_size=BATCH_SIZE,
capacity=capacity,
min_after_dequeue=min_after_dequeue) # 定义损失函数。对于给定的训练数据、正则化损失计算规则和命名空间,计算在这个命名空间下的总损失
# 之所以需要给定命名空间是因为不同的GPU上计算得出的正则化损失都会加入名为loss的集合,
# 如果不通过命名空间就会将不同GPU上的正则化损失都加进来
def get_loss(x, y_, regularizer, scope, reuse_variables=None):
# 沿用5.5节中定义的函数来计算神经网络的前向传播结果
with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
y = mnist_inference.inference(x, regularizer)
# 计算交叉熵损失
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=y_))
# 计算当前GPU上计算得到的正则化损失
regularization_loss = tf.add_n(tf.get_collection('losses', scope))
# 计算最终的总损失
loss = cross_entropy + regularization_loss
return loss def average_gradients(tower_grads):
average_grads = []
# 枚举所有的变量和变量在不同GPU上计算得出的梯度
for grad_and_vars in zip(*tower_grads):
# 计算所有GPU上的梯度平均值
grads = []
for g, _ in grad_and_vars:
expanded_g = tf.expand_dims(g, 0)
grads.append(expanded_g)
grad = tf.concat(grads, 0)
grad = tf.reduce_mean(grad, 0) v = grad_and_vars[0][1]
grad_and_var = (grad, v)
# 将变量和它的平均梯度对应起来
average_grads.append(grad_and_var)
# 返回所有变量的平均梯度,这将被用于变量更新
return average_grads # 主训练过程
def main(argv=None):
# 将简单的运算放在CPU上,只有神经网络的训练过程放在GPU上
with tf.Graph().as_default(), tf.device('/cpu:0'):
# 获取训练batch
x, y_ = get_input()
regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_REATE) # 定义训练轮数和指数衰减的学习率
global_step = tf.get_variable('global_step',
[],
initializer=tf.constant_initializer(0),
trainable=False)
learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE,
global_step,
60000 / BATCH_SIZE,
LEARNING_RATE_DECAY)
# 定义优化方法
opt = tf.train.GradientDescentOptimizer(learning_rate) tower_grads = []
reuse_variables = False
# 将神经网络的优化过程跑在不同的GPU上
for i in range(N_GPU):
# 将优化过程指定在一个GPU上
with tf.device('/gpu:%d' % i):
with tf.name_scope('GPU_%d' % i) as scope:
cur_loss = get_loss(x, y_, regularizer, scope, reuse_variables)
# 在第一次声明变量之后,将控制变量重用的参数设置为True.这样可以
# 让不同的GPU更新同一组参数。注意tf.name_scope函数并不会影响
# tf.get_variable的命名空间
reuse_variables = True
# 使用当前GPU计算所有变量的梯度
grads = opt.compute_gradients(cur_loss)
tower_grads.append(grads) # 计算变量的平均梯度,并输出到TensorBoard日志中
grads = average_gradients(tower_grads)
for grad, var in grads:
if grad is not None:
tf.summary.histogram('gradients_on_average/%s' % var.op.name, grad) # 使用平均梯度更新参数
apply_gradient_op = opt.apply_gradients(grads,
global_step=global_step)
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var) # 计算变量的滑动平均值
variable_averages = tf.train.ExponentialMovingAverage(
MOVING_AVERAGE_DECAY,
global_step)
variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
variable_averages_op = variable_averages.apply(variables_to_average) # 每一轮迭代需要更新变量的取值并更新变量的滑动平均值
train_op = tf.group(apply_gradient_op, variable_averages_op)
saver = tf.train.Saver(tf.global_variables())
summary_op = tf.summary.merge_all()
init = tf.global_variables_initializer() # 训练过程
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
log_device_placement=True)) as sess:
# 初始化所有变量并启动队列
init.run()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
summary_writer = tf.summary.FileWriter(MODEL_SAVE_PATH, sess.graph)
for step in range(TRAINING_STEPS):
# 执行神经网络训练操作,并记录训练操作的运行时间
start_time = time.time()
_, loss_value = sess.run([train_op, cur_loss])
duration = time.time() - start_time # 每隔一段时间展示当前的训练进度,并统计训练速度
if step != 0 and step % 10 == 0 and duration != 0:
# 计算使用过的训练数据个数,因为在每一次运行训练操作时,每一个GPU
# 都会使用一个batch的训练数据,所以总共用到的训练数据个数为
# batch大小*GPU个数
num_examples_per_step = BATCH_SIZE * N_GPU # num_examples_per_step为本次迭代使用到的训练数据个数
# duration为运行当前训练过程使用的时间,于是平均每秒可以处理的训练数据个数为
# num_examples_per_step/duration
examples_per_sec = num_examples_per_step / duration # duration为运行当前训练过程使用的时间,因为在每一个训练过程中
# 每一个GPU都会使用一个batch的训练数据,所以在单个batch上的训练所需要时间为
# duration/GPU个数
sec_per_batch = duration / N_GPU # 输出训练信息
format_str = ('step %d, loss = %.2f (% .1f examples/ sec; %.3f sec/batch)')
print(format_str % (step, loss_value, examples_per_sec, sec_per_batch)) # 通过TensorBoard可视化训练过程
summary = sess.run(summary_op)
summary_writer.add_summary(summary, step) # 每隔一段时间保存当前的模型
if step % 1000 == 0 or (step + 1) == TRAINING_STEPS:
checkpoint_path = os.path.join(MODEL_SAVE_PATH, MODEL_NAME)
saver.save(sess, checkpoint_path, global_step=step)
coord.request_stop()
coord.join(threads) if __name__ == '__main__':
tf.app.run()

下面是训练完的结果

step 20, loss = 29.53 ( 10362.6 examples/ sec; 0.010 sec/batch)
step 30, loss = 9.62 ( 12022.4 examples/ sec; 0.008 sec/batch)
step 40, loss = 16.63 ( 10689.3 examples/ sec; 0.009 sec/batch)
step 50, loss = 10.68 ( 11293.4 examples/ sec; 0.009 sec/batch)
step 60, loss = 14.73 ( 10895.0 examples/ sec; 0.009 sec/batch)
step 70, loss = 17.17 ( 11192.9 examples/ sec; 0.009 sec/batch)
step 80, loss = 12.43 ( 11236.8 examples/ sec; 0.009 sec/batch)
step 90, loss = 5.16 ( 11398.3 examples/ sec; 0.009 sec/batch)
step 100, loss = 8.06 ( 12466.7 examples/ sec; 0.008 sec/batch)
step 110, loss = 13.57 ( 11081.5 examples/ sec; 0.009 sec/batch)
step 120, loss = 9.43 ( 11396.2 examples/ sec; 0.009 sec/batch)
step 130, loss = 12.21 ( 13296.7 examples/ sec; 0.008 sec/batch)
step 140, loss = 6.15 ( 11868.9 examples/ sec; 0.008 sec/batch)
step 150, loss = 9.93 ( 12089.1 examples/ sec; 0.008 sec/batch)
step 160, loss = 10.42 ( 11733.5 examples/ sec; 0.009 sec/batch)
step 170, loss = 23.47 ( 11859.4 examples/ sec; 0.008 sec/batch)
step 180, loss = 2.97 ( 11358.0 examples/ sec; 0.009 sec/batch)
step 190, loss = 5.44 ( 11085.0 examples/ sec; 0.009 sec/batch)
step 200, loss = 3.98 ( 13347.3 examples/ sec; 0.007 sec/batch)
step 210, loss = 11.98 ( 10551.4 examples/ sec; 0.009 sec/batch)
step 220, loss = 9.17 ( 11115.3 examples/ sec; 0.009 sec/batch)
step 230, loss = 15.31 ( 12450.5 examples/ sec; 0.008 sec/batch)
step 240, loss = 5.92 ( 11729.5 examples/ sec; 0.009 sec/batch)
step 250, loss = 9.94 ( 10497.2 examples/ sec; 0.010 sec/batch)
step 260, loss = 2.94 ( 11398.1 examples/ sec; 0.009 sec/batch)
step 270, loss = 7.30 ( 10497.4 examples/ sec; 0.010 sec/batch)
step 280, loss = 3.98 ( 11946.0 examples/ sec; 0.008 sec/batch)
step 290, loss = 7.66 ( 11307.2 examples/ sec; 0.009 sec/batch)
step 300, loss = 2.03 ( 11968.7 examples/ sec; 0.008 sec/batch)
step 310, loss = 2.39 ( 8672.0 examples/ sec; 0.012 sec/batch)
step 320, loss = 2.07 ( 3835.6 examples/ sec; 0.026 sec/batch)
step 330, loss = 2.71 ( 12087.7 examples/ sec; 0.008 sec/batch)
step 340, loss = 2.70 ( 11907.3 examples/ sec; 0.008 sec/batch)
step 350, loss = 7.17 ( 7671.2 examples/ sec; 0.013 sec/batch)
step 360, loss = 8.36 ( 11863.6 examples/ sec; 0.008 sec/batch)
step 370, loss = 2.48 ( 11782.7 examples/ sec; 0.008 sec/batch)
step 380, loss = 2.27 ( 11081.5 examples/ sec; 0.009 sec/batch)
step 390, loss = 2.85 ( 11562.4 examples/ sec; 0.009 sec/batch)
step 400, loss = 2.99 ( 12088.9 examples/ sec; 0.008 sec/batch)
step 410, loss = 5.08 ( 12465.6 examples/ sec; 0.008 sec/batch)
step 420, loss = 2.12 ( 12869.1 examples/ sec; 0.008 sec/batch)
step 430, loss = 2.83 ( 13756.3 examples/ sec; 0.007 sec/batch)
step 440, loss = 7.56 ( 13297.8 examples/ sec; 0.008 sec/batch)
step 450, loss = 3.51 ( 12634.6 examples/ sec; 0.008 sec/batch)
step 460, loss = 2.23 ( 13297.8 examples/ sec; 0.008 sec/batch)
step 470, loss = 1.80 ( 12869.2 examples/ sec; 0.008 sec/batch)
step 480, loss = 5.92 ( 9730.3 examples/ sec; 0.010 sec/batch)
step 490, loss = 4.01 ( 12647.0 examples/ sec; 0.008 sec/batch)
step 500, loss = 2.29 ( 12466.9 examples/ sec; 0.008 sec/batch)
step 510, loss = 2.20 ( 13078.4 examples/ sec; 0.008 sec/batch)
step 520, loss = 3.70 ( 13296.5 examples/ sec; 0.008 sec/batch)
step 530, loss = 2.11 ( 13298.3 examples/ sec; 0.008 sec/batch)
step 540, loss = 1.73 ( 13296.6 examples/ sec; 0.008 sec/batch)
step 550, loss = 1.20 ( 12868.9 examples/ sec; 0.008 sec/batch)
step 560, loss = 3.44 ( 13078.6 examples/ sec; 0.008 sec/batch)
step 570, loss = 1.35 ( 11562.0 examples/ sec; 0.009 sec/batch)
step 580, loss = 3.51 ( 13205.2 examples/ sec; 0.008 sec/batch)
step 590, loss = 3.11 ( 12868.8 examples/ sec; 0.008 sec/batch)
step 600, loss = 3.40 ( 12869.1 examples/ sec; 0.008 sec/batch)
step 610, loss = 2.49 ( 13297.7 examples/ sec; 0.008 sec/batch)
step 620, loss = 2.68 ( 12620.3 examples/ sec; 0.008 sec/batch)
step 630, loss = 2.09 ( 11907.3 examples/ sec; 0.008 sec/batch)
step 640, loss = 3.82 ( 8487.3 examples/ sec; 0.012 sec/batch)
step 650, loss = 2.77 ( 11081.5 examples/ sec; 0.009 sec/batch)
step 660, loss = 2.55 ( 12089.1 examples/ sec; 0.008 sec/batch)
step 670, loss = 2.53 ( 10228.3 examples/ sec; 0.010 sec/batch)
step 680, loss = 5.17 ( 9498.5 examples/ sec; 0.011 sec/batch)
step 690, loss = 2.02 ( 10498.4 examples/ sec; 0.010 sec/batch)
step 700, loss = 0.21 ( 12088.9 examples/ sec; 0.008 sec/batch)
step 710, loss = 1.95 ( 12868.7 examples/ sec; 0.008 sec/batch)
step 720, loss = 3.90 ( 13296.2 examples/ sec; 0.008 sec/batch)
step 730, loss = 2.17 ( 9277.6 examples/ sec; 0.011 sec/batch)
step 740, loss = 1.09 ( 9730.1 examples/ sec; 0.010 sec/batch)
step 750, loss = 1.33 ( 12466.8 examples/ sec; 0.008 sec/batch)
step 760, loss = 3.17 ( 9797.9 examples/ sec; 0.010 sec/batch)
step 770, loss = 3.20 ( 13297.9 examples/ sec; 0.008 sec/batch)
step 780, loss = 4.28 ( 13756.4 examples/ sec; 0.007 sec/batch)
step 790, loss = 1.23 ( 12465.4 examples/ sec; 0.008 sec/batch)
step 800, loss = 1.78 ( 12868.8 examples/ sec; 0.008 sec/batch)
step 810, loss = 1.12 ( 12924.2 examples/ sec; 0.008 sec/batch)
step 820, loss = 2.09 ( 13297.1 examples/ sec; 0.008 sec/batch)
step 830, loss = 0.71 ( 11967.1 examples/ sec; 0.008 sec/batch)
step 840, loss = 3.03 ( 12088.8 examples/ sec; 0.008 sec/batch)
step 850, loss = 2.76 ( 12868.8 examples/ sec; 0.008 sec/batch)
step 860, loss = 1.64 ( 12087.1 examples/ sec; 0.008 sec/batch)
step 870, loss = 2.43 ( 9066.8 examples/ sec; 0.011 sec/batch)
step 880, loss = 1.73 ( 11398.2 examples/ sec; 0.009 sec/batch)
step 890, loss = 0.61 ( 12980.4 examples/ sec; 0.008 sec/batch)
step 900, loss = 3.44 ( 12868.8 examples/ sec; 0.008 sec/batch)
step 910, loss = 0.96 ( 11445.9 examples/ sec; 0.009 sec/batch)
step 920, loss = 2.95 ( 13756.3 examples/ sec; 0.007 sec/batch)
step 930, loss = 2.99 ( 12868.5 examples/ sec; 0.008 sec/batch)
step 940, loss = 0.34 ( 13752.5 examples/ sec; 0.007 sec/batch)
step 950, loss = 1.05 ( 13297.8 examples/ sec; 0.008 sec/batch)
step 960, loss = 2.34 ( 13295.7 examples/ sec; 0.008 sec/batch)
step 970, loss = 1.32 ( 13297.6 examples/ sec; 0.008 sec/batch)
step 980, loss = 2.46 ( 12466.6 examples/ sec; 0.008 sec/batch)
step 990, loss = 1.02 ( 13297.7 examples/ sec; 0.008 sec/batch)

89、tensorflow使用GPU并行计算的更多相关文章

  1. tensorflow 安装GPU版本,个人总结,步骤比较详细【转】

    本文转载自:https://blog.csdn.net/gangeqian2/article/details/79358543 手把手教你windows安装tensorflow的教程参考另一篇博文ht ...

  2. Google TensorFlow for GPU安装、配置大坑

    Google TensorFlow for GPU安装.配置大坑 从本周一开始(12.05),共4天半的时间,终于折腾好Google TensorFlow for GPU版本,其间跳坑无数,摔得遍体鳞 ...

  3. 关于TensorFlow的GPU设置

    摘自:https://blog.csdn.net/byron123456sfsfsfa/article/details/79811286 1.  在使用GPU版的TensorFlow跑程序的时候,如果 ...

  4. TensorFlow的GPU设置

    在使用GPU版的TensorFlow跑程序的时候,如果不特殊写代码注明,程序默认是占用所有主机上的GPU,但计算过程中只会用其中一块.也就是你看着所有GPU都被占用了,以为是在GPU并行计算,但实际上 ...

  5. [信安Presentation]一种基于GPU并行计算的MD5密码解密方法

    -------------------paper--------------------- 一种基于GPU并行计算的MD5密码解密方法 0.abstract1.md5算法概述2.md5安全性分析3.基 ...

  6. Setup Tensorflow with GPU on Mac OSX 10.11

    Setup Tensorflow with GPU on OSX 10.11 环境描述 电脑:MacBook Pro 15.6 CPU: 2.7GHz 显卡: GT 650m 系统:OSX 10.11 ...

  7. 浅说CPU并行计算与GPU并行计算

    最近在学一门课,叫做“C++与并行计算”.要用到多CPU(进程)并行的原理,实现语言是C++的MPI接口.联想到上学期用到CUDA C/C++来做并行计算,就对这两门语言做一个总结,分享下自己关于并行 ...

  8. linux 安装tensorflow(gpu版本)

    一.安装cuda 具体安装过程见我的另一篇博客,ubuntu16.04下安装配置深度学习环境 二.安装tensorflow 1.具体安装过程官网其实写的比较详细,总结一下的话可以分为两种:安装rele ...

  9. Tensorflow检验GPU是否安装成功 及 使用GPU训练注意事项

    1. 已经安装cuda但是tensorflow仍然使用cpu加速的问题 电脑上同时安装了GPU和CPU版本的TensorFlow,本来想用下面代码测试一下GPU程序,但无奈老是没有调用GPU. imp ...

随机推荐

  1. Design:设计目录

    ylbtech-Design:设计目录 1.返回顶部 1.0 蚂蚁设计 https://design.alipay.com 1.1 Ant Design - 一个 UI 设计语言 https://an ...

  2. day 52协程

    协程进程线程: # 进程 启动多个进程 进程之间是由操作系统负责调用 # 线程 启动多个线程 真正被CPU执行的最小单位实际是线程 # 开启一个线程 创建一个线程 寄存器 堆栈 # 关闭一个线程 # ...

  3. Selenium:八种元素定位方法

    前言: 我们在做WEB自动化时,最根本的就是操作页面上的元素,首先我们要能找到这些元素,然后才能操作这些元素.工具或代码无法像我们测试人员一样用肉眼来分辨页面上的元素.那么我们怎么来定位他们呢? 在学 ...

  4. [Java] 缓存池

    new Integer(123) 与 Integer.valueOf(123) 的区别在于: new Integer(123) 每次都会新建一个对象: Integer.valueOf(123) 会使用 ...

  5. Liunx平台安装MySQL操作步骤

    使用yum安装MySQL 第一步 第二步 第三步 数据库安装成功 修改数据库密码,并且删除匿名用户.禁止root远程登录.删除test数据库.刷新权限. 使用命令进入后,找到自己的临时密码,并且修改 ...

  6. Mac os x安装IDEAL及配置JDK和Maven

    此文章是在已安装好IDEAL前提下进行配置jdk和maven的操作文档. 1. 下载并配置JDK及Maven Mac下载并配置JDK方法: 详见Mac安装JDK和JMeter5-安装JDK Mac下载 ...

  7. LOGO有哪几种常规设计思路?

    Logo设计的思路多种多样,但是我个人从Logo设计的历史上,大致可以归纳出五种常规思路,思路的名称是自己编的,仅供大家参考.而列举的这些思路背后,都是有着各自的时代背景的. 先从历史最悠久的一种设计 ...

  8. JS事件循环(Event Loop)机制

    前言 众所周知,为了与浏览器进行交互,Javascript是一门非阻塞单线程脚本语言. 为何单线程? 因为如果在DOM操作中,有两个线程一个添加节点,一个删除节点,浏览器并不知道以哪个为准,所以只能选 ...

  9. java 多线程间通信(二)

    传统的线程通信 Object提供了三个方法wait(), notify(), notifyAll()在线程之间进行通信,以此来解决线程间执行顺序等问题. wait():释放当前线程的同步监视控制器,并 ...

  10. @InitBinder 前端传递date时间类型属性时,转换错误问题

    在Controller里加上这段代码 @InitBinder public void initBinder(WebDataBinder binder) { binder.registerCustomE ...