90、Tensorflow实现分布式学习,多台电脑,多个GPU 异步试学习
'''
Created on 2017年5月28日 @author: weizhen
'''
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data import mnist_inference BATCH_SIZE = 100
LEARNING_RATE_BASE = 0.01
TRAINING_STEPS = 1000
LEARNING_RATE_DECAY = 0.99
REGULARAZTION_RATE = 0.0001
# 模型保存路径
MODEL_SAVE_PATH = "/path/to/model"
# MNIST数据路径
DATA_PATH = "/path/to/data" # 通过flags指定运行的参数。对于不同的任务task给出了不同的程序
# 但这不是一种可扩展的方式,在这一小节中将使用运行程序是给出的参数来配置在不同任务中运行的程序
FLAGS = tf.app.flags.FLAGS
# 指定当前运行的是参数服务器还是计算服务器。参数服务器只负责Tensorflow中变量的维护和管理
# 计算服务器则负责每一轮迭代时运行反向传播过程
tf.app.flags.DEFINE_string('job_name', 'worker', '"ps" or "worker" ') # 指定集群中的参数服务器地址
tf.app.flags.DEFINE_string(
'ps_hosts', 'tf-ps0:2222,tf-ps1:1111',
'Comma-separated list of hostname:port for the parameter server jobs. e.g. "tf-ps0:2222,tf-ps1:1111" ')
# 指定集群中的计算服务器地址
tf.app.flags.DEFINE_string(
'worker_hosts', 'tf-worker0:2222,tf-worker1:1111',
'Comma-separated list of hostname:port for the worker jobs.'
'e.g. "tf-worker0:2222,tf-worker1:1111" ') # 指定当前程序的任务ID. Tensorflow 会自动根据参数服务器/计算服务器列表中的端口号
# 来启动服务。注意参数服务器和计算服务器的编号都是从0开始的
tf.app.flags.DEFINE_integer(
'task_id', 0, 'Task ID of the worker/replica running the training.'
) # 定义Tensorflow的计算图,并返回每一轮迭代时需要 运行的操作。
# 为了是处理分布式计算的部分更加突出,本校节将此过程整理为一个函数
def build_model(x, y_, is_chief):
regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
# 计算神经网络前向传播的结果
y = mnist_inference.inference(x, regularizer)
global_step = tf.Variable(0, trainable=False) # 计算损失函数并定义反向传播过程
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
cross_entropy_mean = tf.reduce_mean(cross_entropy)
loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))
learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY) # 定义每一轮迭代需要运行的操作
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
return global_step, loss, train_op # 训练分布式深度学习模型的主过程
def main(argv=None):
# 解析flags并通过tf.train.ClusterSpec配置TensorFlow集群
ps_hosts = FLAGS.ps_hosts.split(',')
worker_hosts = FLAGS.worker_hosts.split(',')
cluster = tf.train.ClusterSpec({"ps":ps_hosts, "worker":worker_hosts})
# 通过ClusterSpec以及当前任务创建Server
server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_id) # 参数服务器只需要管理TensorFlow中的变量,不需要执行训练的过程。server.join()会一直停在这条语句上
if FLAGS.job_name == 'ps':
server.join() # 定义计算服务器需要运行的操作。在所有的计算服务器中有一个是主计算服务器。它除了负责计算反向传播的结果,它还负责输出日志和保存模型
is_chief = (FLAGS.task_id == 0)
mnist = input_data.read_data_sets(DATA_PATH, one_hot=True) # 通过tf.train.replica_device_setter函数来指定执行每一个运算的设备
# tf.train.replica_device_setter函数会自动将所有的参数分配到参数服务器上,而
# 计算分配到当前的计算服务器上
with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d " % FLAGS.task_id, cluster=cluster)):
x = tf.placeholder(tf.float32, [None, mnist_inference.INPUT_NODE], name='x-input')
y_ = tf.placeholder(tf.float32, [None, mnist_inference.OUTPUT_NODE], name='y-input')
# 定义训练模型需要运行的操作
global_step, loss, train_op = build_model(x, y_, is_chief)
# 定义用于保存模型的saver
saver = tf.train.Saver()
# 定义日志输出操作
summary_op = tf.summary.merge_all()
# 定义病了初始化操作
init_op = tf.global_variables_initializer()
# 通过tf.train.Supervisor管理训练深度学习模型的通用功能
# tf.train.Supervisor能统一管理队列操作、模型保存、日志输出以及会话的生成
sv = tf.train.Supervisor(
is_chief=is_chief, # 定义当前计算服务器是否为主计算服务器,只用主计算服务器会保存模型以及输出日志
logdir=MODEL_SAVE_PATH, # 指定保存模型和输出日志的地址
init_op=init_op, # 指定初始化操作
summary_op=summary_op, # 指定日志生成操作
saver=saver, # 指定用于保存模型的saver
global_step=global_step, # 指定当前迭代的轮数,这个会用于生成保存模型文件的文件名
save_model_secs=60, # 指定保存模型的时间间隔
save_summaries_secs=60 # 指定日志输出的时间间隔
)
sess_config = tf.ConfigProto(allow_soft_placement=True,
log_device_placement=False)
# 通过tf.train.Supervisor生成会话
sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
step = 0
start_time = time.time()
# 执行迭代过程。在迭代过程中tf.train.Supervisor会帮助输出日志并保存模型
# 所以不需要直接调用这些过程
while not sv.should_stop():
xs, ys = mnist.train.next_batch(BATCH_SIZE)
_, loss_value, global_step_value = sess.run(
[train_op, loss, global_step], feed_dict={x:xs, y_:ys})
if global_step_value >= TRAINING_STEPS:break # 每隔一段时间输出训练信息
if step > 0 and step % 100 == 0:
duration = time.time() - start_time
# 不同的计算服务器都会更新全局的训练轮数,所以这里使用
# global_step_value可以直接得到在训练中使用过的batch的总数
sec_per_batch = duration / global_step_value format_str = ("After %d training steps (%d global steps), loss on training batch is %g. (%.3f sec/batch)")
print(format_str % (step, global_step_value, loss_value, sec_per_batch))
step += 1
sv.stop() if __name__ == "__main__":
tf.app.run()
下面是训练的结果,需要等到所有的机器都开起来之后才能进行训练
2017-05-28 22:38:45.122523: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.122960: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.123285: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE3 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.123847: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.124201: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.125153: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.125514: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.126016: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:47.211250: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:887] Found device 0 with properties:
name: GeForce 940MX
major: 5 minor: 0 memoryClockRate (GHz) 1.189
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.66GiB
2017-05-28 22:38:47.211668: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:908] DMA: 0
2017-05-28 22:38:47.211848: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:918] 0: Y
2017-05-28 22:38:47.212045: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce 940MX, pci bus id: 0000:01:00.0)
2017-05-28 22:38:47.375428: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_channel.cc:200] Initialize GrpcChannelCache for job ps -> {0 -> tf-ps0:2222, 1 -> tf-ps1:1111}
2017-05-28 22:38:47.376363: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_channel.cc:200] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222, 1 -> tf-worker1:1111}
2017-05-28 22:38:47.380830: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_server_lib.cc:240] Started server with target: grpc://localhost:2222
Extracting /path/to/data\train-images-idx3-ubyte.gz
Extracting /path/to/data\train-labels-idx1-ubyte.gz
Extracting /path/to/data\t10k-images-idx3-ubyte.gz
Extracting /path/to/data\t10k-labels-idx1-ubyte.gz
2017-05-28 22:38:58.243494: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:38:58.244680: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:38:58.247390: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
2017-05-28 22:39:08.248725: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:39:08.249804: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:39:08.251307: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
2017-05-28 22:39:18.253692: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:39:18.254576: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:39:18.255448: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
2017-05-28 22:39:28.257660: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:39:28.258782: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:39:28.260428: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
90、Tensorflow实现分布式学习,多台电脑,多个GPU 异步试学习的更多相关文章
- 『TensorFlow』分布式训练_其二_单机多GPU并行&GPU模式设定
建议比对『MXNet』第七弹_多GPU并行程序设计 一.tensorflow GPU设置 GPU指定占用 gpu_options = tf.GPUOptions(per_process_gpu_mem ...
- WIN7实现多人远程一台电脑
今天查了查网,发现有人说,WIN7可以实现多人远程一台电脑,于是乎我就试了试, 在工作办公室里的局域网里试了试,嘿,成功了,愿与大家分享一下,呵呵! 方法一: 多用户早就能破解了 方法如下:用UE打开 ...
- mxnet:结合R与GPU加速深度学习
转载于统计之都,http://cos.name/tag/dmlc/,作者陈天奇 ------------------------------------------------------------ ...
- 学习笔记TF061:分布式TensorFlow,分布式原理、最佳实践
分布式TensorFlow由高性能gRPC库底层技术支持.Martin Abadi.Ashish Agarwal.Paul Barham论文<TensorFlow:Large-Scale Mac ...
- Jmeter分布式部署测试-----远程连接多台电脑做压力性能测试
在使用Jmeter进行接口的性能测试时,由于Jmeter 是JAVA应用,对于CPU和内存的消耗比较大,所以,当需要模拟数以万计的并发用户时,使用单台机器模拟所有的并发用户就有些力不从心,甚至会引起J ...
- 【转载】Jmeter分布式部署测试-----远程连接多台电脑做压力性能测试
在使用Jmeter进行接口的性能测试时,由于Jmeter 是JAVA应用,对于CPU和内存的消耗比较大,所以,当需要模拟数以万计的并发用户时,使用单台机器模拟所有的并发用户就有些力不从心,甚至会引起J ...
- jmeter分布式压测(多台电脑一起压测)
(1)在Windows下运行 操作步骤: 1) 有多台电脑,每台电脑上都有jmeter,而且这几台电脑都互相能ping通. 2) 在我的电脑的jmeter的配置文件bin目录下的jme ...
- git学习笔记:一台电脑上配置两个git账户
如何在一台电脑上配置两个git账户,现在云端仓库很多,有开源中国的 gitee.com 微软的 github.com 还有 gitlab.com 和 bitbucket.org 等等,下面是具体步骤 ...
- Git学习笔记——从一台电脑上传文件到Github上
目标:从一台电脑上传文件到Github上 前提: 1.这里假定已在Github上创建了仓库,建立了仓库 2.已在这台电脑上安装了Git客户端 实验环境: 1.Windows 10 64位,已安装了Gi ...
随机推荐
- IDEA默认快捷键
idea常用快捷键大全 Idea常用快捷键大全,拿小本本记下来,忘记了可以方便查找.编写代码Ctrl+Shift + Enter,语句完成“!”,否定完成,输入表达式时按 “!”键Ctrl+E,最 ...
- rm -rf无法删除文件解决方法
# 列出 file.sh 文件的属性 lsattr file.sh # 列出当前目录下所有文件以及文件夹的属性 lsattr # 为 file.sh 文件增加 i 标识 chattr +i file. ...
- shell zip和unzip压缩和解压,压缩效率
1.把/home目录下面的mydata目录压缩为mydata.zip zip -r mydata.zip mydata #压缩mydata目录zip -r mydata.zip ./*txt #压缩当 ...
- 拾遗:vim 配置(个人适用,仅供参考)
~/.vimrc "===================通用配置====================== set encoding=utf- set statusline=%F%=[L ...
- 一些关于SEO优化的笔记
高级搜索指令: 双引号:“xxx” 代表完全匹配的搜索 减号:-(减号前面必须是空格,后面必须没有空格)代表搜索不包含减号后面的词的页面 filetype:用于搜索特定文件格式(百度支持的文件类型:P ...
- [CF1202B] You Are Given a Decimal String(最短路)
Description 今天突然想来发一篇博客防死 [Portal][https://vjudge.net/problem/2650668/origin] 定义被x-y生成器生成的序列为, 一开始有一 ...
- 爬虫-ajax请求遇到Unicode编码问题
2018-08--4爬取金色财经网页 网址:https://www.jinse.com/search/EOS 第一步:我观察了网页:发现了网页是一个发送ajax请求的网页,发现如下: 然后 我就先爬取 ...
- javascript中数组的应用总结
最近在总结javascript的相关应用,今天对js中的数组部分进行归纳总结,以便在以后的工作中有所参考. 1.在js中数组的定义方式有两种: var a = [1,2,3,4]; var b = n ...
- c# 自定义控件之 ComboBox
winform 自带的 combobox 无法支持根据输入文本匹配列表中的项目,需要使用自定义控件. public class MyCombobox : ComboBox { //初始化数据项 pri ...
- Centos安装IDEA
1.官网下载tar包 到https://www.jetbrains.com/idea 下载对应版本的文件 将其解压 tar zvxf idea下载文件.tar 进入到解压后文件夹的bin目录下执行 . ...