90、Tensorflow实现分布式学习,多台电脑,多个GPU 异步试学习
'''
Created on 2017年5月28日 @author: weizhen
'''
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data import mnist_inference BATCH_SIZE = 100
LEARNING_RATE_BASE = 0.01
TRAINING_STEPS = 1000
LEARNING_RATE_DECAY = 0.99
REGULARAZTION_RATE = 0.0001
# 模型保存路径
MODEL_SAVE_PATH = "/path/to/model"
# MNIST数据路径
DATA_PATH = "/path/to/data" # 通过flags指定运行的参数。对于不同的任务task给出了不同的程序
# 但这不是一种可扩展的方式,在这一小节中将使用运行程序是给出的参数来配置在不同任务中运行的程序
FLAGS = tf.app.flags.FLAGS
# 指定当前运行的是参数服务器还是计算服务器。参数服务器只负责Tensorflow中变量的维护和管理
# 计算服务器则负责每一轮迭代时运行反向传播过程
tf.app.flags.DEFINE_string('job_name', 'worker', '"ps" or "worker" ') # 指定集群中的参数服务器地址
tf.app.flags.DEFINE_string(
'ps_hosts', 'tf-ps0:2222,tf-ps1:1111',
'Comma-separated list of hostname:port for the parameter server jobs. e.g. "tf-ps0:2222,tf-ps1:1111" ')
# 指定集群中的计算服务器地址
tf.app.flags.DEFINE_string(
'worker_hosts', 'tf-worker0:2222,tf-worker1:1111',
'Comma-separated list of hostname:port for the worker jobs.'
'e.g. "tf-worker0:2222,tf-worker1:1111" ') # 指定当前程序的任务ID. Tensorflow 会自动根据参数服务器/计算服务器列表中的端口号
# 来启动服务。注意参数服务器和计算服务器的编号都是从0开始的
tf.app.flags.DEFINE_integer(
'task_id', 0, 'Task ID of the worker/replica running the training.'
) # 定义Tensorflow的计算图,并返回每一轮迭代时需要 运行的操作。
# 为了是处理分布式计算的部分更加突出,本校节将此过程整理为一个函数
def build_model(x, y_, is_chief):
regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
# 计算神经网络前向传播的结果
y = mnist_inference.inference(x, regularizer)
global_step = tf.Variable(0, trainable=False) # 计算损失函数并定义反向传播过程
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
cross_entropy_mean = tf.reduce_mean(cross_entropy)
loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))
learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY) # 定义每一轮迭代需要运行的操作
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
return global_step, loss, train_op # 训练分布式深度学习模型的主过程
def main(argv=None):
# 解析flags并通过tf.train.ClusterSpec配置TensorFlow集群
ps_hosts = FLAGS.ps_hosts.split(',')
worker_hosts = FLAGS.worker_hosts.split(',')
cluster = tf.train.ClusterSpec({"ps":ps_hosts, "worker":worker_hosts})
# 通过ClusterSpec以及当前任务创建Server
server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_id) # 参数服务器只需要管理TensorFlow中的变量,不需要执行训练的过程。server.join()会一直停在这条语句上
if FLAGS.job_name == 'ps':
server.join() # 定义计算服务器需要运行的操作。在所有的计算服务器中有一个是主计算服务器。它除了负责计算反向传播的结果,它还负责输出日志和保存模型
is_chief = (FLAGS.task_id == 0)
mnist = input_data.read_data_sets(DATA_PATH, one_hot=True) # 通过tf.train.replica_device_setter函数来指定执行每一个运算的设备
# tf.train.replica_device_setter函数会自动将所有的参数分配到参数服务器上,而
# 计算分配到当前的计算服务器上
with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d " % FLAGS.task_id, cluster=cluster)):
x = tf.placeholder(tf.float32, [None, mnist_inference.INPUT_NODE], name='x-input')
y_ = tf.placeholder(tf.float32, [None, mnist_inference.OUTPUT_NODE], name='y-input')
# 定义训练模型需要运行的操作
global_step, loss, train_op = build_model(x, y_, is_chief)
# 定义用于保存模型的saver
saver = tf.train.Saver()
# 定义日志输出操作
summary_op = tf.summary.merge_all()
# 定义病了初始化操作
init_op = tf.global_variables_initializer()
# 通过tf.train.Supervisor管理训练深度学习模型的通用功能
# tf.train.Supervisor能统一管理队列操作、模型保存、日志输出以及会话的生成
sv = tf.train.Supervisor(
is_chief=is_chief, # 定义当前计算服务器是否为主计算服务器,只用主计算服务器会保存模型以及输出日志
logdir=MODEL_SAVE_PATH, # 指定保存模型和输出日志的地址
init_op=init_op, # 指定初始化操作
summary_op=summary_op, # 指定日志生成操作
saver=saver, # 指定用于保存模型的saver
global_step=global_step, # 指定当前迭代的轮数,这个会用于生成保存模型文件的文件名
save_model_secs=60, # 指定保存模型的时间间隔
save_summaries_secs=60 # 指定日志输出的时间间隔
)
sess_config = tf.ConfigProto(allow_soft_placement=True,
log_device_placement=False)
# 通过tf.train.Supervisor生成会话
sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
step = 0
start_time = time.time()
# 执行迭代过程。在迭代过程中tf.train.Supervisor会帮助输出日志并保存模型
# 所以不需要直接调用这些过程
while not sv.should_stop():
xs, ys = mnist.train.next_batch(BATCH_SIZE)
_, loss_value, global_step_value = sess.run(
[train_op, loss, global_step], feed_dict={x:xs, y_:ys})
if global_step_value >= TRAINING_STEPS:break # 每隔一段时间输出训练信息
if step > 0 and step % 100 == 0:
duration = time.time() - start_time
# 不同的计算服务器都会更新全局的训练轮数,所以这里使用
# global_step_value可以直接得到在训练中使用过的batch的总数
sec_per_batch = duration / global_step_value format_str = ("After %d training steps (%d global steps), loss on training batch is %g. (%.3f sec/batch)")
print(format_str % (step, global_step_value, loss_value, sec_per_batch))
step += 1
sv.stop() if __name__ == "__main__":
tf.app.run()
下面是训练的结果,需要等到所有的机器都开起来之后才能进行训练
2017-05-28 22:38:45.122523: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.122960: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.123285: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE3 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.123847: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.124201: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.125153: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.125514: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:45.126016: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
2017-05-28 22:38:47.211250: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:887] Found device 0 with properties:
name: GeForce 940MX
major: 5 minor: 0 memoryClockRate (GHz) 1.189
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.66GiB
2017-05-28 22:38:47.211668: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:908] DMA: 0
2017-05-28 22:38:47.211848: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:918] 0: Y
2017-05-28 22:38:47.212045: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce 940MX, pci bus id: 0000:01:00.0)
2017-05-28 22:38:47.375428: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_channel.cc:200] Initialize GrpcChannelCache for job ps -> {0 -> tf-ps0:2222, 1 -> tf-ps1:1111}
2017-05-28 22:38:47.376363: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_channel.cc:200] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222, 1 -> tf-worker1:1111}
2017-05-28 22:38:47.380830: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_server_lib.cc:240] Started server with target: grpc://localhost:2222
Extracting /path/to/data\train-images-idx3-ubyte.gz
Extracting /path/to/data\train-labels-idx1-ubyte.gz
Extracting /path/to/data\t10k-images-idx3-ubyte.gz
Extracting /path/to/data\t10k-labels-idx1-ubyte.gz
2017-05-28 22:38:58.243494: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:38:58.244680: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:38:58.247390: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
2017-05-28 22:39:08.248725: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:39:08.249804: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:39:08.251307: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
2017-05-28 22:39:18.253692: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:39:18.254576: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:39:18.255448: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
2017-05-28 22:39:28.257660: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2017-05-28 22:39:28.258782: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1
2017-05-28 22:39:28.260428: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1
90、Tensorflow实现分布式学习,多台电脑,多个GPU 异步试学习的更多相关文章
- 『TensorFlow』分布式训练_其二_单机多GPU并行&GPU模式设定
建议比对『MXNet』第七弹_多GPU并行程序设计 一.tensorflow GPU设置 GPU指定占用 gpu_options = tf.GPUOptions(per_process_gpu_mem ...
- WIN7实现多人远程一台电脑
今天查了查网,发现有人说,WIN7可以实现多人远程一台电脑,于是乎我就试了试, 在工作办公室里的局域网里试了试,嘿,成功了,愿与大家分享一下,呵呵! 方法一: 多用户早就能破解了 方法如下:用UE打开 ...
- mxnet:结合R与GPU加速深度学习
转载于统计之都,http://cos.name/tag/dmlc/,作者陈天奇 ------------------------------------------------------------ ...
- 学习笔记TF061:分布式TensorFlow,分布式原理、最佳实践
分布式TensorFlow由高性能gRPC库底层技术支持.Martin Abadi.Ashish Agarwal.Paul Barham论文<TensorFlow:Large-Scale Mac ...
- Jmeter分布式部署测试-----远程连接多台电脑做压力性能测试
在使用Jmeter进行接口的性能测试时,由于Jmeter 是JAVA应用,对于CPU和内存的消耗比较大,所以,当需要模拟数以万计的并发用户时,使用单台机器模拟所有的并发用户就有些力不从心,甚至会引起J ...
- 【转载】Jmeter分布式部署测试-----远程连接多台电脑做压力性能测试
在使用Jmeter进行接口的性能测试时,由于Jmeter 是JAVA应用,对于CPU和内存的消耗比较大,所以,当需要模拟数以万计的并发用户时,使用单台机器模拟所有的并发用户就有些力不从心,甚至会引起J ...
- jmeter分布式压测(多台电脑一起压测)
(1)在Windows下运行 操作步骤: 1) 有多台电脑,每台电脑上都有jmeter,而且这几台电脑都互相能ping通. 2) 在我的电脑的jmeter的配置文件bin目录下的jme ...
- git学习笔记:一台电脑上配置两个git账户
如何在一台电脑上配置两个git账户,现在云端仓库很多,有开源中国的 gitee.com 微软的 github.com 还有 gitlab.com 和 bitbucket.org 等等,下面是具体步骤 ...
- Git学习笔记——从一台电脑上传文件到Github上
目标:从一台电脑上传文件到Github上 前提: 1.这里假定已在Github上创建了仓库,建立了仓库 2.已在这台电脑上安装了Git客户端 实验环境: 1.Windows 10 64位,已安装了Gi ...
随机推荐
- 3.tensorflow——NN
import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data numClasses=10 inp ...
- appium常见问题09_MAC打开uiautimatorviewer闪退怎么办?
问题: 下载安装Android SDK后,并且已在.bash_profile文件中配置环境变量.但是在tools中打开定位工具uiautomatorviewer出现闪退. 解决: 首先检查环境变量配置 ...
- MybatisPlus自动填充公共字段的策略
背景:数据库中多个表有时间字段,并且字段名一致 需求:该时间字段由MybatisPlus自动插入和更新,业务代码无需处理 方法: 一.创建基础实体[BaseEntity],定义需要处理的公共字段(创建 ...
- cf:c题
题目: 代码: #include<iostream> #include<algorithm> #include<vector> #include<string ...
- 好用的打包工具webpack
<什么是webpack> webpack是一个模块打包器,任何静态资源(js.css.图片等)都可以视作模块,然后模块之间也可以相互依赖,通过webpack对模块进行处理后,可以打包成我们 ...
- 模仿JQuery封装ajax功能
需求分析 因为有时候想提高性能,只需要一个ajax函数,不想引入较大的jq文件,尝试过axios,可是get方法不支持多层嵌套的json,post方式后台接收方式似乎要变..也许是我不太会用吧..其实 ...
- Mac 电脑如何卸载 重装node
由于在日常开发中,部分node版本不支持,因此,我们需要对已安装的node进行卸载重装,步骤如下: 一.在终端依次输入以下命令 sudo npm uninstall npm -g sudo r ...
- 波兰语 polish
There are several systems for encoding the Polish alphabet for computers. All letters of the Polish ...
- PNG文件格式
PNG文件的组成 一个PNG文件可以看作是由多个数据块(chunk)部分组成,如同积木一样,一个数据块就是一个小积木,不同类型的积木组合搭建成了我们的PNG图像. PNG图像至少由文件署名域和三个关键 ...
- 解读dbcp自动重连那些事(转)
转自:http://agapple.iteye.com/blog/791943 Hi all : 最近在做 offerdetail 优化时,替换了数据库驱动,从 c3p0 0.9.1 -> db ...