90、Tensorflow实现分布式学习，多台电脑，多个GPU 异步试学习

'''

Created on 2017年5月28日

@author: weizhen

'''

import time

import tensorflow as tf

from tensorflow.examples.tutorials.mnist import input_data

import mnist_inference

BATCH_SIZE = 100

LEARNING_RATE_BASE = 0.01

TRAINING_STEPS = 1000

LEARNING_RATE_DECAY = 0.99

REGULARAZTION_RATE = 0.0001

# 模型保存路径

MODEL_SAVE_PATH = "/path/to/model"

# MNIST数据路径

DATA_PATH = "/path/to/data"

# 通过flags指定运行的参数。对于不同的任务task给出了不同的程序

# 但这不是一种可扩展的方式，在这一小节中将使用运行程序是给出的参数来配置在不同任务中运行的程序

FLAGS = tf.app.flags.FLAGS

# 指定当前运行的是参数服务器还是计算服务器。参数服务器只负责Tensorflow中变量的维护和管理

# 计算服务器则负责每一轮迭代时运行反向传播过程

tf.app.flags.DEFINE_string('job_name', 'worker', '"ps" or "worker" ')

# 指定集群中的参数服务器地址

tf.app.flags.DEFINE_string(

    'ps_hosts', 'tf-ps0:2222,tf-ps1:1111',

    'Comma-separated list of hostname:port for the parameter server jobs. e.g. "tf-ps0:2222,tf-ps1:1111" ')

# 指定集群中的计算服务器地址

tf.app.flags.DEFINE_string(

    'worker_hosts', 'tf-worker0:2222,tf-worker1:1111',

    'Comma-separated list of hostname:port for the worker jobs.'

    'e.g. "tf-worker0:2222,tf-worker1:1111" ')

# 指定当前程序的任务ID. Tensorflow 会自动根据参数服务器/计算服务器列表中的端口号

# 来启动服务。注意参数服务器和计算服务器的编号都是从0开始的

tf.app.flags.DEFINE_integer(

    'task_id', 0, 'Task ID of the worker/replica running the training.'

    )

# 定义Tensorflow的计算图，并返回每一轮迭代时需要 运行的操作。

# 为了是处理分布式计算的部分更加突出，本校节将此过程整理为一个函数

def build_model(x, y_, is_chief):

    regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)

    # 计算神经网络前向传播的结果

    y = mnist_inference.inference(x, regularizer)

    global_step = tf.Variable(0, trainable=False)

    # 计算损失函数并定义反向传播过程

    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))

    cross_entropy_mean = tf.reduce_mean(cross_entropy)

    loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))

    learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY)

    # 定义每一轮迭代需要运行的操作

    train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    return global_step, loss, train_op

# 训练分布式深度学习模型的主过程

def main(argv=None):

    # 解析flags并通过tf.train.ClusterSpec配置TensorFlow集群

    ps_hosts = FLAGS.ps_hosts.split(',')

    worker_hosts = FLAGS.worker_hosts.split(',')

    cluster = tf.train.ClusterSpec({"ps":ps_hosts, "worker":worker_hosts})

    # 通过ClusterSpec以及当前任务创建Server

    server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_id)

    # 参数服务器只需要管理TensorFlow中的变量，不需要执行训练的过程。server.join()会一直停在这条语句上

    if FLAGS.job_name == 'ps':

        server.join()

    # 定义计算服务器需要运行的操作。在所有的计算服务器中有一个是主计算服务器。它除了负责计算反向传播的结果，它还负责输出日志和保存模型

    is_chief = (FLAGS.task_id == 0)

    mnist = input_data.read_data_sets(DATA_PATH, one_hot=True)

    # 通过tf.train.replica_device_setter函数来指定执行每一个运算的设备

    # tf.train.replica_device_setter函数会自动将所有的参数分配到参数服务器上，而

    # 计算分配到当前的计算服务器上

    with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d " % FLAGS.task_id, cluster=cluster)):

        x = tf.placeholder(tf.float32, [None, mnist_inference.INPUT_NODE], name='x-input')

        y_ = tf.placeholder(tf.float32, [None, mnist_inference.OUTPUT_NODE], name='y-input')

        # 定义训练模型需要运行的操作

        global_step, loss, train_op = build_model(x, y_, is_chief)

        # 定义用于保存模型的saver

        saver = tf.train.Saver()

        # 定义日志输出操作

        summary_op = tf.summary.merge_all()

        # 定义病了初始化操作

        init_op = tf.global_variables_initializer()

        # 通过tf.train.Supervisor管理训练深度学习模型的通用功能

        # tf.train.Supervisor能统一管理队列操作、模型保存、日志输出以及会话的生成

        sv = tf.train.Supervisor(

            is_chief=is_chief,  # 定义当前计算服务器是否为主计算服务器，只用主计算服务器会保存模型以及输出日志

            logdir=MODEL_SAVE_PATH,  # 指定保存模型和输出日志的地址

            init_op=init_op,  # 指定初始化操作

            summary_op=summary_op,  # 指定日志生成操作

            saver=saver,  # 指定用于保存模型的saver

            global_step=global_step,  # 指定当前迭代的轮数，这个会用于生成保存模型文件的文件名

            save_model_secs=60,  # 指定保存模型的时间间隔

            save_summaries_secs=60  # 指定日志输出的时间间隔

            )

        sess_config = tf.ConfigProto(allow_soft_placement=True,

                                   log_device_placement=False)

        # 通过tf.train.Supervisor生成会话

        sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)

        step = 0

        start_time = time.time()

        # 执行迭代过程。在迭代过程中tf.train.Supervisor会帮助输出日志并保存模型

        # 所以不需要直接调用这些过程

        while not sv.should_stop():

            xs, ys = mnist.train.next_batch(BATCH_SIZE)

            _, loss_value, global_step_value = sess.run(

                [train_op, loss, global_step], feed_dict={x:xs, y_:ys})

            if global_step_value >= TRAINING_STEPS:break

            # 每隔一段时间输出训练信息

            if step > 0 and step % 100 == 0:

                duration = time.time() - start_time

                # 不同的计算服务器都会更新全局的训练轮数，所以这里使用

                # global_step_value可以直接得到在训练中使用过的batch的总数

                sec_per_batch = duration / global_step_value

                format_str = ("After %d training steps (%d global steps), loss on training batch is %g. (%.3f sec/batch)")

                print(format_str % (step, global_step_value, loss_value, sec_per_batch))

            step += 1

        sv.stop()

if __name__ == "__main__":

    tf.app.run()

下面是训练的结果，需要等到所有的机器都开起来之后才能进行训练

2017-05-28 22:38:45.122523: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE instructions, but these are available on your machine and could speed up CPU computations.

2017-05-28 22:38:45.122960: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE2 instructions, but these are available on your machine and could speed up CPU computations.

2017-05-28 22:38:45.123285: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE3 instructions, but these are available on your machine and could speed up CPU computations.

2017-05-28 22:38:45.123847: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.

2017-05-28 22:38:45.124201: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.

2017-05-28 22:38:45.125153: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.

2017-05-28 22:38:45.125514: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.

2017-05-28 22:38:45.126016: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.

2017-05-28 22:38:47.211250: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:887] Found device 0 with properties:

name: GeForce 940MX

major: 5 minor: 0 memoryClockRate (GHz) 1.189

pciBusID 0000:01:00.0

Total memory: 2.00GiB

Free memory: 1.66GiB

2017-05-28 22:38:47.211668: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:908] DMA: 0

2017-05-28 22:38:47.211848: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:918] 0:   Y

2017-05-28 22:38:47.212045: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce 940MX, pci bus id: 0000:01:00.0)

2017-05-28 22:38:47.375428: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_channel.cc:200] Initialize GrpcChannelCache for job ps -> {0 -> tf-ps0:2222, 1 -> tf-ps1:1111}

2017-05-28 22:38:47.376363: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_channel.cc:200] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222, 1 -> tf-worker1:1111}

2017-05-28 22:38:47.380830: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\rpc\grpc_server_lib.cc:240] Started server with target: grpc://localhost:2222

Extracting /path/to/data\train-images-idx3-ubyte.gz

Extracting /path/to/data\train-labels-idx1-ubyte.gz

Extracting /path/to/data\t10k-images-idx3-ubyte.gz

Extracting /path/to/data\t10k-labels-idx1-ubyte.gz

2017-05-28 22:38:58.243494: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0

2017-05-28 22:38:58.244680: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1

2017-05-28 22:38:58.247390: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1

2017-05-28 22:39:08.248725: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0

2017-05-28 22:39:08.249804: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1

2017-05-28 22:39:08.251307: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1

2017-05-28 22:39:18.253692: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0

2017-05-28 22:39:18.254576: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1

2017-05-28 22:39:18.255448: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1

2017-05-28 22:39:28.257660: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0

2017-05-28 22:39:28.258782: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:ps/replica:0/task:1

2017-05-28 22:39:28.260428: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\distributed_runtime\master.cc:201] CreateSession still waiting for response from worker: /job:worker/replica:0/task:1

90、Tensorflow实现分布式学习，多台电脑，多个GPU 异步试学习的更多相关文章

『TensorFlow』分布式训练_其二_单机多GPU并行&GPU模式设定
建议比对『MXNet』第七弹_多GPU并行程序设计一.tensorflow GPU设置 GPU指定占用 gpu_options = tf.GPUOptions(per_process_gpu_mem ...
WIN7实现多人远程一台电脑
今天查了查网,发现有人说,WIN7可以实现多人远程一台电脑,于是乎我就试了试, 在工作办公室里的局域网里试了试,嘿,成功了,愿与大家分享一下,呵呵! 方法一: 多用户早就能破解了方法如下:用UE打开 ...
mxnet：结合R与GPU加速深度学习
转载于统计之都,http://cos.name/tag/dmlc/,作者陈天奇 ------------------------------------------------------------ ...
学习笔记TF061:分布式TensorFlow，分布式原理、最佳实践
分布式TensorFlow由高性能gRPC库底层技术支持.Martin Abadi.Ashish Agarwal.Paul Barham论文<TensorFlow:Large-Scale Mac ...
Jmeter分布式部署测试-----远程连接多台电脑做压力性能测试
在使用Jmeter进行接口的性能测试时,由于Jmeter 是JAVA应用,对于CPU和内存的消耗比较大,所以,当需要模拟数以万计的并发用户时,使用单台机器模拟所有的并发用户就有些力不从心,甚至会引起J ...
【转载】Jmeter分布式部署测试-----远程连接多台电脑做压力性能测试
在使用Jmeter进行接口的性能测试时,由于Jmeter 是JAVA应用,对于CPU和内存的消耗比较大,所以,当需要模拟数以万计的并发用户时,使用单台机器模拟所有的并发用户就有些力不从心,甚至会引起J ...
jmeter分布式压测（多台电脑一起压测）
(1)在Windows下运行操作步骤: 1) 有多台电脑,每台电脑上都有jmeter,而且这几台电脑都互相能ping通. 2) 在我的电脑的jmeter的配置文件bin目录下的jme ...
git学习笔记：一台电脑上配置两个git账户
如何在一台电脑上配置两个git账户,现在云端仓库很多,有开源中国的 gitee.com 微软的 github.com 还有 gitlab.com 和 bitbucket.org 等等,下面是具体步骤 ...
Git学习笔记——从一台电脑上传文件到Github上
目标:从一台电脑上传文件到Github上前提: 1.这里假定已在Github上创建了仓库,建立了仓库 2.已在这台电脑上安装了Git客户端实验环境: 1.Windows 10 64位,已安装了Gi ...

随机推荐

linux 修改系统字符集，查看字符
修改系统字符集 # cd /etc/sysconfig # vi i18n -------------------文件内容------------------ LANG="zh_CN.GB1 ...
Jmeter 5.1 从excel读取数据执行接口自动化
思路:数据在excel文件中进行维护,然后转换成csv格式,jme中读取数据执行: 1.将接口各数据在excel文件中进行维护,然后存为csv格式,我的数据如下: 2.jmeter脚本,配置csv文件 ...
面试题30：包含min函数的栈
思路: 1.首先将栈的基本结构写出 #初始化栈的写法 def __init__(self): self.stack = [] #栈的压入 (加self是实例化,如果前面加入静态装饰器啥的,就不需要 ...
BZOJ 2724蒲公英 (分块) 【内有块大小证明】
题面 luogu传送门分析先分块,设块大小为x(之后我们会证明块大小取何值会更优) 步骤1 把所有的数离散化,然后对每个值开一个vector pos[i],pos[i]存储数i出现的位置我们设查 ...
Pycharm2019版官方版本激活码,无需破解
AHD9079DKZ-eyJsaWNlbnNlSWQiOiJBSEQ5MDc5REtaIiwibGljZW5zZWVOYW1lIjoiSmV0IEdyb3VwcyIsImFzc2lnbmVlTmFtZ ...
Linux Kernel中所應用的數據結構及演算法
Linux Kernel中所應用的數據結構及演算法 Basic Data Structures and Algorithms in the Linux kernel Links are to the ...
HTTPS 301错误码以及 SSL错误
301 redirect: 301 代表永久性转移(Permanently Moved) 解决方法:修改请求 http 改为 https PHP通过cURL访问https时出现SSL certific ...
casperjs-options
The Casper class The easiest way to get a casper instance is to use the module's create() method: 最简 ...
模仿JQuery封装ajax功能
需求分析因为有时候想提高性能,只需要一个ajax函数,不想引入较大的jq文件,尝试过axios,可是get方法不支持多层嵌套的json,post方式后台接收方式似乎要变..也许是我不太会用吧..其实 ...
三、bootstrap-treeview
一.bootstrap-treeview 修饰标签为徽章参考 https://www.cnblogs.com/bin521/p/8403588.html

90、Tensorflow实现分布式学习，多台电脑，多个GPU 异步试学习

90、Tensorflow实现分布式学习，多台电脑，多个GPU 异步试学习的更多相关文章

随机推荐

热门专题