Tensorflow多GPU性能不好

问题描述:

我们试图实现塔方法却发现性能变差: https://github.com/tensorflow/models/tree/master/inceptionTensorflow多GPU性能不好

  • 设备:

    • 英特尔

      1. 从修改Core i7

      2. GTX-1060 x 2
  • 的源代码:

    • 拆分=无:默认版本

    • 拆分= TRUE:塔式

  • from tensorflow.python.ops import tensor_array_ops 
     
    from tensorflow.python.client import device_lib 
     
    import tensorflow as tf 
     
    import tflib as lib 
     
    import numpy as np 
     
    import time 
     
    BATCH = 64 
     
    DIM = 1000 
     
    GPUs = 2 
     
    
     
    Splitting = True 
     
    
     
    def init_matrix(shape): 
     
        return tf.random_normal(shape, stddev=0.1) 
     
    
     
    def Block(param, x, name, reuse): 
     
        W = tf.get_variable('%sweight'%name, [DIM, DIM]) 
     
        b = tf.get_variable('%sbias'%name, [DIM]) 
     
        if not reuse: param.extend([W, b]) 
     
    
     
        x_ = tf.reshape(x, [-1,DIM]) 
     
        output = tf.nn.sigmoid(tf.matmul(x_, W) + b) 
     
        return tf.reshape(output,[-1,DIM,DIM]) 
     
    
     
    def _tower_loss(param, inputs, reuse=None): 
     
        with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): 
     
        output = Block(param, inputs, 'Layer.0.', reuse) 
     
        output = Block(param, output, 'Layer.1.', reuse) 
     
        output = Block(param, output, 'Layer.2.', reuse) 
     
        output = Block(param, output, 'Layer.3.', reuse) 
     
        output = Block(param, output, 'Layer.4.', reuse) 
     
        output = Block(param, output, 'Layer.5.', reuse) 
     
        output = tf.reshape(output, [-1, DIM*DIM]) 
     
        return tf.reduce_mean(output) 
     
    
     
    def _all_gradients(tower_grads): 
     
        all_grads = [] 
     
        for i in range(len(tower_grads[0])): 
     
        for grad in tower_grads: 
     
         grads = [] 
     
         expanded_g = tf.expand_dims(grad[i], 0) 
     
         grads.append(expanded_g) 
     
        grad = tf.concat(axis=0, values=grads) 
     
        grad = tf.reduce_sum(grad,0) 
     
        all_grads.append(grad) 
     
        return all_grads 
     
    
     
    if not Splitting: 
     
        opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9) 
     
        inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM]) 
     
    
     
        param = [] 
     
        loss = _tower_loss(param, inputs, None) 
     
        grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0) 
     
        apply_gradient_op = opt.apply_gradients(zip(grad, param)) 
     
        merged = tf.summary.merge_all() 
     
    
     
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session: 
     
        session.run(tf.global_variables_initializer()) 
     
        writer = tf.summary.FileWriter(".", session.graph) 
     
        
     
        for i in range(100): 
     
         start = time.time() 
     
         session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])}) 
     
         print 'Iter'+str(i)+': time='+str(time.time()-start) 
     
    
     
    else: 
     
        with tf.Graph().as_default(), tf.device('/cpu:0'): 
     
        opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9) 
     
        
     
        inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM]) 
     
        inputs_splits = tf.split(axis=0, num_or_size_splits=GPUs, value=inputs) 
     
    
     
        param = [] 
     
        tower_grads = [] 
     
        reuse = None 
     
        for i in range(GPUs): 
     
         with tf.device('/gpu:%d'%i): 
     
         with tf.name_scope('Tower_%d'%i) as scope: 
     
          with tf.device('/cpu:0'): 
     
          loss = _tower_loss(param, inputs_splits[i], reuse) 
     
          reuse = True 
     
          grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0) 
     
          tower_grads.append(grad) 
     
        grads = _all_gradients(tower_grads) 
     
        apply_gradient_op = opt.apply_gradients(zip(grads, param)) 
     
        merged = tf.summary.merge_all() 
     
    
     
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session: 
     
         session.run(tf.global_variables_initializer()) 
     
         writer = tf.summary.FileWriter(".", session.graph) 
     
         for i in range(100): 
     
         start = time.time() 
     
         session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])}) 
     
         print 'Iter'+str(i)+': time='+str(time.time()-start)

    1. 性能:

      • 默认版本 - 只使用GPU:0

        时间= 0.867873907089

      • 塔版本 - 尝试使用multi-GPU

        time = 4.88468384743

    2. 我们的问题是:

      1. 它显示了5时间慢与塔方法。我们的实施有什么问题吗?

      2. 基于本教程,我们将模型保存在CPU中,并将任务分解到不同的GPU。但是我们的GPU通过PCIe相互连接,而不是NVLink。频繁的数据传输成本很高。有没有其他办法可以帮助基于PCIe的多GPU?

      谢谢。

    开始=>

    for i in range(GPUs): 
        with tf.device('/gpu:%d'%i): 
        with tf.name_scope('Tower_%d'%i) as scope: 
         with tf.device('/cpu:0'): ### this line may cause all op allocated on cpu, try remove this line 
         loss = _tower_loss(param, inputs_splits[i], reuse) 
         reuse = True 
    
    +0

    删除'with tf.device('/ cpu:0')'确实改善了!谢谢。但是它仍然需要花费大约0.908358812332的时间执行,与单GPU结果类似。你有这个想法吗? –

    +0

    尝试使用tfprof来分析您的程序。 –