Tensorflow多GPU性能不好

问题描述：

我们试图实现塔方法却发现性能变差： https://github.com/tensorflow/models/tree/master/inception Tensorflow多GPU性能不好

设备：

英特尔
1. 从修改Core i7
2. GTX-1060 x 2

的源代码：

拆分=无：默认版本
拆分= TRUE：塔式

from tensorflow.python.ops import tensor_array_ops 
 
from tensorflow.python.client import device_lib 
 
import tensorflow as tf 
 
import tflib as lib 
 
import numpy as np 
 
import time 
 
BATCH = 64 
 
DIM = 1000 
 
GPUs = 2 
 

 
Splitting = True 
 

 
def init_matrix(shape): 
 
    return tf.random_normal(shape, stddev=0.1) 
 

 
def Block(param, x, name, reuse): 
 
    W = tf.get_variable('%sweight'%name, [DIM, DIM]) 
 
    b = tf.get_variable('%sbias'%name, [DIM]) 
 
    if not reuse: param.extend([W, b]) 
 

 
    x_ = tf.reshape(x, [-1,DIM]) 
 
    output = tf.nn.sigmoid(tf.matmul(x_, W) + b) 
 
    return tf.reshape(output,[-1,DIM,DIM]) 
 

 
def _tower_loss(param, inputs, reuse=None): 
 
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): 
 
    output = Block(param, inputs, 'Layer.0.', reuse) 
 
    output = Block(param, output, 'Layer.1.', reuse) 
 
    output = Block(param, output, 'Layer.2.', reuse) 
 
    output = Block(param, output, 'Layer.3.', reuse) 
 
    output = Block(param, output, 'Layer.4.', reuse) 
 
    output = Block(param, output, 'Layer.5.', reuse) 
 
    output = tf.reshape(output, [-1, DIM*DIM]) 
 
    return tf.reduce_mean(output) 
 

 
def _all_gradients(tower_grads): 
 
    all_grads = [] 
 
    for i in range(len(tower_grads[0])): 
 
    for grad in tower_grads: 
 
     grads = [] 
 
     expanded_g = tf.expand_dims(grad[i], 0) 
 
     grads.append(expanded_g) 
 
    grad = tf.concat(axis=0, values=grads) 
 
    grad = tf.reduce_sum(grad,0) 
 
    all_grads.append(grad) 
 
    return all_grads 
 

 
if not Splitting: 
 
    opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9) 
 
    inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM]) 
 

 
    param = [] 
 
    loss = _tower_loss(param, inputs, None) 
 
    grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0) 
 
    apply_gradient_op = opt.apply_gradients(zip(grad, param)) 
 
    merged = tf.summary.merge_all() 
 

 
    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session: 
 
    session.run(tf.global_variables_initializer()) 
 
    writer = tf.summary.FileWriter(".", session.graph) 
 
    
 
    for i in range(100): 
 
     start = time.time() 
 
     session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])}) 
 
     print 'Iter'+str(i)+': time='+str(time.time()-start) 
 

 
else: 
 
    with tf.Graph().as_default(), tf.device('/cpu:0'): 
 
    opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9) 
 
    
 
    inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM]) 
 
    inputs_splits = tf.split(axis=0, num_or_size_splits=GPUs, value=inputs) 
 

 
    param = [] 
 
    tower_grads = [] 
 
    reuse = None 
 
    for i in range(GPUs): 
 
     with tf.device('/gpu:%d'%i): 
 
     with tf.name_scope('Tower_%d'%i) as scope: 
 
      with tf.device('/cpu:0'): 
 
      loss = _tower_loss(param, inputs_splits[i], reuse) 
 
      reuse = True 
 
      grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0) 
 
      tower_grads.append(grad) 
 
    grads = _all_gradients(tower_grads) 
 
    apply_gradient_op = opt.apply_gradients(zip(grads, param)) 
 
    merged = tf.summary.merge_all() 
 

 
    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session: 
 
     session.run(tf.global_variables_initializer()) 
 
     writer = tf.summary.FileWriter(".", session.graph) 
 
     for i in range(100): 
 
     start = time.time() 
 
     session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])}) 
 
     print 'Iter'+str(i)+': time='+str(time.time()-start)

性能：
- 默认版本 - 只使用GPU：0
  
  时间= 0.867873907089
- 塔版本 - 尝试使用multi-GPU
  
  time = 4.88468384743

我们的问题是：

它显示了5时间慢与塔方法。我们的实施有什么问题吗？
基于本教程，我们将模型保存在CPU中，并将任务分解到不同的GPU。但是我们的GPU通过PCIe相互连接，而不是NVLink。频繁的数据传输成本很高。有没有其他办法可以帮助基于PCIe的多GPU？

谢谢。

开始=>

答

for i in range(GPUs): 
    with tf.device('/gpu:%d'%i): 
    with tf.name_scope('Tower_%d'%i) as scope: 
     with tf.device('/cpu:0'): ### this line may cause all op allocated on cpu, try remove this line 
     loss = _tower_loss(param, inputs_splits[i], reuse) 
     reuse = True

删除'with tf.device（'/ cpu：0'）'确实改善了！谢谢。但是它仍然需要花费大约0.908358812332的时间执行，与单GPU结果类似。你有这个想法吗？ –

尝试使用tfprof来分析您的程序。 –

Tensorflow多GPU性能不好

相关推荐