ResourceExhaustedError：OOM分配与形状[]

问题描述：

#coding:utf-8 
import codecs 
import tensorflow as tf 
from guppy import hpy 
import numpy as np 
from scipy.sparse import csc_matrix 
import collections 
def _read_words(filename): 
    with open(filename,'r') as f: 
     return f.read().replace('\n', ' ').split() 

def _build_vocab(filename): 

    data = _read_words(filename) 
    counter = collections.Counter(data)  
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) 

    words, _ = list(zip(*count_pairs))  

    word_to_id = dict(zip(words, range(len(words)))) 

    return word_to_id 

def _file_to_word_ids(filename, word_to_id):  
    data = _read_words(filename) 
    file_id=[] 
    for word in data: 
     if word in word_to_id: 
      file_id.append(word_to_id[word]) 
     else: 
      file_id.append('unk') 
    file_id=np.asarray(file_id) 
    file_id=file_id.reshape(-1,n_steps) 
    return file_id 

def RNN(X, weights, biases): 
    # hidden layer for input to cell 
    ######################################## 

    # transpose the inputs shape from 
    # X ==> (128 batch * 28 steps, 28 inputs) 
    X = tf.reshape(X, [-1, n_inputs]) 

    # into hidden 
    # X_in = (128 batch * 28 steps, 128 hidden) 
    X_in = tf.matmul(X, weights['in']) + biases['in'] 
    # X_in ==> (128 batch, 28 steps, 128 hidden) 
    X_in = tf.reshape(X_in, [-1, n_steps, n_hidden_units]) 

    # cell 
    ########################################## 

    # basic LSTM Cell. 
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden_units, forget_bias=0.0, state_is_tuple=True) 
    # lstm cell is divided into two parts (c_state, h_state) 
    init_state = lstm_cell.zero_state(batch_size, dtype=tf.float32) 

    # You have 2 options for following step. 
    # 1: tf.nn.rnn(cell, inputs); 
    # 2: tf.nn.dynamic_rnn(cell, inputs). 
    # If use option 1, you have to modified the shape of X_in, go and check out this: 
    # https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py 
    # In here, we go for option 2. 
    # dynamic_rnn receive Tensor (batch, steps, inputs) or (steps, batch, inputs) as X_in. 
    # Make sure the time_major is changed accordingly. 
    outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, X_in, initial_state=init_state, time_major=False) 

    # hidden layer for output as the final results 
    ############################################# 
    # results = tf.matmul(final_state[1], weights['out']) + biases['out'] 

    # # or 
    # unstak to list [(batch, outputs)..] * steps 
    outputs = tf.unpack(tf.transpose(outputs, [1, 0, 2])) # states is the last outputs 
    results = tf.matmul(outputs[-1], weights['out']) + biases['out'] 
    del outputs,final_state,lstm_cell,init_state,X,X_in 
    return results 



def sparse_label_matrix(f1): 
    col=[] 
    for i in f1.readlines(): 
     if i.strip() == '': 
      continue 
     col.append(int(i.strip())) 

    return np.asarray(col) 
    # print(sparse_matrix.shape) 

# set random seed for comparing the two result calculations 
tf.set_random_seed(1) 

# hyperparameters 
lr = 0.001 
batch_size = 32 
n_epoch=3 
n_inputs = 300 # MNIST data input (img shape: 28*28) 
n_steps = 303 # time steps 
n_hidden_units = 512 # neurons in hidden layer 
n_classes = 15857 # MNIST classes (0-9 digits) 

# tf Graph input 
x = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) 
y = tf.placeholder(tf.float32, [None, n_classes]) 
sess=tf.Session() 

# Define weights 
weights = { 
    # (28, 128) 
    'in': tf.Variable(tf.random_normal([n_inputs, n_hidden_units]),name='wi'), 
    # (128, 10) 
    'out': tf.Variable(tf.random_normal([n_hidden_units, n_classes]),name='wo') 
} 
biases = { 
    # (128,) 
    'in': tf.Variable(tf.constant(0.1, shape=[n_hidden_units, ]),name='bi'), 
    # (10,) 
    'out': tf.Variable(tf.constant(0.1, shape=[n_classes, ]),name='bo') 
} 

pred = RNN(x, weights, biases) 
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) 
train_op = tf.train.AdamOptimizer(lr).minimize(cost) 

correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) 
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 


f1 = codecs.open('./data/labels.txt', 'r', 'utf-8') 
label_matrix = sparse_label_matrix(f1) 
f1.close() 


f2 = codecs.open('./New Glove/glove.6B.300d.txt', 'r', 'utf-8') 
word_to_id=_build_vocab('./data/train_data.txt') 
file_id = _file_to_word_ids('./data/train_data.txt',word_to_id)  
f2.close() 

f3=codecs.open('./data/embedding_matrix.txt','r','utf-8') 
embedding_matrix = np.zeros((len(word_to_id), n_inputs)) 
r=0 
for i in f3.readlines(): 
    embedding_matrix[r]=np.asarray(i.strip().split(),dtype='float') 
    r+=1 
f3.close() 
hp=hpy() 
def while_loop(s,e,step): 
    while s+batch_size<ran: 
     batch_id=file_id[s:e] 
     batch_col=label_matrix[s:e]            

     batch_label = csc_matrix((data, (batch_row, batch_col)), shape=(batch_size, n_classes)) 
     batch_label = batch_label.toarray() 
     batch_xs1=tf.nn.embedding_lookup(embedding_matrix,batch_id) 
     batch_xs=sess.run(batch_xs1) 
     del batch_xs1 
     sess.run([train_op], feed_dict={x: batch_xs, 
             y: batch_label}) 

     print(step,':',sess.run(accuracy, feed_dict={x: batch_xs,y: batch_label}),sess.run(cost,feed_dict={x: batch_xs,y: batch_label})) 
     if step!=0 and step % 20 == 0: 
      save_path = saver.save(sess, './model/lstm_classification.ckpt',write_meta_graph=False) 
      print('Save to path', save_path) 

     step += 1 
     s+=batch_size 
     e+=batch_size 
     del batch_label,batch_xs,batch_id,batch_col 
     print(hp.heap()) 
     print(hp.heap().more) 


epoch=0 
ran = file_id.shape[0] 
init = tf.initialize_all_variables() 


with tf.Session() as sess: 
    saver=tf.train.Saver({'wi':weights['in'],'wo':weights['out'],'bi':biases['in'],'bo':biases['out']}) 
    sess.run(init) 
    print(hp.heap().more) 
    indice = np.arange(ran) 
    np.random.shuffle(indice) 
    file_id = file_id[indice] 
    label_matrix = label_matrix[indice] 
    s=0 
    e=s+batch_size 
    step=0 
    batch_row=np.linspace(0,batch_size-1,batch_size)       
    data=np.linspace(1,1,batch_size)           
    while_loop(s,e,step)

张时这是我code.It不断去这个错误： “ResourceExhaustedError与形状分配时，张OOM”我用guppy.Then得到这个。 result of guppy ResourceExhaustedError：OOM分配与形状[]

我几乎疯了，为什么张量流变量需要这么多空间。我怎样才能解决这个问题？你只需要阅读RNN和while_loop方法。

见[此建议]（http://*.com/documentation/tensorflow/3883/how-to-debug-a-memory-leak-in-tensorflow#t=201702280511203392708），用于处理TensorFlow中的内存泄漏。特别是，在Python循环内调用'tf.nn.embedding_lookup（embedding_matrix，...）'表明'embedding_matrix'正被转换为TensorFlow常量并存储在图中多次，这可能是你的内存泄漏。 – mrry

你建议使用tf.graph.finalize（）。但在此之后，我不能使用tf.nn.embedding_lookup（embedding_matrix，...）。那么我做了什么？ –

您可以在while循环之外定义'batch_xs1 = tf.nn.embedding_lookup（embedding_matrix，batch_id_placeholder）'（其中'batch_id_placeholder'为相应类型和形状的'tf.placeholder（）'），然后计算'batch_xs' as'batch_xs = sess.run（batch_xs1，feed_dict = {batch_id_placeholder：batch_id}）'。 – mrry

答

问题是由这条线在训练循环造成的：

while s + batch_size < ran: 
    # ... 
    batch_xs1 = tf.nn.embedding_lookup(embedding_matrix, batch_id)

调用tf.nn.embedding_lookup()功能添加节点的TensorFlow图，—因为这些都是从来没有垃圾收集—在一个循环中这样做会导致一个内存泄漏。

内存泄漏的实际原因可能是参数tf.nn.embedding_lookup()的参数中的embedding_matrix NumPy数组。 TensorFlow会尝试提供帮助，并将参数中的所有NumPy数组转换为函数中的tf.constant()节点。但是，在一个循环中，最终会将多个单独的副本复制到TensorFlow中，然后复制到稀少的GPU内存中。

最简单的解决方案是将tf.nn.embedding_lookup()呼叫移出训练循环。例如：

def while_loop(s,e,step): 
    batch_id_placeholder = tf.placeholder(tf.int32) 
    batch_xs1 = tf.nn.embedding_lookup(embedding_matrix, batch_id_placeholder) 

    while s+batch_size<ran: 
    batch_id=file_id[s:e] 
    batch_col=label_matrix[s:e]            

    batch_label = csc_matrix((data, (batch_row, batch_col)), shape=(batch_size, n_classes)) 
    batch_label = batch_label.toarray() 

    batch_xs=sess.run(batch_xs1, feed_dict={batch_id_placeholder: batch_id})

ResourceExhaustedError：OOM分配与形状[]

相关推荐