ResourceExhaustedError:OOM分配与形状[]
问题描述:
#coding:utf-8
import codecs
import tensorflow as tf
from guppy import hpy
import numpy as np
from scipy.sparse import csc_matrix
import collections
def _read_words(filename):
with open(filename,'r') as f:
return f.read().replace('\n', ' ').split()
def _build_vocab(filename):
data = _read_words(filename)
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
return word_to_id
def _file_to_word_ids(filename, word_to_id):
data = _read_words(filename)
file_id=[]
for word in data:
if word in word_to_id:
file_id.append(word_to_id[word])
else:
file_id.append('unk')
file_id=np.asarray(file_id)
file_id=file_id.reshape(-1,n_steps)
return file_id
def RNN(X, weights, biases):
# hidden layer for input to cell
########################################
# transpose the inputs shape from
# X ==> (128 batch * 28 steps, 28 inputs)
X = tf.reshape(X, [-1, n_inputs])
# into hidden
# X_in = (128 batch * 28 steps, 128 hidden)
X_in = tf.matmul(X, weights['in']) + biases['in']
# X_in ==> (128 batch, 28 steps, 128 hidden)
X_in = tf.reshape(X_in, [-1, n_steps, n_hidden_units])
# cell
##########################################
# basic LSTM Cell.
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden_units, forget_bias=0.0, state_is_tuple=True)
# lstm cell is divided into two parts (c_state, h_state)
init_state = lstm_cell.zero_state(batch_size, dtype=tf.float32)
# You have 2 options for following step.
# 1: tf.nn.rnn(cell, inputs);
# 2: tf.nn.dynamic_rnn(cell, inputs).
# If use option 1, you have to modified the shape of X_in, go and check out this:
# https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py
# In here, we go for option 2.
# dynamic_rnn receive Tensor (batch, steps, inputs) or (steps, batch, inputs) as X_in.
# Make sure the time_major is changed accordingly.
outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, X_in, initial_state=init_state, time_major=False)
# hidden layer for output as the final results
#############################################
# results = tf.matmul(final_state[1], weights['out']) + biases['out']
# # or
# unstak to list [(batch, outputs)..] * steps
outputs = tf.unpack(tf.transpose(outputs, [1, 0, 2])) # states is the last outputs
results = tf.matmul(outputs[-1], weights['out']) + biases['out']
del outputs,final_state,lstm_cell,init_state,X,X_in
return results
def sparse_label_matrix(f1):
col=[]
for i in f1.readlines():
if i.strip() == '':
continue
col.append(int(i.strip()))
return np.asarray(col)
# print(sparse_matrix.shape)
# set random seed for comparing the two result calculations
tf.set_random_seed(1)
# hyperparameters
lr = 0.001
batch_size = 32
n_epoch=3
n_inputs = 300 # MNIST data input (img shape: 28*28)
n_steps = 303 # time steps
n_hidden_units = 512 # neurons in hidden layer
n_classes = 15857 # MNIST classes (0-9 digits)
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_classes])
sess=tf.Session()
# Define weights
weights = {
# (28, 128)
'in': tf.Variable(tf.random_normal([n_inputs, n_hidden_units]),name='wi'),
# (128, 10)
'out': tf.Variable(tf.random_normal([n_hidden_units, n_classes]),name='wo')
}
biases = {
# (128,)
'in': tf.Variable(tf.constant(0.1, shape=[n_hidden_units, ]),name='bi'),
# (10,)
'out': tf.Variable(tf.constant(0.1, shape=[n_classes, ]),name='bo')
}
pred = RNN(x, weights, biases)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
train_op = tf.train.AdamOptimizer(lr).minimize(cost)
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
f1 = codecs.open('./data/labels.txt', 'r', 'utf-8')
label_matrix = sparse_label_matrix(f1)
f1.close()
f2 = codecs.open('./New Glove/glove.6B.300d.txt', 'r', 'utf-8')
word_to_id=_build_vocab('./data/train_data.txt')
file_id = _file_to_word_ids('./data/train_data.txt',word_to_id)
f2.close()
f3=codecs.open('./data/embedding_matrix.txt','r','utf-8')
embedding_matrix = np.zeros((len(word_to_id), n_inputs))
r=0
for i in f3.readlines():
embedding_matrix[r]=np.asarray(i.strip().split(),dtype='float')
r+=1
f3.close()
hp=hpy()
def while_loop(s,e,step):
while s+batch_size<ran:
batch_id=file_id[s:e]
batch_col=label_matrix[s:e]
batch_label = csc_matrix((data, (batch_row, batch_col)), shape=(batch_size, n_classes))
batch_label = batch_label.toarray()
batch_xs1=tf.nn.embedding_lookup(embedding_matrix,batch_id)
batch_xs=sess.run(batch_xs1)
del batch_xs1
sess.run([train_op], feed_dict={x: batch_xs,
y: batch_label})
print(step,':',sess.run(accuracy, feed_dict={x: batch_xs,y: batch_label}),sess.run(cost,feed_dict={x: batch_xs,y: batch_label}))
if step!=0 and step % 20 == 0:
save_path = saver.save(sess, './model/lstm_classification.ckpt',write_meta_graph=False)
print('Save to path', save_path)
step += 1
s+=batch_size
e+=batch_size
del batch_label,batch_xs,batch_id,batch_col
print(hp.heap())
print(hp.heap().more)
epoch=0
ran = file_id.shape[0]
init = tf.initialize_all_variables()
with tf.Session() as sess:
saver=tf.train.Saver({'wi':weights['in'],'wo':weights['out'],'bi':biases['in'],'bo':biases['out']})
sess.run(init)
print(hp.heap().more)
indice = np.arange(ran)
np.random.shuffle(indice)
file_id = file_id[indice]
label_matrix = label_matrix[indice]
s=0
e=s+batch_size
step=0
batch_row=np.linspace(0,batch_size-1,batch_size)
data=np.linspace(1,1,batch_size)
while_loop(s,e,step)
张时这是我code.It不断去这个错误: “ResourceExhaustedError与形状分配时,张OOM”我用guppy.Then得到这个。 result of guppyResourceExhaustedError:OOM分配与形状[]
我几乎疯了,为什么张量流变量需要这么多空间。我怎样才能解决这个问题?你只需要阅读RNN和while_loop方法。
答
问题是由这条线在训练循环造成的:
while s + batch_size < ran:
# ...
batch_xs1 = tf.nn.embedding_lookup(embedding_matrix, batch_id)
调用tf.nn.embedding_lookup()
功能添加节点的TensorFlow图,—因为这些都是从来没有垃圾收集—在一个循环中这样做会导致一个内存泄漏。
内存泄漏的实际原因可能是参数tf.nn.embedding_lookup()
的参数中的embedding_matrix
NumPy数组。 TensorFlow会尝试提供帮助,并将参数中的所有NumPy数组转换为函数中的tf.constant()
节点。但是,在一个循环中,最终会将多个单独的副本复制到TensorFlow中,然后复制到稀少的GPU内存中。
最简单的解决方案是将tf.nn.embedding_lookup()
呼叫移出训练循环。例如:
def while_loop(s,e,step):
batch_id_placeholder = tf.placeholder(tf.int32)
batch_xs1 = tf.nn.embedding_lookup(embedding_matrix, batch_id_placeholder)
while s+batch_size<ran:
batch_id=file_id[s:e]
batch_col=label_matrix[s:e]
batch_label = csc_matrix((data, (batch_row, batch_col)), shape=(batch_size, n_classes))
batch_label = batch_label.toarray()
batch_xs=sess.run(batch_xs1, feed_dict={batch_id_placeholder: batch_id})
见[此建议](http://*.com/documentation/tensorflow/3883/how-to-debug-a-memory-leak-in-tensorflow#t=201702280511203392708),用于处理TensorFlow中的内存泄漏。特别是,在Python循环内调用'tf.nn.embedding_lookup(embedding_matrix,...)'表明'embedding_matrix'正被转换为TensorFlow常量并存储在图中多次,这可能是你的内存泄漏。 – mrry
你建议使用tf.graph.finalize()。但在此之后,我不能使用tf.nn.embedding_lookup(embedding_matrix,...)。那么我做了什么? –
您可以在while循环之外定义'batch_xs1 = tf.nn.embedding_lookup(embedding_matrix,batch_id_placeholder)'(其中'batch_id_placeholder'为相应类型和形状的'tf.placeholder()'),然后计算'batch_xs' as'batch_xs = sess.run(batch_xs1,feed_dict = {batch_id_placeholder:batch_id})'。 – mrry