tensorflow word2vec demo详解












  1. from __future__ import absolute_import
  2. from __future__ import division
  3. from __future__ import print_function
  4. import collections
  5. import math
  6. import os
  7. import sys
  8. import argparse
  9. import random
  10. from tempfile import gettempdir
  11. import zipfile
  12. import numpy as np
  13. from six.moves import urllib
  14. from six.moves import xrange # pylint: disable=redefined-builtin
  15. import tensorflow as tf
  16. from tensorflow.contrib.tensorboard.plugins import projector


  1. current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
  2. parser = argparse.ArgumentParser()
  3. parser.add_argument(
  4. '--log_dir',
  5. type=str,
  6. default=os.path.join(current_path, 'log'),
  7. help='The log directory for TensorBoard summaries.')
  8. FLAGS, unparsed = parser.parse_known_args()
  9. # Create the directory for TensorBoard variables if there is not.
  10. if not os.path.exists(FLAGS.log_dir):
  11. os.makedirs(FLAGS.log_dir)




parser.parse_known_args()用 来解析不定长的命令行参数,其返回的是2个参数,第一个参数是已经定义了的参数,第二个是没有定义的参数。


  1. import argparse
  2. import os
  3. import sys
  4. current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
  5. parser = argparse.ArgumentParser()
  6. parser.add_argument(
  7. '--log_dir',
  8. type=str,
  9. default=os.path.join(current_path, 'log'),
  10. help='The log directory for TensorBoard summaries.')
  11. FLAGS, unparsed = parser.parse_known_args()
  12. print(FLAGS)
  13. print(unparsed)

  1. def maybe_download(filename, expected_bytes):
  2. """Download a file if not present, and make sure it's the right size."""
  3. if not os.path.exists(filename):
  4. filename, _ = urllib.request.urlretrieve(url + filename, filename)
  5. # 获取文件相关属性
  6. statinfo = os.stat(filename)
  7. # 比对文件的大小是否正确
  8. if statinfo.st_size == expected_bytes:
  9. print('Found and verified', filename)
  10. else:
  11. print(statinfo.st_size)
  12. raise Exception(
  13. 'Failed to verify ' + filename + '. Can you get to it with a browser?')
  14. return filename
  15. filename = maybe_download('text8.zip', 31344016)



  1. # Read the data into a list of strings.
  2. def read_data(filename):
  3. """Extract the first file enclosed in a zip file as a list of words."""
  4. with zipfile.ZipFile(filename) as f:
  5. data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  6. return data
  7. vocabulary = read_data(filename)
  8. print('Data size', len(vocabulary))



  1. vocabulary_size = 50000
  2. def build_dataset(words, n_words):
  3. """Process raw inputs into a dataset."""
  4. count = [['UNK', -1]]
  5. count.extend(collections.Counter(words).most_common(n_words - 1))
  6. dictionary = dict()
  7. for word, _ in count:
  8. dictionary[word] = len(dictionary)
  9. data = list()
  10. unk_count = 0
  11. for word in words:
  12. index = dictionary.get(word, 0)
  13. if index == 0: # dictionary['UNK']
  14. unk_count += 1
  15. data.append(index)
  16. count[0][1] = unk_count
  17. reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  18. return data, count, dictionary, reversed_dictionary
  19. data, count, dictionary, reverse_dictionary = build_dataset(
  20. vocabulary, vocabulary_size)
  21. del vocabulary # Hint to reduce memory.
  22. print('Most common words (+UNK)', count[:5])
  23. print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])



count.extend(collections.Counter(words).most_common(n_words - 1))



i    love   tensorflow  very  much .........

2    23      UNK           3       45    .........

count 记录的是每个单词对应的词频比如;[ ['UNK', -1] , ['a','200'] , ['i',150],...............]

dictionary是一个字典:记录的是单词对应编号 即key:单词、value:编号(编号越小,词频越高,但第一个永远是UNK)

reversed_dictionary是一个字典:编号对应的单词  即key:编号、value:单词(编号越小,词频越高,但第一个永远是UNK)



  1. data_index = 0
  2. def generate_batch(batch_size, num_skips, skip_window):
  3. global data_index
  4. assert batch_size % num_skips == 0
  5. assert num_skips <= 2 * skip_window
  6. batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  7. labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  8. span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  9. buffer = collections.deque(maxlen=span) # pylint: disable=redefined-builtin
  10. if data_index + span > len(data):
  11. data_index = 0
  12. buffer.extend(data[data_index:data_index + span])
  13. data_index += span
  14. for i in range(batch_size // num_skips):
  15. context_words = [w for w in range(span) if w != skip_window]
  16. words_to_use = random.sample(context_words, num_skips)
  17. for j, context_word in enumerate(words_to_use):
  18. batch[i * num_skips + j] = buffer[skip_window]
  19. labels[i * num_skips + j, 0] = buffer[context_word]
  20. if data_index == len(data):
  21. buffer.extend(data[0:span])
  22. data_index = span
  23. else:
  24. buffer.append(data[data_index])
  25. data_index += 1
  26. # Backtrack a little bit to avoid skipping words in the end of a batch
  27. data_index = (data_index + len(data) - span) % len(data)
  28. return batch, labels
  29. batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
  30. for i in range(8):
  31. print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
  32. reverse_dictionary[labels[i, 0]])


 num_skips:就是重复用一个单词的次数,比如 num_skips=2时,对于一句话:i    love   tensorflow  very  much ..........


                         tensorflow---》 love        tensorflow---》 very



 span :其实在分批次的过程中可以看做是一个固定大小的框框(比较流行的说法数滑动窗口)在不断移动,而这个框框的大小                  就是 span,可以看到span = 2 * skip_window + 1 

 buffer = collections.deque(maxlen=span):就是申请了一个buffer(其实就是固定大小的窗口这里是3)即每次这个buffer队列中最                                                                           多 能容纳span个单词


所以过程应该是这样的:比如batch_size=6, num_skips=2,skip_window=1,data:

batch_size // num_skips=3,循环3次

(   I      am      looking     for     the     missing     glass-shoes     who     has     picked   it      up .............)

    2      23        56            3       45         84               123              45        23           12     1     14 ...............

i=0时:2 ,23 ,56首先进入 buffer( context_words = [w for w in range(span) if w != skip_window]的意思就是取窗口中不包括目标词              的词即上下文),然后batch[i * num_skips + j] = buffer[skip_window](skip_window=1,所以每次就是取窗口的中间数为                目标词)即batch=23,  labels[i * num_skips + j, 0] = buffer[context_word]就是取其上下文为labels即2和56

             所以此时batch=[23,23] labels=[2,56](当然也可能是[2,56],因为可能先取右边,后取左面),同时data_index=3即单词for的              位置

i=1时:data[data_index]进队列,即 buffer为 23,56,3 赋值后为:batch=[23,23,56,56] labels=[2,56,23,3](也可能是换一下顺序)


i=2时:data[data_index]进队列,即 buffer为 56,3,45 赋值后为:batch=[23,23,56,56,3,3]  labels=[2,56,23,3,56,45](也可能是换一              下顺序) 同时data_index=5即单词missing


                                                batch=[23,23,56,56,3,3]                                labels=[2,56,23,3,56,45]

 然后data_index = (data_index + len(data) - span) % len(data)即data_index回溯3个单位,回到 looking,因为global data_index



  1. batch_size = 128
  2. embedding_size = 128 # Dimension of the embedding vector.
  3. skip_window = 1 # How many words to consider left and right.
  4. num_skips = 2 # How many times to reuse an input to generate a label.
  5. num_sampled = 64 # Number of negative examples to sample.
  6. graph = tf.Graph()



  1. with graph.as_default():
  2. # Input data.
  3. with tf.name_scope('inputs'):
  4. train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  5. train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  6. valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  7. # Ops and variables pinned to the CPU because of missing GPU implementation
  8. with tf.device('/cpu:0'):
  9. # Look up embeddings for inputs.
  10. with tf.name_scope('embeddings'):
  11. embeddings = tf.Variable(
  12. tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  13. embed = tf.nn.embedding_lookup(embeddings, train_inputs)
  14. # Construct the variables for the NCE loss
  15. with tf.name_scope('weights'):
  16. nce_weights = tf.Variable(
  17. tf.truncated_normal(
  18. [vocabulary_size, embedding_size],
  19. stddev=1.0 / math.sqrt(embedding_size)))
  20. with tf.name_scope('biases'):
  21. nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
  22. # Compute the average NCE loss for the batch.
  23. # tf.nce_loss automatically draws a new sample of the negative labels each
  24. # time we evaluate the loss.
  25. # Explanation of the meaning of NCE loss:
  26. # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
  27. with tf.name_scope('loss'):
  28. loss = tf.reduce_mean(
  29. tf.nn.nce_loss(
  30. weights=nce_weights,
  31. biases=nce_biases,
  32. labels=train_labels,
  33. inputs=embed,
  34. num_sampled=num_sampled,
  35. num_classes=vocabulary_size))
  36. # Add the loss value as a scalar to summary.
  37. tf.summary.scalar('loss', loss)
  38. # Construct the SGD optimizer using a learning rate of 1.0.
  39. with tf.name_scope('optimizer'):
  40. optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
  41. # Compute the cosine similarity between minibatch examples and all embeddings.
  42. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
  43. normalized_embeddings = embeddings / norm
  44. valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
  45. valid_dataset)
  46. similarity = tf.matmul(
  47. valid_embeddings, normalized_embeddings, transpose_b=True)
  48. # Merge all summaries.
  49. merged = tf.summary.merge_all()
  50. # Add variable initializer.
  51. init = tf.global_variables_initializer()
  52. # Create a saver.
  53. saver = tf.train.Saver()




  1. def embedding_lookup(
  2. params,
  3. ids,
  4. partition_strategy="mod",
  5. name=None,
  6. validate_indices=True, # pylint: disable=unused-argument
  7. max_norm=None):
  8. """Looks up `ids` in a list of embedding tensors.
  9. This function is used to perform parallel lookups on the list of
  10. tensors in `params`. It is a generalization of
  11. @{tf.gather}, where `params` is
  12. interpreted as a partitioning of a large embedding tensor. `params` may be
  13. a `PartitionedVariable` as returned by using `tf.get_variable()` with a
  14. partitioner.
  15. If `len(params) > 1`, each element `id` of `ids` is partitioned between
  16. the elements of `params` according to the `partition_strategy`.
  17. In all strategies, if the id space does not evenly divide the number of
  18. partitions, each of the first `(max_id + 1) % len(params)` partitions will
  19. be assigned one more id.
  20. If `partition_strategy` is `"mod"`, we assign each id to partition
  21. `p = id % len(params)`. For instance,
  22. 13 ids are split across 5 partitions as:
  23. `[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`
  24. If `partition_strategy` is `"div"`, we assign ids to partitions in a
  25. contiguous manner. In this case, 13 ids are split across 5 partitions as:
  26. `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
  27. The results of the lookup are concatenated into a dense
  28. tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.

看到 The results of the lookup are concatenated into a dense tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.,即假如params是:100*28,sp_ids是[2,56,3] 那么返回的便是3*28即分别对应params的第3、57、4行

其实往下看会发现其主要调用的是 _embedding_lookup_and_transform函数





  1. def nce_loss(weights,
  2. biases,
  3. labels,
  4. inputs,
  5. num_sampled,
  6. num_classes,
  7. num_true=1,
  8. sampled_values=None,
  9. remove_accidental_hits=False,
  10. partition_strategy="mod",
  11. name="nce_loss"):
  12. """Computes and returns the noise-contrastive estimation training loss.
  13. See [Noise-contrastive estimation: A new estimation principle for
  14. unnormalized statistical
  15. models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
  16. Also see our [Candidate Sampling Algorithms
  17. Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)
  18. A common use case is to use this method for training, and calculate the full
  19. sigmoid loss for evaluation or inference. In this case, you must set
  20. `partition_strategy="div"` for the two losses to be consistent, as in the
  21. following example:
  22. ```python
  23. if mode == "train":
  24. loss = tf.nn.nce_loss(
  25. weights=weights,
  26. biases=biases,
  27. labels=labels,
  28. inputs=inputs,
  29. ...,
  30. partition_strategy="div")
  31. elif mode == "eval":
  32. logits = tf.matmul(inputs, tf.transpose(weights))
  33. logits = tf.nn.bias_add(logits, biases)
  34. labels_one_hot = tf.one_hot(labels, n_classes)
  35. loss = tf.nn.sigmoid_cross_entropy_with_logits(
  36. labels=labels_one_hot,
  37. logits=logits)
  38. loss = tf.reduce_sum(loss, axis=1)
  39. ```
  40. Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
  41. so your labels must be sorted in order of decreasing frequency to achieve
  42. good results. For more details, see
  43. @{tf.nn.log_uniform_candidate_sampler}.
  44. Note: In the case where `num_true` > 1, we assign to each target class
  45. the target probability 1 / `num_true` so that the target probabilities
  46. sum to 1 per-example.
  47. Note: It would be useful to allow a variable number of target classes per
  48. example. We hope to provide this functionality in a future release.
  49. For now, if you have a variable number of target classes, you can pad them
  50. out to a constant number by either repeating them or by padding
  51. with an otherwise unused class.
  52. Args:
  53. weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
  54. objects whose concatenation along dimension 0 has shape
  55. [num_classes, dim]. The (possibly-partitioned) class embeddings.
  56. biases: A `Tensor` of shape `[num_classes]`. The class biases.
  57. labels: A `Tensor` of type `int64` and shape `[batch_size,
  58. num_true]`. The target classes.
  59. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward
  60. activations of the input network.
  61. num_sampled: An `int`. The number of classes to randomly sample per batch.
  62. num_classes: An `int`. The number of possible classes.
  63. num_true: An `int`. The number of target classes per training example.
  64. sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
  65. `sampled_expected_count`) returned by a `*_candidate_sampler` function.
  66. (if None, we default to `log_uniform_candidate_sampler`)
  67. remove_accidental_hits: A `bool`. Whether to remove "accidental hits"
  68. where a sampled class equals one of the target classes. If set to
  69. `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
  70. learning to generate log-odds instead of log probabilities. See
  71. our [Candidate Sampling Algorithms Reference]
  72. (https://www.tensorflow.org/extras/candidate_sampling.pdf).
  73. Default is False.
  74. partition_strategy: A string specifying the partitioning strategy, relevant
  75. if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
  76. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
  77. name: A name for the operation (optional).
  78. Returns:
  79. A `batch_size` 1-D tensor of per-example NCE losses.
  80. """
  81. logits, labels = _compute_sampled_logits(
  82. weights=weights,
  83. biases=biases,
  84. labels=labels,
  85. inputs=inputs,
  86. num_sampled=num_sampled,
  87. num_classes=num_classes,
  88. num_true=num_true,
  89. sampled_values=sampled_values,
  90. subtract_log_q=True,
  91. remove_accidental_hits=remove_accidental_hits,
  92. partition_strategy=partition_strategy,
  93. name=name)
  94. sampled_losses = sigmoid_cross_entropy_with_logits(
  95. labels=labels, logits=logits, name="sampled_losses")
  96. # sampled_losses is batch_size x {true_loss, sampled_losses...}
  97. # We sum out true and sampled losses.
  98. return _sum_rows(sampled_losses)


  1. def nce_loss(weights,
  2.              biases,
  3.              labels,
  4.              inputs,
  5.              num_sampled,
  6.              num_classes,
  7.              num_true=1,
  8.              sampled_values=None,
  9.              remove_accidental_hits=False,
  10.              partition_strategy="mod",
  11.              name="nce_loss"):




biases   :    N

labels    :   batch_size, num_true(num_true代表正样本的数量,本demo中为1)

inputs    :   batch_size *N

num_sampled: 采样的负样本

num_classes : M

sampled_values:是否用不同的采样器,即tuple(`sampled_candidates`, `true_expected_count`  `sampled_expected_count`)





一个batch_size内每一个类子的NCE losses



sigmoid_cross_entropy_with_logits---------------------------logistic regression



  1. def _compute_sampled_logits(weights,
  2. biases,
  3. labels,
  4. inputs,
  5. num_sampled,
  6. num_classes,
  7. num_true=1,
  8. sampled_values=None,
  9. subtract_log_q=True,
  10. remove_accidental_hits=False,
  11. partition_strategy="mod",
  12. name=None,
  13. seed=None):
  14. """Helper function for nce_loss and sampled_softmax_loss functions.
  15. Computes sampled output training logits and labels suitable for implementing
  16. e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see
  17. sampled_softmax_loss).
  18. Note: In the case where num_true > 1, we assign to each target class
  19. the target probability 1 / num_true so that the target probabilities
  20. sum to 1 per-example.
  21. Args:
  22. weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
  23. objects whose concatenation along dimension 0 has shape
  24. `[num_classes, dim]`. The (possibly-partitioned) class embeddings.
  25. biases: A `Tensor` of shape `[num_classes]`. The (possibly-partitioned)
  26. class biases.
  27. labels: A `Tensor` of type `int64` and shape `[batch_size,
  28. num_true]`. The target classes. Note that this format differs from
  29. the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
  30. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward
  31. activations of the input network.
  32. num_sampled: An `int`. The number of classes to randomly sample per batch.
  33. num_classes: An `int`. The number of possible classes.
  34. num_true: An `int`. The number of target classes per training example.
  35. sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
  36. `sampled_expected_count`) returned by a `*_candidate_sampler` function.
  37. (if None, we default to `log_uniform_candidate_sampler`)
  38. subtract_log_q: A `bool`. whether to subtract the log expected count of
  39. the labels in the sample to get the logits of the true labels.
  40. Default is True. Turn off for Negative Sampling.
  41. remove_accidental_hits: A `bool`. whether to remove "accidental hits"
  42. where a sampled class equals one of the target classes. Default is
  43. False.
  44. partition_strategy: A string specifying the partitioning strategy, relevant
  45. if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
  46. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
  47. name: A name for the operation (optional).
  48. seed: random seed for candidate sampling. Default to None, which doesn't set
  49. the op-level random seed for candidate sampling.
  50. Returns:
  51. out_logits: `Tensor` object with shape
  52. `[batch_size, num_true + num_sampled]`, for passing to either
  53. `nn.sigmoid_cross_entropy_with_logits` (NCE) or
  54. `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax).
  55. out_labels: A Tensor object with the same shape as `out_logits`.
  56. """
  57. if isinstance(weights, variables.PartitionedVariable):
  58. weights = list(weights)
  59. if not isinstance(weights, list):
  60. weights = [weights]
  61. with ops.name_scope(name, "compute_sampled_logits",
  62. weights + [biases, inputs, labels]):
  63. if labels.dtype != dtypes.int64:
  64. labels = math_ops.cast(labels, dtypes.int64)
  65. labels_flat = array_ops.reshape(labels, [-1])
  66. # Sample the negative labels.
  67. # sampled shape: [num_sampled] tensor
  68. # true_expected_count shape = [batch_size, 1] tensor
  69. # sampled_expected_count shape = [num_sampled] tensor
  70. if sampled_values is None:
  71. sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(
  72. true_classes=labels,
  73. num_true=num_true,
  74. num_sampled=num_sampled,
  75. unique=True,
  76. range_max=num_classes,
  77. seed=seed)
  78. # NOTE: pylint cannot tell that 'sampled_values' is a sequence
  79. # pylint: disable=unpacking-non-sequence
  80. sampled, true_expected_count, sampled_expected_count = (
  81. array_ops.stop_gradient(s) for s in sampled_values)
  82. # pylint: enable=unpacking-non-sequence
  83. sampled = math_ops.cast(sampled, dtypes.int64)
  84. # labels_flat is a [batch_size * num_true] tensor
  85. # sampled is a [num_sampled] int tensor
  86. all_ids = array_ops.concat([labels_flat, sampled], 0)
  87. # Retrieve the true weights and the logits of the sampled weights.
  88. # weights shape is [num_classes, dim]
  89. all_w = embedding_ops.embedding_lookup(
  90. weights, all_ids, partition_strategy=partition_strategy)
  91. # true_w shape is [batch_size * num_true, dim]
  92. true_w = array_ops.slice(all_w, [0, 0],
  93. array_ops.stack(
  94. [array_ops.shape(labels_flat)[0], -1]))
  95. sampled_w = array_ops.slice(
  96. all_w, array_ops.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1])
  97. # inputs has shape [batch_size, dim]
  98. # sampled_w has shape [num_sampled, dim]
  99. # Apply X*W', which yields [batch_size, num_sampled]
  100. sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True)
  101. # Retrieve the true and sampled biases, compute the true logits, and
  102. # add the biases to the true and sampled logits.
  103. all_b = embedding_ops.embedding_lookup(
  104. biases, all_ids, partition_strategy=partition_strategy)
  105. # true_b is a [batch_size * num_true] tensor
  106. # sampled_b is a [num_sampled] float tensor
  107. true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))
  108. sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])
  109. # inputs shape is [batch_size, dim]
  110. # true_w shape is [batch_size * num_true, dim]
  111. # row_wise_dots is [batch_size, num_true, dim]
  112. dim = array_ops.shape(true_w)[1:2]
  113. new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0)
  114. row_wise_dots = math_ops.multiply(
  115. array_ops.expand_dims(inputs, 1),
  116. array_ops.reshape(true_w, new_true_w_shape))
  117. # We want the row-wise dot plus biases which yields a
  118. # [batch_size, num_true] tensor of true_logits.
  119. dots_as_matrix = array_ops.reshape(row_wise_dots,
  120. array_ops.concat([[-1], dim], 0))
  121. true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true])
  122. true_b = array_ops.reshape(true_b, [-1, num_true])
  123. true_logits += true_b
  124. sampled_logits += sampled_b
  125. if remove_accidental_hits:
  126. acc_hits = candidate_sampling_ops.compute_accidental_hits(
  127. labels, sampled, num_true=num_true)
  128. acc_indices, acc_ids, acc_weights = acc_hits
  129. # This is how SparseToDense expects the indices.
  130. acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])
  131. acc_ids_2d_int32 = array_ops.reshape(
  132. math_ops.cast(acc_ids, dtypes.int32), [-1, 1])
  133. sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 1,
  134. "sparse_indices")
  135. # Create sampled_logits_shape = [batch_size, num_sampled]
  136. sampled_logits_shape = array_ops.concat(
  137. [array_ops.shape(labels)[:1],
  138. array_ops.expand_dims(num_sampled, 0)], 0)
  139. if sampled_logits.dtype != acc_weights.dtype:
  140. acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
  141. sampled_logits += sparse_ops.sparse_to_dense(
  142. sparse_indices,
  143. sampled_logits_shape,
  144. acc_weights,
  145. default_value=0.0,
  146. validate_indices=False)
  147. if subtract_log_q:
  148. # Subtract log of Q(l), prior probability that l appears in sampled.
  149. true_logits -= math_ops.log(true_expected_count)
  150. sampled_logits -= math_ops.log(sampled_expected_count)
  151. # Construct output logits and labels. The true labels/logits start at col 0.
  152. out_logits = array_ops.concat([true_logits, sampled_logits], 1)
  153. # true_logits is a float tensor, ones_like(true_logits) is a float
  154. # tensor of ones. We then divide by num_true to ensure the per-example
  155. # labels sum to 1.0, i.e. form a proper probability distribution.
  156. out_labels = array_ops.concat([
  157. array_ops.ones_like(true_logits) / num_true,
  158. array_ops.zeros_like(sampled_logits)
  159. ], 1)
  160. return out_logits, out_labels


  1. Returns:
  2. out_logits: `Tensor` object with shape
  3. `[batch_size, num_true + num_sampled]`, for passing to either
  4. `nn.sigmoid_cross_entropy_with_logits` (NCE) or
  5. `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax).
  6. out_labels: A Tensor object with the same shape as `out_logits`.

即 返回的out_logits和 out_labels的维度都是[batch_size, num_true + num_sampled],其中 num_true + num_sampled代表的就是正样本数+负样本数


  1. out_labels = array_ops.concat([
  2. array_ops.ones_like(true_logits) / num_true,
  3. array_ops.zeros_like(sampled_logits)
  4. ], 1)



  1. @tf_export("ones_like")
  2. def ones_like(tensor, dtype=None, name=None, optimize=True):
  3. """Creates a tensor with all elements set to 1.
  4. Given a single tensor (`tensor`), this operation returns a tensor of the same
  5. type and shape as `tensor` with all elements set to 1. Optionally, you can
  6. specify a new type (`dtype`) for the returned tensor.
  7. For example:
  8. ```python
  9. tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
  10. tf.ones_like(tensor) # [[1, 1, 1], [1, 1, 1]]
  11. ```
  12. Args:
  13. tensor: A `Tensor`.
  14. dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
  15. `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
  16. `complex64`, `complex128` or `bool`.
  17. name: A name for the operation (optional).
  18. optimize: if true, attempt to statically determine the shape of 'tensor'
  19. and encode it as a constant.
  20. Returns:
  21. A `Tensor` with all elements set to 1.
  22. """
  23. with ops.name_scope(name, "ones_like", [tensor]) as name:
  24. tensor = ops.convert_to_tensor(tensor, name="tensor")
  25. ones_shape = shape_internal(tensor, optimize=optimize)
  26. if dtype is None:
  27. dtype = tensor.dtype
  28. ret = ones(ones_shape, dtype=dtype, name=name)
  29. if not context.executing_eagerly():
  30. ret.set_shape(tensor.get_shape())
  31. return ret





二者的维度都是[batch_size, num_true + num_sampled]


  1. if sampled_values is None:
  2. sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(
  3. true_classes=labels,
  4. num_true=num_true,
  5. num_sampled=num_sampled,
  6. unique=True,
  7. range_max=num_classes,
  8. seed=seed)



  1. def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
  2. range_max, seed=None, name=None):
  3. """Samples a set of classes using a log-uniform (Zipfian) base distribution.
  4. This operation randomly samples a tensor of sampled classes
  5. (`sampled_candidates`) from the range of integers `[0, range_max)`.
  6. The elements of `sampled_candidates` are drawn without replacement
  7. (if `unique=True`) or with replacement (if `unique=False`) from
  8. the base distribution.
  9. The base distribution for this operation is an approximately log-uniform
  10. or Zipfian distribution:
  11. `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
  12. This sampler is useful when the target classes approximately follow such
  13. a distribution - for example, if the classes represent words in a lexicon
  14. sorted in decreasing order of frequency. If your classes are not ordered by
  15. decreasing frequency, do not use this op.
  16. In addition, this operation returns tensors `true_expected_count`
  17. and `sampled_expected_count` representing the number of times each
  18. of the target classes (`true_classes`) and the sampled
  19. classes (`sampled_candidates`) is expected to occur in an average
  20. tensor of sampled classes. These values correspond to `Q(y|x)`
  21. defined in [this
  22. document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
  23. If `unique=True`, then these are post-rejection probabilities and we
  24. compute them approximately.
  25. Args:
  26. true_classes: A `Tensor` of type `int64` and shape `[batch_size,
  27. num_true]`. The target classes.
  28. num_true: An `int`. The number of target classes per training example.
  29. num_sampled: An `int`. The number of classes to randomly sample.
  30. unique: A `bool`. Determines whether all sampled classes in a batch are
  31. unique.
  32. range_max: An `int`. The number of possible classes.
  33. seed: An `int`. An operation-specific seed. Default is 0.
  34. name: A name for the operation (optional).
  35. Returns:
  36. sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.
  37. The sampled classes.
  38. true_expected_count: A tensor of type `float`. Same shape as
  39. `true_classes`. The expected counts under the sampling distribution
  40. of each of `true_classes`.
  41. sampled_expected_count: A tensor of type `float`. Same shape as
  42. `sampled_candidates`. The expected counts under the sampling distribution
  43. of each of `sampled_candidates`.
  44. """
  45. seed1, seed2 = random_seed.get_seed(seed)
  46. return gen_candidate_sampling_ops.log_uniform_candidate_sampler(
  47. true_classes, num_true, num_sampled, unique, range_max, seed=seed1,
  48. seed2=seed2, name=name)

可以看到其对负样本是基于以下概率采样的,之所以不使用词频直接作为概率采用是因为如果这样的话,那么采取的负样本就都会是哪些高频词汇类如:and , of , i 等等,显然并不好。另一个极端就是使用词频的倒数,但是这对英文也没有代表性,根据mikolov写的一篇论文,实验得出的经验值是tensorflow word2vec demo详解

这里的话没有用上面的公式,但是也使得其处于两个极端之间了:还是可以看出P(class) 是递减函数,即class越小,P(class)越大,class在本类中代表的是单词的编号,由(五)可以知道,词频越大,编号越小(NUK除外),所以词频高的还是容易被作采用作为负样本的!

P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)


  1. def sigmoid_cross_entropy_with_logits( # pylint: disable=invalid-name
  2. _sentinel=None,
  3. labels=None,
  4. logits=None,
  5. name=None):
  6. """Computes sigmoid cross entropy given `logits`.
  7. Measures the probability error in discrete classification tasks in which each
  8. class is independent and not mutually exclusive. For instance, one could
  9. perform multilabel classification where a picture can contain both an elephant
  10. and a dog at the same time.
  11. For brevity, let `x = logits`, `z = labels`. The logistic loss is
  12. z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
  13. = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
  14. = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
  15. = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
  16. = (1 - z) * x + log(1 + exp(-x))
  17. = x - x * z + log(1 + exp(-x))
  18. For x < 0, to avoid overflow in exp(-x), we reformulate the above
  19. x - x * z + log(1 + exp(-x))
  20. = log(exp(x)) - x * z + log(1 + exp(-x))
  21. = - x * z + log(1 + exp(x))
  22. Hence, to ensure stability and avoid overflow, the implementation uses this
  23. equivalent formulation
  24. max(x, 0) - x * z + log(1 + exp(-abs(x)))
  25. `logits` and `labels` must have the same type and shape.
  26. Args:
  27. _sentinel: Used to prevent positional parameters. Internal, do not use.
  28. labels: A `Tensor` of the same type and shape as `logits`.
  29. logits: A `Tensor` of type `float32` or `float64`.
  30. name: A name for the operation (optional).
  31. Returns:
  32. A `Tensor` of the same shape as `logits` with the componentwise
  33. logistic losses.
  34. Raises:
  35. ValueError: If `logits` and `labels` do not have the same shape.
  36. """
  37. # pylint: disable=protected-access
  38. nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits", _sentinel,
  39. labels, logits)
  40. # pylint: enable=protected-access
  41. with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:
  42. logits = ops.convert_to_tensor(logits, name="logits")
  43. labels = ops.convert_to_tensor(labels, name="labels")
  44. try:
  45. labels.get_shape().merge_with(logits.get_shape())
  46. except ValueError:
  47. raise ValueError("logits and labels must have the same shape (%s vs %s)" %
  48. (logits.get_shape(), labels.get_shape()))
  49. # The logistic loss formula from above is
  50. # x - x * z + log(1 + exp(-x))
  51. # For x < 0, a more numerically stable formula is
  52. # -x * z + log(1 + exp(x))
  53. # Note that these two expressions can be combined into the following:
  54. # max(x, 0) - x * z + log(1 + exp(-abs(x)))
  55. # To allow computing gradients at zero, we define custom versions of max and
  56. # abs functions.
  57. zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
  58. cond = (logits >= zeros)
  59. relu_logits = array_ops.where(cond, logits, zeros)
  60. neg_abs_logits = array_ops.where(cond, -logits, logits)
  61. return math_ops.add(
  62. relu_logits - logits * labels,
  63. math_ops.log1p(math_ops.exp(neg_abs_logits)),
  64. name=name)


z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))

其实z * -log(x) + (1 - z) * -log(1 - x)就是交叉熵,对的,没看错这个函数其实就是将输入先sigmoid再计算交叉熵

如上所示最后化简结果为:x - x * z + log(1 + exp(-x))

这里考虑到当x<0时exp(-x)有可能溢出,所以当x<0时有- x * z + log(1 + exp(x))


 max(x, 0) - x * z + log(1 + exp(-abs(x)))




  1. def _sum_rows(x):
  2. """Returns a vector summing up each row of the matrix x."""
  3. # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is
  4. # a matrix. The gradient of _sum_rows(x) is more efficient than
  5. # reduce_sum(x, 1)'s gradient in today's implementation. Therefore,
  6. # we use _sum_rows(x) in the nce_loss() computation since the loss
  7. # is mostly used for training.
  8. cols = array_ops.shape(x)[1]
  9. ones_shape = array_ops.stack([cols, 1])
  10. ones = array_ops.ones(ones_shape, x.dtype)
  11. return array_ops.reshape(math_ops.matmul(x, ones), [-1])

这个相当简单了就是将矩阵的每一行都加起来,即根据上面的[batch_size, num_true + num_sampled],其实就是true loss与 sampled loss之和,即求batch_size中每一个example的总loss




从代码可以看出这里选择的 optimizer是GradientDescentOptimizer,然后就是通过normalized_embeddings = embeddings / norm



  1. valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
  2. valid_dataset)
  3. similarity = tf.matmul(
  4. valid_embeddings, normalized_embeddings, transpose_b=True)


  1. valid_size = 16 # Random set of words to evaluate similarity on.
  2. valid_window = 100 # Only pick dev samples in the head of the distribution.
  3. valid_examples = np.random.choice(valid_window, valid_size, replace=False)


  1. num_steps = 100001
  2. with tf.Session(graph=graph) as session:
  3. # Open a writer to write summaries.
  4. writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)
  5. # We must initialize all variables before we use them.
  6. init.run()
  7. print('Initialized')
  8. average_loss = 0
  9. for step in xrange(num_steps):
  10. batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
  11. skip_window)
  12. feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
  13. # Define metadata variable.
  14. run_metadata = tf.RunMetadata()
  15. # We perform one update step by evaluating the optimizer op (including it
  16. # in the list of returned values for session.run()
  17. # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
  18. # Feed metadata variable to session for visualizing the graph in TensorBoard.
  19. _, summary, loss_val = session.run(
  20. [optimizer, merged, loss],
  21. feed_dict=feed_dict,
  22. run_metadata=run_metadata)
  23. average_loss += loss_val
  24. # Add returned summaries to writer in each step.
  25. writer.add_summary(summary, step)
  26. # Add metadata to visualize the graph for the last run.
  27. if step == (num_steps - 1):
  28. writer.add_run_metadata(run_metadata, 'step%d' % step)
  29. if step % 2000 == 0:
  30. if step > 0:
  31. average_loss /= 2000
  32. # The average loss is an estimate of the loss over the last 2000 batches.
  33. print('Average loss at step ', step, ': ', average_loss)
  34. average_loss = 0
  35. # Note that this is expensive (~20% slowdown if computed every 500 steps)
  36. if step % 10000 == 0:
  37. sim = similarity.eval()
  38. for i in xrange(valid_size):
  39. valid_word = reverse_dictionary[valid_examples[i]]
  40. top_k = 8 # number of nearest neighbors
  41. nearest = (-sim[i, :]).argsort()[1:top_k + 1]
  42. log_str = 'Nearest to %s:' % valid_word
  43. for k in xrange(top_k):
  44. close_word = reverse_dictionary[nearest[k]]
  45. log_str = '%s %s,' % (log_str, close_word)
  46. print(log_str)
  47. final_embeddings = normalized_embeddings.eval()
  48. # Write corresponding labels for the embeddings.
  49. with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
  50. for i in xrange(vocabulary_size):
  51. f.write(reverse_dictionary[i] + '\n')
  52. # Save the model for checkpoints.
  53. saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))
  54. # Create a configuration for visualizing embeddings with the labels in TensorBoard.
  55. config = projector.ProjectorConfig()
  56. embedding_conf = config.embeddings.add()
  57. embedding_conf.tensor_name = embeddings.name
  58. embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
  59. projector.visualize_embeddings(writer, config)
  60. writer.close()









  1. def plot_with_labels(low_dim_embs, labels, filename):
  2. assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
  3. plt.figure(figsize=(18, 18)) # in inches
  4. for i, label in enumerate(labels):
  5. x, y = low_dim_embs[i, :]
  6. plt.scatter(x, y)
  7. plt.annotate(
  8. label,
  9. xy=(x, y),
  10. xytext=(5, 2),
  11. textcoords='offset points',
  12. ha='right',
  13. va='bottom')
  14. plt.savefig(filename)
  15. try:
  16. # pylint: disable=g-import-not-at-top
  17. from sklearn.manifold import TSNE
  18. import matplotlib.pyplot as plt
  19. tsne = TSNE(
  20. perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
  21. plot_only = 500
  22. low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
  23. labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  24. plot_with_labels(low_dim_embs, labels, os.path.join(FLAGS.log_dir, 'tsne.png', 'tsne.png'))
  25. except ImportError as ex:
  26. print('Please install sklearn, matplotlib, and scipy to show embeddings.')
  27. print(ex)





n_components 降为几维,默认2

init 可选嵌入的初始化,默认值:“random”,这里选取pca是因为其通常比随机初始化更全局稳定。但需要注意的是pca初始化不         能用于预先计算的距离

n_iter 优化的最大迭代次数

method  梯度计算算法使用在O(NlogN)时间内运行的Barnes-Hut近似值。 method ='exact'将运行在O(N ^ 2)时间内较慢但精确的算法上。当最近邻的误差需要好于3%时,应该使用精确的算法。但是,确切的方法无法扩展到数百万个示例。0.17新版​​功能:通过Barnes-Hut近似优化方法。




tensorflow word2vec demo详解


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]


Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


3081 originated -> 5234 anarchism
3081 originated -> 12 as
12 as -> 6 a
12 as -> 3081 originated
6 a -> 195 term
6 a -> 12 as
195 term -> 2 of
195 term -> 6 a



tensorflow word2vec demo详解

可以看到最和from接近的词汇有into, in, at, through, near, upanija, wct, polynomial

和five接近的词汇有 four, three, seven, six, eight, two, zero, nine等等


tensorflow word2vec demo详解

tensorflow word2vec demo详解



tensorflow word2vec demo详解

tensorflow word2vec demo详解





I      am      looking     for     the     missing     glass-shoes     who     has     picked   it      up .............

batch:[ ' i ' , ' looking '] , [ ' am ' , ' for '] , [ ' looking ' , ' the '] , [ ' for ' , ' missing '] .................

labels: [ ' am '  ,  ' looking ' ,  ' for '  ,  ' the ' ] 


  1. def generate_batch(batch_size, cbow_window):
  2. global data_index
  3. assert cbow_window % 2 == 1
  4. span = 2 * cbow_window + 1
  5. # 去除中心word: span - 1
  6. batch = np.ndarray(shape=(batch_size, span - 1), dtype=np.int32)
  7. labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  8. buffer = collections.deque(maxlen=span)
  9. for _ in range(span):
  10. buffer.append(data[data_index])
  11. # 循环选取 data中数据,到尾部则从头开始
  12. data_index = (data_index + 1) % len(data)
  13. for i in range(batch_size):
  14. # target at the center of span
  15. target = cbow_window
  16. # 仅仅需要知道context(word)而不需要word
  17. target_to_avoid = [cbow_window]
  18. col_idx = 0
  19. for j in range(span):
  20. # 略过中心元素 word
  21. if j == span // 2:
  22. continue
  23. batch[i, col_idx] = buffer[j]
  24. col_idx += 1
  25. labels[i, 0] = buffer[target]
  26. # 更新 buffer
  27. buffer.append(data[data_index])
  28. data_index = (data_index + 1) % len(data)
  29. return batch, labels
  30. batch, labels = generate_batch(batch_size=8, cbow_window=1)
  31. for i in range(8):
  32. print(reverse_dictionary[batch[i,0]],'and',reverse_dictionary[batch[i,1]] ,'->',
  33. reverse_dictionary[labels[i, 0]])
  1. with graph.as_default():
  2. # Input data.
  3. with tf.name_scope('inputs'):
  4. train_dataset = tf.placeholder(tf.int32, shape=[batch_size,2 * cbow_window])
  5. train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  6. valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  7. # Ops and variables pinned to the CPU because of missing GPU implementation
  8. with tf.device('/cpu:0'):
  9. # Look up embeddings for inputs.
  10. with tf.name_scope('embeddings'):
  11. embeddings = tf.Variable(
  12. tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  13. # Construct the variables for the NCE loss
  14. with tf.name_scope('weights'):
  15. nce_weights = tf.Variable(
  16. tf.truncated_normal(
  17. [vocabulary_size, embedding_size],
  18. stddev=1.0 / math.sqrt(embedding_size)))
  19. with tf.name_scope('biases'):
  20. nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
  21. embeds = None
  22. for i in range(2 * cbow_window):
  23. embedding_i = tf.nn.embedding_lookup(embeddings, train_dataset[:,i])
  24. print('embedding %d shape: %s'%(i, embedding_i.get_shape().as_list()))
  25. emb_x,emb_y = embedding_i.get_shape().as_list()
  26. if embeds is None:
  27. embeds = tf.reshape(embedding_i, [emb_x,emb_y,1])
  28. else:
  29. embeds = tf.concat([embeds, tf.reshape(embedding_i, [emb_x, emb_y,1])], 2)
  30. print("Concat embedding size: %s"%embeds.get_shape().as_list())
  31. avg_embed = tf.reduce_mean(embeds, 2, keep_dims=False)
  32. print("Avg embedding size: %s"%avg_embed.get_shape().as_list())
  33. print('--------------------------------------------------------------------------------------------')
  34. print(avg_embed.shape)
  35. print(train_labels.shape)
  36. print('--------------------------------------------------------------------------------------------')
  37. # Compute the average NCE loss for the batch.
  38. # tf.nce_loss automatically draws a new sample of the negative labels each
  39. # time we evaluate the loss.
  40. # Explanation of the meaning of NCE loss:
  41. # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
  42. with tf.name_scope('loss'):
  43. loss = tf.reduce_mean(
  44. tf.nn.nce_loss(
  45. weights=nce_weights,
  46. biases=nce_biases,
  47. labels=train_labels,