neumf.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. # Copyright (c) 2018. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # -----------------------------------------------------------------------
  16. #
  17. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  18. #
  19. # Licensed under the Apache License, Version 2.0 (the "License");
  20. # you may not use this file except in compliance with the License.
  21. # You may obtain a copy of the License at
  22. #
  23. # http://www.apache.org/licenses/LICENSE-2.0
  24. #
  25. # Unless required by applicable law or agreed to in writing, software
  26. # distributed under the License is distributed on an "AS IS" BASIS,
  27. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  28. # See the License for the specific language governing permissions and
  29. # limitations under the License.
  30. import tensorflow as tf
  31. import horovod.tensorflow as hvd
  32. def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
  33. initializer=None, regularizer=None,
  34. trainable=True,
  35. *args, **kwargs):
  36. """
  37. Custom variable getter that forces trainable variables to be stored in
  38. float32 precision and then casts them to the half-precision
  39. """
  40. storage_dtype = tf.float32 if trainable else dtype
  41. variable = getter(name, shape, dtype=storage_dtype,
  42. initializer=initializer, regularizer=regularizer,
  43. trainable=trainable,
  44. *args, **kwargs)
  45. if trainable and dtype != tf.float32:
  46. variable = tf.cast(variable, dtype)
  47. return variable
  48. def neural_mf(users,
  49. items,
  50. model_dtype,
  51. nb_users,
  52. nb_items,
  53. mf_dim,
  54. mf_reg,
  55. mlp_layer_sizes,
  56. mlp_layer_regs,
  57. dropout_rate,
  58. sigmoid=False):
  59. """
  60. Constructs the model graph
  61. """
  62. # Check params
  63. if len(mlp_layer_sizes) != len(mlp_layer_regs):
  64. raise RuntimeError('u dummy, layer_sized != layer_regs')
  65. if mlp_layer_sizes[0] % 2 != 0:
  66. raise RuntimeError('u dummy, mlp_layer_sizes[0] % 2 != 0')
  67. nb_mlp_layers = len(mlp_layer_sizes)
  68. # Embeddings
  69. user_embed = tf.get_variable(
  70. "user_embeddings",
  71. shape=[nb_users, mf_dim + mlp_layer_sizes[0] // 2],
  72. initializer=tf.initializers.random_normal(mean=0.0, stddev=0.01))
  73. item_embed = tf.get_variable(
  74. "item_embeddings",
  75. shape=[nb_items, mf_dim + mlp_layer_sizes[0] // 2],
  76. initializer=tf.initializers.random_normal(mean=0.0, stddev=0.01))
  77. # Matrix Factorization Embeddings
  78. xmfu = tf.nn.embedding_lookup(user_embed[:, :mf_dim], users, partition_strategy='div')
  79. xmfi = tf.nn.embedding_lookup(item_embed[:, :mf_dim], items, partition_strategy='div')
  80. # MLP Network Embeddings
  81. xmlpu = tf.nn.embedding_lookup(user_embed[:, mf_dim:], users, partition_strategy='div')
  82. xmlpi = tf.nn.embedding_lookup(item_embed[:, mf_dim:], items, partition_strategy='div')
  83. # Enforce model to use fp16 data types when manually enabling mixed precision
  84. # (Tensorfow ops will use automatically use the data type of the first input)
  85. if model_dtype == tf.float16:
  86. xmfu = tf.cast(xmfu, model_dtype)
  87. xmfi = tf.cast(xmfi, model_dtype)
  88. xmlpu = tf.cast(xmlpu, model_dtype)
  89. xmlpi = tf.cast(xmlpi, model_dtype)
  90. # Matrix Factorization
  91. xmf = tf.math.multiply(xmfu, xmfi)
  92. # MLP Layers
  93. xmlp = tf.concat((xmlpu, xmlpi), 1)
  94. for i in range(1, nb_mlp_layers):
  95. xmlp = tf.layers.Dense(
  96. mlp_layer_sizes[i],
  97. activation=tf.nn.relu,
  98. kernel_initializer=tf.glorot_uniform_initializer()
  99. ).apply(xmlp)
  100. xmlp = tf.layers.Dropout(rate=dropout_rate).apply(xmlp)
  101. # Final fully-connected layer
  102. logits = tf.concat((xmf, xmlp), 1)
  103. logits = tf.layers.Dense(
  104. 1,
  105. kernel_initializer=tf.keras.initializers.lecun_uniform()
  106. ).apply(logits)
  107. if sigmoid:
  108. logits = tf.math.sigmoid(logits)
  109. # Cast model outputs back to float32 if manually enabling mixed precision for loss calculation
  110. if model_dtype == tf.float16:
  111. logits = tf.cast(logits, tf.float32)
  112. return logits
  113. def compute_eval_metrics(logits, dup_mask, val_batch_size, K):
  114. """
  115. Constructs the graph to compute Hit Rate and NDCG
  116. """
  117. # Replace duplicate (uid, iid) pairs with -inf
  118. logits = logits * (1. - dup_mask)
  119. logits = logits + (dup_mask * logits.dtype.min)
  120. # Reshape tensors so that each row corresponds with a user
  121. logits_by_user = tf.reshape(logits, [-1, val_batch_size])
  122. dup_mask_by_user = tf.cast(tf.reshape(logits, [-1, val_batch_size]), tf.bool)
  123. # Get the topk items for each user
  124. top_item_indices = tf.math.top_k(logits_by_user, K)[1]
  125. # Check that the positive sample (last index) is in the top K
  126. is_positive = tf.cast(tf.equal(top_item_indices, val_batch_size-1), tf.int32)
  127. found_positive = tf.reduce_sum(is_positive, axis=1)
  128. # Extract the rankings of the positive samples
  129. positive_ranks = tf.reduce_sum(is_positive * tf.expand_dims(tf.range(K), 0), axis=1)
  130. dcg = tf.log(2.) / tf.log(tf.cast(positive_ranks, tf.float32) + 2)
  131. dcg *= tf.cast(found_positive, dcg.dtype)
  132. return found_positive, dcg
  133. def ncf_model_ops(users,
  134. items,
  135. labels,
  136. dup_mask,
  137. params,
  138. mode='TRAIN'):
  139. """
  140. Constructs the training and evaluation graphs
  141. """
  142. # Validation params
  143. val_batch_size = params['val_batch_size']
  144. K = params['top_k']
  145. # Training params
  146. learning_rate = params['learning_rate']
  147. beta_1 = params['beta_1']
  148. beta_2 = params['beta_2']
  149. epsilon = params['epsilon']
  150. # Model params
  151. fp16 = False
  152. nb_users = params['num_users']
  153. nb_items = params['num_items']
  154. mf_dim = params['num_factors']
  155. mf_reg = params['mf_reg']
  156. mlp_layer_sizes = params['layer_sizes']
  157. mlp_layer_regs = params['layer_regs']
  158. dropout = params['dropout']
  159. sigmoid = False #params['sigmoid']
  160. loss_scale = params['loss_scale']
  161. model_dtype = tf.float16 if fp16 else tf.float32
  162. # If manually enabling mixed precision, use the custom variable getter
  163. custom_getter = None if not fp16 else float32_variable_storage_getter
  164. # Allow soft device placement
  165. with tf.device(None), \
  166. tf.variable_scope('neumf', custom_getter=custom_getter):
  167. # Model graph
  168. logits = neural_mf(
  169. users,
  170. items,
  171. model_dtype,
  172. nb_users,
  173. nb_items,
  174. mf_dim,
  175. mf_reg,
  176. mlp_layer_sizes,
  177. mlp_layer_regs,
  178. dropout,
  179. sigmoid
  180. )
  181. logits = tf.squeeze(logits)
  182. if mode == 'INFERENCE':
  183. return logits
  184. # Evaluation Ops
  185. found_positive, dcg = compute_eval_metrics(logits, dup_mask, val_batch_size, K)
  186. # Metrics
  187. hit_rate = tf.metrics.mean(found_positive, name='hit_rate')
  188. ndcg = tf.metrics.mean(dcg, name='ndcg')
  189. eval_op = tf.group(hit_rate[1], ndcg[1])
  190. if mode == 'EVAL':
  191. return hit_rate[0], ndcg[0], eval_op, None
  192. # Labels
  193. labels = tf.reshape(labels, [-1, 1])
  194. logits = tf.reshape(logits, [-1, 1])
  195. # Use adaptive momentum optimizer
  196. optimizer = tf.train.AdamOptimizer(
  197. learning_rate=learning_rate,
  198. beta1=beta_1, beta2=beta_2,
  199. epsilon=epsilon)
  200. loss = tf.losses.sigmoid_cross_entropy(
  201. labels,
  202. logits,
  203. reduction=tf.losses.Reduction.MEAN)
  204. # Apply loss scaling if manually enabling mixed precision
  205. if fp16:
  206. if loss_scale is None:
  207. loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(2**32, 1000)
  208. else:
  209. loss_scale_manager = tf.contrib.mixed_precision.FixedLossScaleManager(loss_scale)
  210. optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
  211. # Horovod wrapper for distributed training
  212. optimizer = hvd.DistributedOptimizer(optimizer)
  213. # Update ops
  214. global_step = tf.train.get_global_step()
  215. train_op = optimizer.minimize(loss, global_step=global_step)
  216. return hit_rate[0], ndcg[0], eval_op, train_op