ソースを参照

Adding VAE-CF/TF

Przemek Strzelczyk 6 年 前
コミット
b348f179e2

+ 0 - 0
TensorFlow/Recommendation/VAE-CF/.gitmodules


+ 22 - 0
TensorFlow/Recommendation/VAE-CF/Dockerfile

@@ -0,0 +1,22 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.11-tf1-py3
+FROM ${FROM_IMAGE_NAME}
+
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+
+WORKDIR /code
+COPY . .

+ 201 - 0
TensorFlow/Recommendation/VAE-CF/LICENSE.md

@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

+ 3 - 0
TensorFlow/Recommendation/VAE-CF/NOTICE

@@ -0,0 +1,3 @@
+VAE-CF Tensorflow
+
+This repository includes software from https://github.com/mkfilipiuk/VAE-CF developed by Albert Cieślak, Michał Filipiuk, Frederic Grabowski and Radosław Rowicki and licensed under the Apache License, Version 2.0.

+ 373 - 0
TensorFlow/Recommendation/VAE-CF/README.md

@@ -0,0 +1,373 @@
+# Variational Autoencoder for Collaborative Filtering 19.11 for TensorFlow
+
+This repository provides a script and recipe to train the Variational Autoencoder model for TensorFlow to achieve state-of-the-art accuracy on a Collaborative Filtering task and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+
+- [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
+            * [Training stability test](#training-stability-test)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+        * [Inference performance results](#inference-performance-results)
+            * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-(1x-v100-16G))
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+## Model overview
+
+The Variational Autoencoder (VAE) shown here is an optimized implementation of the architecture first described in Variational [Autoencoders for Collaborative Filtering] (https://arxiv.org/abs/1802.05814) and can be used for recommendation tasks. The main differences between this model and the original one are the performance optimizations, such as using sparse matrices, mixed precision, larger mini-batches and multiple GPUs. These changes enabled us to achieve a significantly better speed while maintaining the same accuracy. Because of our fast implementation, we’ve also been able to carry out an extensive hyperparameter search to slightly improve the accuracy metrics.
+
+When using Variational Autoencoder for Collaborative Filtering (VAE-CF), you can quickly train a recommendation model for a collaborative filtering task. The required input data consists of pairs of user-item IDs for each interaction between a user and an item. With a trained model, you can run inference to predict what items are a new user most likely to interact with. 
+
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 1.9x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+This implementation has been initially developed as an educational project at the University of Warsaw by Albert Cieślak, Michał Filipiuk, Frederic Grabowski and Radosław Rowicki.
+
+### Model architecture
+
+<p align="center">
+   <img width="70%" src="images/autoencoder.png" />
+   <br>
+   Figure 1. The architecture of the VAE-CF model </p>
+
+
+The Variational Autoencoder is a neural network that provides collaborative filtering based on implicit feedback. Specifically, it provides product recommendations based on user and item interactions. The training data for this model should contain a sequence of user ID, item ID pairs indicating that the specified user has interacted with,  and the specified item. 
+
+The model consists of two parts: the encoder and the decoder. 
+The encoder transforms the vector, that contains the interactions for a specific user, into an n-dimensional variational distribution. We can then use this variational distribution to obtain a latent representation of a user. 
+This latent representation is then fed into the decoder. The result is a vector of item interaction probabilities for a particular user.
+
+### Default configuration
+
+The following features were implemented in this model:
+- general
+	- sparse matrix support
+	- data-parallel multi-GPU training
+	- dynamic loss scaling with backoff for tensor cores (mixed precision) training 
+    
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature               | VAE-CF       
+|-----------------------|--------------------------   
+|Horovod Multi-GPU (NCCL)          |     Yes            
+|Automatic mixed precision (AMP)   |     Yes     
+         
+#### Features
+
+Horovod
+Horovod is a distributed training framework for TensorFlow, Keras, PyTorch and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod).
+
+Multi-GPU training with Horovod
+Our model uses Horovod to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
+
+
+
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+1.  Porting the model to use the FP16 data type where appropriate.    
+2.  Adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+
+
+#### Enabling mixed precision
+
+To enable mixed precision in VAE-CF, run the `main.py` script with the `--use_tf_amp` flag.
+
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the VAE-CF model.
+
+### Requirements
+
+This repository contains Dockerfile which extends the Tensorflow NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   Tensorflow 19.11+ NGC container
+-   [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+- [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running)
+  
+For those unable to use the TensorFlow NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the VAE-CF model on the [MovieLens 20m dataset](https://grouplens.org/datasets/movielens/20m/). For the specifics concerning training and inference, see the [Advanced](#advanced) section.
+
+1. Clone the repository.
+
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/Tensorflow/Recommendation/VAE_CF
+```
+
+2. Build the VAE TensorFlow NGC container.
+
+```bash
+docker build . -t vae
+``` 
+
+3. Launch the VAE-CF TensorFlow Docker container.
+```bash
+docker run -it --rm --runtime=nvidia -v /data/vae-cf:/data vae /bin/bash
+``` 
+
+4. Prepare the dataset.
+```bash
+python3 prepare_dataset.py
+``` 
+
+5. Start training.
+```bash
+python3 main.py --train --use_tf_amp --checkpoint_dir ./checkpoints
+```
+6. Start validation/evaluation.
+
+The model is exported to the default `model_dir` and can be loaded and tested using:
+
+
+```bash
+python3 main.py --test --use_tf_amp --checkpoint_dir ./checkpoints
+```
+
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+The `main.py` script provides an entry point to all the provided functionalities. This includes running training, testing and inference. The behavior of the script is controlled by command-line arguments listed below in the [Parameters](#parameters) section. The `prepare_dataset.py` script can be used to download and preprocess the MovieLens 20m dataset.
+
+Most of the deep learning logic is implemented in the `vae/models` subdirectory. The `vae/load` subdirectory contains code for downloading and preprocessing the dataset. The `vae/metrics` subdirectory provides functions for computing the validation metrics such as recall and [NDCG](https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG).
+
+### Parameters
+
+To train a VAE-CF model in TensorFlow the following parameters are supported:
+
+```
+usage: main.py [-h] [--train] [--test] [--inference] [--inference_benchmark]
+               [--use_tf_amp] [--epochs EPOCHS]
+               [--batch_size_train BATCH_SIZE_TRAIN]
+               [--batch_size_validation BATCH_SIZE_VALIDATION]
+               [--validation_step VALIDATION_STEP]
+               [--warm_up_epochs WARM_UP_EPOCHS]
+               [--total_anneal_steps TOTAL_ANNEAL_STEPS]
+               [--anneal_cap ANNEAL_CAP] [--lam LAM] [--lr LR] [--beta1 BETA1]
+               [--beta2 BETA2] [--top_results TOP_RESULTS] [--xla] [--trace]
+               [--activation ACTIVATION] [--log_path LOG_PATH] [--seed SEED]
+               [--data_dir DATA_DIR] [--checkpoint_dir CHECKPOINT_DIR]
+
+Train a Variational Autoencoder for Collaborative Filtering in TensorFlow
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --train               Run training of VAE
+  --test                Run validation of VAE
+  --inference           Run inference on a single random example.This can also
+                        be used to measure the latency for a batch size of 1
+  --inference_benchmark
+                        Benchmark the inference throughput on a very large
+                        batch size
+  --use_tf_amp          Enable Automatic Mixed Precision
+  --epochs EPOCHS       Number of epochs to train
+  --batch_size_train BATCH_SIZE_TRAIN
+                        Global batch size for training
+  --batch_size_validation BATCH_SIZE_VALIDATION
+                        Used both for validation and testing
+  --validation_step VALIDATION_STEP
+                        Train epochs for one validation
+  --warm_up_epochs WARM_UP_EPOCHS
+                        Number of epochs to omit during benchmark
+  --total_anneal_steps TOTAL_ANNEAL_STEPS
+                        Number of annealing steps
+  --anneal_cap ANNEAL_CAP
+                        Annealing cap
+  --lam LAM             Regularization parameter
+  --lr LR               Learning rate
+  --beta1 BETA1         Adam beta1
+  --beta2 BETA2         Adam beta2
+  --top_results TOP_RESULTS
+                        Number of results to be recommended
+  --xla                 Enable XLA
+  --trace               Save profiling traces
+  --activation ACTIVATION
+                        Activation function
+  --log_path LOG_PATH   Path to the detailed JSON log from to be created
+  --seed SEED           Random seed for TensorFlow and numpy
+  --data_dir DATA_DIR   Directory for storing the training data
+  --checkpoint_dir CHECKPOINT_DIR
+                        Path for saving a checkpoint after the training
+
+```
+
+### Command-line options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+```bash
+python main.py --help
+```
+
+
+### Getting the data
+
+The VA-CF model was trained on the [MovieLens 20M dataset](https://grouplens.org/datasets/movielens/20m/). The dataset can be downloaded and preprocessed simply by running: `python prepare_dataset.py` in the Docker container. By default, the dataset will be stored in the `/data` directory. If you want to store the data in a different location, you can pass the desired location to the `--data_dir` argument.
+
+#### Dataset guidelines
+
+As a Collaborative Filtering model, VAE-CF only uses information about which user interacted with which item. For the MovieLens dataset, this means that a particular user has positively reviewed a particular movie. VAE-CF can be adapted to any other collaborative filtering task. The input to the model is generally a list of all interactions between users and items. One column of the CSV should contain user IDs while the other should contain item IDs. Example preprocessing for the MovieLens 20M dataset is provided in the `vae/load/preprocessing.py` file.
+
+
+### Training process
+
+The training can be started by running the `main.py` script with the `train` argument. The resulting checkpoints containing the trained model weights are then stored in the directory specified by the `--checkpoint_dir` directory (by default, no checkpoints are saved).
+
+Additionally, a command-line argument called `--results_dir` (by default None) can be used to enable saving some statistics to JSON files in a directory specified by this parameter. The statistics saved are:
+1) a complete list of command-line arguments saved as `<results_dir>/args.json` and
+2) a dictionary of validation metrics and performance metrics recorded during training
+
+The main validation metric used is [NDCG@100](https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG). Following the original VAE-CF paper we also report numbers for Recall@20 and Recall@50.
+
+Multi-GPU training uses horovod. You can run it with:
+```horovodrun -np 8 -H localhost:8 python3 main.py  --train --use_tf_amp```
+
+Mixed precision support is controlled by the `--use_tf_amp` command-line flag. It enables TensorFlow’s Automatic Mixed Precision mode.
+
+
+### Inference process
+
+Inference on a trained model can be run by passing the `--inference` argument to the main.py script, for example:
+```
+python3 main.py --inference --use_tf_amp --checkpoint_dir /checkpoints
+```
+This will generate a user with a collection of random items that they interacted with and run inference for that user. The result is a list of K recommended items the user is likely to interact with. You can control the number of items to be recommended by setting the `--top_results` command-line argument (by default 100).
+
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+
+To benchmark the training performance, run:
+```
+horovodrun -np 8 -H localhost:8 python3 main.py  --train --use_tf_amp
+```
+
+Training benchmark was run on 8x V100 16G GPUs.
+
+#### Inference performance benchmark
+
+To benchmark the inference performance, run:
+```
+python3 main.py  --inference_benchmark --use_tf_amp --batch_size_validation 24576
+```
+ 
+Inference benchmark was run on 1x V100 16G GPU.
+
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+
+Our results were obtained by running the `main.py` training script in the TensorFlow 19.11 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+
+| GPUs | Batch size / GPU | Accuracy - FP32 | Accuracy - mixed precision  | Time to train - FP32 (s) |  Time to train - mixed precision (s) | Time to train speedup (FP32 to mixed precision) |
+|---|---|---|---|---|---|---|
+| 1 | 24576 | 0.42863  | 0.42824 | 357.6| 205.9  | 1.737 |
+| 8 | 3072  | 0.42763  | 0.42766 | 43.2 | 59.7    | 1.381 |
+
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16G)
+
+Our results were obtained by running:
+```
+ horovodrun -np 8 -H localhost:8 python3 main.py  --train --use_tf_amp
+```
+in the TensorFlow 19.11 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Performance numbers (throughput in users processed per second) were averaged over an entire training run.
+
+| GPUs   | Batch size / GPU   | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Strong scaling - FP32    | Strong scaling - mixed precision |
+|---|---|---|---|---|---|---|
+| 1 | 24576| 116k | 219k | 1.897 | 1.00| 1.00|
+| 8 | 3072 | 685k | 966k | 1.410 | 5.92 | 4.41 |
+
+We use users processed per second as a throughput metric for measuring training performance.
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+
+Our results were obtained by running:
+```
+python3 main.py  --inference_benchmark --use_tf_amp --batch_size_validation 24576
+```
+in the TensorFlow 19.11 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPU.
+
+| GPUs   | Batch size / GPU   | Inference Throughput - FP32    | Inference Throughput - mixed precision    | Inference Throughput speedup (FP32 - mixed precision) |
+|---|---|---|---|---|
+| 1 | 24576| 127k | 154k | 1.215 |
+
+We use users processed per second as a throughput metric for measuring inference performance.
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+## Release notes
+
+### Changelog
+
+November 2019
+- Initial release
+
+### Known issues
+
+Multi-GPU scaling
+
+We benchmark this implementation on the ML-20m dataset so that our results are comparable to the original VAE-CF paper. We also use the same neural network architecture. As a consequence, the ratio of communication to computation is relatively large. This means that although using multiple GPUs speeds up the training substantially, the scaling efficiency is worse from what one would expect if using a larger model and a more realistic dataset.
+

BIN
TensorFlow/Recommendation/VAE-CF/images/autoencoder.png


+ 182 - 0
TensorFlow/Recommendation/VAE-CF/main.py

@@ -0,0 +1,182 @@
+#!/usr/bin/python3
+
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from functools import partial
+import json
+import logging
+from argparse import ArgumentParser
+import tensorflow as tf
+import numpy as np
+import horovod.tensorflow as hvd
+import dllogger
+
+from vae.utils.round import round_8
+from vae.metrics.recall import recall
+from vae.metrics.ndcg import ndcg
+from vae.models.train import VAE
+from vae.load.preprocessing import load_and_parse_ML_20M
+
+def main():
+    hvd.init()
+
+    parser = ArgumentParser(description="Train a Variational Autoencoder for Collaborative Filtering in TensorFlow")
+    parser.add_argument('--train', action='store_true',
+                        help='Run training of VAE')
+    parser.add_argument('--test', action='store_true',
+                        help='Run validation of VAE')
+    parser.add_argument('--inference', action='store_true',
+                        help='Run inference on a single random example.'
+                        'This can also be used to measure the latency for a batch size of 1')
+    parser.add_argument('--inference_benchmark', action='store_true',
+                        help='Benchmark the inference throughput on a very large batch size')
+    parser.add_argument('--use_tf_amp', action='store_true',
+                        help='Enable Automatic Mixed Precision')
+    parser.add_argument('--epochs', type=int, default=400,
+                        help='Number of epochs to train')
+    parser.add_argument('--batch_size_train', type=int, default=24576,
+                        help='Global batch size for training')
+    parser.add_argument('--batch_size_validation', type=int, default=10000,
+                        help='Used both for validation and testing')
+    parser.add_argument('--validation_step', type=int, default=50,
+                        help='Train epochs for one validation')
+    parser.add_argument('--warm_up_epochs', type=int, default=5,
+                        help='Number of epochs to omit during benchmark')
+    parser.add_argument('--total_anneal_steps', type=int, default=15000,
+                        help='Number of annealing steps')
+    parser.add_argument('--anneal_cap', type=float, default=0.1,
+                        help='Annealing cap')
+    parser.add_argument('--lam', type=float, default=1.00,
+                        help='Regularization parameter')
+    parser.add_argument('--lr', type=float, default=0.004,
+                        help='Learning rate')
+    parser.add_argument('--beta1', type=float, default=0.90,
+                        help='Adam beta1')
+    parser.add_argument('--beta2', type=float, default=0.90,
+                        help='Adam beta2')
+    parser.add_argument('--top_results', type=int, default=100,
+                        help='Number of results to be recommended')
+    parser.add_argument('--xla', action='store_true', default=False,
+                        help='Enable XLA')
+    parser.add_argument('--trace', action='store_true', default=False,
+                        help='Save profiling traces')
+    parser.add_argument('--activation', type=str, default='tanh',
+                        help='Activation function')
+    parser.add_argument('--log_path', type=str, default='./vae_cf.log',
+                        help='Path to the detailed training log to be created')
+    parser.add_argument('--seed', type=int, default=0,
+                        help='Random seed for TensorFlow and numpy')
+    parser.add_argument('--data_dir', default='/data', type=str,
+                        help='Directory for storing the training data')
+    parser.add_argument('--checkpoint_dir', type=str,
+                        default=None,
+                        help='Path for saving a checkpoint after the training')
+    args = parser.parse_args()
+
+    if args.batch_size_train % hvd.size() != 0:
+        raise ValueError('Global batch size should be a multiple of the number of workers')
+
+    args.local_batch_size = args.batch_size_train // hvd.size()
+
+    logger = logging.getLogger("VAE")
+    if hvd.rank() == 0:
+        logger.setLevel(logging.INFO)
+        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
+                                                           filename=args.log_path),
+                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
+    else:
+        dllogger.init(backends=[])
+        logger.setLevel(logging.ERROR)
+
+    dllogger.log(data=vars(args), step='PARAMETER')
+
+    np.random.seed(args.seed)
+    tf.set_random_seed(args.seed)
+
+    # Suppress TF warnings
+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+
+    # set AMP
+    os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if args.use_tf_amp else '0'
+
+    # load dataset
+    (train_data,
+     validation_data_input,
+     validation_data_true,
+     test_data_input,
+     test_data_true) = load_and_parse_ML_20M(args.data_dir)
+
+    # make sure all dims and sizes are divisible by 8
+    number_of_train_users, number_of_items = train_data.shape
+    number_of_items = round_8(number_of_items)
+
+    for data in [train_data,
+                 validation_data_input,
+                 validation_data_true,
+                 test_data_input,
+                 test_data_true]:
+        number_of_users, _ = data.shape
+        data.resize(number_of_users, number_of_items)
+
+    number_of_users, number_of_items = train_data.shape
+    encoder_dims = [number_of_items, 600, 200]
+
+    vae = VAE(train_data, encoder_dims, total_anneal_steps=args.total_anneal_steps,
+              anneal_cap=args.anneal_cap, batch_size_train=args.local_batch_size,
+              batch_size_validation=args.batch_size_validation, lam=args.lam,
+              lr=args.lr, beta1=args.beta1, beta2=args.beta2, activation=args.activation,
+              xla=args.xla, checkpoint_dir=args.checkpoint_dir, trace=args.trace,
+              top_results=args.top_results)
+
+    metrics = {'ndcg@100': partial(ndcg, R=100),
+               'recall@20': partial(recall, R=20),
+               'recall@50': partial(recall, R=50)}
+
+    if args.train:
+        vae.train(n_epochs=args.epochs, validation_data_input=validation_data_input,
+                  validation_data_true=validation_data_true,  metrics=metrics,
+                  validation_step=args.validation_step)
+
+    if args.test and hvd.size() <= 1:
+        test_results = vae.test(test_data_input=test_data_input,
+                                test_data_true=test_data_true, metrics=metrics)
+
+        for k, v in test_results.items():
+            print("{}:\t{}".format(k, v))
+    elif args.test and hvd.size() > 1:
+        print("Testing is not supported with horovod multigpu yet")
+
+    if args.inference_benchmark and hvd.size() <= 1:
+        # use the train data to get accurate throughput numbers for inference
+        # the test and validation sets are too small to measure this accurately
+        # vae.inference_benchmark()
+        _ = vae.test(test_data_input=train_data,
+                     test_data_true=train_data, metrics={})
+        
+
+    elif args.test and hvd.size() > 1:
+        print("Testing is not supported with horovod multigpu yet")
+
+    if args.inference:
+        input_data = np.random.randint(low=0, high=10000, size=10)
+        recommendations = vae.query(input_data=input_data)
+        print('Recommended item indices: ', recommendations)
+
+    vae.close_session()
+    dllogger.flush()
+
+if __name__ == '__main__':
+    main()

+ 35 - 0
TensorFlow/Recommendation/VAE-CF/prepare_dataset.py

@@ -0,0 +1,35 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from argparse import ArgumentParser
+from vae.load.preprocessing import load_and_parse_ML_20M
+import numpy as np
+
+parser = ArgumentParser(description="Prepare data for VAE training")
+parser.add_argument('--data_dir', default='/data', type=str,
+                    help='Directory for storing the training data')
+parser.add_argument('--seed', default=0, type=int,
+                    help='Random seed')
+args = parser.parse_args()
+
+print('Preprocessing seed: ', args.seed)
+np.random.seed(args.seed)
+
+# load dataset
+(train_data,
+ validation_data_input,
+ validation_data_true,
+ test_data_input,
+ test_data_true) = load_and_parse_ML_20M(args.data_dir)

+ 1 - 0
TensorFlow/Recommendation/VAE-CF/requirements.txt

@@ -0,0 +1 @@
+-e git://github.com/NVIDIA/dllogger#egg=dllogger

+ 45 - 0
TensorFlow/Recommendation/VAE-CF/scripts/benchmark.sh

@@ -0,0 +1,45 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#! /bin/bash
+
+set -e
+set -x
+
+python prepare_dataset.py
+
+# performance with AMP
+for i in 1 2 4 8 16; do
+  horovodrun -np $i -H localhost:$i python3 /code/main.py --train --use_tf_amp --results_dir /data/performance_amp_results/${i}gpu
+  rm -rf /tmp/checkpoints
+done
+
+# performance without AMP
+for i in 1 2 4 8 16; do
+  horovodrun -np $i -H localhost:$i python3 /code/main.py --train --results_dir /data/performance_fp32_results/${i}gpu
+  rm -rf /tmp/checkpoints
+done
+
+# AMP accuracy for multiple seeds
+for i in $(seq 20); do
+  horovodrun -np 8 -H localhost:8 python3 /code/main.py --train --use_tf_amp  --seed $i --results_dir /data/amp_accuracy_results/seed_${i}
+  rm -rf /tmp/checkpoints
+done
+
+# FP32 accuracy for multiple seeds
+for i in $(seq 20); do
+  horovodrun -np 8 -H localhost:8 python3 /code/main.py --train --seed $i --results_dir /data/fp32_accuracy_results/seed_${i}
+  rm -rf /tmp/checkpoints
+done
+

+ 21 - 0
TensorFlow/Recommendation/VAE-CF/vae/__init__.py

@@ -0,0 +1,21 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+LOG = logging.getLogger("VAE")
+
+_log_format = logging.Formatter("[%(name)s| %(levelname)s]: %(message)s")
+_log_handler = logging.StreamHandler()
+_log_handler.setFormatter(_log_format)
+LOG.addHandler(_log_handler)

+ 0 - 0
TensorFlow/Recommendation/VAE-CF/vae/load/__init__.py


+ 96 - 0
TensorFlow/Recommendation/VAE-CF/vae/load/downloaders.py

@@ -0,0 +1,96 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from os.path import basename, normpath
+import urllib.request
+import tarfile
+import zipfile
+from tqdm import tqdm
+import itertools
+
+from glob import glob
+import logging
+
+LOG = logging.getLogger("VAE")
+
+
+def download_movielens(data_dir):
+    destination_filepath = os.path.join(data_dir, 'ml-20m/download/ml-20m.zip')
+    if not glob(destination_filepath):
+        ml_20m_download_url = 'http://files.grouplens.org/datasets/movielens/ml-20m.zip'
+        download_file(ml_20m_download_url, destination_filepath)
+
+    LOG.info("Extracting")
+    extract_file(destination_filepath, to_directory=os.path.join(data_dir, 'ml-20m/extracted'))
+
+
+def download_file(url, filename):
+    if not os.path.isdir(os.path.dirname(filename)):
+        os.makedirs(os.path.dirname(filename))
+
+    u = urllib.request.urlopen(url)
+    with open(filename, 'wb') as f:
+        meta = u.info()
+        if (meta.get_all("Content-Length")):
+            file_size = int(meta.get_all("Content-Length")[0])
+            pbar = tqdm(
+                total=file_size,
+                desc=basename(normpath(filename)),
+                unit='B',
+                unit_scale=True)
+
+            file_size_dl = 0
+            block_sz = 8192
+            while True:
+                buff = u.read(block_sz)
+                if not buff:
+                    break
+                pbar.update(len(buff))
+                file_size_dl += len(buff)
+                f.write(buff)
+            pbar.close()
+        else:
+            LOG.warning("No content length information")
+            file_size_dl = 0
+            block_sz = 8192
+            for cyc in itertools.cycle('/–\\|'):
+                buff = u.read(block_sz)
+                if not buff:
+                    break
+                print(cyc, end='\r')
+                file_size_dl += len(buff)
+                f.write(buff)
+
+
+def extract_file(path, to_directory):
+    """
+    Extract file
+    :param path: Path to compressed file
+    :param to_directory: Directory that is going to store extracte files
+    """
+    if (path.endswith("tar.gz")):
+        tar = tarfile.open(path, "r:gz")
+        tar.extractall(path=to_directory)
+        tar.close()
+    elif (path.endswith("tar")):
+        tar = tarfile.open(path, "r:")
+        tar.extractall(path=to_directory)
+        tar.close()
+    elif (path.endswith("zip")):
+        with zipfile.ZipFile(path, 'r') as zip_ref:
+            zip_ref.extractall(to_directory)
+    else:
+        raise Exception(
+            "Could not extract {} as no appropriate extractor is found".format(path))

+ 316 - 0
TensorFlow/Recommendation/VAE-CF/vae/load/preprocessing.py

@@ -0,0 +1,316 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+from collections import defaultdict
+from glob import glob
+
+import pandas as pd
+from scipy import sparse
+import scipy.sparse as sp
+import numpy as np
+from scipy.sparse import load_npz, csr_matrix
+
+from vae.load.downloaders import download_movielens
+import logging
+import json
+
+LOG = logging.getLogger("VAE")
+
+def save_as_npz(m_sp, path):
+    if not os.path.isdir(os.path.dirname(path)):
+        os.makedirs(os.path.dirname(path))
+    sp.save_npz(path, m_sp)
+
+
+def get_count(tp, id):
+    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
+    count = playcount_groupbyid.size()
+    return count
+
+
+def filter_triplets(tp, min_uc=5, min_sc=0):
+    # Only keep the triplets for items which were clicked on by at least min_sc users.
+    if min_sc > 0:
+        itemcount = get_count(tp, 'movieId')
+        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
+
+    # Only keep the triplets for users who clicked on at least min_uc items
+    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
+    if min_uc > 0:
+        usercount = get_count(tp, 'userId')
+        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
+
+    # Update both usercount and itemcount after filtering
+    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId')
+    return tp, usercount, itemcount
+
+def save_id_mappings(cache_dir, show2id, profile2id):
+    if not os.path.isdir(cache_dir):
+        os.makedirs(cache_dir)
+
+    for d, filename in [(show2id, 'show2id.json'),
+                        (profile2id, 'profile2id.json')]:
+
+        with open(os.path.join(cache_dir, filename), 'w') as f:
+            d = {str(k): v for k, v in d.items()}
+            json.dump(d, f, indent=4)
+
+
+def load_and_parse_ML_20M(data_dir, threshold=4):
+    """
+    Original way of processing ml-20m dataset from VAE for CF paper
+	Copyright [2018] [Dawen Liang, Rahul G. Krishnan, Matthew D. Hoffman, and Tony Jebara]
+	SPDX-License-Identifier: Apache-2.0
+	Modifications copyright (C) 2019 Michał Filipiuk, Albert Cieślak, Frederic Grabowski, Radosław Rowicki
+    """
+
+    cache_dir = os.path.join(data_dir, "ml-20m/preprocessed")
+
+    train_data_file = os.path.join(cache_dir, "train_data.npz")
+    vad_data_true_file = os.path.join(cache_dir, "vad_data_true.npz")
+    vad_data_test_file = os.path.join(cache_dir, "vad_data_test.npz")
+    test_data_true_file = os.path.join(cache_dir, "test_data_true.npz")
+    test_data_test_file = os.path.join(cache_dir, "test_data_test.npz")
+
+    if (os.path.isfile(train_data_file)
+       and os.path.isfile(vad_data_true_file)
+       and os.path.isfile(vad_data_test_file)
+       and os.path.isfile(test_data_true_file)
+       and os.path.isfile(test_data_test_file)):
+
+           LOG.info("Already processed, skipping.")
+           return load_npz(train_data_file), \
+                load_npz(vad_data_true_file), \
+                load_npz(vad_data_test_file), \
+                load_npz(test_data_true_file), \
+                load_npz(test_data_test_file),
+
+    LOG.info("Parsing movielens.")
+
+    source_file = os.path.join(data_dir, "ml-20m/extracted/ml-20m", "ratings.csv")
+    if not glob(source_file):
+        download_movielens(data_dir=data_dir)
+
+    raw_data = pd.read_csv(source_file)
+    raw_data.drop('timestamp', axis=1, inplace=True)
+
+    raw_data = raw_data[raw_data['rating'] >= threshold]
+    raw_data, user_activity, item_popularity = filter_triplets(raw_data)
+
+    unique_uid = user_activity.index
+    idx_perm = np.random.permutation(unique_uid.size)
+    unique_uid = unique_uid[idx_perm]
+
+    n_users = unique_uid.size
+    n_heldout_users = 10000
+
+    true_users = unique_uid[:(n_users - n_heldout_users * 2)]
+    vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
+    test_users = unique_uid[(n_users - n_heldout_users):]
+
+    train_plays = raw_data.loc[raw_data['userId'].isin(true_users)]
+
+    unique_sid = pd.unique(train_plays['movieId'])
+
+    show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
+    profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))
+    save_id_mappings(cache_dir, show2id, profile2id)
+
+    def split_train_test_proportion(data, test_prop=0.2):
+        data_grouped_by_user = data.groupby('userId')
+        true_list, test_list = list(), list()
+
+        for i, (_, group) in enumerate(data_grouped_by_user):
+            n_items_u = len(group)
+
+            if n_items_u >= 5:
+                idx = np.zeros(n_items_u, dtype='bool')
+                idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True
+
+                true_list.append(group[np.logical_not(idx)])
+                test_list.append(group[idx])
+            else:
+                true_list.append(group)
+
+        data_true = pd.concat(true_list)
+        data_test = pd.concat(test_list)
+
+        return data_true, data_test
+
+    vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
+    vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]
+
+    vad_plays_true, vad_plays_test = split_train_test_proportion(vad_plays)
+
+    test_plays = raw_data.loc[raw_data['userId'].isin(test_users)]
+    test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]
+
+    test_plays_true, test_plays_test = split_train_test_proportion(test_plays)
+
+    def numerize(tp):
+        uid = tp['userId'].map(lambda x: profile2id[x])
+        sid = tp['movieId'].map(lambda x: show2id[x])
+        return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])
+
+    train_data = numerize(train_plays)
+    vad_data_true = numerize(vad_plays_true)
+    vad_data_test = numerize(vad_plays_test)
+    test_data_true = numerize(test_plays_true)
+    test_data_test = numerize(test_plays_test)
+
+    n_items = len(unique_sid)
+    def load_train_data(tp):
+        n_users = tp['uid'].max() + 1
+
+        rows, cols = tp['uid'], tp['sid']
+        data = sparse.csr_matrix((np.ones_like(rows),
+                                  (rows, cols)), dtype='float64',
+                                 shape=(n_users, n_items))
+        return data
+
+    train_data = load_train_data(train_data)
+
+    def load_true_test_data(tp_true, tp_test):
+        start_idx = min(tp_true['uid'].min(), tp_test['uid'].min())
+        end_idx = max(tp_true['uid'].max(), tp_test['uid'].max())
+
+        rows_true, cols_true = tp_true['uid'] - start_idx, tp_true['sid']
+        rows_test, cols_test = tp_test['uid'] - start_idx, tp_test['sid']
+
+        data_true = sparse.csr_matrix((np.ones_like(rows_true),
+                                     (rows_true, cols_true)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
+        data_test = sparse.csr_matrix((np.ones_like(rows_test),
+                                     (rows_test, cols_test)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
+        return data_true, data_test
+
+    vad_data_true, vad_data_test = load_true_test_data(vad_data_true, vad_data_test)
+
+    test_data_true, test_data_test = load_true_test_data(test_data_true, test_data_test)
+
+    save_as_npz(train_data, train_data_file)
+    save_as_npz(vad_data_true, vad_data_true_file)
+    save_as_npz(vad_data_test, vad_data_test_file)
+    save_as_npz(test_data_true, test_data_true_file)
+    save_as_npz(test_data_test, test_data_test_file)
+
+    return train_data, vad_data_true, vad_data_test, test_data_true, test_data_test
+
+
+def filter_data(data, min_users=1, min_items=5):
+    """
+
+    :param data: input matrix
+    :param min_users: only keep items, that were clicked by at least min_users
+    :param min_items: only keep users, that clicked at least min_items
+    :return: filtered matrix
+    """
+
+    col_count = defaultdict(lambda: 0)
+    for col in data.nonzero()[1]:
+        col_count[col] += 1
+
+    filtered_col = [k for k, v in col_count.items() if v >= min_users]
+    filtered_data_c = data[:, filtered_col]
+    del data
+
+    row_count = defaultdict(lambda: 0)
+    for row in filtered_data_c.nonzero()[0]:
+        row_count[row] += 1
+
+    filtered_row = [k for k, v in row_count.items() if v >= min_items]
+    filtered_data_r = filtered_data_c[filtered_row, :]
+    del filtered_data_c
+
+    return filtered_data_r
+
+
+def split_into_train_val_test(data, val_ratio, test_ratio):
+    """
+
+    :param data: input matrix
+    :param val_ratio: Ratio of validation users to all users
+    :param test_ratio: Ratio of test users to all users
+    :return: Tuple of 3 matrices : {train_matrix, val_matrix, test_matrix}
+    """
+
+    assert val_ratio + test_ratio < 1
+    train_ratio = 1 - val_ratio - test_ratio
+    rows_count = data.shape[0]
+
+    idx = np.random.permutation(range(rows_count))
+    train_users_count = int(np.rint(rows_count * train_ratio))
+    val_users_count = int(np.rint(rows_count * val_ratio))
+    seperator = train_users_count + val_users_count
+
+    train_matrix = data[idx[:train_users_count]]
+    val_matrix = data[idx[train_users_count:seperator]]
+    test_matrix = data[idx[seperator:]]
+
+    return train_matrix, val_matrix, test_matrix
+
+
+def split_movies_into_train_test(data, train_ratio):
+    """
+    Splits data into 2 matrices. The users stay the same, but the items are being split by train_ratio
+    :param data: input matrix
+    :param train_ratio: Ratio of input items to all items
+    :return: tuple of 2 matrices: {train_matrix, test_matrix}
+    """
+    rows_count, columns_count = data.shape
+
+    train_rows = list()
+    train_columns = list()
+    test_rows = list()
+    test_columns = list()
+
+    for i in range(rows_count):
+        user_movies = data.getrow(i).nonzero()[1]
+        np.random.shuffle(user_movies)
+
+        movies_count = len(user_movies)
+        train_count = int(np.floor(movies_count * train_ratio))
+        test_count = movies_count - train_count
+
+        train_movies = user_movies[:train_count]
+        test_movies = user_movies[train_count:]
+
+        train_rows += ([i] * train_count)
+        train_columns += list(train_movies)
+
+        test_rows += ([i] * test_count)
+        test_columns += list(test_movies)
+
+    train_matrix = csr_matrix(([1] * len(train_rows), (train_rows, train_columns)), shape=(rows_count, columns_count))
+    test_matrix = csr_matrix(([1] * len(test_rows), (test_rows, test_columns)), shape=(rows_count, columns_count))
+
+    return train_matrix, test_matrix
+
+
+def remove_items_that_doesnt_occure_in_train(train_matrix, val_matrix, test_matrix):
+    """
+    Remove items that don't occure in train matrix
+    :param train_matrix: training data
+    :param val_matrix: validation data
+    :param test_matrix: test data
+    :return: Input matrices without some items
+    """
+    item_occure = defaultdict(lambda: False)
+    for col in train_matrix.nonzero()[1]:
+        item_occure[col] = True
+
+    non_empty_items = [k for k, v in item_occure.items() if v == True]
+
+    return train_matrix[:, non_empty_items], val_matrix[:, non_empty_items], test_matrix[:, non_empty_items]

+ 0 - 0
TensorFlow/Recommendation/VAE-CF/vae/metrics/__init__.py


+ 52 - 0
TensorFlow/Recommendation/VAE-CF/vae/metrics/ndcg.py

@@ -0,0 +1,52 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Discounted Cumulative Gain @ R is
+
+    DCG@R(u,ω) := Σ_{r=1}^{R} I[ω(r) ∈ I_u] − 1 / log(r + 1) / IDCG@R(u,ω)
+    IDCG@R(u,ω) := Σ_{r=1}^{|I_u|} 1 / log(r + 1)
+
+https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG
+https://arxiv.org/pdf/1802.05814.pdf, chapter 4.2
+"""
+
+import numpy as np
+from scipy.sparse import csr_matrix
+
+
+def ndcg(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array:
+    """ Calculate ndcg@R for each users in X_true and X_pred matrices
+
+    Args:
+        X_true: Matrix containing True values for user-item interactions
+        X_top_k: Matrix containing inidices picked by model
+        R: Number of elements taken into consideration
+
+    Returns:
+        Numpy array containing calculated ndcg@R for each user
+    """
+
+    penalties = 1. / np.log2(np.arange(2, R + 2))
+    selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1)
+
+    DCG = selected * penalties
+
+    cpenalties = np.empty(R + 1)
+    np.cumsum(penalties, out=cpenalties[1:])
+    cpenalties[0] = 0
+    maxhit = np.minimum(X_true.getnnz(axis=1), R)
+    IDCG = cpenalties[maxhit]
+
+    return DCG / IDCG

+ 45 - 0
TensorFlow/Recommendation/VAE-CF/vae/metrics/recall.py

@@ -0,0 +1,45 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Recall is counting the number of relevant recommended items in R and normalizes it 
+by dividing by minimum of R and number of clicked items by user
+
+Recall@R(u,ω) := Σ_{r=1}^{R} I[ω(r) ∈ I_u] / min(R,|I_u|)
+
+https://arxiv.org/pdf/1802.05814.pdf, chapter 4.2
+"""
+
+import numpy as np
+from scipy.sparse import csr_matrix
+
+
+def recall(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array:
+    """ Calculates recall@R for each users in X_true and X_top_k matrices
+
+    Args:
+        X_true: Matrix containing True values for user-item interactions
+        X_top_k: Matrix containing indices picked by model
+        R: Number of elements taken into consideration
+
+    Returns:
+        Numpy array containing calculated recall@R for each user
+    """
+
+    selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1)
+    hit = selected.sum(axis=-1)
+
+    maxhit = np.minimum(X_true.getnnz(axis=1), R)
+
+    return np.squeeze(np.asarray(hit)) / maxhit

+ 0 - 0
TensorFlow/Recommendation/VAE-CF/vae/models/__init__.py


+ 32 - 0
TensorFlow/Recommendation/VAE-CF/vae/models/layers.py

@@ -0,0 +1,32 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+
+
+class DenseFromSparse(Dense):
+    def call(self, inputs):
+        if type(inputs) != tf.sparse.SparseTensor:
+            raise ValueError("input should be of type " + str(tf.sparse.SparseTensor))
+        rank = len(inputs.get_shape().as_list())
+        if rank != 2:
+            raise NotImplementedError("input should be rank 2")
+        else:
+            outputs = tf.sparse.sparse_dense_matmul(inputs, self.kernel)
+        if self.use_bias:
+            outputs = tf.nn.bias_add(outputs, self.bias)
+        if self.activation is not None:
+            return self.activation(outputs)  # pylint: disable=not-callable
+        return outputs

+ 435 - 0
TensorFlow/Recommendation/VAE-CF/vae/models/train.py

@@ -0,0 +1,435 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import horovod.tensorflow as hvd
+import scipy.sparse as sparse
+import tensorflow as tf
+import numpy as np
+import time
+import logging
+import dllogger
+
+from sklearn.preprocessing import normalize
+from collections import defaultdict
+
+from vae.models.vae import _VAEGraph, TRAINING, QUERY, VALIDATION
+from vae.utils.round import round_8
+
+LOG = logging.getLogger("VAE")
+
+
+class VAE:
+    def __init__(self,
+                 train_data,
+                 encoder_dims,
+                 decoder_dims=None,
+                 batch_size_train=500,
+                 batch_size_validation=2000,
+                 lam=3e-2,
+                 lr=1e-3,
+                 beta1=0.9,
+                 beta2=0.999,
+                 total_anneal_steps=200000,
+                 anneal_cap=0.2,
+                 xla=True,
+                 activation='tanh',
+                 checkpoint_dir=None,
+                 trace=False,
+                 top_results=100):
+
+        if decoder_dims is None:
+            decoder_dims = encoder_dims[::-1]
+        for i in encoder_dims + decoder_dims + [batch_size_train, batch_size_validation]:
+            if i != round_8(i):
+                raise ValueError("all dims and batch sizes should be divisible by 8")
+
+        self.metrics_history = None
+        self.batch_size_train = batch_size_train
+        self.batch_size_validation = batch_size_validation
+        self.lam = lam
+        self.lr = lr
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.xla = xla
+        self.total_anneal_steps = total_anneal_steps
+        self.anneal_cap = anneal_cap
+        self.activation = activation
+        self.encoder_dims = encoder_dims
+        self.decoder_dims = decoder_dims
+        self.trace = trace
+        self.top_results = top_results
+        self.checkpoint_dir = checkpoint_dir if hvd.rank() == 0 else None
+        self._create_dataset(train_data,
+                             batch_size_train,
+                             encoder_dims)
+        self._setup_model()
+
+        self.metrics_history = defaultdict(lambda: [])
+        self.time_elapsed_training_history = []
+        self.time_elapsed_validation_history = []
+        self.training_throughputs = []
+        self.inference_throughputs = []
+
+
+    def _create_dataset(self, train_data, batch_size_train, encoder_dims):
+        generator, self.n_batch_per_train = self.batch_iterator(train_data,
+                                                                None,
+                                                                batch_size_train,
+                                                                thread_idx=hvd.rank(),
+                                                                thread_num=hvd.size())
+        dataset = tf.data.Dataset \
+            .from_generator(generator, output_types=(tf.int64, tf.float32)) \
+            .map(lambda i, v: tf.SparseTensor(i, v, (batch_size_train, encoder_dims[0]))) \
+            .prefetch(10)
+        self.iter = dataset.make_initializable_iterator()
+        self.inputs_train = self.iter.get_next()
+
+    def _setup_model(self):
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        config.gpu_options.visible_device_list = str(hvd.local_rank())
+
+        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
+        if self.trace:
+            hooks.append(tf.train.ProfilerHook(save_steps=1, output_dir='.'))
+
+        if self.xla:
+            LOG.info('Enabling XLA')
+            config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+        else:
+            LOG.info('XLA disabled')
+
+        self._build_graph()
+        self.session = tf.train.MonitoredTrainingSession(config=config,
+                                                         checkpoint_dir=self.checkpoint_dir,
+                                                         save_checkpoint_secs=10,
+                                                         hooks=hooks)
+
+    def _build_optimizer(self, loss):
+        optimizer= tf.train.AdamOptimizer(learning_rate=self.lr, beta1=self.beta1, beta2=self.beta2)
+        return hvd.DistributedOptimizer(optimizer).minimize(
+            loss, global_step=tf.train.get_or_create_global_step())
+
+    def close_session(self):
+        if self.session is not None:
+            self.session.close()
+
+    def batch_iterator(self, data_input, data_true=None, batch_size=500, thread_idx=0, thread_num=1):
+        training = data_true is None
+
+        data_input = normalize(data_input)
+        indices = np.arange(data_input.shape[0])
+
+        global_batch_size = batch_size * hvd.size()
+
+        if training:
+            # crop the data so that each gpu has the same number of batches
+            stop = data_input.shape[0] // global_batch_size * global_batch_size
+            LOG.info('Cropping each epoch from: {} to {} samples'.format(data_input.shape[0], stop))
+        else:
+            stop = data_input.shape[0]
+
+        def generator():
+            data_in = data_input
+            epoch = 0
+            while True:
+                if training:
+                    # deterministic shuffle necessary for multigpu
+                    np.random.seed(epoch)
+                    np.random.shuffle(indices)
+                    data_in = data_in[indices]
+
+                for st_idx in range(thread_idx * batch_size, stop, thread_num * batch_size):
+                    batch = data_in[st_idx:st_idx + batch_size].copy()
+                    batch = batch.tocoo()
+                    idxs = np.stack([batch.row, batch.col], axis=1)
+                    vals = batch.data
+                    if training:
+                        np.random.seed(epoch * thread_num + thread_idx)
+                        nnz = vals.shape[0]
+
+                        # dropout with keep_prob=0.5
+                        vals *= (2 * np.random.randint(2, size=nnz))
+                        yield (idxs, vals)
+                    else:
+                        yield idxs, vals, data_true[st_idx:st_idx + batch_size]
+                if not training:
+                    break
+                epoch += 1
+
+        be = thread_idx * batch_size
+        st = thread_num * batch_size
+        return generator, int(np.ceil((stop - be) / st))
+
+    def _build_graph(self):
+        self.vae = _VAEGraph(self.encoder_dims, self.decoder_dims, self.activation)
+
+        self.inputs_validation = tf.sparse.placeholder(
+            dtype=tf.float32,
+            shape=np.array([self.batch_size_validation, self.vae.input_dim], dtype=np.int32))
+        self.inputs_query = tf.sparse.placeholder(
+            dtype=tf.float32,
+            shape=np.array([1, self.vae.input_dim], dtype=np.int32))
+
+        self.top_k_validation = self._gen_handlers(mode=VALIDATION)
+        self.logits_train, self.loss_train, self.optimizer = self._gen_handlers(mode=TRAINING)
+        self.top_k_query = self._gen_handlers(mode=QUERY)
+
+        global_step = tf.train.get_or_create_global_step()
+        self.increment_global_step = tf.assign(global_step, global_step + 1)
+
+    def _gen_handlers(self, mode):
+        # model input
+        if mode is TRAINING:
+            inputs = self.inputs_train
+        elif mode is VALIDATION:
+            inputs = self.inputs_validation
+        elif mode is QUERY:
+            inputs = self.inputs_query
+        else:
+            assert False
+
+        if mode is TRAINING:
+            batch_size = self.batch_size_train
+        elif mode is VALIDATION:
+            batch_size = self.batch_size_validation
+        elif mode is QUERY:
+            batch_size = 1
+        else:
+            assert False
+
+        # model output
+        logits, latent_mean, latent_log_var = self.vae(inputs, mode=mode)
+        if mode in [VALIDATION, QUERY]:
+            mask = tf.ones_like(inputs.values) * (-np.inf)
+            logits = tf.tensor_scatter_nd_update(logits, inputs.indices, mask)
+            top_k_values, top_k_indices = tf.math.top_k(logits, sorted=True, k=self.top_results)
+            return top_k_indices
+
+        softmax = tf.nn.log_softmax(logits)
+
+        anneal = tf.math.minimum(
+            tf.cast(tf.train.get_or_create_global_step(), tf.float32) /
+            self.total_anneal_steps, self.anneal_cap)
+
+        # KL divergence
+        KL = tf.reduce_mean(
+            tf.reduce_sum(
+                (-latent_log_var + tf.exp(latent_log_var) + latent_mean ** 2 - 1)
+                / 2,
+                axis=1))
+
+        # per-user average negative log-likelihood part of loss
+        ll_loss = -tf.reduce_sum(tf.gather_nd(softmax, inputs.indices)) / batch_size
+
+        # regularization part of loss
+        reg_loss = 2 * tf.reduce_sum(
+            tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+
+        loss = ll_loss + self.lam * reg_loss + anneal * KL
+
+        train_op = self._build_optimizer(loss)
+        return logits, ll_loss, train_op
+
+    def train(
+            self,
+            n_epochs: int,
+            validation_data_input: sparse.csr_matrix,
+            validation_data_true: sparse.csr_matrix,
+            metrics: dict,  # Dict[str, matrix -> matrix -> float]
+            validation_step: 10,
+    ):
+        """
+        Train the model
+        :param n_epochs: number of epochs
+        :param train_data:  train matrix of shape users count x items count
+        :param metrics: Dictionary of metric names to metric functions
+        :param validation_step: If it's set to n then validation is run once every n epochs
+        """
+
+        self.total_time_start = time.time()
+        self.session.run(self.iter.initializer)
+
+        num_workers = hvd.size()
+        for epoch in range(1, n_epochs + 1):
+
+            init_time = time.time()
+
+            for i in range(self.n_batch_per_train):
+                self.session.run(self.optimizer)
+            batches_per_epoch = i + 1
+
+            training_duration = time.time() - init_time
+            self.time_elapsed_training_history.append(training_duration)
+            training_throughput = num_workers * batches_per_epoch * self.batch_size_train / training_duration
+            self.training_throughputs.append(training_throughput)
+
+            dllogger.log(data={"train_epoch_time" : training_duration,
+                               "train_throughput" : training_throughput},
+                         step=(epoch,))
+
+            if (epoch % validation_step == 0 or epoch == n_epochs) and hvd.rank() == 0:
+                init_time = time.time()
+                metrics_scores = self.test(validation_data_input,
+                                           validation_data_true,
+                                           metrics,
+                                           epoch=epoch)
+
+                for name, score in metrics_scores.items():
+                    self.metrics_history[name].append(score)
+
+                validation_duration = time.time() - init_time
+                self.time_elapsed_validation_history.append(validation_duration)
+
+                dllogger.log(data={"valid_time" : validation_duration},
+                             step=(epoch,))
+
+                self.log_metrics(epoch, metrics_scores, n_epochs)
+        self.total_time = time.time() - self.total_time_start
+        if hvd.rank() == 0:
+            self.log_final_stats()
+
+    def test(
+            self,
+            test_data_input,
+            test_data_true,
+            metrics,
+            epoch=0,
+    ):
+        """
+        Test the performance of the model
+        :param metrics: Dictionary of metric names to metric functions
+        """
+        metrics_scores = defaultdict(lambda: [])
+        gen = self.batch_iterator_val(test_data_input, test_data_true)
+        for idxs, vals, X_true in gen():
+            inference_begin = time.time()
+
+            if self.trace:
+                pred_val, _ = self.session.run([self.top_k_validation, self.increment_global_step],
+                                            feed_dict={self.inputs_validation: (idxs, vals)})
+            else:
+                pred_val = self.session.run(self.top_k_validation,
+                                            feed_dict={self.inputs_validation: (idxs, vals)})
+            elapsed = time.time() - inference_begin
+            pred_val = np.copy(pred_val)
+
+            inference_throughput = self.batch_size_validation / elapsed
+            self.inference_throughputs.append(inference_throughput)
+            dllogger.log(data={"inference_throughput" : inference_throughput},
+                         step=(epoch,))
+
+            for name, metric in metrics.items():
+                metrics_scores[name].append(metric(X_true, pred_val))
+
+        # For some random seeds passed to the data preprocessing script
+        # the test set might contain samples that have no true items to be predicted.
+        # At least one such sample is present in about 7% of all possible test sets.
+        # We decided not to change the preprocessing to remain comparable to the original implementation.
+        # Therefore we're using the nan-aware mean from numpy to ignore users with no items to be predicted. 
+        return {name: np.nanmean(scores) for name, scores in metrics_scores.items()}
+
+    def query(self, input_data: np.ndarray):
+        """
+        inference for batch size 1
+
+        :param input_data:
+        :return:
+        """
+        query_start = time.time()
+        indices = np.stack([np.zeros(len(input_data)), input_data], axis=1)
+        values = np.ones(shape=(1, len(input_data)))
+        values = normalize(values)
+        values = values.reshape(-1)
+
+        sess_run_start = time.time()
+        res = self.session.run(
+            self.top_k_query,
+            feed_dict={self.inputs_query: (indices,
+                                           values)})
+        query_end_time = time.time()
+        LOG.info('query time: {}'.format(query_end_time - query_start))
+        LOG.info('sess run time: {}'.format(query_end_time - sess_run_start))
+        return res
+
+    def _increment_global_step(self):
+        res = self.session.run(self.increment_global_step)
+        print('increment global step result: ', res)
+
+    def batch_iterator_train(self, data_input):
+        """
+        :return: iterator of consecutive batches and its length
+        """
+        data_input = normalize(data_input)
+
+        indices = np.arange(data_input.shape[0])
+        np.random.shuffle(indices)
+        data_input = data_input[list(indices)]
+
+        nsize, _ = data_input.shape
+        csize = nsize // self.batch_size_train * self.batch_size_train
+
+        def generator():
+            while True:
+                for st_idx in range(0, csize, self.batch_size_train):
+                    idxs, vals = self.next_batch(data_input,st_idx, self.batch_size_train)
+
+                    nnz = vals.shape[0]
+                    vals *= (2 * np.random.randint(2, size=nnz))
+                    yield (idxs, vals)
+
+        return generator, int(np.ceil(csize / self.batch_size_train))
+
+    def batch_iterator_val(self, data_input, data_true):
+        """
+        :return: iterator of consecutive batches and its length
+        """
+
+        data_input = normalize(data_input)
+
+        nsize, _ = data_input.shape
+        csize = nsize // self.batch_size_validation * self.batch_size_validation
+
+        def generator():
+            for st_idx in range(0, csize, self.batch_size_validation):
+                idxs, vals = self.next_batch(data_input, st_idx, self.batch_size_validation)
+                yield idxs, vals, data_true[st_idx:st_idx + self.batch_size_validation]
+
+        return generator
+
+    def next_batch(self, data_input, st_idx, batch_size):
+        batch = data_input[st_idx:st_idx + batch_size].copy()
+        batch = batch.tocoo()
+        idxs = np.stack([batch.row, batch.col], axis=1)
+        vals = batch.data
+        return idxs,vals
+
+    def log_metrics(self, epoch, metrics_scores, n_epochs):
+        dllogger.log(data=metrics_scores, step=(epoch,))
+
+    def log_final_stats(self):
+        data = {"total_train_time": np.sum(self.time_elapsed_training_history),
+                "total_valid_time": np.sum(self.time_elapsed_validation_history),
+                "average_train_epoch time": np.mean(self.time_elapsed_training_history),
+                "average_validation_time": np.mean(self.time_elapsed_validation_history),
+                "total_elapsed_time" : self.total_time,
+                "mean_training_throughput": np.mean(self.training_throughputs[10:]),
+                "mean_inference_throughput": np.mean(self.inference_throughputs),
+                "max_training_throughput": np.max(self.training_throughputs[10:]),
+                "max_inference_throughput": np.max(self.inference_throughputs)}
+
+        for metric_name, metric_values in self.metrics_history.items():
+            data["final_" + metric_name] = metric_values[-1]
+
+        dllogger.log(data=data, step=tuple())

+ 110 - 0
TensorFlow/Recommendation/VAE-CF/vae/models/vae.py

@@ -0,0 +1,110 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+from vae.models.layers import DenseFromSparse
+
+TRAINING = 0
+VALIDATION = 1
+QUERY = 2
+
+
+class _VAEGraph(tf.keras.Model):
+    def __init__(self, encoder_dims, decoder_dims, activation='tanh'):
+        super(_VAEGraph, self).__init__()
+        if encoder_dims[-1] != decoder_dims[0]:
+            raise Exception("encoder/decoder dims mismatch")
+        self.input_dim = encoder_dims[0]
+        self.output_dim = decoder_dims[-1]
+        self.activation = tf.nn.tanh if activation == 'tanh' else tf.nn.relu
+        self.encoder = self.encoder_model(encoder_dims[1:])
+        self.decoder = self.decoder_model(decoder_dims[1:])
+
+    def call(self, inputs: tf.SparseTensor, mode):
+        """ Get handlers to VAE output
+        :param inputs: batch_size * items_count as sparse tensor.
+        :param mode: Either 0,1 or 2 representing type of network
+        :return: Tuple of 3 tensors:
+            1. decoder output: batch_size * items_count tensor
+            2. latent_mean: mean tensor between encoder and decoder. It has size batch_size * size_of_mean_vector
+            3. latent_log_var: tesor containing logarithms of variances. It has size batch_size * size_of_var_vector
+        """
+
+        latent_all = self.encoder(inputs, training=(mode is TRAINING))
+        latent_mean = latent_all[:, 0]
+        latent_log_var = latent_all[:, 1]
+        latent_std = tf.exp(0.5 * latent_log_var)
+
+        # reparametrization trick
+        batch = tf.shape(latent_mean)[0]
+        dim = tf.shape(latent_mean)[1]
+        epsilon = tf.random_normal(shape=(batch, dim))
+        decoder_input = latent_mean + (int(mode is TRAINING)) * latent_std * epsilon
+
+        decoder_output = self.decoder(decoder_input, training=(mode is TRAINING))
+
+        return decoder_output, latent_mean, latent_log_var
+
+
+    def encoder_model(self, dims):
+        assert dims
+        last = dims[-1]
+        dims[-1] = 2 * last
+        layers = tf.keras.layers
+        return tf.keras.Sequential(
+            [DenseFromSparse(
+                    dims[0],
+                    activation=self.activation,
+                    name="encoder_{}".format(dims[0]),
+                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                    bias_initializer=tf.truncated_normal_initializer(stddev=0.001),
+                    kernel_regularizer=tf.contrib.layers.l2_regularizer)
+            ] + [
+                layers.Dense(
+                    d,
+                    activation=self.activation,
+                    name="encoder_{}".format(d),
+                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                    bias_initializer=tf.truncated_normal_initializer(stddev=0.001),
+                    kernel_regularizer=tf.contrib.layers.l2_regularizer)
+                for d in dims[1:-1]
+            ] + [
+                layers.Dense(
+                    dims[-1],
+                    name="encoder_{}".format(dims[-1]),
+                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                    bias_initializer=tf.truncated_normal_initializer(stddev=0.001),
+                    kernel_regularizer=tf.contrib.layers.l2_regularizer)
+            ] + [layers.Reshape(target_shape=(2, last))])
+
+
+    def decoder_model(self, dims):
+        assert dims
+        layers = tf.keras.layers
+        return tf.keras.Sequential([
+            layers.Dense(
+                d,
+                activation=self.activation,
+                name="decoder_{}".format(d),
+                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                bias_initializer=tf.truncated_normal_initializer(stddev=0.001),
+                kernel_regularizer=tf.contrib.layers.l2_regularizer) for d in dims[:-1]
+            ] + [
+                layers.Dense(
+                dims[-1],
+                name="decoder_{}".format(dims[-1]),
+                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                bias_initializer=tf.truncated_normal_initializer(stddev=0.001),
+                kernel_regularizer=tf.contrib.layers.l2_regularizer)
+            ])

+ 0 - 0
TensorFlow/Recommendation/VAE-CF/vae/utils/__init__.py


+ 22 - 0
TensorFlow/Recommendation/VAE-CF/vae/utils/round.py

@@ -0,0 +1,22 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from functools import partial
+
+
+def round_n(x, n=8):
+    return n * int(np.ceil(x / n))
+
+round_8 = partial(round_n, n=8)