3 jaren geleden · bf00fe1dbe
--- a/TensorFlow2/Recommendation/SIM/.gitignore
+++ b/TensorFlow2/Recommendation/SIM/.gitignore
@@ -15,4 +15,3 @@
 
				 .ipynb_checkpoints/
			
 
				 .idea/
			
 
				 __pycache__
			
 
				-results/
			
--- a/TensorFlow2/Recommendation/SIM/README.md
+++ b/TensorFlow2/Recommendation/SIM/README.md
@@ -28,6 +28,7 @@ This repository provides a script and recipe to train the SIM model to achieve s
 
				     * [Command-line options](#command-line-options)
			
 
				     * [Getting the data](#getting-the-data)
			
 
				         * [Dataset guidelines](#dataset-guidelines)
			
 
				+        * [Prebatching](#prebatching)
			
 
				         * [BYO dataset](#byo-dataset)
			
 
				             * [Channel definitions and requirements](#channel-definitions-and-requirements)
			
 
				     * [Training process](#training-process)
			
@@ -78,7 +79,7 @@ In the author’s SIM implementation, the internals of submodels differs slightl
 
				 List of implementation differences between original SIM code and DIN/DIEN/SIM papers
			
 
				 </b></summary>
			
 
				 
			
 
				-- Batch normalization before NLP is not included in papers.
			
 
				+- Batch normalization before MLP is not included in papers.
			
 
				 - Batch normalization in code used `trainable=False` during the training phase.
			
 
				 - ItemItemInteraction in DIN`s attention module in SIM implementation didn't correspond to activation unit inside DIN paper.
			
 
				   - Element-wise subtraction and multiplications are fed to MLP, skipping outer product operation.
			
@@ -375,7 +376,7 @@ The following section lists the requirements that you need to meet in order to s
 
				 
			
 
				 This repository contains a Dockerfile that extends the TensorFflow2 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
			
 
				 - [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
			
 
				-- [TensorFlow2 21.10-py3](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow/tags) NGC container
			
 
				+- [TensorFlow2 22.01-py3](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow/tags) NGC container
			
 
				 - Supported GPUs:
			
 
				   - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
			
 
				   - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
			
@@ -417,9 +418,6 @@ To train your model using mixed or TF32 precision with Tensor Cores or using FP3
 
				 5. Start preprocessing.
			
 
				 
			
 
				     For details of the required file format and certain preprocessing parameters refer to [BYO dataset](#byo-dataset).
			
 
				-    
			
 
				-    
			
 
				-    `${NUMBER_OF_USER_FEATURES}` defines how many user specific features are present in dataset. If using default Amazon Books dataset and `sim_preprocessing` script (as shown below), this parameter should be set to <b>1</b> (in this case, the only user specific features is <b>user_id</b>. Other features are item specific).
			
 
				 
			
 
				    ```bash
			
 
				    python preprocessing/sim_preprocessing.py \
			
@@ -428,8 +426,7 @@ To train your model using mixed or TF32 precision with Tensor Cores or using FP3
 
				 
			
 
				    python preprocessing/parquet_to_tfrecord.py \
			
 
				     --amazon_dataset_path ${PARQUET_PATH} \
			
 
				-    --tfrecord_output_dir ${TF_RECORD_PATH} \
			
 
				-    --number_of_user_features ${NUMBER_OF_USER_FEATURES}
			
 
				+    --tfrecord_output_dir ${TF_RECORD_PATH}
			
 
				    ```
			
 
				 
			
 
				 6. Start training (`${GPU}` is an arbitrary number of GPUs to be used).
			
@@ -496,10 +493,11 @@ The `main.py` script parameters are detailed in the following table.
 
				 | training        | drop_remainder            | Drop remainder batch for training set (flag)                            | False                     |
			
 
				 | training        | disable_cache             | Disable dataset caching after the first time it is iterated over (flag)        | False                     |
			
 
				 | training        | repeat_count              | Repeat training dataset this number of times                            | 0                         |
			
 
				-| training | prefetch_train_size |  Number of batches to prefetch in training. | -1 |
			
 
				-| training | prefetch_test_size |  Number of batches to prefetch in evaluation. | -1 |
			
 
				-| training | train_dataset_size |  Number of samples in training dataset (used to determine prefetch_train_size when --prefetch_train_size < 0) | 11796480 |
			
 
				+| training | prefetch_train_size |  Number of batches to prefetch in training. | 10 |
			
 
				+| training | prefetch_test_size |  Number of batches to prefetch in evaluation. | 2 |
			
 
				 | training | long_seq_length | Determines the long history - short history split of history features | 90 |
			
 
				+| training | prebatch_train_size | Batch size of batching applied during preprocessing to train dataset. | 0 |
			
 
				+| training | prebatch_test_size | Batch size of batching applied during preprocessing to test dataset. | 0 |
			
 
				 | results         | results_dir               | Path to the model result files storage                                  | /tmp/sim                  |
			
 
				 | results         | log_filename              | Name of the file to store logger output                                 | log.json                  |
			
 
				 | results         | save_checkpoint_path      | Directory to save model checkpoints                                     | ""                        |
			
@@ -511,8 +509,10 @@ The `main.py` script parameters are detailed in the following table.
 
				 | run mode        | affinity                  | Type of CPU affinity                                                    | socket_unique_interleaved |
			
 
				 | run mode        | inter_op_parallelism      | Number of inter op threads                                              | 0                         |
			
 
				 | run mode        | intra_op_parallelism      | Number of intra op threads                                              | 0                         |
			
 
				+| run mode        | num_parallel_calls        | Parallelism level for tf.data API. If None, heuristic based on number of CPUs and number of GPUs will be used   |   None  |
			
 
				 | reproducibility | seed                      | Random seed                                                             | -1                        |
			
 
				 
			
 
				+
			
 
				 ### Command-line options
			
 
				 
			
 
				 To view the full list of available options and their descriptions, use the `--help` command-line option, for example:
			
@@ -534,6 +534,56 @@ The preprocessing steps applied to the raw data include:
 
				 - Determining embedding table sizes for categorical features needed to construct a model
			
 
				 - Filter users for training split based on their number of interactions (discard users with less than 20 interactions)
			
 
				 
			
 
				+#### Prebatching
			
 
				+
			
 
				+Preprocessing scripts allow to apply batching prior to the model`s dataloader. This reduces the size of produced TFrecord files and speeds up dataloading.
			
 
				+To do so, specify `--prebatch_train_size` and `--prebatch_test_size` while converting data using `scripts/parquet_to_tfrecord.py`. Later, while using the `main.py` script, pass the information about applied prebatch size via the same parameters.
			
 
				+
			
 
				+Example
			
 
				+
			
 
				+Start preprocessing from step 5. from [Quick Start Guide](#quick-start-guide):
			
 
				+
			
 
				+```bash
			
 
				+python preprocessing/sim_preprocessing.py \
			
 
				+--amazon_dataset_path ${RAW_DATASET_PATH} \
			
 
				+--output_path ${PARQUET_PATH}
			
 
				+
			
 
				+python preprocessing/parquet_to_tfrecord.py \
			
 
				+--amazon_dataset_path ${PARQUET_PATH} \
			
 
				+--tfrecord_output_dir ${TF_RECORD_PATH} \
			
 
				+--prebatch_train_size ${PREBATCH_TRAIN_SIZE} \
			
 
				+--prebatch_train_size ${PREBATCH_TEST_SIZE}
			
 
				+```
			
 
				+
			
 
				+And then train the model (step 6.):
			
 
				+
			
 
				+```bash
			
 
				+mpiexec --allow-run-as-root --bind-to socket -np ${GPU} python main.py \
			
 
				+--dataset_dir ${TF_RECORD_PATH} \
			
 
				+--mode train \
			
 
				+--model_type sim \
			
 
				+--embedding_dim 16 \
			
 
				+--drop_remainder \
			
 
				+--optimizer adam \
			
 
				+--lr 0.01 \
			
 
				+--epochs 3 \
			
 
				+--global_batch_size 131072 \
			
 
				+--amp \
			
 
				+--prebatch_train_size ${PREBATCH_TRAIN_SIZE} \
			
 
				+--prebatch_train_size ${PREBATCH_TEST_SIZE}
			
 
				+```
			
 
				+
			
 
				+<details>
			
 
				+<summary><b>Prebatching details</b></summary>
			
 
				+
			
 
				+- The last batch for each split will pe saved to the separate file `remainder.tfrecord` unless there are enough samples to form a full batch.
			
 
				+- Final batch size used in main script can be a multiple of prebatch size.
			
 
				+- Final batch size used in main script can be a divider of prebatch size. In this case, when using multi GPU training, the number of batches received by each worker can be greater than 1 thus resulting in error during allgather operation. Dataset size, batch size and prebatch size have to be chosen with that limitation in mind.
			
 
				+- For the orignal Amazon Books Dataset, parameters were set to PREBATCH_TRAIN_SIZE = PREBATCH_TEST_SIZE = 4096 for performance benchmarking purposes.
			
 
				+</details>
			
 
				+
			
 
				+&nbsp;
			
 
				+
			
 
				 #### BYO dataset 
			
 
				 
			
 
				 This implementation supports using other datasets thanks to BYO dataset functionality. 
			
@@ -676,7 +726,7 @@ source_spec:
 
				     type: tfrecord
			
 
				 ```
			
 
				 
			
 
				-`dimensions` should contain the length of the history to which the entries will be padded.
			
 
				+`dimensions` should contain the length of the sequencial features.
			
 
				 
			
 
				 Note that corresponsive features in `negative_history`, `positive_history`, `target_item_features` need to be listed in the same order in channel spec in each channel since they share embedding tables in the model. (for example `item_id` needs to be first and `cat_id` second). 
			
 
				 
			
@@ -705,7 +755,7 @@ For performance reasons, the only supported dataset type is tfrecord.
 
				 
			
 
				 ### Training process
			
 
				 
			
 
				-Training can be run using `main.py` script by specifying the `--mode train` parameter. The speed of training is measured by throughput, that is, the number of samples processed per second. Evaluation is based on the [Area under ROC Curve (ROC AUC)](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) metric. Model checkpoints may be stored using Checkpoint manager as specified via (...). Training and inference logs are saved to a directory specified via the `--results_dir` parameter. Mixed precision training is supported via the `--amp` flag. Multi-GPU training is performed using mpiexec and Horovod libraries.
			
 
				+Training can be run using `main.py` script by specifying the `--mode train` parameter. The speed of training is measured by throughput, that is, the number of samples processed per second. Evaluation is based on the [Area under ROC Curve (ROC AUC)](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) metric. Model checkpoints may be stored using Checkpoint manager via the `--save_checkpoint_path` and `--load_checkpoint_path` parameters. Training and inference logs are saved to a directory specified via the `--results_dir` parameter. Mixed precision training is supported via the `--amp` flag. Multi-GPU training is performed using mpiexec and Horovod libraries.
			
 
				 
			
 
				 ### Inference process
			
 
				 
			
@@ -778,7 +828,9 @@ mpiexec --allow-run-as-root --bind-to socket -np ${GPU} python main.py \
 
				   --global_batch_size 131072 \
			
 
				   --drop_remainder \
			
 
				   --amp \
			
 
				-  --benchmark
			
 
				+  --benchmark \
			
 
				+  --prebatch_train_size ${PREBATCH_TRAIN_SIZE} \
			
 
				+  --prebatch_test_size ${PREBATCH_TEST_SIZE}
			
 
				 ```
			
 
				 
			
 
				 Equivalent:
			
@@ -787,7 +839,9 @@ scripts/run_model.sh \
 
				   --data_path ${TF_RECORD_PATH} \
			
 
				   --gpus ${GPU} \
			
 
				   --amp 1 \
			
 
				-  --benchmark 1 
			
 
				+  --benchmark 1 \
			
 
				+  --prebatch_train_size ${PREBATCH_TRAIN_SIZE} \
			
 
				+  --prebatch_test_size ${PREBATCH_TEST_SIZE}
			
 
				 ```
			
 
				 
			
 
				 #### Inference performance benchmark
			
@@ -801,7 +855,9 @@ mpiexec --allow-run-as-root --bind-to socket -np ${GPU} python main.py \
 
				   --model_type sim \
			
 
				   --global_batch_size 131072 \
			
 
				   --amp \
			
 
				-  --benchmark
			
 
				+  --benchmark \
			
 
				+  --prebatch_train_size ${PREBATCH_TRAIN_SIZE} \
			
 
				+  --prebatch_test_size ${PREBATCH_TEST_SIZE}
			
 
				 ```
			
 
				 
			
 
				 Equivalent:
			
@@ -811,7 +867,8 @@ scripts/run_model.sh \
 
				   --gpus ${GPU} \
			
 
				   --amp 1 \
			
 
				   --benchmark 1 \
			
 
				-  --mode inference
			
 
				+  --prebatch_train_size ${PREBATCH_TRAIN_SIZE} \
			
 
				+  --prebatch_test_size ${PREBATCH_TEST_SIZE}
			
 
				 ```
			
 
				 
			
 
				 ### Results
			
@@ -820,7 +877,7 @@ The following sections provide details on how we achieved our performance and ac
 
				 
			
 
				 #### Training accuracy results
			
 
				 
			
 
				-Our results were obtained by running the `run_model.sh` bash script in the TensorFlow2 21.10-py3 NGC container. Experiments were run on 1 and 8 GPUs, with FP32/TF32 Precision and AMP and with XLA-OFF/XLA-ON. Other parameters were set to defaults.
			
 
				+Our results were obtained by running the `run_model.sh` bash script in the TensorFlow2 21.10-py3 NGC container. Experiments were run on 1 and 8 GPUs, with FP32/TF32 Precision and AMP and with XLA-OFF/XLA-ON. Dataset was prebatched with the size of 16384. Other parameters were set to defaults.
			
 
				 
			
 
				 There were 10 runs for each configuration. In the `Training accuracy` sections, average values are reported. In the `Training stability` sections, values from all runs are included in plots.
			
 
				 
			
@@ -962,7 +1019,7 @@ Figure 8. ROC curve for different configurations of Ampere/Volta, 1/8 GPUs, doub
 
				 
			
 
				 #### Training performance results
			
 
				 
			
 
				-Our results were obtained by running the `scripts/run_model.sh` script in the TensorFlow2 21.10-py3 NGC container. 
			
 
				+Our results were obtained by running the `scripts/run_model.sh` script in the TensorFlow2 21.10-py3 NGC container. Dataset was prebatched with the size of 16384.
			
 
				 
			
 
				 Numbers were averaged over 10 separate runs for each configuration.
			
 
				 
			
@@ -974,12 +1031,12 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
				 
			
 
				 ##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
			
 
				 
			
 
				-|GPUs |XLA  |Throughput - TF32 (samples/s)  |Throughput - mixed precision (samples/s) |Throughput speedup (mixed precision / TF32)  | Strong scaling - TF32 | Strong scaling - mixed precision |
			
 
				-|-----|-----|--------------------|------------------------------|---------------------------------------------|-----------|-------------|
			
 
				-|1    |OFF  |381211.31           |484360.65                     |1.27    | 1.00 | 1.00 |
			
 
				-|1    |ON   |462012.86           |571727.91                     |1.24    | 1.00 | 1.00 |
			
 
				-|8    |OFF  |2304284.08          |2475445.94                    |1.07   | 6.04 | 5.11 |
			
 
				-|8    |ON   |2679300.61          |3006370.96                    |1.12   | 5.80 | 5.26 |
			
 
				+|   GPUs |   XLA |   Throughput - TF32 (samples/s) |   Throughput - mixed precision (samples/s) |   Throughput speedup (mixed precision / TF32) |   Strong scaling - TF32 |   Strong scaling - mixed precision |
			
 
				+|-------:|------:|--------------------------------:|-------------------------------------------:|----------------------------------------------:|------------------------:|-----------------------------------:|
			
 
				+|      1 |     OFF |                       377254.65 |                                  479921.54 |                                          1.27 |                    1.00 |                               1.00 |
			
 
				+|      1 |     ON |                       455724.01 |                                  565221.04 |                                          1.24 |                    1.00 |                               1.00 |
			
 
				+|      8 |     OFF |                      2161681.55 |                                 2603489.60 |                                          1.20 |                    5.73 |                               5.42 |
			
 
				+|      8 |     ON |                      2662368.18 |                                 2979441.80 |                                          1.12 |                    5.84 |                               5.27 |
			
 
				 
			
 
				 <details>
			
 
				 <summary><b>
			
@@ -990,24 +1047,24 @@ For each configuration of parameters present in the table, the `Speedup` column
 
				 
			
 
				 |GPUs |Precision      |Speedup |
			
 
				 |-----|---------------|--------|
			
 
				-|1    |TF32           |1.212   |
			
 
				-|1    |AMP            |1.180   |
			
 
				-|8    |TF32           |1.163   |
			
 
				-|8    |AMP            |1.214   |
			
 
				+|1    |TF32           |1.208   |
			
 
				+|1    |AMP            |1.178   |
			
 
				+|8    |TF32           |1.232   |
			
 
				+|8    |AMP            |1.119   |
			
 
				 </details>
			
 
				 
			
 
				 &nbsp;
			
 
				 
			
 
				 ##### Training performance: NVIDIA DGX-2 (16x V100 32GB)
			
 
				 
			
 
				-|GPUs |XLA  |Throughput - FP32 (samples/s)  |Throughput - mixed precision (samples/s) |Throughput speedup (mixed precision / FP32) | Strong scaling - FP32 | Strong scaling - mixed precision |
			
 
				-|-----|-----|--------------------|------------------------------|---------------------------------------------|----------|-------------|
			
 
				-|1    |OFF  |210772.27           |312580.01                     |1.48                                         | 1.00 | 1.00 |
			
 
				-|1    |ON   |248514.27           |358305.52                     |1.44                                         | 1.00 | 1.00 |
			
 
				-|8    |OFF  |1357463.39          |1785361.62                    |1.32                                         | 6.44 | 5.71 |
			
 
				-|8    |ON   |1584757.09          |2091403.04                    |1.32                                         | 7.52 | 6.69 |
			
 
				-|16   |OFF  |2319719.76          |2837309.15                    |1.22                                         | 11.00 | 9.08 |
			
 
				-|16   |ON   |2681789.69          |3168488.89                    |1.18                                         | 12.73 | 10.14 |
			
 
				+|   GPUs |   XLA |   Throughput - FP32 (samples/s) |   Throughput - mixed precision (samples/s) |   Throughput speedup (mixed precision / FP32) |   Strong scaling - FP32 |   Strong scaling - mixed precision |
			
 
				+|-------:|------:|--------------------------------:|-------------------------------------------:|----------------------------------------------:|------------------------:|-----------------------------------:|
			
 
				+|      1 |     OFF |                       209376.38 |                                  309752.48 |                                          1.48 |                    1.00 |                               1.00 |
			
 
				+|      1 |     ON |                       245414.62 |                                  348945.59 |                                          1.42 |                    1.00 |                               1.00 |
			
 
				+|      8 |     OFF |                      1310239.01 |                                 1689602.79 |                                          1.29 |                    6.26 |                               5.45 |
			
 
				+|      8 |     ON |                      1483120.32 |                                 1962226.32 |                                          1.32 |                    6.04 |                               5.62 |
			
 
				+|     16 |     OFF |                      2127221.65 |                                 2555926.79 |                                          1.20 |                   10.16 |                               8.25 |
			
 
				+|     16 |     ON |                      2450499.40 |                                 2788997.07 |                                          1.14 |                    9.99 |                               7.99 |
			
 
				 
			
 
				 <details>
			
 
				 <summary><b>
			
@@ -1018,12 +1075,12 @@ For each configuration of parameters present in the table, the `Speedup` column
 
				 
			
 
				 |GPUs |AMP                 |Speedup        |
			
 
				 |-----|--------------------|---------------|
			
 
				-|1    |FP32                |1.179          |
			
 
				-|1    |AMP                 |1.146          |
			
 
				-|8    |FP32                |1.167          |
			
 
				-|8    |AMP                 |1.171          |
			
 
				-|16   |FP32                |1.156          |
			
 
				-|16   |AMP                 |1.117          |
			
 
				+|1    |FP32                |1.172          |
			
 
				+|1    |AMP                 |1.127          |
			
 
				+|8    |FP32                |1.132          |
			
 
				+|8    |AMP                 |1.161          |
			
 
				+|16   |FP32                |1.152          |
			
 
				+|16   |AMP                 |1.091          |
			
 
				 </details>
			
 
				 
			
 
				 &nbsp;
			
@@ -1033,16 +1090,17 @@ For each configuration of parameters present in the table, the `Speedup` column
 
				 NVIDIA DGX A100 / DGX-2 (Ampere / Volta) training speedup
			
 
				 </b></summary>
			
 
				 
			
 
				-|GPUs |XLA    |Precision       |Speedup|
			
 
				-|-----|-------|---------------|-------|
			
 
				-|1    |OFF    |TF32/FP32      |1.809  |
			
 
				-|1    |OFF    |AMP            |1.550  |
			
 
				-|1    |ON     |TF32/FP32      |1.860  |
			
 
				-|1    |ON     |AMP            |1.596  |
			
 
				-|8    |OFF    |TF32/FP32      |1.697  |
			
 
				-|8    |OFF    |AMP            |1.387  |
			
 
				-|8    |ON     |TF32/FP32      |1.691  |
			
 
				-|8    |ON     |AMP            |1.437  |
			
 
				+
			
 
				+|   GPUs |   XLA | Precision   |   Speedup |
			
 
				+|-------:|------:|:------------|----------:|
			
 
				+|      1 |     OFF | TF32/FP32   |     1.802 |
			
 
				+|      1 |     OFF | AMP         |     1.549 |
			
 
				+|      1 |     ON | TF32/FP32   |     1.857 |
			
 
				+|      1 |     ON | AMP         |     1.620 |
			
 
				+|      8 |     OFF | TF32/FP32   |     1.650 |
			
 
				+|      8 |     OFF | AMP         |     1.541 |
			
 
				+|      8 |     ON | TF32/FP32   |     1.795 |
			
 
				+|      8 |     ON | AMP         |     1.518 |
			
 
				 
			
 
				 </details>
			
 
				 
			
@@ -1060,74 +1118,44 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
				 
			
 
				 ##### Inference performance: NVIDIA DGX A100 (8x A100 80GB)
			
 
				 
			
 
				-|GPUs |Global batch size|XLA  |Throughput - TF32 (samples/s)|Throughput - mixed precision (samples/s)|Throughput speedup (mixed precision / TF32)  | Strong scaling - TF32 | Strong scaling - mixed precision |
			
 
				-|-----|----------|-----|---------------|----------------------------|---------------------------------------------|----------------|---------|
			
 
				-|1    |4096      |ON   |561967.1       |535674.63                   |0.95                                         | 1.00 | 1.00 |
			
 
				-|1    |8192      |ON   |670885.47      |758801.43                   |1.13                                         | 1.00 | 1.00 |
			
 
				-|1    |16384     |ON   |788890.79      |920695.88                   |1.17                                         | 1.00 | 1.00 |
			
 
				-|1    |32768     |ON   |855056.39      |1035530.23                  |1.21                                         | 1.00 | 1.00 |
			
 
				-|1    |65536     |ON   |918649.98      |1081408.05                  |1.18                                         | 1.00 | 1.00 |
			
 
				-|1    |131072    |ON   |918555.37      |771119.78                   |0.84                                         | 1.00 | 1.00 |
			
 
				-|8    |4096      |ON   |1130031.99     |935848.52                   |0.83                                         | 2.01 | 1.75 |
			
 
				-|8    |8192      |ON   |2246441.94     |1885511.32                  |0.84                                         | 3.64 | 2.48 |
			
 
				-|8    |16384     |ON   |4000071.31     |3303417.5                   |0.83                                         | 5.07 | 3.59 |
			
 
				-|8    |32768     |ON   |5479754.01     |5762298.42                  |1.05                                         | 6.41 | 5.56 |
			
 
				-|8    |65536     |ON   |6736333.91     |7869825.77                  |1.17                                         | 7.33 | 7.28 |
			
 
				-|8    |131072    |ON   |7598665.72     |9002545.49                  |1.18                                         | 8.27 | 11.67 |
			
 
				+|   Batch Size |   XLA |   Throughput - TF32 (samples/s) |   Throughput - mixed precision (samples/s) |   Throughput speedup (mixed precision / TF32) |
			
 
				+|--------------------:|------:|--------------------------------:|-------------------------------------------:|----------------------------------------------:|
			
 
				+|                4096 |     ON |                       618547.45 |                                  669640.65 |                                          1.08 |
			
 
				+|                8192 |     ON |                       722801.14 |                                  849101.88 |                                          1.17 |
			
 
				+|               16384 |     ON |                       859418.77 |                                 1051361.67 |                                          1.22 |
			
 
				+|               32768 |     ON |                       976771.70 |                                 1269000.97 |                                          1.30 |
			
 
				+|               65536 |     ON |                      1082688.51 |                                 1444729.52 |                                          1.33 |
			
 
				+|              131072 |     ON |                      1094733.64 |                                 1483542.86 |                                          1.36 |
			
 
				 
			
 
				 <details>
			
 
				 <summary><b> Complete table of DGX A100 inference performance results </b></summary>
			
 
				 
			
 
				-|GPUSs|Global Batch Size   |XLA    |Precision      |Throughput  (samples/s)           |
			
 
				-|-----|--------------------|-------|---------------|-----------------------|
			
 
				-|1    |4096                |OFF    |TF32           |585246.51 ± 10513.06   |
			
 
				-|1    |8192                |OFF    |TF32           |750729.14 ± 17029.41   |
			
 
				-|1    |16384               |OFF    |TF32           |803593.59 ± 11207.58   |
			
 
				-|1    |32768               |OFF    |TF32           |822162.85 ± 5071.85    |
			
 
				-|1    |65536               |OFF    |TF32           |775748.42 ± 36821.04   |
			
 
				-|1    |131072              |OFF    |TF32           |644740.49 ± 31148.79   |
			
 
				-|1    |4096                |OFF    |AMP            |516164.09 ± 9916.80    |
			
 
				-|1    |8192                |OFF    |AMP            |778740.41 ± 19384.36   |
			
 
				-|1    |16384               |OFF    |AMP            |932211.18 ± 20331.07   |
			
 
				-|1    |32768               |OFF    |AMP            |990696.89 ± 11554.34   |
			
 
				-|1    |65536               |OFF    |AMP            |715678.16 ± 30944.63   |
			
 
				-|1    |131072              |OFF    |AMP            |611740.50 ± 21392.81   |
			
 
				-|1    |4096                |ON     |TF32           |561967.10 ± 18100.55   |
			
 
				-|1    |8192                |ON     |TF32           |670885.47 ± 11149.51   |
			
 
				-|1    |16384               |ON     |TF32           |788890.79 ± 10058.99   |
			
 
				-|1    |32768               |ON     |TF32           |855056.39 ± 14349.13   |
			
 
				-|1    |65536               |ON     |TF32           |918649.98 ± 7571.32    |
			
 
				-|1    |131072              |ON     |TF32           |918555.37 ± 15036.89   |
			
 
				-|1    |4096                |ON     |AMP            |535674.63 ± 14003.35   |
			
 
				-|1    |8192                |ON     |AMP            |758801.43 ± 15225.76   |
			
 
				-|1    |16384               |ON     |AMP            |920695.88 ± 15325.29   |
			
 
				-|1    |32768               |ON     |AMP            |1035530.23 ± 16055.40  |
			
 
				-|1    |65536               |ON     |AMP            |1081408.05 ± 41906.29  |
			
 
				-|1    |131072              |ON     |AMP            |771119.78 ± 79589.50   |
			
 
				-|8    |4096                |OFF    |TF32           |765154.17 ± 30582.87   |
			
 
				-|8    |8192                |OFF    |TF32           |1396414.24 ± 99987.01  |
			
 
				-|8    |16384               |OFF    |TF32           |2281597.86 ± 77483.79  |
			
 
				-|8    |32768               |OFF    |TF32           |3555014.42 ± 145944.33 |
			
 
				-|8    |65536               |OFF    |TF32           |4792413.60 ± 203285.21 |
			
 
				-|8    |131072              |OFF    |TF32           |5941195.01 ± 182519.72 |
			
 
				-|8    |4096                |OFF    |AMP            |642706.11 ± 28063.45   |
			
 
				-|8    |8192                |OFF    |AMP            |1197789.38 ± 47262.95  |
			
 
				-|8    |16384               |OFF    |AMP            |1961353.19 ± 49818.70  |
			
 
				-|8    |32768               |OFF    |AMP            |3267263.60 ± 130680.70 |
			
 
				-|8    |65536               |OFF    |AMP            |4847783.16 ± 257991.99 |
			
 
				-|8    |131072              |OFF    |AMP            |6413842.15 ± 289543.64 |
			
 
				-|8    |4096                |ON     |TF32           |1130031.99 ± 75271.24  |
			
 
				-|8    |8192                |ON     |TF32           |2246441.94 ± 26132.90  |
			
 
				-|8    |16384               |ON     |TF32           |4000071.31 ± 48054.68  |
			
 
				-|8    |32768               |ON     |TF32           |5479754.01 ± 170421.20 |
			
 
				-|8    |65536               |ON     |TF32           |6736333.91 ± 153745.68 |
			
 
				-|8    |131072              |ON     |TF32           |7598665.72 ± 174188.78 |
			
 
				-|8    |4096                |ON     |AMP            |935848.52 ± 14583.48   |
			
 
				-|8    |8192                |ON     |AMP            |1885511.32 ± 22206.00  |
			
 
				-|8    |16384               |ON     |AMP            |3303417.50 ± 210306.61 |
			
 
				-|8    |32768               |ON     |AMP            |5762298.42 ± 140412.56 |
			
 
				-|8    |65536               |ON     |AMP            |7869825.77 ± 305838.69 |
			
 
				-|8    |131072              |ON     |AMP            |9002545.49 ± 438204.32 |
			
 
				+|   Batch Size | XLA   | Precision   | Throughput  (samples/s)   |
			
 
				+|-------------:|:------|:------------|:--------------------------|
			
 
				+|         4096 | OFF   | TF32        | 708349.73 ± 14161.58      |
			
 
				+|         8192 | OFF   | TF32        | 873335.82 ± 8539.56       |
			
 
				+|        16384 | OFF   | TF32        | 937987.79 ± 12114.34      |
			
 
				+|        32768 | OFF   | TF32        | 943313.07 ± 8631.81       |
			
 
				+|        65536 | OFF   | TF32        | 960794.46 ± 7388.45       |
			
 
				+|       131072 | OFF   | TF32        | 966245.27 ± 8637.82       |
			
 
				+|         4096 | OFF   | AMP         | 645394.94 ± 14844.27      |
			
 
				+|         8192 | OFF   | AMP         | 919410.07 ± 11355.28      |
			
 
				+|        16384 | OFF   | AMP         | 1136346.66 ± 14529.91     |
			
 
				+|        32768 | OFF   | AMP         | 1216810.45 ± 21013.12     |
			
 
				+|        65536 | OFF   | AMP         | 1287305.05 ± 19373.18     |
			
 
				+|       131072 | OFF   | AMP         | 1298478.97 ± 10733.67     |
			
 
				+|         4096 | ON    | TF32        | 618547.45 ± 6569.97       |
			
 
				+|         8192 | ON    | TF32        | 722801.14 ± 9448.19       |
			
 
				+|        16384 | ON    | TF32        | 859418.77 ± 10012.61      |
			
 
				+|        32768 | ON    | TF32        | 976771.70 ± 13377.36      |
			
 
				+|        65536 | ON    | TF32        | 1082688.51 ± 8523.55      |
			
 
				+|       131072 | ON    | TF32        | 1094733.64 ± 11157.18     |
			
 
				+|         4096 | ON    | AMP         | 669640.65 ± 9319.68       |
			
 
				+|         8192 | ON    | AMP         | 849101.88 ± 14068.04      |
			
 
				+|        16384 | ON    | AMP         | 1051361.67 ± 15310.42     |
			
 
				+|        32768 | ON    | AMP         | 1269000.97 ± 23971.56     |
			
 
				+|        65536 | ON    | AMP         | 1444729.52 ± 18011.54     |
			
 
				+|       131072 | ON    | AMP         | 1483542.86 ± 6751.29      |
			
 
				 
			
 
				 </details>
			
 
				 
			
@@ -1138,32 +1166,20 @@ DGX A100 XLA-ON / XLA-OFF inference Speedup
 
				 
			
 
				 For each configuration of parameters present in the table, the `Speedup` column shows the speedup achieved by turning on XLA.
			
 
				 
			
 
				-|GPUs |Global Batch Size   |Precision      |Speedup |
			
 
				-|-----|--------------------|---------------|--------|
			
 
				-|1    |4096                |TF32           |0.960   |
			
 
				-|1    |8192                |TF32           |0.894   |
			
 
				-|1    |16384               |TF32           |0.982   |
			
 
				-|1    |32768               |TF32           |1.040   |
			
 
				-|1    |65536               |TF32           |1.184   |
			
 
				-|1    |131072              |TF32           |1.425   |
			
 
				-|1    |4096                |AMP            |1.038   |
			
 
				-|1    |8192                |AMP            |0.974   |
			
 
				-|1    |16384               |AMP            |0.988   |
			
 
				-|1    |32768               |AMP            |1.045   |
			
 
				-|1    |65536               |AMP            |1.511   |
			
 
				-|1    |131072              |AMP            |1.261   |
			
 
				-|8    |4096                |TF32           |1.477   |
			
 
				-|8    |8192                |TF32           |1.609   |
			
 
				-|8    |16384               |TF32           |1.753   |
			
 
				-|8    |32768               |TF32           |1.541   |
			
 
				-|8    |65536               |TF32           |1.406   |
			
 
				-|8    |131072              |TF32           |1.279   |
			
 
				-|8    |4096                |AMP            |1.456   |
			
 
				-|8    |8192                |AMP            |1.574   |
			
 
				-|8    |16384               |AMP            |1.684   |
			
 
				-|8    |32768               |AMP            |1.764   |
			
 
				-|8    |65536               |AMP            |1.623   |
			
 
				-|8    |131072              |AMP            |1.404   |
			
 
				+|Batch Size   |Precision      |Speedup |
			
 
				+|--------------------|---------------|--------|
			
 
				+|4096                |TF32           |0.873   |
			
 
				+|8192                |TF32           |0.828   |
			
 
				+|16384               |TF32           |0.916   |
			
 
				+|32768               |TF32           |1.035   |
			
 
				+|65536               |TF32           |1.127   |
			
 
				+|131072              |TF32           |1.133   |
			
 
				+|4096                |AMP            |1.038   |
			
 
				+|8192                |AMP            |0.924   |
			
 
				+|16384               |AMP            |0.925   |
			
 
				+|32768               |AMP            |1.043   |
			
 
				+|65536               |AMP            |1.187   |
			
 
				+|131072              |AMP            |1.143   |
			
 
				 
			
 
				 </details>
			
 
				 
			
@@ -1171,153 +1187,69 @@ For each configuration of parameters present in the table, the `Speedup` column
 
				 
			
 
				 ##### Inference performance: NVIDIA DGX-2 (16x V100 32GB)
			
 
				 
			
 
				-|GPUs |Global batch size|XLA  |Throughput - FP32 (samples/s)|Throughput - mixed precision (samples/s)|Throughput speedup (mixed precision / FP32)  | Strong scaling - FP32 | Strong scaling - mixed precision |
			
 
				-|-----|----------|-----|---------------|----------------------------|---------------------------------------------|--------|--------|
			
 
				-|1    |4096      |ON   |403479.95      |479051.62                   |1.19                                         | 1.00 | 1.00 |
			
 
				-|1    |8192      |ON   |480491.12      |600002.95                   |1.25                                         | 1.00 | 1.00 |
			
 
				-|1    |16384     |ON   |538737.44      |713203.59                   |1.32                                         | 1.00 | 1.00 |
			
 
				-|1    |32768     |ON   |580958.93      |790782.1                    |1.36                                         | 1.00 | 1.00 |
			
 
				-|1    |65536     |ON   |586275.07      |818038.44                   |1.40                                         | 1.00 | 1.00 |
			
 
				-|1    |131072    |ON   |613524.11      |734034.26                   |1.20                                         | 1.00 | 1.00 |
			
 
				-|8    |4096      |ON   |1059775.22     |909719.3                    |0.86                                         | 2.63 | 1.90 |
			
 
				-|8    |8192      |ON   |1845819.99     |1752510.62                  |0.95                                         | 3.84 | 2.92 |
			
 
				-|8    |16384     |ON   |2801114.77     |2898423.08                  |1.03                                         | 5.20 | 4.06 |
			
 
				-|8    |32768     |ON   |3396766.27     |4102026.01                  |1.21                                         | 5.85 | 5.19 |
			
 
				-|8    |65536     |ON   |3911994.39     |4725023.23                  |1.21                                         | 6.67 | 5.78 |
			
 
				-|8    |131072    |ON   |4197603.74     |5413542.58                  |1.29                                         | 6.84 | 7.38 |
			
 
				-|16   |4096      |ON   |1142272.86     |924525.38                   |0.81                                         | 2.83 | 1.93 |
			
 
				-|16   |8192      |ON   |2068920.7      |1917814.81                  |0.93                                         | 4.31 | 3.20 |
			
 
				-|16   |16384     |ON   |3091676.83     |3496153.45                  |1.13                                         | 5.74 | 4.90 |
			
 
				-|16   |32768     |ON   |5132772.75     |5063615.77                  |0.99                                         | 8.84 | 6.40 |
			
 
				-|16   |65536     |ON   |6553882.87     |8247475.75                  |1.26                                         | 11.18 | 10.08 |
			
 
				-|16   |131072    |ON   |7555906.17     |9571965.84                  |1.27                                         | 12.32 | 13.04 |
			
 
				+|   Batch Size |   XLA |   Throughput - FP32 (samples/s) |   Throughput - mixed precision (samples/s) |   Throughput speedup (mixed precision / FP32) |
			
 
				+|--------------------:|------:|--------------------------------:|-------------------------------------------:|----------------------------------------------:|
			
 
				+|                4096 |     ON |                       444532.22 |                                  541975.24 |                                          1.22 |
			
 
				+|                8192 |     ON |                       505047.64 |                                  642784.48 |                                          1.27 |
			
 
				+|               16384 |     ON |                       549325.54 |                                  727077.63 |                                          1.32 |
			
 
				+|               32768 |     ON |                       587452.73 |                                  788606.35 |                                          1.34 |
			
 
				+|               65536 |     ON |                       605187.67 |                                  832651.59 |                                          1.38 |
			
 
				+|              131072 |     ON |                       599557.03 |                                  840602.90 |                                          1.40 |
			
 
				 
			
 
				 <details>
			
 
				 <summary><b>
			
 
				-Complete table of DGX2 inference performance results
			
 
				+Complete table of DGX-2 inference performance results
			
 
				 </b></summary>
			
 
				 
			
 
				-|GPUs |Global Batch Size   |XLA    |Precision      |Throughput (samples/s)           |
			
 
				-|-----|--------------------|-------|---------------|-----------------------|
			
 
				-|1    |4096                |OFF    |FP32           |459149.07 ± 20971.34   |
			
 
				-|1    |8192                |OFF    |FP32           |488763.98 ± 15037.09   |
			
 
				-|1    |16384               |OFF    |FP32           |516804.05 ± 8355.49    |
			
 
				-|1    |32768               |OFF    |FP32           |534387.97 ± 4763.49    |
			
 
				-|1    |65536               |OFF    |FP32           |536215.89 ± 5794.77    |
			
 
				-|1    |131072              |OFF    |FP32           |538646.76 ± 6359.47    |
			
 
				-|1    |4096                |OFF    |AMP            |488475.14 ± 6226.30    |
			
 
				-|1    |8192                |OFF    |AMP            |632098.48 ± 27370.49   |
			
 
				-|1    |16384               |OFF    |AMP            |705878.12 ± 7852.19    |
			
 
				-|1    |32768               |OFF    |AMP            |739740.73 ± 6866.73    |
			
 
				-|1    |65536               |OFF    |AMP            |618291.18 ± 26749.52   |
			
 
				-|1    |131072              |OFF    |AMP            |544071.41 ± 19200.23   |
			
 
				-|1    |4096                |ON     |FP32           |403479.95 ± 4079.19    |
			
 
				-|1    |8192                |ON     |FP32           |480491.12 ± 6828.93    |
			
 
				-|1    |16384               |ON     |FP32           |538737.44 ± 10932.49   |
			
 
				-|1    |32768               |ON     |FP32           |580958.93 ± 9544.37    |
			
 
				-|1    |65536               |ON     |FP32           |586275.07 ± 7640.59    |
			
 
				-|1    |131072              |ON     |FP32           |613524.11 ± 7931.04    |
			
 
				-|1    |4096                |ON     |AMP            |479051.62 ± 6076.26    |
			
 
				-|1    |8192                |ON     |AMP            |600002.95 ± 16380.88   |
			
 
				-|1    |16384               |ON     |AMP            |713203.59 ± 9515.25    |
			
 
				-|1    |32768               |ON     |AMP            |790782.10 ± 10788.69   |
			
 
				-|1    |65536               |ON     |AMP            |818038.44 ± 14132.80   |
			
 
				-|1    |131072              |ON     |AMP            |734034.26 ± 34664.74   |
			
 
				-|8    |4096                |OFF    |FP32           |502947.25 ± 105758.96  |
			
 
				-|8    |8192                |OFF    |FP32           |809285.58 ± 112765.45  |
			
 
				-|8    |16384               |OFF    |FP32           |1974085.95 ± 476616.90 |
			
 
				-|8    |32768               |OFF    |FP32           |2990517.14 ± 645490.89 |
			
 
				-|8    |65536               |OFF    |FP32           |3662830.22 ± 191010.11 |
			
 
				-|8    |131072              |OFF    |FP32           |3978985.17 ± 142801.19 |
			
 
				-|8    |4096                |OFF    |AMP            |596945.98 ± 92977.56   |
			
 
				-|8    |8192                |OFF    |AMP            |730694.36 ± 67972.28   |
			
 
				-|8    |16384               |OFF    |AMP            |1758189.25 ± 340547.41 |
			
 
				-|8    |32768               |OFF    |AMP            |3873856.45 ± 528746.35 |
			
 
				-|8    |65536               |OFF    |AMP            |4863371.50 ± 297299.34 |
			
 
				-|8    |131072              |OFF    |AMP            |5134261.52 ± 473726.31 |
			
 
				-|8    |4096                |ON     |FP32           |1059775.22 ± 24386.54  |
			
 
				-|8    |8192                |ON     |FP32           |1845819.99 ± 250767.40 |
			
 
				-|8    |16384               |ON     |FP32           |2801114.77 ± 210397.18 |
			
 
				-|8    |32768               |ON     |FP32           |3396766.27 ± 221795.61 |
			
 
				-|8    |65536               |ON     |FP32           |3911994.39 ± 239259.17 |
			
 
				-|8    |131072              |ON     |FP32           |4197603.74 ± 158110.80 |
			
 
				-|8    |4096                |ON     |AMP            |909719.30 ± 135634.13  |
			
 
				-|8    |8192                |ON     |AMP            |1752510.62 ± 87042.91  |
			
 
				-|8    |16384               |ON     |AMP            |2898423.08 ± 231659.28 |
			
 
				-|8    |32768               |ON     |AMP            |4102026.01 ± 254242.94 |
			
 
				-|8    |65536               |ON     |AMP            |4725023.23 ± 322597.53 |
			
 
				-|8    |131072              |ON     |AMP            |5413542.58 ± 364633.26 |
			
 
				-|16   |4096                |OFF    |FP32           |865109.29 ± 40032.58   |
			
 
				-|16   |8192                |OFF    |FP32           |1565843.18 ± 305582.99 |
			
 
				-|16   |16384               |OFF    |FP32           |3109303.21 ± 240314.57 |
			
 
				-|16   |32768               |OFF    |FP32           |5750753.42 ± 898435.09 |
			
 
				-|16   |65536               |OFF    |FP32           |6456324.48 ± 730326.61 |
			
 
				-|16   |131072              |OFF    |FP32           |7415730.04 ± 434928.14 |
			
 
				-|16   |4096                |OFF    |AMP            |742890.53 ± 27541.80   |
			
 
				-|16   |8192                |OFF    |AMP            |1468615.49 ± 67548.46  |
			
 
				-|16   |16384               |OFF    |AMP            |2591245.05 ± 394504.75 |
			
 
				-|16   |32768               |OFF    |AMP            |4671719.91 ± 721705.81 |
			
 
				-|16   |65536               |OFF    |AMP            |7982733.55 ± 1242742.25|
			
 
				-|16   |131072              |OFF    |AMP            |9867894.78 ± 679119.71 |
			
 
				-|16   |4096                |ON     |FP32           |1142272.86 ± 43154.49  |
			
 
				-|16   |8192                |ON     |FP32           |2068920.70 ± 130214.35 |
			
 
				-|16   |16384               |ON     |FP32           |3091676.83 ± 991449.61 |
			
 
				-|16   |32768               |ON     |FP32           |5132772.75 ± 525201.10 |
			
 
				-|16   |65536               |ON     |FP32           |6553882.87 ± 400638.86 |
			
 
				-|16   |131072              |ON     |FP32           |7555906.17 ± 626110.02 |
			
 
				-|16   |4096                |ON     |AMP            |924525.38 ± 163488.57  |
			
 
				-|16   |8192                |ON     |AMP            |1917814.81 ± 59114.71  |
			
 
				-|16   |16384               |ON     |AMP            |3496153.45 ± 190771.71 |
			
 
				-|16   |32768               |ON     |AMP            |5063615.77 ± 1281699.58|
			
 
				-|16   |65536               |ON     |AMP            |8247475.75 ± 539827.60 |
			
 
				-|16   |131072              |ON     |AMP            |9571965.84 ± 764075.50 |
			
 
				+|   Batch Size | XLA   | Precision   | Throughput  (samples/s)   |
			
 
				+|-------------:|:------|:------------|:--------------------------|
			
 
				+|         4096 | OFF   | FP32        | 459175.30 ± 23184.33      |
			
 
				+|         8192 | OFF   | FP32        | 499179.20 ± 15967.26      |
			
 
				+|        16384 | OFF   | FP32        | 525180.72 ± 2521.56       |
			
 
				+|        32768 | OFF   | FP32        | 532042.10 ± 4020.44       |
			
 
				+|        65536 | OFF   | FP32        | 534307.20 ± 7276.26       |
			
 
				+|       131072 | OFF   | FP32        | 532311.44 ± 6195.16       |
			
 
				+|         4096 | OFF   | AMP         | 581771.66 ± 6163.50       |
			
 
				+|         8192 | OFF   | AMP         | 665048.04 ± 4607.95       |
			
 
				+|        16384 | OFF   | AMP         | 716355.19 ± 7174.98       |
			
 
				+|        32768 | OFF   | AMP         | 741642.61 ± 4981.04       |
			
 
				+|        65536 | OFF   | AMP         | 755141.25 ± 6175.05       |
			
 
				+|       131072 | OFF   | AMP         | 744459.46 ± 8183.17       |
			
 
				+|         4096 | ON    | FP32        | 444532.22 ± 6239.01       |
			
 
				+|         8192 | ON    | FP32        | 505047.64 ± 6543.06       |
			
 
				+|        16384 | ON    | FP32        | 549325.54 ± 2841.21       |
			
 
				+|        32768 | ON    | FP32        | 587452.73 ± 2366.43       |
			
 
				+|        65536 | ON    | FP32        | 605187.67 ± 3740.07       |
			
 
				+|       131072 | ON    | FP32        | 599557.03 ± 11811.28      |
			
 
				+|         4096 | ON    | AMP         | 541975.24 ± 4441.93       |
			
 
				+|         8192 | ON    | AMP         | 642784.48 ± 4721.08       |
			
 
				+|        16384 | ON    | AMP         | 727077.63 ± 5332.80       |
			
 
				+|        32768 | ON    | AMP         | 788606.35 ± 11705.36      |
			
 
				+|        65536 | ON    | AMP         | 832651.59 ± 10401.17      |
			
 
				+|       131072 | ON    | AMP         | 840602.90 ± 16358.73      |
			
 
				 </details>
			
 
				 
			
 
				 <details>
			
 
				 <summary><b>
			
 
				-DGX A100 XLA-ON / XLA-OFF inference speedup
			
 
				+DGX-2 XLA-ON / XLA-OFF inference speedup
			
 
				 </b></summary>
			
 
				 
			
 
				 For each configuration of parameters present in the table, the `Speedup` column shows the speedup achieved by turning on XLA.
			
 
				 
			
 
				-|GPUs |Global Batch Size   |Precision      |Speedup |
			
 
				-|-----|--------------------|---------------|--------|
			
 
				-|1    |4096                |FP32           |0.879   |
			
 
				-|1    |8192                |FP32           |0.983   |
			
 
				-|1    |16384               |FP32           |1.042   |
			
 
				-|1    |32768               |FP32           |1.087   |
			
 
				-|1    |65536               |FP32           |1.093   |
			
 
				-|1    |131072              |FP32           |1.139   |
			
 
				-|1    |4096                |AMP            |0.981   |
			
 
				-|1    |8192                |AMP            |0.949   |
			
 
				-|1    |16384               |AMP            |1.010   |
			
 
				-|1    |32768               |AMP            |1.069   |
			
 
				-|1    |65536               |AMP            |1.323   |
			
 
				-|1    |131072              |AMP            |1.349   |
			
 
				-|8    |4096                |FP32           |2.107   |
			
 
				-|8    |8192                |FP32           |2.281   |
			
 
				-|8    |16384               |FP32           |1.419   |
			
 
				-|8    |32768               |FP32           |1.136   |
			
 
				-|8    |65536               |FP32           |1.068   |
			
 
				-|8    |131072              |FP32           |1.055   |
			
 
				-|8    |4096                |AMP            |1.524   |
			
 
				-|8    |8192                |AMP            |2.398   |
			
 
				-|8    |16384               |AMP            |1.649   |
			
 
				-|8    |32768               |AMP            |1.059   |
			
 
				-|8    |65536               |AMP            |0.972   |
			
 
				-|8    |131072              |AMP            |1.054   |
			
 
				-|16   |4096                |FP32           |1.320   |
			
 
				-|16   |8192                |FP32           |1.321   |
			
 
				-|16   |16384               |FP32           |0.994   |
			
 
				-|16   |32768               |FP32           |0.893   |
			
 
				-|16   |65536               |FP32           |1.015   |
			
 
				-|16   |131072              |FP32           |1.019   |
			
 
				-|16   |4096                |AMP            |1.244   |
			
 
				-|16   |8192                |AMP            |1.306   |
			
 
				-|16   |16384               |AMP            |1.349   |
			
 
				-|16   |32768               |AMP            |1.084   |
			
 
				-|16   |65536               |AMP            |1.033   |
			
 
				-|16   |131072              |AMP            |0.970   |
			
 
				+|Batch Size   |Precision      |Speedup |
			
 
				+|--------------------|---------------|--------|
			
 
				+|4096                |TF32           |0.968   |
			
 
				+|8192                |TF32           |1.012   |
			
 
				+|16384               |TF32           |1.046   |
			
 
				+|32768               |TF32           |1.104   |
			
 
				+|65536               |TF32           |1.133   |
			
 
				+|131072              |TF32           |1.126   |
			
 
				+|4096                |AMP            |0.932  |
			
 
				+|8192                |AMP            |0.967   |
			
 
				+|16384               |AMP            |1.384   |
			
 
				+|32768               |AMP            |1.063   |
			
 
				+|65536               |AMP            |1.103   |
			
 
				+|131072              |AMP            |1.129   |
			
 
				 </details>
			
 
				 
			
 
				 &nbsp;
			
@@ -1327,56 +1259,32 @@ For each configuration of parameters present in the table, the `Speedup` column
 
				 NVIDIA A100 / DGX-2 (Ampere / Volta) inference speedup
			
 
				 </b></summary>
			
 
				 
			
 
				-|GPUs |Global Batch Size   |XLA    |Precision      |Speedup |
			
 
				-|-----|--------------------|-------|---------------|--------|
			
 
				-|1    |4096                |OFF    |TF32/FP32      |1.275   |
			
 
				-|1    |8192                |OFF    |TF32/FP32      |1.536   |
			
 
				-|1    |16384               |OFF    |TF32/FP32      |1.555   |
			
 
				-|1    |32768               |OFF    |TF32/FP32      |1.539   |
			
 
				-|1    |65536               |OFF    |TF32/FP32      |1.447   |
			
 
				-|1    |131072              |OFF    |TF32/FP32      |1.197   |
			
 
				-|1    |4096                |OFF    |AMP            |1.057   |
			
 
				-|1    |8192                |OFF    |AMP            |1.232   |
			
 
				-|1    |16384               |OFF    |AMP            |1.321   |
			
 
				-|1    |32768               |OFF    |AMP            |1.339   |
			
 
				-|1    |65536               |OFF    |AMP            |1.158   |
			
 
				-|1    |131072              |OFF    |AMP            |1.124   |
			
 
				-|1    |4096                |ON     |TF32/FP32      |1.393   |
			
 
				-|1    |8192                |ON     |TF32/FP32      |1.396   |
			
 
				-|1    |16384               |ON     |TF32/FP32      |1.464   |
			
 
				-|1    |32768               |ON     |TF32/FP32      |1.472   |
			
 
				-|1    |65536               |ON     |TF32/FP32      |1.567   |
			
 
				-|1    |131072              |ON     |TF32/FP32      |1.497   |
			
 
				-|1    |4096                |ON     |AMP            |1.118   |
			
 
				-|1    |8192                |ON     |AMP            |1.265   |
			
 
				-|1    |16384               |ON     |AMP            |1.291   |
			
 
				-|1    |32768               |ON     |AMP            |1.310   |
			
 
				-|1    |65536               |ON     |AMP            |1.322   |
			
 
				-|1    |131072              |ON     |AMP            |1.051   |
			
 
				-|8    |4096                |OFF    |TF32/FP32      |1.521   |
			
 
				-|8    |8192                |OFF    |TF32/FP32      |1.725   |
			
 
				-|8    |16384               |OFF    |TF32/FP32      |1.156   |
			
 
				-|8    |32768               |OFF    |TF32/FP32      |1.189   |
			
 
				-|8    |65536               |OFF    |TF32/FP32      |1.308   |
			
 
				-|8    |131072              |OFF    |TF32/FP32      |1.493   |
			
 
				-|8    |4096                |OFF    |AMP            |1.077   |
			
 
				-|8    |8192                |OFF    |AMP            |1.639   |
			
 
				-|8    |16384               |OFF    |AMP            |1.116   |
			
 
				-|8    |32768               |OFF    |AMP            |0.843   |
			
 
				-|8    |65536               |OFF    |AMP            |0.997   |
			
 
				-|8    |131072              |OFF    |AMP            |1.249   |
			
 
				-|8    |4096                |ON     |TF32/FP32      |1.066   |
			
 
				-|8    |8192                |ON     |TF32/FP32      |1.217   |
			
 
				-|8    |16384               |ON     |TF32/FP32      |1.428   |
			
 
				-|8    |32768               |ON     |TF32/FP32      |1.613   |
			
 
				-|8    |65536               |ON     |TF32/FP32      |1.722   |
			
 
				-|8    |131072              |ON     |TF32/FP32      |1.810   |
			
 
				-|8    |4096                |ON     |AMP            |1.029   |
			
 
				-|8    |8192                |ON     |AMP            |1.076   |
			
 
				-|8    |16384               |ON     |AMP            |1.140   |
			
 
				-|8    |32768               |ON     |AMP            |1.405   |
			
 
				-|8    |65536               |ON     |AMP            |1.666   |
			
 
				-|8    |131072              |ON     |AMP            |1.663   |
			
 
				+|   Batch Size | XLA   | Precision   |   Speedup |
			
 
				+|-------------:|:------|:------------|----------:|
			
 
				+|         4096 | OFF   | TF32/FP32   |      1.54 |
			
 
				+|         8192 | OFF   | TF32/FP32   |      1.75 |
			
 
				+|        16384 | OFF   | TF32/FP32   |      1.79 |
			
 
				+|        32768 | OFF   | TF32/FP32   |      1.77 |
			
 
				+|        65536 | OFF   | TF32/FP32   |      1.80 |
			
 
				+|       131072 | OFF   | TF32/FP32   |      1.81 |
			
 
				+|         4096 | OFF   | AMP         |      1.11 |
			
 
				+|         8192 | OFF   | AMP         |      1.38 |
			
 
				+|        16384 | OFF   | AMP         |      1.59 |
			
 
				+|        32768 | OFF   | AMP         |      1.64 |
			
 
				+|        65536 | OFF   | AMP         |      1.71 |
			
 
				+|       131072 | OFF   | AMP         |      1.74 |
			
 
				+|         4096 | ON    | TF32/FP32   |      1.39 |
			
 
				+|         8192 | ON    | TF32/FP32   |      1.43 |
			
 
				+|        16384 | ON    | TF32/FP32   |      1.56 |
			
 
				+|        32768 | ON    | TF32/FP32   |      1.66 |
			
 
				+|        65536 | ON    | TF32/FP32   |      1.79 |
			
 
				+|       131072 | ON    | TF32/FP32   |      1.83 |
			
 
				+|         4096 | ON    | AMP         |      1.24 |
			
 
				+|         8192 | ON    | AMP         |      1.32 |
			
 
				+|        16384 | ON    | AMP         |      1.45 |
			
 
				+|        32768 | ON    | AMP         |      1.61 |
			
 
				+|        65536 | ON    | AMP         |      1.74 |
			
 
				+|       131072 | ON    | AMP         |      1.76 |
			
 
				 </details>
			
 
				 
			
 
				 &nbsp;
			
@@ -1388,10 +1296,12 @@ NVIDIA A100 / DGX-2 (Ampere / Volta) inference speedup
 
				 May 2022
			
 
				 - Initial release
			
 
				 
			
 
				-### Known issues
			
 
				+November 2022
			
 
				+- Moved batching and padding operations to preprocessing
			
 
				+- Added support for prebatched samples during dataloading
			
 
				+- Reduced throughput variance (previously appearing mainly during inference)
			
 
				 
			
 
				-- While benchmarking inference on a single GPU, sometimes throughput drops drastically in the middle of the epoch and remains low until the end of the epoch.
			
 
				-- On a multi-GPU setup, the summary of throughput (in the last line of the logfile) is lower than it would result from each step`s throughput (sample/s). It is probably the case when a single GPU is slower than the one on the logging node. In this case, the overhead for synchronization before the final throughput calculation is higher than usual.
			
 
				+### Known issues
			
 
				 - The SIM model results are non-deterministic, even using the same random seed. The reason for this non-determinism is the [tf.math.unsorted_segment_sum](https://www.tensorflow.org/api_docs/python/tf/math/unsorted_segment_sum) operation called within an optimization step. Its influence depends on categorical data distribution within a batch, and this issue is more severe for momentum-based optimizers. A potential solution is to use a deterministic version of this op which allows perfect reproduction, but is up to six times slower training.
			
 
				 
			
 
				 
			
--- a/TensorFlow2/Recommendation/SIM/main.py
+++ b/TensorFlow2/Recommendation/SIM/main.py
@@ -93,8 +93,8 @@ def init_logger(results_dir, filename):
 
				 
			
 
				 
			
 
				 # In the future, select one of available dataloaders there (tfrecord, csv, etc...)
			
 
				-def get_data_iterator(paths, feature_spec, batch_size, num_gpus, long_seq_length, prefetch_size, repeat_count=0,
			
 
				-                      drop_remainder=False, amp=False, disable_cache=False):
			
 
				+def get_data_iterator(paths, feature_spec, batch_size, num_gpus, long_seq_length, prefetch_size, num_parallel_calls=None, repeat_count=0,
			
 
				+                      drop_remainder=False, amp=False, disable_cache=False, prebatch_size=0):
			
 
				     return get_dataloader_tfrecord(
			
 
				         paths,
			
 
				         feature_spec=feature_spec,
			
@@ -105,7 +105,9 @@ def get_data_iterator(paths, feature_spec, batch_size, num_gpus, long_seq_length
 
				         drop_remainder=drop_remainder,
			
 
				         repeat_count=repeat_count,
			
 
				         disable_cache=disable_cache,
			
 
				-        prefetch_buffer_size=prefetch_size
			
 
				+        prefetch_buffer_size=prefetch_size,
			
 
				+        num_parallel_calls=num_parallel_calls,
			
 
				+        prebatch_size=prebatch_size
			
 
				     )
			
 
				 
			
 
				 
			
@@ -243,10 +245,24 @@ def eval(model_fn, data_iterator, num_thresholds=8000, prefix=""):
 
				         local_targets.append(targets)
			
 
				         local_total_losses.append(loss_dict["total_loss"])
			
 
				 
			
 
				-    # concat all local variables into a single tensor
			
 
				-    logits = tf.concat(local_logits, 0)
			
 
				-    targets = tf.concat(local_targets, 0)
			
 
				-    total_losses = tf.concat(local_total_losses, 0)
			
 
				+    locals = [local_logits, local_targets, local_total_losses]
			
 
				+    for i, local in enumerate(locals):
			
 
				+
			
 
				+        # wrap empty lists in tensor to allow tf.concat
			
 
				+        if len(local) == 0:
			
 
				+            local = tf.constant(local)
			
 
				+
			
 
				+        # concat all local variables into a single tensor
			
 
				+        local = tf.concat(local, 0)
			
 
				+
			
 
				+        # for single element lists, tf.concat will produce shape=() instead of shape=(1,).
			
 
				+        # reshape it for hvd.allgather to work
			
 
				+        if len(local.shape) == 0:
			
 
				+            local = tf.reshape(local, -1)
			
 
				+
			
 
				+        locals[i] = local
			
 
				+    
			
 
				+    logits, targets, total_losses = locals
			
 
				 
			
 
				     if distributed:
			
 
				         # gather from all nodes
			
@@ -455,6 +471,9 @@ def inference(model, data_iterator, benchmark, performance_calculator):
 
				 @click.option(
			
 
				     "--global_batch_size", default=131072, help="Batch size used to train/eval the model.", type=int
			
 
				 )
			
 
				[email protected](
			
 
				+    "--num_parallel_calls", default=None, help="Parallelism level for tf.data API. If None, heuristic based on number of CPUs and number of GPUs will be used."
			
 
				+)
			
 
				 @click.option(
			
 
				     "--epochs", default=3, help="Train for the following number of epochs.", type=int
			
 
				 )
			
@@ -521,10 +540,8 @@ def inference(model, data_iterator, benchmark, performance_calculator):
 
				 )
			
 
				 @click.option(
			
 
				     "--prefetch_train_size",
			
 
				-    default=-1,
			
 
				+    default=10,
			
 
				     help="Number of batches to prefetch in training. "
			
 
				-    "If == 0: No prefetching is done. "
			
 
				-    "If < 0: Prefetch size is set to train_dataset_size // global_batch_size. ",
			
 
				 )
			
 
				 @click.option(
			
 
				     "--prefetch_test_size",
			
@@ -532,9 +549,14 @@ def inference(model, data_iterator, benchmark, performance_calculator):
 
				     help="Number of batches to prefetch in testing"
			
 
				 )
			
 
				 @click.option(
			
 
				-    "--train_dataset_size",
			
 
				-    default=11796480,
			
 
				-    help="Number of train samples. Used to set prefetching size (see --prefetch_train_size for more information."
			
 
				+    "--prebatch_train_size",
			
 
				+    default=0,
			
 
				+    help="Information about batch size applied during preprocessing to train dataset"
			
 
				+)
			
 
				[email protected](
			
 
				+    "--prebatch_test_size",
			
 
				+    default=0,
			
 
				+    help="Information about batch size applied during preprocessing to test dataset"
			
 
				 )
			
 
				 def main(
			
 
				         mode: str,
			
@@ -554,6 +576,7 @@ def main(
 
				         weight_decay: float,
			
 
				         embedding_dim: int,
			
 
				         global_batch_size: int,
			
 
				+        num_parallel_calls: int,
			
 
				         epochs: int,
			
 
				         disable_cache: bool,
			
 
				         drop_remainder: bool,
			
@@ -570,7 +593,8 @@ def main(
 
				         intra_op_parallelism: int,
			
 
				         prefetch_train_size: int,
			
 
				         prefetch_test_size: int,
			
 
				-        train_dataset_size: int
			
 
				+        prebatch_train_size: int,
			
 
				+        prebatch_test_size: int
			
 
				 ):
			
 
				     hvd.init()
			
 
				 
			
@@ -636,20 +660,19 @@ def main(
 
				     # since each tfrecord file must include all of the features, it is enough to read first chunk for each split. 
			
 
				     train_files = [dataset_dir / file for file in feature_spec.source_spec[TRAIN_MAPPING][0][FILES_SELECTOR]]
			
 
				 
			
 
				-    if prefetch_train_size < 0:
			
 
				-        prefetch_train_size = train_dataset_size // global_batch_size
			
 
				-
			
 
				     data_iterator_train = get_data_iterator(
			
 
				         train_files, feature_spec, batch_size, num_gpus, long_seq_length,
			
 
				         repeat_count=repeat_count, drop_remainder=drop_remainder,
			
 
				-        amp=amp, disable_cache=disable_cache, prefetch_size=prefetch_train_size
			
 
				+        amp=amp, disable_cache=disable_cache, prefetch_size=prefetch_train_size,
			
 
				+        num_parallel_calls=num_parallel_calls, prebatch_size=prebatch_train_size
			
 
				     )
			
 
				 
			
 
				     if mode == "train":
			
 
				         test_files = [dataset_dir / file for file in feature_spec.source_spec[TEST_MAPPING][0][FILES_SELECTOR]]
			
 
				         data_iterator_test = get_data_iterator(
			
 
				             test_files, feature_spec, batch_size, num_gpus, long_seq_length,
			
 
				-            amp=amp, disable_cache=disable_cache, prefetch_size=prefetch_test_size
			
 
				+            amp=amp, disable_cache=disable_cache, prefetch_size=prefetch_test_size, num_parallel_calls=num_parallel_calls,
			
 
				+            prebatch_size=prebatch_test_size
			
 
				         )
			
 
				     else:
			
 
				         data_iterator_test = []  # otherwise not used
			
@@ -689,4 +712,4 @@ def main(
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    main()
			
 
				+    main()
			
--- a/TensorFlow2/Recommendation/SIM/preprocessing/ops.py
+++ b/TensorFlow2/Recommendation/SIM/preprocessing/ops.py
@@ -73,6 +73,39 @@ def _preserve_data(offsets, values, new_values):
 
				             new_values[i] = values[rowid]
			
 
				 
			
 
				 
			
 
				[email protected]
			
 
				+def _slice_rjust(max_elements, offsets, elements, new_offsets, new_elements):
			
 
				+    rowid = numba.cuda.grid(1)
			
 
				+    if rowid < new_offsets.size - 1:
			
 
				+        row_size = min(offsets[rowid + 1] - offsets[rowid], max_elements)
			
 
				+        offset = offsets[rowid + 1] - row_size
			
 
				+        new_start = new_offsets[rowid + 1] - row_size
			
 
				+
			
 
				+        for i in range(row_size):
			
 
				+            new_elements[new_start + i] = elements[offset + i]
			
 
				+
			
 
				+
			
 
				+def slice_and_pad_left(seq_col, max_elements, pad_value=0):
			
 
				+    c = seq_col._column
			
 
				+    offsets = c.offsets.values
			
 
				+    elements = c.elements.values
			
 
				+
			
 
				+    threads = THREADS
			
 
				+    blocks = (offsets.size + threads - 1) // threads
			
 
				+
			
 
				+    new_offsets = cupy.arange(offsets.size, dtype=offsets.dtype) * max_elements
			
 
				+
			
 
				+    new_elements = cupy.full(
			
 
				+        new_offsets[-1].item(), fill_value=pad_value, dtype=elements.dtype
			
 
				+    )
			
 
				+    _slice_rjust[blocks, threads](
			
 
				+        max_elements, offsets, elements, new_offsets, new_elements
			
 
				+    )
			
 
				+
			
 
				+    new_col = nvt_build_list_column(new_elements, new_offsets)
			
 
				+    return new_col
			
 
				+
			
 
				+
			
 
				 class ExplodeSequence:
			
 
				     """
			
 
				     For each row create a new one with a subsequence of the original list columns.
			
--- a/TensorFlow2/Recommendation/SIM/preprocessing/parquet_to_tfrecord.py
+++ b/TensorFlow2/Recommendation/SIM/preprocessing/parquet_to_tfrecord.py
@@ -21,9 +21,11 @@ from functools import partial
 
				 
			
 
				 import click
			
 
				 import pandas as pd
			
 
				+import numpy as np
			
 
				 import tensorflow as tf
			
 
				 
			
 
				 from sim.data.feature_spec import FeatureSpec
			
 
				+from sim.data.defaults import TRAIN_MAPPING, TEST_MAPPING, REMAINDER_FILENAME, FILES_SELECTOR
			
 
				 
			
 
				 # Docker image sets it to "python" for NVTabular purposes (bugfix), which slows down the script 20x
			
 
				 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp"
			
@@ -34,46 +36,31 @@ logging.basicConfig(
 
				     format="[%(asctime)s] %(levelname)s: %(message)s",
			
 
				 )
			
 
				 
			
 
				-
			
 
				-def _int64_feature(value, islist=False):
			
 
				-    """Returns an int64_list from a bool / enum / int / uint."""
			
 
				-    if not islist:
			
 
				-        value = [value]
			
 
				-    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
			
 
				-
			
 
				-
			
 
				-def process_chunk(df, sequential_data_start):
			
 
				-    feature_values_lists = [df.iloc[:, i].values for i in range(sequential_data_start)]
			
 
				-
			
 
				-    for i in range(sequential_data_start, df.shape[1]):
			
 
				-        values = df.iloc[:, i].values.tolist()
			
 
				-        feature_values_lists.append(values)
			
 
				-
			
 
				-    return zip(*feature_values_lists)
			
 
				-
			
 
				-
			
 
				-def prepare_record(sample, all_feature_names, sequential_data_start):
			
 
				-
			
 
				+def prepare_record(sample, all_feature_names, sequential_data_start, prebatch):
			
 
				     feature = {}
			
 
				-    for idx, (f_name, data) in enumerate(zip(all_feature_names, sample)):
			
 
				-        islist = idx >= sequential_data_start
			
 
				-        feature[f_name] = _int64_feature(data, islist)
			
 
				+    for idx, (f_name, data) in enumerate(zip(all_feature_names, sample.values())):
			
 
				+        if idx >= sequential_data_start:
			
 
				+            if prebatch:
			
 
				+                data = np.array(data).flatten()
			
 
				+        else:
			
 
				+            if not prebatch:
			
 
				+                data = [data]
			
 
				 
			
 
				-    record_bytes = tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()
			
 
				-    return record_bytes
			
 
				+        feature[f_name] = tf.train.Feature(int64_list=tf.train.Int64List(value=data))
			
 
				 
			
 
				+    return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()
			
 
				 
			
 
				-def create_default_feature_spec(user_features_cardinalities, item_features_cardinalities,
			
 
				-                                max_seq_len, tfrecord_output_dir, train_output_file, test_output_file):
			
 
				+def save_records(output_path, records, base_output_path, feature_spec, mapping):
			
 
				 
			
 
				-    train_output = tfrecord_output_dir / train_output_file
			
 
				-    test_output = tfrecord_output_dir / test_output_file
			
 
				+    with tf.io.TFRecordWriter(str(output_path)) as file_writer:
			
 
				+        for record_bytes in records:
			
 
				+            file_writer.write(record_bytes)
			
 
				 
			
 
				-    f_spec = FeatureSpec.get_default_feature_spec(user_features_cardinalities, item_features_cardinalities,
			
 
				-                                                  max_seq_len, train_output, test_output)
			
 
				+    feature_spec.source_spec[mapping][0][FILES_SELECTOR].append(
			
 
				+        str(output_path.relative_to(base_output_path))
			
 
				+    )
			
 
				 
			
 
				-    save_path = tfrecord_output_dir / 'feature_spec.yaml'
			
 
				-    f_spec.to_yaml(save_path)
			
 
				+    logging.info(f'Created: {output_path}')
			
 
				 
			
 
				 
			
 
				 @click.command()
			
@@ -91,14 +78,15 @@ def create_default_feature_spec(user_features_cardinalities, item_features_cardi
 
				 )
			
 
				 @click.option(
			
 
				     "--number_of_user_features",
			
 
				-    required=True,
			
 
				-    help="number of user specific features.",
			
 
				+    default=1,
			
 
				+    help="number of user specific features. Default is 1 for amazon books dataset (user_id).",
			
 
				     type=int
			
 
				 )
			
 
				 @click.option(
			
 
				     "--max_seq_len",
			
 
				     default=100,
			
 
				-    help="maximum possible length of history. (Entries will be padded to that length later)."
			
 
				+    help="maximum possible length of history. (Entries will be padded to that length later).",
			
 
				+    type=int
			
 
				 )
			
 
				 @click.option(
			
 
				     "--n_proc",
			
@@ -109,30 +97,57 @@ def create_default_feature_spec(user_features_cardinalities, item_features_cardi
 
				 @click.option(
			
 
				     "--train_split_dir",
			
 
				     default='train',
			
 
				-    help="name of directory within amazon dataset directory containing train data."
			
 
				+    help="Name of directory within amazon dataset directory containing train data.",
			
 
				+    type=str
			
 
				 )
			
 
				 @click.option(
			
 
				     "--test_split_dir",
			
 
				     default='test',
			
 
				-    help="name of directory within amazon dataset directory containing test data."
			
 
				+    help="Name of directory within amazon dataset directory containing test data.",
			
 
				+    type=str,
			
 
				 )
			
 
				 @click.option(
			
 
				     "--metadata_file",
			
 
				     default='metadata.json',
			
 
				-    help="name of metadata file within amazon dataset directory (containing feature cardinalities)."
			
 
				+    help="Name of metadata file within amazon dataset directory (containing feature cardinalities).",
			
 
				+    type=str
			
 
				 )
			
 
				 @click.option(
			
 
				-    "--train_output_file",
			
 
				-    default='train.tfrecord',
			
 
				-    help='name of train file within output directory.',
			
 
				+    "--train_output_dir",
			
 
				+    default='train',
			
 
				+    help="Name of train directory within output directory.",
			
 
				     type=str
			
 
				 )
			
 
				 @click.option(
			
 
				-    "--test_output_file",
			
 
				-    default='test.tfrecord',
			
 
				-    help='name of test file within output directory.',
			
 
				+    "--test_output_dir",
			
 
				+    default='test',
			
 
				+    help='Name of test directory within output directory.',
			
 
				     type=str
			
 
				 )
			
 
				[email protected](
			
 
				+    "--train_parts",
			
 
				+    default=8,
			
 
				+    help="Number of output train files.",
			
 
				+    type=int
			
 
				+)
			
 
				[email protected](
			
 
				+    "--test_parts",
			
 
				+    default=4,
			
 
				+    help="Number of output test files.",
			
 
				+    type=int
			
 
				+)
			
 
				[email protected](
			
 
				+    "--prebatch_train_size",
			
 
				+    default=0,
			
 
				+    help='Apply batching to data in preprocessing. If prebatch_size == 0, no prebatching is done.',
			
 
				+    type=int
			
 
				+)
			
 
				[email protected](
			
 
				+    "--prebatch_test_size",
			
 
				+    default=0,
			
 
				+    help='Apply batching to data in preprocessing. If prebatch_size == 0, no prebatching is done.',
			
 
				+    type=int
			
 
				+)
			
 
				 def main(
			
 
				         amazon_dataset_path: str,
			
 
				         tfrecord_output_dir: str,
			
@@ -142,8 +157,12 @@ def main(
 
				         train_split_dir: str,
			
 
				         test_split_dir: str,
			
 
				         metadata_file: str,
			
 
				-        train_output_file: str,
			
 
				-        test_output_file: str
			
 
				+        train_output_dir: str,
			
 
				+        test_output_dir: str,
			
 
				+        train_parts: int,
			
 
				+        test_parts: int,
			
 
				+        prebatch_train_size: int,
			
 
				+        prebatch_test_size: int
			
 
				 ):
			
 
				     """
			
 
				     read_parquet()
			
@@ -160,11 +179,12 @@ def main(
 
				         amazon_dataset_path / test_split_dir
			
 
				     ]
			
 
				 
			
 
				-    os.makedirs(tfrecord_output_dir, exist_ok=True)
			
 
				     output_splits = [
			
 
				-        tfrecord_output_dir / train_output_file,
			
 
				-        tfrecord_output_dir / test_output_file
			
 
				+        tfrecord_output_dir / train_output_dir,
			
 
				+        tfrecord_output_dir / test_output_dir
			
 
				     ]
			
 
				+    for split_dir in output_splits:
			
 
				+        os.makedirs(split_dir, exist_ok=True)
			
 
				 
			
 
				     with open(amazon_dataset_path / metadata_file, 'r') as file:
			
 
				         metadata = json.load(file)
			
@@ -176,35 +196,55 @@ def main(
 
				     user_features_cardinalities = feature_cardinalities[:number_of_user_features]
			
 
				     item_features_cardinalities = feature_cardinalities[number_of_user_features:]
			
 
				 
			
 
				-    create_default_feature_spec(user_features_cardinalities, item_features_cardinalities, max_seq_len,
			
 
				-                                tfrecord_output_dir, train_output_file, test_output_file)
			
 
				+    feature_spec = FeatureSpec.get_default_feature_spec(user_features_cardinalities, item_features_cardinalities, max_seq_len)
			
 
				 
			
 
				     number_of_item_features = len(item_features_cardinalities)
			
 
				     sequential_data_start = 1 + number_of_user_features + number_of_item_features
			
 
				     all_feature_names = FeatureSpec.get_default_features_names(number_of_user_features, number_of_item_features)
			
 
				-    prepare_record_function = partial(prepare_record, all_feature_names=all_feature_names,
			
 
				-                                      sequential_data_start=sequential_data_start)
			
 
				+    
			
 
				+    prebatch_per_split = [prebatch_train_size, prebatch_test_size]
			
 
				+    parts_per_split = [train_parts, test_parts]
			
 
				+    mappings = [TRAIN_MAPPING, TEST_MAPPING]
			
 
				+
			
 
				+    for mapping, input_dir, output_dir, output_parts, prebatch_size in zip(mappings, input_splits, output_splits, parts_per_split, prebatch_per_split):
			
 
				+
			
 
				+        prebatch = prebatch_size > 0
			
 
				+        prepare_record_function = partial(prepare_record, all_feature_names=all_feature_names,
			
 
				+                                        sequential_data_start=sequential_data_start, prebatch=prebatch)
			
 
				+        save_records_function = partial(save_records, base_output_path=tfrecord_output_dir, feature_spec=feature_spec, mapping=mapping)
			
 
				+
			
 
				+        logging.info(f"Started conversion, will output to {output_dir}")
			
 
				+
			
 
				+        df = pd.read_parquet(input_dir, engine='pyarrow')
			
 
				+
			
 
				+        logging.info("Parquet loaded")
			
 
				+
			
 
				+        if prebatch:
			
 
				+            df['batch_index'] = df.index // prebatch_size
			
 
				+            df = df.groupby('batch_index').agg(list)
			
 
				+            if len(df.iloc[-1, 0]) < prebatch_size:
			
 
				+                remainder = df[-1:].to_dict('records')[0]
			
 
				+                remainder = prepare_record_function(remainder)
			
 
				 
			
 
				-    for input_dir, output_file in zip(input_splits, output_splits):
			
 
				+                df = df[:-1]
			
 
				 
			
 
				-        files = input_dir.glob("part.*.parquet")
			
 
				-        def num_order(p): return int(p.name.split(".")[1])
			
 
				-        paths = sorted(files, key=num_order)
			
 
				+            logging.info("Prebatching applied")
			
 
				 
			
 
				-        logging.info(f"Started conversion, will output to {output_file}")
			
 
				+        df = df.to_dict('records')
			
 
				+        with multiprocessing.Pool(n_proc) as pool:
			
 
				+            records = pool.map(prepare_record_function, df)
			
 
				 
			
 
				-        with tf.io.TFRecordWriter(str(output_file)) as file_writer:
			
 
				-            with multiprocessing.Pool(n_proc) as pool:
			
 
				-                for path in paths:
			
 
				-                    df = pd.read_parquet(path)
			
 
				+        logging.info("Records created")
			
 
				 
			
 
				-                    zipped_data = process_chunk(df, sequential_data_start)
			
 
				+        records = np.array_split(records, output_parts)
			
 
				+        for i, records_part in enumerate(records):
			
 
				+            if len(records_part) > 0:
			
 
				+                save_records_function(output_dir / f'part_{i}.tfrecord', records_part)
			
 
				 
			
 
				-                    records = pool.map(prepare_record_function, zipped_data)
			
 
				-                    for record_bytes in records:
			
 
				-                        file_writer.write(record_bytes)
			
 
				+        if prebatch:
			
 
				+            save_records_function(output_dir / REMAINDER_FILENAME, [remainder])
			
 
				 
			
 
				-                    logging.info(f"Processed {path}")
			
 
				+    feature_spec.to_yaml(tfrecord_output_dir / 'feature_spec.yaml')
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/TensorFlow2/Recommendation/SIM/preprocessing/sim_preprocessing.py
+++ b/TensorFlow2/Recommendation/SIM/preprocessing/sim_preprocessing.py
@@ -25,7 +25,7 @@ import dask_cudf
 
				 import rmm
			
 
				 
			
 
				 from preprocessing.io import load_metadata, load_review_data, save_metadata
			
 
				-from preprocessing.ops import ExplodeSequence, add_negative_sequence, list_slice
			
 
				+from preprocessing.ops import ExplodeSequence, add_negative_sequence, list_slice, slice_and_pad_left
			
 
				 
			
 
				 DASK_TRAIN_DATASET_CHUNKSIZE = 15_000
			
 
				 TRAIN_DATA_DIR = "train"
			
@@ -179,11 +179,12 @@ def add_negative_sampling(df: cudf.DataFrame, sampling_df: cudf.DataFrame) -> cu
 
				     return df
			
 
				 
			
 
				 
			
 
				-def slice_sequences(df: cudf.DataFrame, max_elements: int) -> cudf.DataFrame:
			
 
				-    df["item_sequence"] = list_slice(df["item_sequence"], -max_elements)
			
 
				-    df["cat_sequence"] = list_slice(df["cat_sequence"], -max_elements)
			
 
				-    df["neg_item_sequence"] = list_slice(df["neg_item_sequence"], -max_elements)
			
 
				-    df["neg_cat_sequence"] = list_slice(df["neg_cat_sequence"], -max_elements)
			
 
				+def pad_with_zeros(df: cudf.DataFrame, max_elements: int) -> cudf.DataFrame:
			
 
				+    df["item_sequence"] = slice_and_pad_left(df["item_sequence"], max_elements)
			
 
				+    df["cat_sequence"] = slice_and_pad_left(df["cat_sequence"], max_elements)
			
 
				+    df["neg_item_sequence"] = slice_and_pad_left(df["neg_item_sequence"], max_elements)
			
 
				+    df["neg_cat_sequence"] = slice_and_pad_left(df["neg_cat_sequence"], max_elements)
			
 
				+
			
 
				     return df
			
 
				 
			
 
				 
			
@@ -202,6 +203,7 @@ def create_train_dataset(
 
				 
			
 
				         df = explode_sequence(df, min_elements, max_elements)
			
 
				         df = add_negative_sampling(df, sampling_df)
			
 
				+        df = pad_with_zeros(df, max_elements)
			
 
				         df = df.sort_values(by=["uid"])
			
 
				         df.reset_index(drop=True, inplace=True)
			
 
				         df = df[list(OUTPUT_META)]
			
@@ -222,7 +224,7 @@ def create_test_dataset(
 
				     output_path: str,
			
 
				 ) -> None:
			
 
				     df = add_negative_sampling(df, sampling_df)
			
 
				-    df = slice_sequences(df, max_elements)
			
 
				+    df = pad_with_zeros(df, max_elements)
			
 
				     df = df.sort_values(by=["uid"])
			
 
				     df.reset_index(drop=True, inplace=True)
			
 
				     df = df[list(OUTPUT_META)]
			
--- a/TensorFlow2/Recommendation/SIM/scripts/run_model.sh
+++ b/TensorFlow2/Recommendation/SIM/scripts/run_model.sh
@@ -30,6 +30,8 @@ Usage: bash scripts/run_model.sh
 
				 --log_filename          Name of output log file within results_dir. Default: log.json.
			
 
				 --save_checkpoint_path  Path to output checkpoint after training.
			
 
				 --load_checkpoint_path  Path from which to restore checkpoint for inference or suspend/resume training.
			
 
				+--prebatch_train_size
			
 
				+--prebatch_test_size
			
 
				 EOF
			
 
				 }
			
 
				 
			
@@ -82,10 +84,12 @@ results_dir_option=$(get_option_or_use_default --results_dir $results_dir)
 
				 log_filename_option=$(get_option_or_use_default --log_filename $log_filename)
			
 
				 save_checkpoint_path_option=$(get_option_or_use_default --save_checkpoint_path $save_checkpoint_path)
			
 
				 load_checkpoint_path_option=$(get_option_or_use_default --load_checkpoint_path $load_checkpoint_path)
			
 
				+prebatch_train_size_option=$(get_option_or_use_default --prebatch_train_size $prebatch_train_size)
			
 
				+prebatch_test_size_option=$(get_option_or_use_default --prebatch_test_size $prebatch_test_size)
			
 
				 
			
 
				 command="mpiexec --allow-run-as-root --bind-to socket -np ${gpus} python main.py --dataset_dir ${data_path} --drop_remainder ${epochs_option} 
			
 
				 ${xla_arg} ${amp_arg} ${benchmark_arg} ${mode_option} ${benchmark_steps_option} ${batch_size_option} ${results_dir_option} ${log_filename_option}
			
 
				-${save_checkpoint_path_option} ${load_checkpoint_path_option}"
			
 
				+${save_checkpoint_path_option} ${load_checkpoint_path_option} ${prebatch_train_size_option} ${prebatch_test_size_option}"
			
 
				 
			
 
				 printf "[INFO] Running:\n%s\n" "${command}"
			
 
				 # run
			
--- a/TensorFlow2/Recommendation/SIM/sim/data/dataloader.py
+++ b/TensorFlow2/Recommendation/SIM/sim/data/dataloader.py
@@ -18,12 +18,7 @@ from functools import partial
 
				 import tensorflow as tf
			
 
				 
			
 
				 from sim.data.defaults import (DIMENSIONS_SELECTOR, LABEL_CHANNEL, NEGATIVE_HISTORY_CHANNEL, POSITIVE_HISTORY_CHANNEL,
			
 
				-                               TARGET_ITEM_FEATURES_CHANNEL, USER_FEATURES_CHANNEL)
			
 
				-
			
 
				-
			
 
				-def _pad_ragged_infront(x, pad_length):
			
 
				-    x = tf.reverse(x, axis=[1])
			
 
				-    return tf.reverse(x.to_tensor(shape=(None, pad_length)), axis=[1])
			
 
				+                               TARGET_ITEM_FEATURES_CHANNEL, USER_FEATURES_CHANNEL, REMAINDER_FILENAME)
			
 
				 
			
 
				 
			
 
				 def _remap_column_values_tfrecord(sample, feature_spec, long_seq_length):
			
@@ -32,20 +27,20 @@ def _remap_column_values_tfrecord(sample, feature_spec, long_seq_length):
 
				     features = feature_spec.feature_spec
			
 
				 
			
 
				     user_features = {
			
 
				-        f_name: sample[f_name] for f_name in channel_spec[USER_FEATURES_CHANNEL]
			
 
				+        f_name: tf.reshape(sample[f_name], [-1]) for f_name in channel_spec[USER_FEATURES_CHANNEL]
			
 
				     }
			
 
				 
			
 
				     target_item_features = {
			
 
				-        f_name: sample[f_name] for f_name in channel_spec[TARGET_ITEM_FEATURES_CHANNEL]
			
 
				+        f_name: tf.reshape(sample[f_name], [-1]) for f_name in channel_spec[TARGET_ITEM_FEATURES_CHANNEL]
			
 
				     }
			
 
				 
			
 
				     padded_positive = {
			
 
				-        f_name: _pad_ragged_infront(sample[f_name], features[f_name][DIMENSIONS_SELECTOR][0])
			
 
				+        f_name: tf.reshape(sample[f_name], [-1, features[f_name][DIMENSIONS_SELECTOR][0]]) 
			
 
				         for f_name in channel_spec[POSITIVE_HISTORY_CHANNEL]
			
 
				     }
			
 
				 
			
 
				     padded_negative = {
			
 
				-        f_name: _pad_ragged_infront(sample[f_name], features[f_name][DIMENSIONS_SELECTOR][0])
			
 
				+        f_name: tf.reshape(sample[f_name], [-1, features[f_name][DIMENSIONS_SELECTOR][0]]) 
			
 
				         for f_name in channel_spec[NEGATIVE_HISTORY_CHANNEL]
			
 
				     }
			
 
				 
			
@@ -70,7 +65,7 @@ def _remap_column_values_tfrecord(sample, feature_spec, long_seq_length):
 
				     short_sequence_mask = history_mask[:, long_seq_length:]
			
 
				 
			
 
				     label_name = channel_spec[LABEL_CHANNEL][0]
			
 
				-    target = sample[label_name]
			
 
				+    target = tf.reshape(sample[label_name], [-1])
			
 
				 
			
 
				     return {
			
 
				         "user_features": user_features,
			
@@ -84,6 +79,14 @@ def _remap_column_values_tfrecord(sample, feature_spec, long_seq_length):
 
				     }, target
			
 
				 
			
 
				 
			
 
				+def split_prebatch(sample, split_into):
			
 
				+    res = {}
			
 
				+    for f_name, val in sample.items():
			
 
				+        res[f_name] = tf.reshape(val, [split_into, -1])
			
 
				+
			
 
				+    return tf.data.Dataset.from_tensor_slices(res)
			
 
				+
			
 
				+
			
 
				 def get_dataloader_tfrecord(
			
 
				     file_paths,
			
 
				     feature_spec,
			
@@ -94,36 +97,74 @@ def get_dataloader_tfrecord(
 
				     drop_remainder=False,
			
 
				     repeat_count=0,
			
 
				     prefetch_buffer_size=90,
			
 
				-    disable_cache=False):
			
 
				+    num_parallel_calls=None,
			
 
				+    disable_cache=False,
			
 
				+    prebatch_size=0
			
 
				+    ):
			
 
				 
			
 
				     features = feature_spec.feature_spec
			
 
				+    prebatched = prebatch_size > 0
			
 
				+
			
 
				+    remainder_file = None
			
 
				+    if file_paths[-1].name == REMAINDER_FILENAME:
			
 
				+        remainder_file = file_paths[-1:]
			
 
				+        file_paths = file_paths[:-1]
			
 
				 
			
 
				     tf_feature_spec = {}
			
 
				     for name, feature in features.items():
			
 
				         dimensions = feature.get(DIMENSIONS_SELECTOR)
			
 
				         if dimensions is None:
			
 
				-            tf_feature_spec[name] = tf.io.FixedLenFeature([], tf.int64)
			
 
				-        else:
			
 
				-            tf_feature_spec[name] = tf.io.RaggedFeature(tf.int64)
			
 
				+            dimensions = [1] if prebatched else []
			
 
				+
			
 
				+        if prebatched:
			
 
				+            dimensions = dimensions.copy()
			
 
				+            dimensions[0] *= prebatch_size
			
 
				 
			
 
				-    num_cpus = multiprocessing.cpu_count()
			
 
				+        tf_feature_spec[name] = tf.io.FixedLenFeature(dimensions, tf.int64)
			
 
				 
			
 
				-    dataset = tf.data.TFRecordDataset(file_paths)
			
 
				+    if num_parallel_calls is None:
			
 
				+        num_cpus = multiprocessing.cpu_count()
			
 
				+        num_parallel_calls = 4 * num_cpus // num_gpus
			
 
				+
			
 
				+    dataset = tf.data.TFRecordDataset(file_paths, num_parallel_reads=num_parallel_calls)
			
 
				 
			
 
				     dataset = dataset.shard(num_gpus, id)
			
 
				 
			
 
				-    dataset = dataset.apply(
			
 
				-        tf.data.experimental.dense_to_ragged_batch(batch_size, drop_remainder=drop_remainder)
			
 
				+    splitting_function = None
			
 
				+    if prebatched:
			
 
				+        if batch_size >= prebatch_size:
			
 
				+            batch_size = batch_size // prebatch_size
			
 
				+        else:
			
 
				+            split_into = prebatch_size // batch_size
			
 
				+            splitting_function = partial(split_prebatch, split_into=split_into)
			
 
				+            batch_size = 1
			
 
				+
			
 
				+    dataset = dataset.batch(
			
 
				+        batch_size, drop_remainder=drop_remainder, num_parallel_calls=num_parallel_calls
			
 
				     )
			
 
				 
			
 
				     dataset = dataset.map(
			
 
				         map_func=partial(tf.io.parse_example, features=tf_feature_spec),
			
 
				-        num_parallel_calls=num_cpus//num_gpus
			
 
				+        num_parallel_calls=num_parallel_calls
			
 
				     )
			
 
				 
			
 
				+    if splitting_function is not None:
			
 
				+        dataset = dataset.flat_map(splitting_function)
			
 
				+
			
 
				+    if not drop_remainder and id == 0 and remainder_file is not None:
			
 
				+        tf_feature_spec_remainder = {
			
 
				+            name: tf.io.RaggedFeature(tf.int64) for name in tf_feature_spec
			
 
				+        }
			
 
				+        remainder = tf.data.TFRecordDataset(remainder_file)
			
 
				+        remainder = remainder.map(
			
 
				+            map_func=partial(tf.io.parse_example, features=tf_feature_spec_remainder)
			
 
				+        )
			
 
				+
			
 
				+        dataset = dataset.concatenate(remainder)
			
 
				+
			
 
				     dataset = dataset.map(
			
 
				         map_func=partial(_remap_column_values_tfrecord, feature_spec=feature_spec, long_seq_length=long_seq_length),
			
 
				-        num_parallel_calls=num_cpus//num_gpus
			
 
				+        num_parallel_calls=num_parallel_calls
			
 
				     )
			
 
				 
			
 
				     if repeat_count > 0:
			
--- a/TensorFlow2/Recommendation/SIM/sim/data/defaults.py
+++ b/TensorFlow2/Recommendation/SIM/sim/data/defaults.py
@@ -12,6 +12,8 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				+REMAINDER_FILENAME = 'remainder.tfrecord'
			
 
				+
			
 
				 USER_FEATURES_CHANNEL = 'user_features'
			
 
				 TARGET_ITEM_FEATURES_CHANNEL = 'target_item_features'
			
 
				 POSITIVE_HISTORY_CHANNEL = 'positive_history'
			
--- a/TensorFlow2/Recommendation/SIM/sim/data/feature_spec.py
+++ b/TensorFlow2/Recommendation/SIM/sim/data/feature_spec.py
@@ -72,8 +72,7 @@ class FeatureSpec:
 
				         return [label_feature_name] + user_features_names + item_features_names
			
 
				 
			
 
				     @staticmethod
			
 
				-    def get_default_feature_spec(user_features_cardinalities, item_features_cardinalities,
			
 
				-                                 max_seq_len, train_output, test_output):
			
 
				+    def get_default_feature_spec(user_features_cardinalities, item_features_cardinalities, max_seq_len):
			
 
				 
			
 
				         number_of_user_features = len(user_features_cardinalities)
			
 
				         number_of_item_features = len(item_features_cardinalities)
			
@@ -127,9 +126,9 @@ class FeatureSpec:
 
				                 {
			
 
				                     'type': 'tfrecord',
			
 
				                     'features': all_features_names,
			
 
				-                    'files': [filepath.name]
			
 
				+                    'files': []
			
 
				                 }
			
 
				-            ] for split, filepath in zip([TRAIN_MAPPING, TEST_MAPPING], [train_output, test_output])
			
 
				+            ] for split in [TRAIN_MAPPING, TEST_MAPPING]
			
 
				         }
			
 
				 
			
 
				         return FeatureSpec(feature_spec=feature_spec, channel_spec=channel_spec, source_spec=source_spec)