Jelajahi Sumber

[WIDENDEEP/TF2] Initial release with multihot and triton support

Tomasz Jakubek 3 tahun lalu
induk
melakukan
146fa3c86b
100 mengubah file dengan 9163 tambahan dan 3281 penghapusan
  1. 1 1
      TensorFlow2/Recommendation/WideAndDeep/.dockerignore
  2. 1 1
      TensorFlow2/Recommendation/WideAndDeep/.gitignore
  3. 3 3
      TensorFlow2/Recommendation/WideAndDeep/Dockerfile
  4. 1 1
      TensorFlow2/Recommendation/WideAndDeep/LICENSE
  5. 158 158
      TensorFlow2/Recommendation/WideAndDeep/README.md
  6. 18 4
      TensorFlow2/Recommendation/WideAndDeep/data/outbrain/dataloader.py
  7. 19 4
      TensorFlow2/Recommendation/WideAndDeep/data/outbrain/features.py
  8. 2 2
      TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/preproc.py
  9. 4 4
      TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/arguments.py
  10. 1 42
      TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/feature_description.py
  11. 2 2
      TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/setup.py
  12. 132 50
      TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/workflow.py
  13. 1 1
      TensorFlow2/Recommendation/WideAndDeep/hvd_wrapper.sh
  14. 543 376
      TensorFlow2/Recommendation/WideAndDeep/img/amp_influence.svg
  15. 420 377
      TensorFlow2/Recommendation/WideAndDeep/img/learning_curve.svg
  16. 0 1502
      TensorFlow2/Recommendation/WideAndDeep/img/model.svg
  17. 445 522
      TensorFlow2/Recommendation/WideAndDeep/img/training_stability.svg
  18. 3 2
      TensorFlow2/Recommendation/WideAndDeep/main.py
  19. 14 0
      TensorFlow2/Recommendation/WideAndDeep/requirements.txt
  20. 1 1
      TensorFlow2/Recommendation/WideAndDeep/scripts/evaluating_benchmark.sh
  21. 1 1
      TensorFlow2/Recommendation/WideAndDeep/scripts/preproc.sh
  22. 1 1
      TensorFlow2/Recommendation/WideAndDeep/scripts/training_benchmark.sh
  23. 1 1
      TensorFlow2/Recommendation/WideAndDeep/scripts/training_full.sh
  24. 379 0
      TensorFlow2/Recommendation/WideAndDeep/trainer/model/layers.py
  25. 35 38
      TensorFlow2/Recommendation/WideAndDeep/trainer/model/widedeep.py
  26. 23 4
      TensorFlow2/Recommendation/WideAndDeep/trainer/run.py
  27. 17 6
      TensorFlow2/Recommendation/WideAndDeep/trainer/utils/arguments.py
  28. 5 5
      TensorFlow2/Recommendation/WideAndDeep/trainer/utils/benchmark.py
  29. 32 27
      TensorFlow2/Recommendation/WideAndDeep/trainer/utils/evaluator.py
  30. 473 97
      TensorFlow2/Recommendation/WideAndDeep/trainer/utils/gpu_affinity.py
  31. 5 7
      TensorFlow2/Recommendation/WideAndDeep/trainer/utils/schedulers.py
  32. 19 4
      TensorFlow2/Recommendation/WideAndDeep/trainer/utils/setup.py
  33. 57 37
      TensorFlow2/Recommendation/WideAndDeep/trainer/utils/trainer.py
  34. 46 0
      TensorFlow2/Recommendation/WideAndDeep/triton/Dockerfile
  35. 1321 0
      TensorFlow2/Recommendation/WideAndDeep/triton/README.md
  36. 97 0
      TensorFlow2/Recommendation/WideAndDeep/triton/calculate_metrics.py
  37. 63 0
      TensorFlow2/Recommendation/WideAndDeep/triton/dataloader.py
  38. 13 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/__init__.py
  39. 136 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/args.py
  40. 270 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/core.py
  41. 253 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/dump.py
  42. 82 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/extensions.py
  43. 13 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/__init__.py
  44. 237 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/onnx.py
  45. 232 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/tensorrt.py
  46. 462 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/tf.py
  47. 129 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/utils.py
  48. 61 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/report.py
  49. 14 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/__init__.py
  50. 51 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/base.py
  51. 238 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/grpc.py
  52. 190 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/http.py
  53. 78 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/runner.py
  54. 14 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/__init__.py
  55. 14 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/__init__.py
  56. 39 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/exceptions.py
  57. 89 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/model_analyzer.py
  58. 113 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/model_analyzer_config.py
  59. 296 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/runner.py
  60. 15 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/__init__.py
  61. 41 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/exceptions.py
  62. 159 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/perf_analyzer.py
  63. 216 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/perf_config.py
  64. 183 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/runner.py
  65. 99 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/warmup.py
  66. 117 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/runner.py
  67. 64 0
      TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/utils.py
  68. 156 0
      TensorFlow2/Recommendation/WideAndDeep/triton/export_model.py
  69. 89 0
      TensorFlow2/Recommendation/WideAndDeep/triton/metrics.py
  70. 73 0
      TensorFlow2/Recommendation/WideAndDeep/triton/model.py
  71. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png
  72. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png
  73. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png
  74. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png
  75. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png
  76. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png
  77. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png
  78. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png
  79. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png
  80. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png
  81. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png
  82. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png
  83. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png
  84. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png
  85. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png
  86. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png
  87. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png
  88. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png
  89. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png
  90. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png
  91. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png
  92. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png
  93. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png
  94. TEMPAT SAMPAH
      TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png
  95. 25 0
      TensorFlow2/Recommendation/WideAndDeep/triton/requirements.txt
  96. 140 0
      TensorFlow2/Recommendation/WideAndDeep/triton/run_inference_on_fw.py
  97. 146 0
      TensorFlow2/Recommendation/WideAndDeep/triton/run_inference_on_triton.py
  98. 196 0
      TensorFlow2/Recommendation/WideAndDeep/triton/run_performance_on_triton.py
  99. 13 0
      TensorFlow2/Recommendation/WideAndDeep/triton/runner/__init__.py
  100. 63 0
      TensorFlow2/Recommendation/WideAndDeep/triton/runner/__main__.py

+ 1 - 1
TensorFlow2/Recommendation/WideAndDeep/.dockerignore

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

+ 1 - 1
TensorFlow2/Recommendation/WideAndDeep/.gitignore

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

+ 3 - 3
TensorFlow2/Recommendation/WideAndDeep/Dockerfile

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/merlin/merlin-tensorflow-training:21.09
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/merlin/merlin-tensorflow-training:22.03
 
 FROM ${FROM_IMAGE_NAME}
 
@@ -27,4 +27,4 @@ WORKDIR  /wd
 
 COPY . .
 
-RUN cd /nvtabular && git checkout v0.6.1
+RUN cd /nvtabular && git checkout v0.7.1

+ 1 - 1
TensorFlow2/Recommendation/WideAndDeep/LICENSE

@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2021 NVIDIA Corporation
+   Copyright [yyyy] [name of copyright owner]
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

+ 158 - 158
TensorFlow2/Recommendation/WideAndDeep/README.md

@@ -56,7 +56,7 @@ Recommendation systems drive engagement on many of the most popular online platf
 Google's [Wide & Deep Learning for Recommender Systems](https://arxiv.org/abs/1606.07792) has emerged as a popular model for Click Through Rate (CTR) prediction tasks thanks to its power of generalization (deep part) and memorization (wide part).
 The differences between this Wide & Deep Recommender Model and the model from the paper is the size of the deep part of the model. Originally, in Google's paper, the fully connected part was three layers of 1024, 512, and 256 neurons. Our model consists of 5 layers each of 1024 neurons.
 
-This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and NVIDIA Ampere GPU architectures. Therefore, researchers can get results 4.5 times faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and NVIDIA Ampere GPU architectures. Therefore, researchers can get results 3.5 times faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
 
 ### Model architecture
 
@@ -68,10 +68,6 @@ Wide & Deep refers to a class of networks that use the output of two parts worki
 Figure 1. The architecture of the Wide & Deep model.</a>
 </p>
 
-
-
-
-
 ### Applications and dataset
 
 As a reference dataset, we used a subset of [the features engineered](https://github.com/gabrielspmoreira/kaggle_outbrain_click_prediction_google_cloud_ml_engine) by the 19th place finisher in the [Kaggle Outbrain Click Prediction Challenge](https://www.kaggle.com/c/outbrain-click-prediction/). This competition challenged competitors to predict the likelihood with which a particular ad on a website's display would be clicked on. Competitors were given information about the user, display, document, and ad in order to train their models. More information can be found [here](https://www.kaggle.com/c/outbrain-click-prediction/data).
@@ -83,23 +79,25 @@ The Outbrain Dataset is preprocessed in order to get features input to the model
 Features:
 - Request Level:
     * 5 scalar numeric features `dtype=float32`
-    * 8 categorical features `dtype=int32`
-    * 8 trainable embeddings of (dimension, cardinality of categorical variable): (128,300000), (19,4), (128,100000), (64,4000), (64,1000), (64,2500), (64,300), (64,2000)
-    * 8  trainable embeddings for wide part of size 1 (serving as an embedding from the categorical to scalar space for input to the wide portion of the model)
+    * 8 one-hot categorical features `dtype=int32`
+    * 3 multi-hot categorical features `dtype=int32`
+    * 11 trainable embeddings of (dimension, cardinality of categorical variable, hotness for multi-hot): \
+      (128,300000), (19,4), (128,100000), (64,4000), (64,1000), (64,2500), (64,300), (64,2000), (64, 350, 3), (64, 10000, 3), (64, 100, 3)
+    * 11 trainable embeddings for wide part of size 1 (serving as an embedding from the categorical to scalar space for input to the wide portion of the model)
 
 - Item Level:
     * 8 scalar numeric features `dtype=float32`
-    * 5 categorical features `dtype=int32`
+    * 5 one-hot categorical features `dtype=int32`
     * 5 trainable embeddings of  (dimension, cardinality of categorical variable): (128,250000), (64,2500), (64,4000), (64,1000), (128,5000)
     * 5 trainable embeddings for wide part of size 1 (working as trainable one-hot embeddings)
 
 Features describe both the user (Request Level features) and Item (Item Level Features).
 
 - Model:
-    * Input dimension is 26 (13 categorical and 13 numerical features)
-    * Total embedding dimension is 1043
+    * Input dimension is 29 (16 categorical and 13 numerical features)
+    * Total embedding dimension is 1235
     * 5 hidden layers each with size 1024
-    * Total number of model parameter is ~90M
+    * Total number of model parameter is ~92M
     * Output dimension is 1 (`y` is the probability of click given Request-level and Item-level features)
     * Loss function: Binary Crossentropy
 
@@ -143,7 +141,7 @@ For more information:
 * Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
 * How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
 
-For information on the influence of mixed precision training on model accuracy in train and inference, go to [Training accuracy results](Training-accuracy-results).
+For information on the influence of mixed precision training on model accuracy in train and inference, go to [Training accuracy results](#training-accuracy-results).
 
 #### Enabling mixed precision
 
@@ -174,7 +172,7 @@ The following section lists the requirements that you need to meet in order to s
 
 This repository contains Dockerfile which extends the TensorFlow2 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 - [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-- [21.09 Merlin Tensorflow Training](https://ngc.nvidia.com/catalog/containers/nvidia:merlin:merlin-tensorflow-training) NGC container
+- [22.03 Merlin Tensorflow Training](https://ngc.nvidia.com/catalog/containers/nvidia:merlin:merlin-tensorflow-training) NGC container
 
 Supported GPUs:
 - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
@@ -205,7 +203,7 @@ cd DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep
 
 3. Download the Outbrain dataset.
 
-The Outbrain dataset can be downloaded from Kaggle (requires Kaggle account). Unzip the downloaded archive (for example, to `/raid/outbrain/orig`) and set the `HOST_OUTBRAIN_PATH` variable to the parent directory:
+The Outbrain dataset can be downloaded from [Kaggle](https://www.kaggle.com/c/outbrain-click-prediction/) (requires Kaggle account). Unzip the downloaded archive into `orig` directory (for example, to `/raid/outbrain/orig`) and set the `HOST_OUTBRAIN_PATH` variable to the parent directory:
 ```
 HOST_OUTBRAIN_PATH=/raid/outbrain
 ```
@@ -262,7 +260,7 @@ If you want to run validation or evaluation, you can either:
 * use the checkpoint obtained from the training commands above, or
 * download the pretrained checkpoint from NGC.
 
-In order to download the checkpoint from NGC, visit [ngc.nvidia.com](https://ngc.nvidia.com) website and browse the available models. Download the checkpoint files and unzip them to some path, for example, to `$HOST_OUTBRAIN_PATH/checkpoints/` (which is the default path for storing the checkpoints during training). The checkpoint requires around 700MB disk space.
+In order to download the checkpoint from NGC, visit [ngc.nvidia.com](https://catalog.ngc.nvidia.com/orgs/nvidia/models/widedeep_tf2_amp_base_128k_nvtabular) website and browse the available models. Download the checkpoint files and unzip them to some path, for example, to `$HOST_OUTBRAIN_PATH/checkpoints/` (which is the default path for storing the checkpoints during training). The checkpoint requires around 700MB disk space.
 
 8. Start validation/evaluation.
 In order to validate the checkpoint on the evaluation set, run the `main.py` script with the `--evaluate` and `--use_checkpoint` flags.
@@ -271,7 +269,7 @@ In order to validate the checkpoint on the evaluation set, run the `main.py` scr
 horovodrun -np ${GPU} sh hvd_wrapper.sh python main.py --evaluate --use_checkpoint
 ```
 
-Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark yours performance to [Training and evaluation performance benchmark](#training-and-evaluation-performance-benchmark). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training and evaluation performance benchmark](#training-and-evaluation-performance-benchmark). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
 
 ## Advanced
 
@@ -310,11 +308,12 @@ These are model parameters in the `main.py` script:
 |training parameters|--deep_warmup_epochs DEEP_WARMUP_EPOCHS|Number of learning rate warmup epochs for deep model | 6
 |model construction|--deep_hidden_units DEEP_HIDDEN_UNITS [DEEP_HIDDEN_UNITS ...]|Hidden units per layer for deep model, separated by spaces|[1024, 1024, 1024, 1024, 1024]
 |model construction|--deep_dropout DEEP_DROPOUT|Dropout regularization for deep model|0.1
+|model construction|--combiner {mean,sum}|Type of aggregation used for multi hot categorical features|sum
 |run mode parameters|--evaluate|Only perform an evaluation on the validation dataset, don't train | False
 |run mode parameters|--benchmark|Run training or evaluation benchmark to collect performance metrics | False
 |run mode parameters|--benchmark_warmup_steps BENCHMARK_WARMUP_STEPS|Number of warmup steps before start of the benchmark | 500
 |run mode parameters|--benchmark_steps BENCHMARK_STEPS|Number of steps for performance benchmark | 1000
-|run mode parameters|--affinity{socket,single,single_unique,<br>socket_unique_interleaved,<br>socket_unique_continuous,disabled}|Type of CPU affinity | socket_unique_interleaved
+|run mode parameters|--affinity {all,single,single_unique,<br>unique_interleaved,unique_contiguous,disabled}|Type of CPU affinity | unique_interleaved
 
 
 ### Command-line options
@@ -339,25 +338,26 @@ The original data is stored in several separate files:
 * `promoted_content.csv` - metadata about the ads
 * `document_meta.csv`, `document_topics.csv`, `document_entities.csv`, `document_categories.csv` - metadata about the documents
 
-During the preprocessing stage, the data is transformed into 87M rows tabular data of 26 features. The dataset is split into training and evaluation parts that have approx 60M and approx 27M rows, respectively. Splitting into train and eval is done in this way so that random 80% of daily events for the first 10 days of the dataset form a training set and remaining part (20% of events daily for the first 10 days and all events in the last two days) form an evaluation set. Eventually the dataset is saved in NVTabular parquet format.
+During the preprocessing stage, the data is transformed into 87M rows tabular data of 29 features. The dataset is split into training and evaluation parts that have approx 60M and approx 27M rows, respectively. Splitting into train and eval is done in this way so that random 80% of daily events for the first 10 days of the dataset form a training set and remaining part (20% of events daily for the first 10 days and all events in the last two days) form an evaluation set. Eventually the dataset is saved in NVTabular parquet format.
 
 #### Dataset preprocessing
 
-Dataset preprocessing aims in creating in total 26 features: 13 categorical and 13 numerical. These features are obtained from the original Outbrain dataset in [NVTabular](https://nvidia.github.io/NVTabular/v0.6.1/index.html) preprocessing.
+Dataset preprocessing aims in creating in total 29 features: 16 categorical and 13 numerical. These features are obtained from the original Outbrain dataset in [NVTabular](https://nvidia-merlin.github.io/NVTabular/v0.7.1/Introduction.html) preprocessing.
 
 ##### NVTabular GPU preprocessing
 
 The NVTabular dataset is preprocessed using the script provided in `data/outbrain/nvtabular`. The workflow consists of:
 * separating out the validation set for cross-validation
-* filling missing data with themode, median, or imputed values most frequent value
+* filling missing data with the mode, median, or imputed values most frequent value
 * joining click data, ad metadata, and document category, topic and entity tables to create an enriched table.joining the  tables for the ad clicks data
 * computing  7 click-through rates (CTR) for ads grouped by 7 features different contexts
 * computing attribute cosine similarity between the landing page and ad to be featured on the page features of the clicked ads and the viewed ads
+* extracting multi-hot categorical values
 * math transformations of the numeric features (logarithmic, normalization)
 * categorifying data using hash-bucketing
 * storing the result in a Parquet format
 
-Most of the code describing operations in this workflow are in `data/outbrain/nvtabular/utils/workflow.py` and leverage NVTabular v0.6.1. As stated in its repository, [NVTabular](https://github.com/NVIDIA/NVTabular), a component of [NVIDIA Merlin Open Beta](https://developer.nvidia.com/nvidia-merlin), is a feature engineering and preprocessing library for tabular data that is designed to quickly and easily manipulate terabyte scale datasets and train deep learning based recommender systems. It provides a high-level abstraction to simplify code and accelerates computation on the GPU using the [RAPIDS Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) library.
+Most of the code describing operations in this workflow are in `data/outbrain/nvtabular/utils/workflow.py` and leverage NVTabular v0.7.1. As stated in its repository, [NVTabular](https://github.com/NVIDIA/NVTabular), a component of [NVIDIA Merlin Open Beta](https://developer.nvidia.com/nvidia-merlin), is a feature engineering and preprocessing library for tabular data that is designed to quickly and easily manipulate terabyte scale datasets and train deep learning based recommender systems. It provides a high-level abstraction to simplify code and accelerates computation on the GPU using the [RAPIDS Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) library.
 The NVTabular Outbrain workflow has been successfully tested on DGX-1 V100 and DGX A100 for single and multigpu preprocessing.
 
 For more information about NVTabular, refer to the [NVTabular documentation](https://github.com/NVIDIA/NVTabular).
@@ -427,13 +427,12 @@ The following sections provide details on how we achieved our performance and ac
 
 Our results were obtained by running the `main.py` training script in the TensorFlow2 NGC container on NVIDIA DGX A100 with (8x A100 80GB) GPUs.
 
-| GPUs | Batch size / GPU | XLA | Accuracy - TF32 (MAP@12) | Accuracy - mixed precision (MAP@12) |  Time to train - TF32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (TF32 to mixed precision) |
-| ---- | ---------------- | --- | --------------|---|------- | ----------------------------------- |  ----------------------------------------------- |
-1|131072|Yes|0.65656|0.65654|13.40|9.48|1.41
-1|131072|No |0.65662|0.65656|17.75|13.38|1.33
-8|16384|Yes |0.65672|0.65665|4.82|4.50|1.07
-8|16384|No  |0.65671|0.65655|5.71|5.72|1.00
-
+| GPUs | Batch size / GPU | XLA | Accuracy - TF32 (MAP@12) | Accuracy - mixed precision (MAP@12) | Time to train - TF32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (TF32 to mixed precision) |
+| --- | ---------------- | ---- | -------------- | -------------- | ------- | ------- | ------- |
+| 1   | 131072           | Yes  | 0.65728        | 0.65728        | 17.05   | 13.12   | 1.30    |
+| 1   | 131072           | No   | 0.65734        | 0.65732        | 21.75   | 17.50   | 1.24    |
+| 8   | 16384            | Yes  | 0.65754        | 0.65751        | 6.48    | 6.33    | 1.02    |
+| 8   | 16384            | No   | 0.65750        | 0.65754        | 8.07    | 7.87    | 1.03    |
 
 To achieve the same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
 
@@ -442,12 +441,12 @@ To achieve the same results, follow the steps in the [Quick Start Guide](#quick-
 Our results were obtained by running the main.py training script in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs.
 
 
-| GPUs | Batch size / GPU | XLA | Accuracy - FP32 (MAP@12) | Accuracy - mixed precision (MAP@12) |  Time to train - FP32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (FP32 to mixed precision) |
-| ---- | ---------------- | --- | --------------|---|------- |  ----------------------------------------- | ----------------------------------------------- |
-1|131072|Yes |0.65658|0.65664|62.89|18.65|3.37
-1|131072|No  |0.65662|0.65658|71.53|25.18|2.84
-8|16384|Yes  |0.65668|0.65655|12.21|8.89|1.37
-8|16384|No   |0.65665|0.65654|14.38|7.17|2.01
+| GPUs | Batch size / GPU | XLA | Accuracy - FP32 (MAP@12) | Accuracy - mixed precision (MAP@12) | Time to train - FP32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (FP32 to mixed precision) |
+| --- | ---------------- | ---- | -------------- | -------------- | ------- | ------- | ------- |
+| 1   | 131072           | Yes  | 0.65736        | 0.65731        | 72.38   | 24.60   | 2.94    |
+| 1   | 131072           | No   | 0.65736        | 0.65735        | 80.53   | 31.60   | 2.55    |
+| 8   | 16384            | Yes  | 0.65751        | 0.65752        | 15.62   | 10.13   | 1.54    |
+| 8   | 16384            | No   | 0.65749        | 0.65752        | 18.37   | 12.45   | 1.48    |
 
 
 To achieve the same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@@ -466,7 +465,7 @@ The plot represents MAP@12 in a function of steps (step is single batch) during
 
 ##### Training stability test
 
-Training of the model is stable for multiple configurations achieving the standard deviation of 10e-4. The model achieves similar MAP@12 scores for A100 and V100, training precisions, XLA usage and single/multi GPU. The Wide and Deep model was trained for 9140 training steps (20 epochs, 457 batches in each epoch, every batch containing 131072), starting from 20 different initial random seeds for each setup. The training was performed in the 21.09 Merlin Tensorflow Training NGC container on NVIDIA DGX A100 80GB and DGX-1 32GB machines with and without mixed precision enabled, with and without XLA enabled for NVTabular generated dataset. The provided charts and numbers consider single and 8 GPU training. After training, the models were evaluated on the validation set. The following plots compare distributions of MAP@12 on the evaluation set. In columns there is single vs 8 GPU training, in rows DGX A100 and DGX-1 V100.
+Training of the model is stable for multiple configurations achieving the standard deviation of 10e-4. The model achieves similar MAP@12 scores for A100 and V100, training precisions, XLA usage and single/multi GPU. The Wide and Deep model was trained for 9140 training steps (20 epochs, 457 batches in each epoch, every batch containing 131072), starting from 20 different initial random seeds for each setup. The training was performed in the 22.03 Merlin Tensorflow Training NGC container on NVIDIA DGX A100 80GB and DGX-1 32GB machines with and without mixed precision enabled, with and without XLA enabled for NVTabular generated dataset. The provided charts and numbers consider single and 8 GPU training. After training, the models were evaluated on the validation set. The following plots compare distributions of MAP@12 on the evaluation set. In columns there is single vs 8 GPU training, in rows DGX A100 and DGX-1 V100.
 
 <p align="center">
   <img width="100%" src="./img/training_stability.svg" />
@@ -480,24 +479,24 @@ Training stability was also compared in terms of point statistics for MAP@12 dis
 <details>
 <summary>Full tabular data for training stability tests</summary>
 
-| | GPUs | Precicision | XLA | Mean | Std | Min | Max | 
-| -------- | --- |  --------- | ---- | ------ | ------ | ------ | ------ |
-|DGX A100|1|TF32|Yes   |0.65656|0.00016|0.6563|0.6569
-|DGX A100|1|TF32|No    |0.65662|0.00013|0.6563|0.6568
-|DGX A100|1|AMP|Yes    |0.65654|0.00010|0.6563|0.6567
-|DGX A100|1|AMP|No     |0.65656|0.00011|0.6564|0.6568
-|DGX A100|8|TF32|Yes   |0.65672|0.00012|0.6565|0.6570
-|DGX A100|8|TF32|No    |0.65671|0.00013|0.6565|0.6569
-|DGX A100|8|AMP|Yes    |0.65665|0.00014|0.6564|0.6569
-|DGX A100|8|AMP|No     |0.65655|0.00012|0.6564|0.6568
-|DGX-1 V100|1|FP32|Yes |0.65658|0.00013|0.6563|0.6568
-|DGX-1 V100|1|FP32|No  |0.65662|0.00011|0.6564|0.6568
-|DGX-1 V100|1|AMP|Yes  |0.65664|0.00011|0.6564|0.6568
-|DGX-1 V100|1|AMP|No   |0.65658|0.00011|0.6564|0.6568
-|DGX-1 V100|8|FP32|Yes |0.65668|0.00016|0.6564|0.6570
-|DGX-1 V100|8|FP32|No  |0.65665|0.00019|0.6564|0.6570
-|DGX-1 V100|8|AMP|Yes  |0.65655|0.00012|0.6563|0.6567
-|DGX-1 V100|8|AMP|No   |0.65654|0.00013|0.6563|0.6567
+|  | GPUs | Precision | XLA | Mean | Std | Min | Max |
+| ---------- | --- | ----- | ---- | -------------- | -------------- | ------------- | ------------- |
+| DGX A100   | 1   | TF32  | Yes  | 0.65728        | 0.00014        | 0.6571        | 0.6575        |
+| DGX A100   | 1   | TF32  | No   | 0.65734        | 0.00007        | 0.6572        | 0.6575        |
+| DGX A100   | 1   | AMP   | Yes  | 0.65728        | 0.00011        | 0.6571        | 0.6575        |
+| DGX A100   | 1   | AMP   | No   | 0.65732        | 0.00009        | 0.6572        | 0.6575        |
+| DGX A100   | 8   | TF32  | Yes  | 0.65754        | 0.00014        | 0.6573        | 0.6579        |
+| DGX A100   | 8   | TF32  | No   | 0.65750        | 0.00011        | 0.6573        | 0.6577        |
+| DGX A100   | 8   | AMP   | Yes  | 0.65751        | 0.00013        | 0.6573        | 0.6577        |
+| DGX A100   | 8   | AMP   | No   | 0.65754        | 0.00013        | 0.6573        | 0.6578        |
+| DGX-1 V100 | 1   | FP32  | Yes  | 0.65736        | 0.00011        | 0.6572        | 0.6576        |
+| DGX-1 V100 | 1   | FP32  | No   | 0.65736        | 0.00009        | 0.6572        | 0.6575        |
+| DGX-1 V100 | 1   | AMP   | Yes  | 0.65731        | 0.00013        | 0.6571        | 0.6576        |
+| DGX-1 V100 | 1   | AMP   | No   | 0.65735        | 0.00011        | 0.6571        | 0.6575        |
+| DGX-1 V100 | 8   | FP32  | Yes  | 0.65751        | 0.00011        | 0.6574        | 0.6578        |
+| DGX-1 V100 | 8   | FP32  | No   | 0.65749        | 0.00014        | 0.6572        | 0.6577        |
+| DGX-1 V100 | 8   | AMP   | Yes  | 0.65752        | 0.00012        | 0.6573        | 0.6578        |
+| DGX-1 V100 | 8   | AMP   | No   | 0.65752        | 0.00013        | 0.6573        | 0.6577        |
 </details>
 
 
@@ -520,16 +519,16 @@ Distribution scores for full precision training and AMP training were compared i
 <details>
 <summary>Full tabular data for AMP influence on MAP@12</summary>
 
-|              | GPUs                   |  XLA    | Mean MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Std MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Mean MAP@12 for AMP | Std MAP@12 for AMP | KS test value: statistics, p-value |
-| ------------ | ---------------------- |  ------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------- | ------------------ | ---------------------------------- |
-| DGX A100   | 1    | Yes     |0.65656|0.00016|0.65654|0.00010|0.10000 (0.99999)
-| DGX A100   | 8    | Yes     |0.65672|0.00012|0.65665|0.00014|0.40000 (0.08106)
-| DGX A100   | 1    | No      |0.65662|0.00013|0.65656|0.00011|0.35000 (0.17453)
-| DGX A100   | 8    | No      |0.65671|0.00013|0.65655|0.00012|0.35000 (0.17453)
-| DGX-1 V100 | 1    | Yes     |0.65658|0.00013|0.65664|0.00011|0.25000 (0.57134)
-| DGX-1 V100 | 8    | Yes     |0.65668|0.00016|0.65655|0.00012|0.30000 (0.33559)
-| DGX-1 V100 | 1    | No      |0.65662|0.00011|0.65658|0.00011|0.20000 (0.83197)
-| DGX-1 V100 | 8    | No      |0.65665|0.00019|0.65654|0.00013|0.40000 (0.08106)
+|  | GPUs | XLA | Mean MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Std MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Mean MAP@12 for AMP | Std MAP@12 for AMP | KS test value: statistics, p-value |
+| ---------- | --- | ---- | -------------- | -------------- | -------------- | -------------- | ------------------------- |
+| DGX A100   | 1   | Yes  | 0.65728        | 0.00014        | 0.65728        | 0.00011        | 0.15000 (0.98314)         |
+| DGX A100   | 8   | Yes  | 0.65754        | 0.00014        | 0.65751        | 0.00013        | 0.10000 (0.99999)         |
+| DGX A100   | 1   | No   | 0.65734        | 0.00007        | 0.65732        | 0.00009        | 0.20000 (0.83197)         |
+| DGX A100   | 8   | No   | 0.65750        | 0.00011        | 0.65754        | 0.00013        | 0.15000 (0.98314)         |
+| DGX-1 V100 | 1   | Yes  | 0.65736        | 0.00011        | 0.65731        | 0.00013        | 0.20000 (0.83197)         |
+| DGX-1 V100 | 8   | Yes  | 0.65751        | 0.00011        | 0.65752        | 0.00012        | 0.10000 (0.99999)         |
+| DGX-1 V100 | 1   | No   | 0.65736        | 0.00009        | 0.65735        | 0.00011        | 0.05000 (1.00000)         |
+| DGX-1 V100 | 8   | No   | 0.65749        | 0.00014        | 0.65752        | 0.00013        | 0.15000 (0.98314)         |
 
 </details>
 
@@ -539,24 +538,24 @@ Distribution scores for full precision training and AMP training were compared i
 
 Our results were obtained by running the benchmark script (`main.py --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX A100 with (8x A100 80GB) GPUs. 
 
-|GPUs | Batch size / GPU | XLA | Throughput - TF32 (samples/s)|Throughput - mixed precision (samples/s)|Throughput speedup (TF32 - mixed precision)| Strong scaling - TF32|Strong scaling - mixed precision
-| ---- | ---------------- | --- | ----------------------------- | ---------------------------------------- | ------------------------------------------- | --------------------- | -------------------------------- |
-|1|131,072|Yes|2026524|3069487|1.51|1.00|1.00
-|1|131,072|No |1379960|1928375|1.40|1.00|1.00
-|8|16,384|Yes |6892010|7574174|1.10|3.40|2.47
-|8|16,384|No  |5124054|5120040|1.00|3.71|2.66
+| GPUs | Batch size / GPU | XLA | Throughput - TF32 (samples/s) | Throughput - mixed precision (samples/s) | Throughput speedup (TF32 - mixed precision) | Strong scaling - TF32 | Strong scaling - mixed precision |
+| --- | ---------------- | ---- | -------------- | -------------- | ------- | ------- | ------- |
+| 1   | 131072           | Yes  | 1655113        | 2346864        | 1.42    | 1.00    | 1.00    |
+| 1   | 131072           | No   | 1198447        | 1568767        | 1.31    | 1.00    | 1.00    |
+| 8   | 16384            | Yes  | 5364411        | 5852297        | 1.09    | 3.24    | 2.49    |
+| 8   | 16384            | No   | 3955617        | 4048638        | 1.02    | 3.30    | 2.58    |
 
 
 ##### Training performance: NVIDIA DGX-1 (8x V100 32GB)
 
 Our results were obtained by running the benchmark script (`main.py --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs.
 
-|GPUs | Batch size / GPU | XLA | Throughput - FP32 (samples/s)|Throughput - mixed precision (samples/s)|Throughput speedup (FP32 - mixed precision)| Strong scaling - FP32|Strong scaling - mixed precision
-| ---- | ---------------- | --- | ----------------------------- | ---------------------------------------- | ------------------------------------------- | --------------------- | -------------------------------- |
-|1|131,072|Yes|378918|1405633|3.71|1.00|1.00
-|1|131,072|No |323817|969824|2.99|1.00|1.00
-|8|16,384|Yes |2196648|4332939|1.97|5.80|3.08
-|8|16,384|No  |1772485|3058944|1.73|5.47|3.15
+| GPUs | Batch size / GPU | XLA | Throughput - FP32 (samples/s) | Throughput - mixed precision (samples/s) | Throughput speedup (FP32 - mixed precision) | Strong scaling - FP32 | Strong scaling - mixed precision |
+| --- | ---------------- | ---- | -------------- | -------------- | ------- | ------- | ------- |
+| 1   | 131072           | Yes  | 338245         | 1111894        | 3.29    | 1.00    | 1.00    |
+| 1   | 131072           | No   | 293062         | 814952         | 2.78    | 1.00    | 1.00    |
+| 8   | 16384            | Yes  | 1869462        | 3549165        | 1.90    | 5.53    | 3.19    |
+| 8   | 16384            | No   | 1489016        | 2491795        | 1.67    | 5.08    | 3.06    |
 
 
 #### Evaluation performance results
@@ -566,52 +565,49 @@ Our results were obtained by running the benchmark script (`main.py --benchmark`
 Our results were obtained by running the benchmark script (`main.py --evaluate --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX A100 with 8x A100 80GB GPUs. 
 
 
-|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\] AMP|Throughput speedup AMP to TF32
-|----|----------------|---|------------------------------|-----------------------------|-------------------------------
-|1|4096|NO    |1107650|1028782|0.93|
-|1|8192|NO    |1783848|1856528|1.04|
-|1|16384|NO   |2295874|2409601|1.05|
-|1|32768|NO   |2367142|2583293|1.09|
-|1|65536|NO   |3044662|3471619|1.14|
-|1|131072|NO  |3229625|3823612|1.18|
-|8|4096|NO    |5503985|5333228|0.97|
-|8|8192|NO    |12251675|12386870|1.01|
-|8|16384|NO   |16020973|16438269|1.03|
-|8|32768|NO   |17225168|18667798|1.08|
-|8|65536|NO   |19969248|22270424|1.12|
-|8|131072|NO  |19929457|22496045|1.13|
+| GPUs | Batch size / GPU | XLA | Throughput [samples/s] TF32 | Throughput [samples/s] AMP | Throughput speedup AMP to TF32 |
+| --- | ---------------- | ---- | -------------- | -------------- | ------- |
+| 1   | 4096             | No   | 631542         | 605132         | 0.96    |
+| 1   | 8192             | No   | 1003923        | 1025958        | 1.02    |
+| 1   | 16384            | No   | 1436331        | 1465785        | 1.02    |
+| 1   | 32768            | No   | 1807615        | 1965822        | 1.09    |
+| 1   | 65536            | No   | 2114939        | 2320347        | 1.10    |
+| 1   | 131072           | No   | 2343520        | 2638773        | 1.13    |
+| 8   | 4096             | No   | 4474162        | 4129841        | 0.92    |
+| 8   | 8192             | No   | 6984567        | 6977303        | 1.00    |
+| 8   | 16384            | No   | 10398419       | 10872412       | 1.05    |
+| 8   | 32768            | No   | 13896799       | 13704361       | 0.99    |
+| 8   | 65536            | No   | 15933755       | 17760589       | 1.11    |
 
 For more results go to the expandable table below.
 
 <details>
 <summary>Full tabular data for evaluation performance results for DGX A100</summary>
 
-|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\] AMP|Throughput speedup AMP to TF32
-|----|----------------|---|------------------------------|-----------------------------|-------------------------------
-|1|4096|YES   |1344225|1501677|1.12|
-|1|4096|NO    |1107650|1028782|0.93|
-|1|8192|YES   |2220721|2545781|1.15|
-|1|8192|NO    |1783848|1856528|1.04|
-|1|16384|YES  |2730441|3230949|1.18|
-|1|16384|NO   |2295874|2409601|1.05|
-|1|32768|YES  |2527368|2974417|1.18|
-|1|32768|NO   |2367142|2583293|1.09|
-|1|65536|YES  |3163906|3935731|1.24|
-|1|65536|NO   |3044662|3471619|1.14|
-|1|131072|YES |3171670|4064426|1.28|
-|1|131072|NO  |3229625|3823612|1.18|
-|8|4096|YES   |6243348|6553485|1.05|
-|8|4096|NO    |5503985|5333228|0.97|
-|8|8192|YES   |14995914|16222429|1.08|
-|8|8192|NO    |12251675|12386870|1.01|
-|8|16384|YES  |14584474|16543902|1.13|
-|8|16384|NO   |16020973|16438269|1.03|
-|8|32768|YES  |17840220|21537660|1.21|
-|8|32768|NO   |17225168|18667798|1.08|
-|8|65536|YES  |20732672|24082577|1.16|
-|8|65536|NO   |19969248|22270424|1.12|
-|8|131072|YES |20104010|24157900|1.20|
-|8|131072|NO  |19929457|22496045|1.13|
+| GPUs | Batch size / GPU | XLA | Throughput [samples/s] TF32 | Throughput [samples/s] AMP | Throughput speedup AMP to TF32 |
+| --- | ---------------- | ---- | -------------- | -------------- | ------- |
+| 1   | 4096             | Yes  | 765213         | 802188         | 1.05    |
+| 1   | 4096             | No   | 631542         | 605132         | 0.96    |
+| 1   | 8192             | Yes  | 1162267        | 1233427        | 1.06    |
+| 1   | 8192             | No   | 1003923        | 1025958        | 1.02    |
+| 1   | 16384            | Yes  | 1643782        | 1824973        | 1.11    |
+| 1   | 16384            | No   | 1436331        | 1465785        | 1.02    |
+| 1   | 32768            | Yes  | 2014538        | 2248111        | 1.12    |
+| 1   | 32768            | No   | 1807615        | 1965822        | 1.09    |
+| 1   | 65536            | Yes  | 2308737        | 2666944        | 1.16    |
+| 1   | 65536            | No   | 2114939        | 2320347        | 1.10    |
+| 1   | 131072           | Yes  | 2515197        | 2944289        | 1.17    |
+| 1   | 131072           | No   | 2343520        | 2638773        | 1.13    |
+| 1   | 4096             | Yes  | 5235260        | 5386308        | 1.03    |
+| 1   | 4096             | No   | 4474162        | 4129841        | 0.92    |
+| 1   | 8192             | Yes  | 8438479        | 8625083        | 1.02    |
+| 1   | 8192             | No   | 6984567        | 6977303        | 1.00    |
+| 1   | 16384            | Yes  | 12629246       | 12146912       | 0.96    |
+| 1   | 16384            | No   | 10398419       | 10872412       | 1.05    |
+| 1   | 32768            | Yes  | 14908125       | 17372751       | 1.17    |
+| 1   | 32768            | No   | 13896799       | 13704361       | 0.99    |
+| 1   | 65536            | Yes  | 17899139       | 19909649       | 1.11    |
+| 1   | 65536            | No   | 15933755       | 17760589       | 1.11    |
  </details>
 
 
@@ -619,20 +615,19 @@ For more results go to the expandable table below.
 
 Our results were obtained by running the benchmark script (`main.py --evaluate --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs.
 
-|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] FP32|Throughput \[samples/s\] AMP|Throughput speedup AMP to FP32
-|----|----------------|---|------------------------------|-----------------------------|-------------------------------
-|1|4096|NO    |499442|718163|1.44|
-|1|8192|NO    |670906|1144640|1.71|
-|1|16384|NO   |802366|1599006|1.99|
-|1|32768|NO   |856130|1795285|2.10|
-|1|65536|NO   |934394|2221221|2.38|
-|1|131072|NO  |965293|2403829|2.49|
-|8|4096|NO    |2840155|3602516|1.27|
-|8|8192|NO    |4810100|7912019|1.64|
-|8|16384|NO   |5939908|10876135|1.83|
-|8|32768|NO   |6489446|12593087|1.94|
-|8|65536|NO   |6614453|14742844|2.23|
-|8|131072|NO  |7133219|15524549|2.18|
+| GPUs | Batch size / GPU | XLA | Throughput [samples/s] FP32 | Throughput [samples/s] AMP | Throughput speedup AMP to FP32 |
+| --- | ---------------- | ---- | -------------- | -------------- | ------- |
+| 1   | 4096             | No   | 311886         | 363685         | 1.17    |
+| 1   | 8192             | No   | 454822         | 639173         | 1.41    |
+| 1   | 16384            | No   | 594582         | 959301         | 1.61    |
+| 1   | 32768            | No   | 705038         | 1279068        | 1.81    |
+| 1   | 65536            | No   | 748398         | 1510412        | 2.02    |
+| 1   | 131072           | No   | 787982         | 1677366        | 2.13    |
+| 8   | 4096             | No   | 2210862        | 2548723        | 1.15    |
+| 8   | 8192             | No   | 3408621        | 4474287        | 1.31    |
+| 8   | 16384            | No   | 4368245        | 6518982        | 1.49    |
+| 8   | 32768            | No   | 5153906        | 8689990        | 1.69    |
+| 8   | 65536            | No   | 5393286        | 11071794       | 2.05    |
 
 
 
@@ -641,40 +636,42 @@ For more results go to the expandable table below.
 <details>
 <summary>Full tabular data for evaluation performance for DGX-1 V100 results</summary>
 
-|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] FP32|Throughput \[samples/s\] AMP|Throughput speedup AMP to FP32
-|----|----------------|---|------------------------------|-----------------------------|-------------------------------
-|1|4096|YES   |573285|919150|1.60|
-|1|4096|NO    |499442|718163|1.44|
-|1|8192|YES   |753993|1486867|1.97|
-|1|8192|NO    |670906|1144640|1.71|
-|1|16384|YES  |859699|1945700|2.26|
-|1|16384|NO   |802366|1599006|1.99|
-|1|32768|YES  |904255|1995194|2.21|
-|1|32768|NO   |856130|1795285|2.10|
-|1|65536|YES  |982448|2608010|2.65|
-|1|65536|NO   |934394|2221221|2.38|
-|1|131072|YES |926734|2621095|2.83|
-|1|131072|NO  |965293|2403829|2.49|
-|8|4096|YES   |3102948|4083015|1.32|
-|8|4096|NO    |2840155|3602516|1.27|
-|8|8192|YES   |5536556|10094905|1.82|
-|8|8192|NO    |4810100|7912019|1.64|
-|8|16384|YES  |5722386|10524548|1.84|
-|8|16384|NO   |5939908|10876135|1.83|
-|8|32768|YES  |6813318|14356608|2.11|
-|8|32768|NO   |6489446|12593087|1.94|
-|8|65536|YES  |6918413|16227668|2.35|
-|8|65536|NO   |6614453|14742844|2.23|
-|8|131072|YES |6910518|16423342|2.38|
-|8|131072|NO  |7133219|15524549|2.18|
+| GPUs | Batch size / GPU | XLA | Throughput [samples/s] FP32 | Throughput [samples/s] AMP | Throughput speedup AMP to FP32 |
+| --- | ---------------- | ---- | -------------- | -------------- | ------- |
+| 1   | 4096             | Yes  | 349110         | 419470         | 1.20    |
+| 1   | 4096             | No   | 311886         | 363685         | 1.17    |
+| 1   | 8192             | Yes  | 495663         | 738806         | 1.49    |
+| 1   | 8192             | No   | 454822         | 639173         | 1.41    |
+| 1   | 16384            | Yes  | 641953         | 1112849        | 1.73    |
+| 1   | 16384            | No   | 594582         | 959301         | 1.61    |
+| 1   | 32768            | Yes  | 737395         | 1442387        | 1.96    |
+| 1   | 32768            | No   | 705038         | 1279068        | 1.81    |
+| 1   | 65536            | Yes  | 794009         | 1693861        | 2.13    |
+| 1   | 65536            | No   | 748398         | 1510412        | 2.02    |
+| 1   | 131072           | Yes  | 819904         | 1887338        | 2.30    |
+| 1   | 131072           | No   | 787982         | 1677366        | 2.13    |
+| 1   | 4096             | Yes  | 2505902        | 3165730        | 1.26    |
+| 1   | 4096             | No   | 2210862        | 2548723        | 1.15    |
+| 1   | 8192             | Yes  | 3759356        | 5289218        | 1.41    |
+| 1   | 8192             | No   | 3408621        | 4474287        | 1.31    |
+| 1   | 16384            | Yes  | 4686372        | 7551041        | 1.61    |
+| 1   | 16384            | No   | 4368245        | 6518982        | 1.49    |
+| 1   | 32768            | Yes  | 5398782        | 9615114        | 1.78    |
+| 1   | 32768            | No   | 5153906        | 8689990        | 1.69    |
+| 1   | 65536            | Yes  | 5642629        | 11907666       | 2.11    |
+| 1   | 65536            | No   | 5393286        | 11071794       | 2.05    |
  </details>
 
 ## Release notes
 
 ### Changelog
 
-February 2021
-- Initial release
+May 2022
+- Added multi-hot categorical features
+- Added triton inference
+- Updated model architecture figure
+- Updated NVTabular to v0.7.1
+- Updated readme numbers
 
 November 2021
 - Refresh release with performance optimizations
@@ -684,5 +681,8 @@ November 2021
 - Updated readme numbers
 - Changed V100 cards from 16GB to 32GB
 
+February 2021
+- Initial release
+
 ### Known issues
 * In this model the TF32 precision can in some cases be as fast as the FP16 precision on Ampere GPUs. This is because TF32 also uses Tensor Cores and doesn't need any additional logic such as maintaining FP32 master weights and casts. However, please note that W&D is, by modern recommender standards, a very small model. Larger models should still see significant benefits of using FP16 math.

+ 18 - 4
TensorFlow2/Recommendation/WideAndDeep/data/outbrain/dataloader.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 import cupy
 import horovod.tensorflow as hvd
 import tensorflow as tf
-from data.outbrain.features import CATEGORICAL_COLUMNS, NUMERIC_COLUMNS
+from data.outbrain.features import CATEGORICAL_COLUMNS, MULTIHOT_COLUMNS, NUMERIC_COLUMNS
 from nvtabular.loader.tensorflow import KerasSequenceLoader
 
 cupy.random.seed(None)
@@ -37,7 +37,7 @@ def seed_fn():
 
 
 def train_input_fn(
-    train_paths, records_batch_size, buffer_size=0.1, parts_per_chunk=1, shuffle=True
+        train_paths, records_batch_size, buffer_size=0.1, parts_per_chunk=1, shuffle=True
 ):
     train_dataset_tf = KerasSequenceLoader(
         train_paths,
@@ -58,7 +58,7 @@ def train_input_fn(
 
 
 def eval_input_fn(
-    valid_paths, records_batch_size, buffer_size=0.1, parts_per_chunk=1, shuffle=False
+        valid_paths, records_batch_size, buffer_size=0.1, parts_per_chunk=1, shuffle=False
 ):
     valid_dataset_tf = KerasSequenceLoader(
         valid_paths,
@@ -76,3 +76,17 @@ def eval_input_fn(
     )
 
     return valid_dataset_tf
+
+
[email protected](experimental_relax_shapes=True)
+def pad_batch(batch):
+    batch = batch.copy()
+    for feature, hotness in MULTIHOT_COLUMNS.items():
+        multihot_tuple = batch[feature]
+        values = multihot_tuple[0][:, 0]
+        row_lengths = multihot_tuple[1][:, 0]
+        padded = tf.RaggedTensor.from_row_lengths(
+            values, row_lengths, validate=False
+        ).to_tensor(default_value=-1, shape=[None, hotness])
+        batch[feature] = padded
+    return batch

+ 19 - 4
TensorFlow2/Recommendation/WideAndDeep/data/outbrain/features.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ NUMERIC_COLUMNS = [
     "publish_time_promo_days_since_published",
 ]
 
-CATEGORICAL_COLUMNS = [
+ONEHOT_COLUMNS = [
     "ad_id",
     "document_id",
     "platform",
@@ -50,6 +50,15 @@ CATEGORICAL_COLUMNS = [
     "publisher_id_promo",
 ]
 
+# Multihot columns with their hotness
+MULTIHOT_COLUMNS = {
+    "topic_id_list": 3,
+    "entity_id_list": 3,
+    "category_id_list": 3
+}
+
+CATEGORICAL_COLUMNS = ONEHOT_COLUMNS + list(MULTIHOT_COLUMNS.keys())
+
 HASH_BUCKET_SIZES = {
     "document_id": 300000,
     "ad_id": 250000,
@@ -64,6 +73,9 @@ HASH_BUCKET_SIZES = {
     "geo_location_country": 300,
     "platform": 4,
     "campaign_id": 5000,
+    "topic_id_list": 350,
+    "entity_id_list": 10000,
+    "category_id_list": 100,
 }
 
 EMBEDDING_DIMENSIONS = {
@@ -80,6 +92,9 @@ EMBEDDING_DIMENSIONS = {
     "geo_location_country": 64,
     "platform": 19,
     "campaign_id": 128,
+    "topic_id_list": 64,
+    "entity_id_list": 64,
+    "category_id_list": 64,
 }
 
 EMBEDDING_TABLE_SHAPES = {
@@ -92,7 +107,7 @@ def get_features_keys():
     return CATEGORICAL_COLUMNS + NUMERIC_COLUMNS + [DISPLAY_ID_COLUMN]
 
 
-def get_feature_columns():
+def get_feature_columns(combiner):
     logger = logging.getLogger("tensorflow")
     wide_columns, deep_columns = [], []
 
@@ -104,7 +119,7 @@ def get_feature_columns():
             wrapped_column = tf.feature_column.embedding_column(
                 categorical_column,
                 dimension=EMBEDDING_TABLE_SHAPES[column_name][1],
-                combiner="mean",
+                combiner=combiner,
             )
         else:
             raise ValueError(f"Unexpected categorical column found {column_name}")

+ 2 - 2
TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/preproc.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,9 +16,9 @@ import logging
 import os
 
 os.environ["TF_MEMORY_ALLOCATION"] = "0.0"
-from data.outbrain.nvtabular.utils.workflow import execute_pipeline
 from data.outbrain.nvtabular.utils.arguments import parse_args
 from data.outbrain.nvtabular.utils.setup import create_config
+from data.outbrain.nvtabular.utils.workflow import execute_pipeline
 
 
 def is_empty(path):

+ 4 - 4
TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/arguments.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,10 +36,10 @@ def parse_args():
         nargs="+",
     )
     parser.add_argument(
-        '--use_dask',
+        "--use_dask",
         default=False,
-        action='store_true',
-        help='Use multi-gpu preprocessing for nvTabular workflow'
+        action="store_true",
+        help="Use multi-gpu preprocessing for nvTabular workflow",
     )
 
     return parser.parse_args()

+ 1 - 42
TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/feature_description.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -81,44 +81,3 @@ EXCLUDE_COLUMNS = [
     "uuid",
     "day_event",
 ]
-
-nvt_to_spark = {
-    "ad_id": "ad_id",
-    "clicked": "label",
-    "display_id": "display_id",
-    "document_id": "doc_event_id",
-    "platform": "event_platform",
-    "document_id_promo": "doc_id",
-    "campaign_id": "campaign_id",
-    "advertiser_id": "ad_advertiser",
-    "source_id": "doc_event_source_id",
-    "publisher_id": "doc_event_publisher_id",
-    "source_id_promo": "doc_ad_source_id",
-    "publisher_id_promo": "doc_ad_publisher_id",
-    "geo_location": "event_geo_location",
-    "geo_location_country": "event_country",
-    "geo_location_state": "event_country_state",
-    "document_id_promo_ctr": "pop_document_id",
-    "publisher_id_promo_ctr": "pop_publisher_id",
-    "source_id_promo_ctr": "pop_source_id",
-    "document_id_promo_count": "doc_views_log_01scaled",
-    "publish_time_days_since_published": "doc_event_days_since_published_log_01scaled",
-    "ad_id_ctr": "pop_ad_id",
-    "advertiser_id_ctr": "pop_advertiser_id",
-    "campaign_id_ctr": "pop_campain_id",
-    "ad_id_count": "ad_views_log_01scaled",
-    "publish_time_promo_days_since_published": "doc_ad_days_since_published_log_01scaled",
-    "document_id_document_id_promo_sim_categories": "doc_event_doc_ad_sim_categories",
-    "document_id_document_id_promo_sim_topics": "doc_event_doc_ad_sim_topics",
-    "document_id_document_id_promo_sim_entities": "doc_event_doc_ad_sim_entities",
-}
-
-spark_to_nvt = {item: key for key, item in nvt_to_spark.items()}
-
-
-def transform_nvt_to_spark(column):
-    return nvt_to_spark[column]
-
-
-def transform_spark_to_nvt(column):
-    return spark_to_nvt[column]

+ 2 - 2
TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/setup.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ def create_config(args):
         "valid_path": valid_path,
         "output_valid_folder": output_valid_folder,
         "hash_spec": hash_spec,
-        "dask": args.use_dask
+        "dask": args.use_dask,
     }
 
     return config

+ 132 - 50
TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/workflow.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,28 +17,19 @@ import shutil
 
 import cudf
 import cupy
+import numpy as np
 import nvtabular as nvt
 import rmm
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
+from data.outbrain.features import get_features_keys
 from data.outbrain.nvtabular.utils.feature_description import (
-    CATEGORICAL_COLUMNS,
-    DISPLAY_ID_COLUMN,
-    CTR_INPUTS,
-)
+    CATEGORICAL_COLUMNS, CTR_INPUTS, DISPLAY_ID_COLUMN)
 from nvtabular import ColumnGroup
 from nvtabular.io import Shuffle
-from nvtabular.ops import (
-    FillMedian,
-    LogOp,
-    Rename,
-    JoinGroupby,
-    LambdaOp,
-    FillMissing,
-    HashBucket,
-    Normalize,
-)
-from nvtabular.ops import Operator
+from nvtabular.ops import (Categorify, ColumnSelector, FillMedian, FillMissing,
+                           HashBucket, JoinExternal, JoinGroupby, LambdaOp,
+                           ListSlice, LogOp, Normalize, Operator, Rename)
 from nvtabular.ops.column_similarity import ColumnSimilarity
 from nvtabular.utils import device_mem_size, get_rmm_size
 
@@ -51,7 +42,7 @@ def get_devices():
             int(device) for device in os.environ["CUDA_VISIBLE_DEVICES"].split(",")
         ]
     except KeyError:
-        from pynvml import nvmlInit, nvmlDeviceGetCount
+        from pynvml import nvmlDeviceGetCount, nvmlInit
 
         nvmlInit()
         devices = list(range(nvmlDeviceGetCount()))
@@ -60,7 +51,7 @@ def get_devices():
 
 class DaysSincePublished(Operator):
     def transform(self, columns, gdf):
-        for column in columns:
+        for column in columns.names:
             col = gdf[column]
             col.loc[col == ""] = None
             col = col.astype("datetime64[ns]")
@@ -72,7 +63,9 @@ class DaysSincePublished(Operator):
         return gdf
 
     def output_column_names(self, columns):
-        return [column + "_days_since_published" for column in columns]
+        return ColumnSelector(
+            [column + "_days_since_published" for column in columns.names]
+        )
 
     def dependencies(self):
         return ["timestamp"]
@@ -109,6 +102,7 @@ def create_client(devices, local_directory):
 
 def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, dask):
     rmm.reinitialize(managed_memory=False)
+
     documents_categories_path = os.path.join(
         data_bucket_folder, "documents_categories.csv"
     )
@@ -121,6 +115,39 @@ def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, das
     documents_entities_cudf["entity_id"] = (
         documents_entities_cudf["entity_id"].astype("category").cat.codes
     )
+    documents_categories_grouped = (
+        documents_categories_cudf.groupby("document_id")
+            .agg({"category_id": "collect", "confidence_level": "collect"})
+            .reset_index()
+    )
+    documents_categories_grouped = documents_categories_grouped.rename(
+        columns={
+            "category_id": "category_id_list",
+            "confidence_level": "confidence_level_cat_list",
+        }
+    )
+    documents_entities_grouped = (
+        documents_entities_cudf.groupby("document_id")
+            .agg({"entity_id": "collect", "confidence_level": "collect"})
+            .reset_index()
+    )
+    documents_entities_grouped = documents_entities_grouped.rename(
+        columns={
+            "entity_id": "entity_id_list",
+            "confidence_level": "confidence_level_ent_list",
+        }
+    )
+    documents_topics_grouped = (
+        documents_topics_cudf.groupby("document_id")
+            .agg({"topic_id": "collect", "confidence_level": "collect"})
+            .reset_index()
+    )
+    documents_topics_grouped = documents_topics_grouped.rename(
+        columns={
+            "topic_id": "topic_id_list",
+            "confidence_level": "confidence_level_top_list",
+        }
+    )
 
     categories = _df_to_coo(documents_categories_cudf, col="category_id")
     topics = _df_to_coo(documents_topics_cudf, col="topic_id")
@@ -136,47 +163,54 @@ def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, das
         "document_id_promo": 5,
     }
 
-    ctr_inputs = ColumnGroup(CTR_INPUTS)
     cat_cols = ColumnGroup(CATEGORICAL_COLUMNS)
 
+    def get_slice(num_char):
+        def lambda_slice(col, gdf):
+            return col.str.slice(0, num_char)
+
+        return lambda_slice
+
     geo_location = ColumnGroup(["geo_location"])
-    country = (
-            geo_location >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")
-    )
-    state = (
-            geo_location >> (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state")
-    )
+    country = geo_location >> LambdaOp(get_slice(2)) >> Rename(postfix="_country")
+    state = geo_location >> LambdaOp(get_slice(5)) >> Rename(postfix="_state")
     geo_features = geo_location + country + state
 
     dates = ["publish_time", "publish_time_promo"]
     date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp
 
+    ctr_inputs = ColumnGroup(CTR_INPUTS)
+
     stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"])
+
+    def calculate_ctr_with_filter(col, gdf):
+        col = col.astype(np.float32)
+        ctr_col_name = col.name.replace("_clicked_sum", "")
+        ctr_count_name = col.name.replace("_clicked_sum", "_count")
+
+        col = col / gdf[ctr_count_name]  # CTR
+        col = col.where(gdf[ctr_count_name] >= ctr_thresh[ctr_col_name], 0)  # Filter
+
+        return col
+
+    ctr_selected_features = [column + "_clicked_sum" for column in ctr_inputs.names]
+    dependency_features = [column + "_count" for column in ctr_inputs.names]
+
     ctr_cols = (
-            stat_cols - [column + "_count" for column in ctr_inputs.flattened_columns]
+            stat_cols[ctr_selected_features]
             >> LambdaOp(
-        f=lambda col, gdf: (
-                (col) / (gdf[col.name.replace("_clicked_sum", "_count")])
-        ).where(
-            gdf[col.name.replace("_clicked_sum", "_count")]
-            >= ctr_thresh[col.name.replace("_clicked_sum", "")],
-            0,
-        ),
-        dependency=stat_cols
-                   - [column + "clicked_sum" for column in ctr_inputs.flattened_columns],
+        calculate_ctr_with_filter, dependency=stat_cols[dependency_features]
     )
             >> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr"))
     )
 
     stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize()
+
     ctr_cols = ctr_cols >> FillMissing()
 
-    cat_cols = cat_cols + geo_features >> HashBucket(hash_spec)
+    cat_cols = cat_cols + geo_features >> HashBucket(dict(list(hash_spec.items())[:-3]))
 
-    features = (
-            date_features + ctr_cols + stat_cols + cat_cols + ["clicked", "display_id"]
-    )
-    sim_features_categ = (
+    sim_features_categories = (
             [["document_id", "document_id_promo"]]
             >> ColumnSimilarity(categories, metric="tfidf", on_device=False)
             >> Rename(postfix="_categories")
@@ -191,11 +225,61 @@ def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, das
             >> ColumnSimilarity(entities, metric="tfidf", on_device=False)
             >> Rename(postfix="_entities")
     )
-    sim_features = sim_features_categ + sim_features_topics + sim_features_entities
 
-    client = create_client(devices=devices, local_directory=local_directory) if dask else None
+    sim_features = sim_features_categories + sim_features_topics + sim_features_entities
+
+    joined = ["document_id"] >> JoinExternal(
+        documents_categories_grouped,
+        on=["document_id"],
+        on_ext=["document_id"],
+        how="left",
+        columns_ext=["category_id_list", "confidence_level_cat_list", "document_id"],
+        cache="device",
+    )
+
+    joined = joined >> JoinExternal(
+        documents_entities_grouped,
+        on=["document_id"],
+        on_ext=["document_id"],
+        how="left",
+        columns_ext=["entity_id_list", "confidence_level_ent_list", "document_id"],
+        cache="device",
+    )
+    joined = joined >> JoinExternal(
+        documents_topics_grouped,
+        on=["document_id"],
+        on_ext=["document_id"],
+        how="left",
+        columns_ext=["topic_id_list", "confidence_level_top_list", "document_id"],
+        cache="device",
+    )
+
+    categorified_multihots = (
+            joined[["topic_id_list", "entity_id_list", "category_id_list"]]
+            >> Categorify()
+            >> FillMissing()
+            >> ListSlice(3)
+            >> HashBucket(dict(list(hash_spec.items())[-3:]))
+    )
 
-    workflow = nvt.Workflow(column_group=features + sim_features, client=client)
+    features = (
+            date_features
+            + ctr_cols
+            + stat_cols
+            + cat_cols
+            + sim_features
+            + categorified_multihots
+            + ["clicked", "display_id"]
+    )
+
+    client = (
+        create_client(devices=devices, local_directory=local_directory)
+        if dask
+        else None
+    )
+    required_features = get_features_keys() + ["clicked"]
+
+    workflow = nvt.Workflow(features[required_features], client=client)
 
     return workflow
 
@@ -207,9 +291,7 @@ def create_parquets(data_bucket_folder, train_path, valid_path):
     clicks_train_path = os.path.join(data_bucket_folder, "clicks_train.csv")
     events_path = os.path.join(data_bucket_folder, "events.csv")
     promoted_content_path = os.path.join(data_bucket_folder, "promoted_content.csv")
-
     documents_meta = cudf.read_csv(documents_meta_path, na_values=["\\N", ""])
-    documents_meta = documents_meta.dropna(subset="source_id")
     documents_meta["publisher_id"].fillna(
         documents_meta["publisher_id"].isnull().cumsum()
         + documents_meta["publisher_id"].max()
@@ -268,7 +350,7 @@ def save_stats(
         stats_file,
         hash_spec,
         local_directory,
-        dask
+        dask,
 ):
     devices = get_devices()
     shuffle = Shuffle.PER_PARTITION if len(devices) > 1 else True
@@ -278,10 +360,10 @@ def save_stats(
         hash_spec=hash_spec,
         devices=devices,
         local_directory=local_directory,
-        dask=dask
+        dask=dask,
     )
 
-    train_dataset = nvt.Dataset(train_path, part_size="1GB")
+    train_dataset = nvt.Dataset(train_path, part_size="150MB")
     valid_dataset = nvt.Dataset(valid_path, part_size="150MB")
     workflow.fit(train_dataset)
     workflow.transform(train_dataset).to_parquet(
@@ -323,7 +405,7 @@ def execute_pipeline(config):
         stats_file=config["stats_file"],
         hash_spec=config["hash_spec"],
         local_directory=config["temporary_folder"],
-        dask=config["dask"]
+        dask=config["dask"],
     )
 
     clean(config["temporary_folder"])

+ 1 - 1
TensorFlow2/Recommendation/WideAndDeep/hvd_wrapper.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

File diff ditekan karena terlalu besar
+ 543 - 376
TensorFlow2/Recommendation/WideAndDeep/img/amp_influence.svg


File diff ditekan karena terlalu besar
+ 420 - 377
TensorFlow2/Recommendation/WideAndDeep/img/learning_curve.svg


File diff ditekan karena terlalu besar
+ 0 - 1502
TensorFlow2/Recommendation/WideAndDeep/img/model.svg


File diff ditekan karena terlalu besar
+ 445 - 522
TensorFlow2/Recommendation/WideAndDeep/img/training_stability.svg


+ 3 - 2
TensorFlow2/Recommendation/WideAndDeep/main.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,8 @@
 
 import os
 
-os.environ["TF_MEMORY_ALLOCATION"] = "0.6"  # fraction of free memory
+os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
+
 import nvtabular as nvt
 
 from trainer.model.widedeep import wide_deep_model

+ 14 - 0
TensorFlow2/Recommendation/WideAndDeep/requirements.txt

@@ -1 +1,15 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 git+https://github.com/NVIDIA/[email protected]#egg=dllogger

+ 1 - 1
TensorFlow2/Recommendation/WideAndDeep/scripts/evaluating_benchmark.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

+ 1 - 1
TensorFlow2/Recommendation/WideAndDeep/scripts/preproc.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

+ 1 - 1
TensorFlow2/Recommendation/WideAndDeep/scripts/training_benchmark.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

+ 1 - 1
TensorFlow2/Recommendation/WideAndDeep/scripts/training_full.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

+ 379 - 0
TensorFlow2/Recommendation/WideAndDeep/trainer/model/layers.py

@@ -0,0 +1,379 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.feature_column import feature_column_v2 as fc
+
+
+# pylint has issues with TF array ops, so disable checks until fixed:
+# https://github.com/PyCQA/pylint/issues/3613
+# pylint: disable=no-value-for-parameter, unexpected-keyword-arg
+
+
+def _sort_columns(feature_columns):
+    return sorted(feature_columns, key=lambda col: col.name)
+
+
+def _validate_numeric_column(feature_column):
+    if len(feature_column.shape) > 1:
+        return (
+            "Matrix numeric features are not allowed, "
+            "found feature {} with shape {}".format(
+                feature_column.key, feature_column.shape
+            )
+        )
+
+
+def _validate_categorical_column(feature_column):
+    if not isinstance(feature_column, fc.IdentityCategoricalColumn):
+        return (
+            "Only acceptable categorical columns for feeding "
+            "embeddings are identity, found column {} of type {}. "
+            "Consider using NVTabular online preprocessing to perform "
+            "categorical transformations".format(
+                feature_column.name, type(feature_column).__name__
+            )
+        )
+
+
+def _validate_dense_feature_columns(feature_columns):
+    _errors = []
+    for feature_column in feature_columns:
+        if isinstance(feature_column, fc.CategoricalColumn):
+            if not isinstance(feature_column, fc.BucketizedColumn):
+                _errors.append(
+                    "All feature columns must be dense, found categorical "
+                    "column {} of type {}. Please wrap categorical columns "
+                    "in embedding or indicator columns before passing".format(
+                        feature_column.name, type(feature_column).__name__
+                    )
+                )
+            else:
+                _errors.append(
+                    "Found bucketized column {}. DenseFeatures layer "
+                    "cannot apply bucketization preprocessing. Consider using "
+                    "NVTabular to do preprocessing offline".format(feature_column.name)
+                )
+        elif isinstance(feature_column, (fc.EmbeddingColumn, fc.IndicatorColumn)):
+            _errors.append(
+                _validate_categorical_column(feature_column.categorical_column)
+            )
+
+        elif isinstance(feature_column, fc.NumericColumn):
+            _errors.append(_validate_numeric_column(feature_column))
+
+    _errors = list(filter(lambda e: e is not None, _errors))
+    if len(_errors) > 0:
+        msg = "Found issues with columns passed to DenseFeatures:"
+        msg += "\n\t".join(_errors)
+        raise ValueError(_errors)
+
+
+def _validate_stack_dimensions(feature_columns):
+    dims = []
+    for feature_column in feature_columns:
+        if isinstance(feature_column, fc.EmbeddingColumn):
+            dimension = feature_column.dimension
+        elif isinstance(feature_column, fc.IndicatorColumn):
+            dimension = feature_column.categorical_column.num_buckets
+        else:
+            dimension = feature_column.shape[0]
+
+        dims.append(dimension)
+
+    dim0 = dims[0]
+    if not all(dim == dim0 for dim in dims[1:]):
+        dims = ", ".join(map(str, dims))
+        raise ValueError(
+            "'stack' aggregation requires all categorical "
+            "embeddings and continuous features to have same "
+            "size. Found dimensions {}".format(dims)
+        )
+
+
+def _categorical_embedding_lookup(table, inputs, feature_name, combiner):
+    # Multi-hots
+    if inputs[feature_name].shape[1] > 1:
+
+        # Multi-hot embedding lookup
+        x = inputs[feature_name]
+        embeddings = tf.gather(table, x)
+
+        # Remove padded values
+        mask_array = tf.cast(x >= 0, embeddings.dtype)
+        mask = tf.expand_dims(mask_array, -1)
+        embeddings = tf.math.multiply(embeddings, mask)
+
+        # Sum aggregation
+        embeddings = tf.reduce_sum(embeddings, axis=1)
+
+        # Divide by number of not zeros if mean aggregation
+        if combiner == "mean":
+            row_lengths = tf.reduce_sum(mask_array, axis=1)
+            row_lengths = tf.cast(row_lengths, embeddings.dtype)
+            row_lengths = tf.expand_dims(row_lengths, -1)
+            embeddings = tf.math.divide_no_nan(embeddings, row_lengths)
+    else:
+        embeddings = tf.gather(table, inputs[feature_name][:, 0])
+
+    return embeddings
+
+
+def _handle_continuous_feature(inputs, feature_column):
+    if feature_column.shape[0] > 1:
+        x = inputs[feature_column.name]
+        if isinstance(x, tuple):
+            x = x[0]
+        return tf.reshape(x, (-1, feature_column.shape[0]))
+    return inputs[feature_column.name]
+
+
+class DenseFeatures(tf.keras.layers.Layer):
+    """
+    Layer which maps a dictionary of input tensors to a dense, continuous
+    vector digestible by a neural network. Meant to reproduce the API exposed
+    by `tf.keras.layers.DenseFeatures` while reducing overhead for the
+    case of one-hot categorical and scalar numeric features.
+    Uses TensorFlow `feature_column` to represent inputs to the layer, but
+    does not perform any preprocessing associated with those columns. As such,
+    it should only be passed `numeric_column` objects and their subclasses,
+    `embedding_column` and `indicator_column`. Preprocessing functionality should
+    be moved to NVTabular.
+    For multi-hot categorical or vector continuous data, represent the data for
+    a feature with a dictionary entry `"<feature_name>__values"` corresponding
+    to the flattened array of all values in the batch. For multi-hot categorical
+    data, there should be a corresponding `"<feature_name>__nnzs"` entry that
+    describes how many categories are present in each sample (and so has length
+    `batch_size`).
+    Note that categorical columns should be wrapped in embedding or
+    indicator columns first, consistent with the API used by
+    `tf.keras.layers.DenseFeatures`.
+    Example usage::
+        column_a = tf.feature_column.numeric_column("a", (1,))
+        column_b = tf.feature_column.categorical_column_with_identity("b", 100)
+        column_b_embedding = tf.feature_column.embedding_column(column_b, 4)
+        inputs = {
+            "a": tf.keras.Input(name="a", shape=(1,), dtype=tf.float32),
+            "b": tf.keras.Input(name="b", shape=(1,), dtype=tf.int64)
+        }
+        x = DenseFeatures([column_a, column_b_embedding])(inputs)
+    Parameters
+    ----------
+    feature_columns : list of `tf.feature_column`
+        feature columns describing the inputs to the layer
+    aggregation : str in ("concat", "stack")
+        how to combine the embeddings from multiple features
+    """
+
+    def __init__(self, feature_columns, aggregation="concat", name=None, **kwargs):
+        # sort feature columns to make layer independent of column order
+        feature_columns = _sort_columns(feature_columns)
+        _validate_dense_feature_columns(feature_columns)
+
+        if aggregation == "stack":
+            _validate_stack_dimensions(feature_columns)
+        elif aggregation != "concat":
+            raise ValueError(
+                "Unrecognized aggregation {}, must be stack or concat".format(
+                    aggregation
+                )
+            )
+
+        self.feature_columns = feature_columns
+        self.aggregation = aggregation
+        super(DenseFeatures, self).__init__(name=name, **kwargs)
+
+    def build(self, input_shapes):
+        self.embedding_tables = {}
+        for feature_column in self.feature_columns:
+            if isinstance(feature_column, fc.NumericColumn):
+                continue
+
+            feature_name = feature_column.categorical_column.key
+            num_buckets = feature_column.categorical_column.num_buckets
+            if isinstance(feature_column, fc.EmbeddingColumn):
+                self.embedding_tables[feature_name] = self.add_weight(
+                    name="{}/embedding_weights".format(feature_name),
+                    trainable=True,
+                    initializer="glorot_normal",
+                    shape=(num_buckets, feature_column.dimension),
+                )
+            else:
+                self.embedding_tables[feature_name] = self.add_weight(
+                    name="{}/embedding_weights".format(feature_name),
+                    trainable=False,
+                    initializer=tf.constant_initializer(np.eye(num_buckets)),
+                    shape=(num_buckets, num_buckets),
+                )
+        self.built = True
+
+    def call(self, inputs):
+        features = []
+        for feature_column in self.feature_columns:
+            if isinstance(feature_column, fc.NumericColumn):
+                x = _handle_continuous_feature(inputs, feature_column)
+                features.append(x)
+            else:
+                feature_name = feature_column.categorical_column.name
+                table = self.embedding_tables[feature_name]
+                combiner = getattr(feature_column, "combiner", "sum")
+                embeddings = _categorical_embedding_lookup(
+                    table, inputs, feature_name, combiner
+                )
+                features.append(embeddings)
+
+        if self.aggregation == "stack":
+            return tf.stack(features, axis=1)
+        return tf.concat(features, axis=1)
+
+    def compute_output_shape(self, input_shapes):
+        input_shape = list(input_shapes.values())[0]
+        if self.aggregation == "concat":
+            output_dim = len(self.numeric_features) + sum(
+                [shape[-1] for shape in self.embedding_shapes.values()]
+            )
+            return (input_shape[0], output_dim)
+        else:
+            embedding_dim = list(self.embedding_shapes.values())[0]
+            return (input_shape[0], len(self.embedding_shapes), embedding_dim)
+
+    def get_config(self):
+        return {
+            "feature_columns": self.feature_columns,
+            "aggregation": self.aggregation,
+        }
+
+
+def _validate_linear_feature_columns(feature_columns):
+    _errors = []
+    for feature_column in feature_columns:
+        if isinstance(feature_column, (fc.EmbeddingColumn, fc.IndicatorColumn)):
+            _errors.append(
+                "Only pass categorical or numeric columns to ScalarLinearFeatures "
+                "layer, found column {} of type".format(feature_column)
+            )
+        elif isinstance(feature_column, fc.NumericColumn):
+            _errors.append(_validate_numeric_column(feature_column))
+        else:
+            _errors.append(_validate_categorical_column(feature_column))
+
+    _errors = list(filter(lambda e: e is not None, _errors))
+    if len(_errors) > 0:
+        msg = "Found issues with columns passed to ScalarDenseFeatures:"
+        msg += "\n\t".join(_errors)
+        raise ValueError(_errors)
+
+
+# TODO: is there a clean way to combine these two layers
+# into one, maybe with a "sum" aggregation? Major differences
+# seem to be whether categorical columns are wrapped in
+# embeddings and the numeric matmul, both of which seem
+# reasonably easy to check. At the very least, we should
+# be able to subclass I think?
+class LinearFeatures(tf.keras.layers.Layer):
+    """
+    Layer which implements a linear combination of one-hot categorical
+    and scalar numeric features. Based on the "wide" branch of the Wide & Deep
+    network architecture.
+    Uses TensorFlow ``feature_column``s to represent inputs to the layer, but
+    does not perform any preprocessing associated with those columns. As such,
+    it should only be passed ``numeric_column`` and
+    ``categorical_column_with_identity``. Preprocessing functionality should
+    be moved to NVTabular.
+    Also note that, unlike ScalarDenseFeatures, categorical columns should
+    NOT be wrapped in embedding or indicator columns first.
+    Example usage::
+        column_a = tf.feature_column.numeric_column("a", (1,))
+        column_b = tf.feature_column.categorical_column_with_identity("b", 100)
+        inputs = {
+            "a": tf.keras.Input(name="a", shape=(1,), dtype=tf.float32),
+            "b": tf.keras.Input(name="b", shape=(1,), dtype=tf.int64)
+        }
+        x = ScalarLinearFeatures([column_a, column_b])(inputs)
+    Parameters
+    ----------
+    feature_columns : list of tf.feature_column
+        feature columns describing the inputs to the layer
+    """
+
+    def __init__(self, feature_columns, name=None, **kwargs):
+        feature_columns = _sort_columns(feature_columns)
+        _validate_linear_feature_columns(feature_columns)
+
+        self.feature_columns = feature_columns
+        super(LinearFeatures, self).__init__(name=name, **kwargs)
+
+    def build(self, input_shapes):
+        # TODO: I've tried combining all the categorical tables
+        # into a single giant lookup op, but it ends up turning
+        # out the adding the offsets to lookup indices at call
+        # time ends up being much slower due to kernel overhead
+        # Still, a better (and probably custom) solutions would
+        # probably be desirable
+        numeric_kernel_dim = 0
+        self.embedding_tables = {}
+        for feature_column in self.feature_columns:
+            if isinstance(feature_column, fc.NumericColumn):
+                numeric_kernel_dim += feature_column.shape[0]
+                continue
+
+            self.embedding_tables[feature_column.key] = self.add_weight(
+                name="{}/embedding_weights".format(feature_column.key),
+                initializer="zeros",
+                trainable=True,
+                shape=(feature_column.num_buckets, 1),
+            )
+        if numeric_kernel_dim > 0:
+            self.embedding_tables["numeric"] = self.add_weight(
+                name="numeric/embedding_weights",
+                initializer="zeros",
+                trainable=True,
+                shape=(numeric_kernel_dim, 1),
+            )
+
+        self.bias = self.add_weight(
+            name="bias", initializer="zeros", trainable=True, shape=(1,)
+        )
+        self.built = True
+
+    def call(self, inputs):
+        x = self.bias
+        numeric_inputs = []
+        for feature_column in self.feature_columns:
+            if isinstance(feature_column, fc.NumericColumn):
+                numeric_inputs.append(
+                    _handle_continuous_feature(inputs, feature_column)
+                )
+            else:
+                table = self.embedding_tables[feature_column.key]
+                embeddings = _categorical_embedding_lookup(
+                    table, inputs, feature_column.key, "sum"
+                )
+                x = x + embeddings
+
+        if len(numeric_inputs) > 0:
+            numerics = tf.concat(numeric_inputs, axis=1)
+            x = x + tf.matmul(numerics, self.embedding_tables["numeric"])
+        return x
+
+    def compute_output_shape(self, input_shapes):
+        batch_size = list(input_shapes.values())[0].shape[0]
+        return (batch_size, 1)
+
+    def get_config(self):
+        return {
+            "feature_columns": self.feature_columns,
+        }

+ 35 - 38
TensorFlow2/Recommendation/WideAndDeep/trainer/model/widedeep.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,53 +13,45 @@
 # limitations under the License.
 
 import tensorflow as tf
-from data.outbrain.features import (
-    CATEGORICAL_COLUMNS,
-    NUMERIC_COLUMNS,
-    get_feature_columns,
-)
-from nvtabular.framework_utils.tensorflow import layers as nvtlayers
+from data.outbrain.features import (CATEGORICAL_COLUMNS, MULTIHOT_COLUMNS,
+                                    NUMERIC_COLUMNS, ONEHOT_COLUMNS,
+                                    get_feature_columns)
 
+from trainer.model import layers as nvtlayers
 
-def get_inputs_columns():
-    wide_columns, deep_columns = get_feature_columns()
 
-    wide_columns_dict = {}
-    deep_columns_dict = {}
+def get_inputs_columns(combiner):
+    wide_columns, deep_columns = get_feature_columns(combiner)
+
     features = {}
 
-    for col in wide_columns:
-        features[col.key] = tf.keras.Input(
-            shape=(1,),
-            batch_size=None,
-            name=col.key,
-            dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
-            sparse=False,
+    # Numerical
+    for feature in NUMERIC_COLUMNS:
+        features[feature] = tf.keras.Input(
+            shape=(1,), batch_size=None, name=feature, dtype=tf.float32, sparse=False
         )
-        wide_columns_dict[col.key] = col
 
-    for col in deep_columns:
-        is_embedding_column = "key" not in dir(col)
-        key = col.categorical_column.key if is_embedding_column else col.key
-
-        if key not in features:
-            features[key] = tf.keras.Input(
-                shape=(1,),
-                batch_size=None,
-                name=key,
-                dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
-                sparse=False,
-            )
-        deep_columns_dict[key] = col
+    # Categorical (One-hot)
+    for feature in ONEHOT_COLUMNS:
+        features[feature] = tf.keras.Input(
+            shape=(1,), batch_size=None, name=feature, dtype=tf.int32, sparse=False
+        )
 
-    deep_columns = list(deep_columns_dict.values())
-    wide_columns = list(wide_columns_dict.values())
+    # Categorical (Multi-hot)
+    for feature, hotness in MULTIHOT_COLUMNS.items():
+        features[feature] = tf.keras.Input(
+            shape=(hotness,),
+            batch_size=None,
+            name=f"{feature}",
+            dtype=tf.int32,
+            sparse=False,
+        )
 
     return deep_columns, wide_columns, features
 
 
 def wide_deep_model(args):
-    deep_columns, wide_columns, features = get_inputs_columns()
+    deep_columns, wide_columns, features = get_inputs_columns(combiner=args.combiner)
 
     wide = nvtlayers.LinearFeatures(wide_columns, name="wide_linear")(features)
 
@@ -85,7 +77,12 @@ def get_dummy_inputs(batch_size):
     for cat in CATEGORICAL_COLUMNS:
         inputs[cat] = tf.zeros(shape, dtype=tf.dtypes.int32)
 
-    for cat in NUMERIC_COLUMNS:
-        inputs[cat] = tf.zeros(shape, dtype=tf.dtypes.float32)
+    for num in NUMERIC_COLUMNS:
+        inputs[num] = tf.zeros(shape, dtype=tf.dtypes.float32)
+
+    for mul, hotness in MULTIHOT_COLUMNS.items():
+        inputs[mul] = tf.zeros((batch_size, hotness), dtype=tf.dtypes.int32)
+
+    labels = tf.zeros(shape, dtype=tf.dtypes.int32)
 
-    return inputs
+    return inputs, labels

+ 23 - 4
TensorFlow2/Recommendation/WideAndDeep/trainer/run.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 
 import horovod.tensorflow as hvd
 import tensorflow as tf
+
 from trainer.utils.benchmark import ThroughputCalculator
 from trainer.utils.evaluator import Evaluator
 from trainer.utils.schedulers import LearningRateScheduler
@@ -24,6 +25,8 @@ def run(args, model, config):
     train_dataset = config["train_dataset"]
     eval_dataset = config["eval_dataset"]
     steps_per_epoch = len(train_dataset)
+    steps_per_epoch = min(hvd.allgather(tf.constant([steps_per_epoch], dtype=tf.int32)))
+    steps_per_epoch = steps_per_epoch.numpy()
 
     steps = int(steps_per_epoch * args.num_epochs)
     deep_optimizer = tf.keras.optimizers.RMSprop(
@@ -33,8 +36,12 @@ def run(args, model, config):
     wide_optimizer = tf.keras.optimizers.Ftrl(learning_rate=args.linear_learning_rate)
 
     if not args.cpu:
-        deep_optimizer = hvd.DistributedOptimizer(deep_optimizer)
-        wide_optimizer = hvd.DistributedOptimizer(wide_optimizer)
+        deep_optimizer = hvd.DistributedOptimizer(
+            deep_optimizer, compression=hvd.Compression.fp16
+        )
+        wide_optimizer = hvd.DistributedOptimizer(
+            wide_optimizer, compression=hvd.Compression.fp16
+        )
 
     if args.amp:
         deep_optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
@@ -56,7 +63,6 @@ def run(args, model, config):
         throughput_calculator=throughput_calculator,
         eval_dataset=eval_dataset,
         compiled_loss=compiled_loss,
-        steps=steps,
         args=args,
     )
 
@@ -75,6 +81,19 @@ def run(args, model, config):
 
     trainer.maybe_restore_checkpoint()
 
+    # Wrap datasets with .epochs(n) method to speed up data loading
+    current_epoch = trainer.current_epoch
+    trainer.prepare_dataset(current_epoch)
+    evaluator.prepare_dataset(current_epoch)
+
+    # Update max_steps to make sure that all workers finish training at the same time
+    max_training_steps = len(trainer.train_dataset)
+    max_training_steps = min(
+        hvd.allgather(tf.constant([max_training_steps], dtype=tf.int32))
+    )
+    max_training_steps = int(max_training_steps.numpy())
+    trainer.max_steps = max_training_steps
+
     if args.evaluate:
         evaluator.eval(trainer.current_step_var)
     else:

+ 17 - 6
TensorFlow2/Recommendation/WideAndDeep/trainer/utils/arguments.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ def parse_args():
         "--eval_data_pattern",
         type=str,
         default=f"{DEFAULT_DIR}/data/valid/*.parquet",
-        help="Pattern of eval file names. For example if training files are part_0.parquet, "
+        help="Pattern of eval file names. For example if evaluation files are part_0.parquet, "
              "part_0.parquet then --eval_data_pattern is *.parquet",
     )
 
@@ -143,6 +143,17 @@ def parse_args():
         help="Dropout regularization for deep model",
     )
 
+    model_construction.add_argument(
+        "--combiner",
+        type=str,
+        default="sum",
+        choices=[
+            "mean",
+            "sum",
+        ],
+        help="Type of aggregation used for multi hot categorical features",
+    )
+
     run_params = parser.add_argument_group("run mode parameters")
 
     run_params.add_argument(
@@ -176,13 +187,13 @@ def parse_args():
     run_params.add_argument(
         "--affinity",
         type=str,
-        default="socket_unique_interleaved",
+        default="unique_interleaved",
         choices=[
-            "socket",
+            "all",
             "single",
             "single_unique",
-            "socket_unique_interleaved",
-            "socket_unique_continuous",
+            "unique_interleaved",
+            "unique_contiguous",
             "disabled",
         ],
         help="Type of CPU affinity",

+ 5 - 5
TensorFlow2/Recommendation/WideAndDeep/trainer/utils/benchmark.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,10 +31,10 @@ class ThroughputCalculator:
             self.samples = tf.Variable(0, trainable=False, dtype=tf.int64)
 
     def _init_benchmark(self):
-        self.t0 = time.time()
+        self.t0 = time.perf_counter()
 
     def on_epoch_end_log(self, step, shape):
-        batch_time = time.time() - self.start_batch_time
+        batch_time = time.perf_counter() - self.start_batch_time
         self.samples.assign_add(shape)
         workers = hvd.size() if not self.args.cpu else 1
         samplesps = shape * workers / batch_time
@@ -42,7 +42,7 @@ class ThroughputCalculator:
             dllogger.log(data={"batch_samplesps": samplesps}, step=(1, step))
 
     def on_benchmark_end_log(self, eval_benchmark=False):
-        train_time = time.time() - self.t0
+        train_time = time.perf_counter() - self.t0
         hvd.join()
         if not self.args.cpu:
             all_samples = hvd.allreduce(self.samples, op=Sum)
@@ -67,4 +67,4 @@ class ThroughputCalculator:
                     exit(0)
 
             self.step += 1
-            self.start_batch_time = time.time()
+            self.start_batch_time = time.perf_counter()

+ 32 - 27
TensorFlow2/Recommendation/WideAndDeep/trainer/utils/evaluator.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,23 +15,23 @@
 import dllogger
 import horovod.tensorflow as hvd
 import tensorflow as tf
+from data.outbrain.dataloader import pad_batch
 from data.outbrain.features import DISPLAY_ID_COLUMN
-from horovod.tensorflow.mpi_ops import Sum, Average
+from horovod.tensorflow.mpi_ops import Average, Sum
 
 
 class Evaluator:
     def __init__(
-        self,
-        model,
-        throughput_calculator,
-        eval_dataset,
-        compiled_loss,
-        steps,
-        args,
+            self,
+            model,
+            throughput_calculator,
+            eval_dataset,
+            compiled_loss,
+            args,
     ):
 
         self.model = model
-        self.steps = steps
+        self.steps_per_epoch = len(eval_dataset)
         self.args = args
         self.throughput_calculator = throughput_calculator
         self.compiled_loss = compiled_loss
@@ -57,11 +57,16 @@ class Evaluator:
         self.current_step_var.assign(1)
         self.streaming_map.assign(1)
 
+    def prepare_dataset(self, current_epoch):
+        benchmark_needed_steps = self.args.benchmark_steps // self.steps_per_epoch + 1
+        n = 1 if self.args.evaluate and not self.args.benchmark else self.args.num_epochs - current_epoch \
+            if not self.args.benchmark else max(benchmark_needed_steps, self.args.num_epochs)
+        self.eval_dataset = self.eval_dataset.epochs(n)
+
     @tf.function
-    def _calculate_map(self, x, y, predictions):
+    def _calculate_map(self, y, predictions, display_ids):
         predictions = tf.reshape(predictions, [-1])
         predictions = tf.cast(predictions, tf.float64)
-        display_ids = x[DISPLAY_ID_COLUMN]
         display_ids = tf.reshape(display_ids, [-1])
         labels = tf.reshape(y, [-1])
         sorted_ids = tf.argsort(display_ids)
@@ -96,8 +101,8 @@ class Evaluator:
         self.display_id_counter.assign_add(shape)
         self.streaming_map.assign_add(ap_sum)
 
-    @tf.function
-    def _execute_step_calculations(self, x, y):
+    @tf.function(experimental_relax_shapes=True)
+    def _execute_step_calculations(self, x, y, display_ids):
         predictions = self.model(x, training=False)
 
         with tf.device("/CPU:0"):
@@ -105,7 +110,7 @@ class Evaluator:
             for metric in self.metrics:
                 metric.update_state(y, predictions)
             self.eval_loss.update_state(loss)
-            self._calculate_map(x, y, predictions)
+            self._calculate_map(y, predictions, display_ids)
 
         return loss
 
@@ -114,9 +119,7 @@ class Evaluator:
         if not self.args.cpu:
             all_streaming_map = hvd.allreduce(self.streaming_map, op=Sum)
             all_display_id_counter = hvd.allreduce(self.display_id_counter, op=Sum)
-            eval_loss = hvd.allreduce(
-                self.eval_loss.result(), op=Average
-            )
+            eval_loss = hvd.allreduce(self.eval_loss.result(), op=Average)
         else:
             all_streaming_map = self.streaming_map
             all_display_id_counter = self.display_id_counter
@@ -128,11 +131,11 @@ class Evaluator:
         return map_metric, eval_loss
 
     @staticmethod
-    def log(eval_data, step, steps):
-        dllogger.log(data=eval_data, step=(step, steps))
+    def log(eval_data, step):
+        dllogger.log(data=eval_data, step=(step,))
 
-    def eval_step(self, x, y):
-        self._execute_step_calculations(x, y)
+    def eval_step(self, x, y, display_ids):
+        self._execute_step_calculations(x, y, display_ids)
 
         if self.args.benchmark:
             self.throughput_calculator(y.shape[0], eval_benchmark=True)
@@ -141,12 +144,14 @@ class Evaluator:
 
         eval_data = {}
         self._reset_states()
-        range_val = 1 if not self.args.benchmark else 100
 
         # Graph mode part
-        for _ in range(range_val):
-            for x, y in self.eval_dataset:
-                self.eval_step(x, y)
+        for i, (x, y) in enumerate(self.eval_dataset, 1):
+            x = pad_batch(x)
+            display_ids = x.pop(DISPLAY_ID_COLUMN)
+            self.eval_step(x, y, display_ids)
+            if i == self.steps_per_epoch and not self.args.benchmark:
+                break
 
         map_metric, eval_loss = self._reduce_results()
 
@@ -159,6 +164,6 @@ class Evaluator:
                     "streaming_map_val": f"{map_metric.numpy():.4f}",
                 }
 
-                self.log(eval_data, current_step, self.steps)
+                self.log(eval_data, current_step)
 
         return eval_data

+ 473 - 97
TensorFlow2/Recommendation/WideAndDeep/trainer/utils/gpu_affinity.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,42 +13,49 @@
 # limitations under the License.
 
 import collections
-import math
+import functools
+import itertools
+import operator
 import os
 import pathlib
 import re
 
 import pynvml
 
-pynvml.nvmlInit()
 
-
-def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
-
-
-def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
-
-
-class device:
+class Device:
     # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+    _nvml_bit_affinity = 64
+
+    _nvml_affinity_elements = (
+        os.cpu_count() + _nvml_bit_affinity - 1
+    ) // _nvml_bit_affinity
 
     def __init__(self, device_idx):
         super().__init__()
         self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
 
-    def getName(self):
+    def get_name(self):
         return pynvml.nvmlDeviceGetName(self.handle)
 
-    def getCpuAffinity(self):
+    def get_uuid(self):
+        return pynvml.nvmlDeviceGetUUID(self.handle)
+
+    def get_cpu_affinity(self, scope):
+        if scope == "socket":
+            nvml_scope = pynvml.NVML_AFFINITY_SCOPE_SOCKET
+        elif scope == "node":
+            nvml_scope = pynvml.NVML_AFFINITY_SCOPE_NODE
+        else:
+            raise RuntimeError("Unknown scope")
+
         affinity_string = ""
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-                self.handle, device._nvml_affinity_elements
+        for j in pynvml.nvmlDeviceGetCpuAffinityWithinScope(
+            self.handle, Device._nvml_affinity_elements, nvml_scope
         ):
             # assume nvml returns list of 64 bit ints
             affinity_string = "{:064b}".format(j) + affinity_string
+
         affinity_list = [int(x) for x in affinity_string]
         affinity_list.reverse()  # so core 0 is in 0th element of list
 
@@ -56,109 +63,478 @@ class device:
         return ret
 
 
-def set_socket_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity)
+def get_thread_siblings_list():
+    """
+    Returns a list of 2-element integer tuples representing pairs of
+    hyperthreading cores.
+    """
+    path = "/sys/devices/system/cpu/cpu*/topology/thread_siblings_list"
+    thread_siblings_list = []
+    pattern = re.compile(r"(\d+)\D(\d+)")
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(sorted(map(int, res[0])))
+                thread_siblings_list.append(pair)
+    thread_siblings_list = list(set(thread_siblings_list))
+    return thread_siblings_list
+
+
+def build_thread_siblings_dict(siblings_list):
+    siblings_dict = {}
+    for siblings_tuple in siblings_list:
+        for core in siblings_tuple:
+            siblings_dict[core] = siblings_tuple
 
+    return siblings_dict
 
-def set_single_affinity(gpu_id):
-    dev = device(gpu_id)
-    affinity = dev.getCpuAffinity()
-    os.sched_setaffinity(0, affinity[:1])
 
+def group_list_by_key(the_list, key):
+    sorted_list = sorted(the_list, key=key)
+    grouped = [tuple(group) for key, group in itertools.groupby(sorted_list, key=key)]
+    return grouped
 
-def set_single_unique_affinity(gpu_id, nproc_per_node):
-    devices = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in devices]
 
+def group_by_siblings(affinities):
     siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
+    siblings_dict = build_thread_siblings_dict(siblings_list)
+    siblings_key = lambda x: siblings_dict.get(x, (x,))
+    affinities = [
+        tuple(group_list_by_key(affinity, key=siblings_key)) for affinity in affinities
+    ]
+    return affinities
 
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(
-            set(socket_affinity) - set(siblings_dict.values())
-        )
 
-    affinities = []
-    assigned = []
+def group_by_node(socket_affinities, node_affinities):
+    socket_node_assigned_cores = collections.defaultdict(list)
+    for socket, node_cores in zip(socket_affinities, node_affinities):
+        socket_node_assigned_cores[socket].extend(node_cores)
 
-    for socket_affinity in socket_affinities:
-        for core in socket_affinity:
-            if core not in assigned:
-                affinities.append([core])
-                assigned.append(core)
-                break
-    os.sched_setaffinity(0, affinities[gpu_id])
+    socket_node_assigned_cores = {
+        key: tuple(sorted(set(value)))
+        for key, value in socket_node_assigned_cores.items()
+    }
 
+    node_grouping = collections.defaultdict(list)
 
-def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
-    device_ids = [device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+    for socket_cores, assigned_cores in socket_node_assigned_cores.items():
+        unassigned_cores = sorted(list(set(socket_cores) - set(assigned_cores)))
 
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
+        for assigned_core in assigned_cores:
+            node_grouping[assigned_core].append(assigned_core)
+
+        for assigned, unassigned in zip(
+            itertools.cycle(assigned_cores), unassigned_cores
+        ):
+            node_grouping[assigned].append(unassigned)
+
+    node_grouping = {key: tuple(value) for key, value in node_grouping.items()}
+
+    grouped_affinities = [
+        tuple(node_grouping[item] for item in node_affinity)
+        for node_affinity in node_affinities
+    ]
+    return grouped_affinities
+
+
+def ungroup_by_nodes(affinities, scope):
+    if scope == "socket":
+        affinities = [list(itertools.chain(*zip(*affinity))) for affinity in affinities]
+    elif scope == "node":
+        affinities = [[group[0] for group in affinity] for affinity in affinities]
+    return affinities
+
+
+def ungroup_by_siblings(affinities, cores):
+    if cores == "all_logical":
+        affinities = [list(itertools.chain(*affinity)) for affinity in affinities]
+    elif cores == "single_logical":
+        affinities = [[group[0] for group in affinity] for affinity in affinities]
+    else:
+        raise RuntimeError("Unknown cores mode")
+    return affinities
+
+
+def check_core_count(affinities, min_cores=1, max_cores=None):
+    for gpu_id, affinity in enumerate(affinities):
+        if len(affinity) < min_cores:
+            raise RuntimeError(
+                f"Number of available physical cores for GPU {gpu_id} is less "
+                f"the predefinied minimum, min_cores={min_cores}, available "
+                f"physical cores: {affinity} (count={len(affinity)})"
+            )
+
+    if max_cores is not None:
+        affinities = [affinity[:max_cores] for affinity in affinities]
 
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(
-            set(socket_affinity) - set(siblings_dict.values())
+    return affinities
+
+
+def ungroup_all_and_check_count(affinities, scope, cores, min_cores=1, max_cores=None):
+    affinities = ungroup_by_nodes(affinities, scope)
+    affinities = check_core_count(affinities, min_cores, max_cores)
+    affinities = ungroup_by_siblings(affinities, cores)
+    return affinities
+
+
+def check_affinities(affinities):
+    # sets of cores should be either identical or disjoint
+    for i, j in itertools.product(affinities, affinities):
+        if not set(i) == set(j) and not set(i).isdisjoint(set(j)):
+            raise RuntimeError(
+                f"Sets of cores should be either identical or disjoint, "
+                f"but got {i} and {j}."
+            )
+
+
+def get_affinities(nproc_per_node, scope, exclude_unavailable_cores=True):
+    devices = [Device(i) for i in range(nproc_per_node)]
+    affinities = [dev.get_cpu_affinity(scope) for dev in devices]
+
+    if exclude_unavailable_cores:
+        available_cores = os.sched_getaffinity(0)
+        affinities = [
+            sorted(list(set(affinity) & available_cores)) for affinity in affinities
+        ]
+
+    check_affinities(affinities)
+
+    return affinities
+
+
+def get_grouped_affinities(nproc_per_node, exclude_unavailable_cores=True):
+    socket_affinities = get_affinities(
+        nproc_per_node, "socket", exclude_unavailable_cores
+    )
+    node_affinities = get_affinities(nproc_per_node, "node", exclude_unavailable_cores)
+
+    sibling_socket_affinities = group_by_siblings(socket_affinities)
+    sibling_node_affinities = group_by_siblings(node_affinities)
+
+    grouped_affinities = group_by_node(
+        sibling_socket_affinities, sibling_node_affinities
+    )
+
+    return grouped_affinities
+
+
+def get_all(nproc_per_node, scope, cores, min_cores, max_cores):
+    """
+    The process is assigned with all available physical CPU cores recommended by
+    pynvml for the GPU with a given id.
+
+    Assignment automatically includes available hyperthreading siblings if
+    cores='all_logical'.
+
+    Args:
+        nproc_per_node: number of processes per node
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
+    """
+    affinities = get_affinities(nproc_per_node, scope)
+
+    affinities = group_by_siblings(affinities)
+
+    node_affinities = group_by_siblings(get_affinities(nproc_per_node, "node"))
+    all_node_affinities = functools.reduce(operator.add, node_affinities)
+
+    affinities = [
+        tuple(
+            sorted(
+                affinity,
+                key=lambda x: (
+                    0 if x in all_node_affinities else 1,
+                    x,
+                ),
+            )
         )
+        for affinity in affinities
+    ]
+
+    affinities = check_core_count(affinities, min_cores, max_cores)
+    affinities = ungroup_by_siblings(affinities, cores)
+    return affinities
+
+
+def get_single(nproc_per_node, scope, cores, min_cores=1, max_cores=1):
+    """
+    The process is assigned with the first available physical CPU core from the
+    list of all physical CPU cores recommended by pynvml for the GPU with a
+    given id.
+
+    Assignment automatically includes available hyperthreading siblings if
+    cores='all_logical'.
+
+    Args:
+        nproc_per_node: number of processes per node
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
+    """
+    grouped_affinities = get_grouped_affinities(nproc_per_node)
+    ungrouped_affinities = ungroup_all_and_check_count(
+        grouped_affinities, scope, cores, min_cores, max_cores
+    )
+    return ungrouped_affinities
+
+
+def get_single_unique(nproc_per_node, scope, cores, min_cores=1, max_cores=1):
+    """
+    The process is assigned with a single unique available physical CPU core
+    from the list of all physical CPU cores recommended by pynvml for the GPU
+    with a given id.
+
+    Assignment automatically includes available hyperthreading siblings if
+    cores='all_logical'.
+
+    Args:
+        nproc_per_node: number of processes per node
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
+    """
+    grouped_affinities = get_grouped_affinities(nproc_per_node)
 
-    socket_affinities_to_device_ids = collections.defaultdict(list)
+    affinities = []
+    assigned_groups = set()
 
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+    for grouped_affinity in grouped_affinities:
+        for group in grouped_affinity:
+            if group not in assigned_groups:
+                affinities.append([group])
+                assigned_groups.add(group)
+                break
 
-    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+    ungrouped_affinities = ungroup_all_and_check_count(
+        affinities, scope, cores, min_cores, max_cores
+    )
+
+    return ungrouped_affinities
+
+
+def get_unique(
+    nproc_per_node,
+    scope,
+    cores,
+    mode,
+    min_cores,
+    max_cores,
+    balanced=True,
+):
+    """
+    The process is assigned with a unique subset of available physical CPU
+    cores from the list of all CPU cores recommended by pynvml for the GPU with
+    a given id.
+
+    Assignment automatically includes available hyperthreading siblings if
+    cores='all_logical'.
+
+    Args:
+        nproc_per_node: number of processes per node
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
+        mode: 'unique_contiguous' or 'unique_interleaved'
+        balanced: assign an equal number of physical cores to each process,
+    """
+    grouped_affinities = get_grouped_affinities(nproc_per_node)
+
+    grouped_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, grouped_affinity in enumerate(grouped_affinities):
+        grouped_affinities_to_device_ids[tuple(grouped_affinity)].append(idx)
+
+    # compute minimal number of physical cores per GPU across all GPUs and
+    # sockets, code assigns this number of cores per GPU if balanced == True
+    min_physical_cores_per_gpu = min(
+        [
+            len(cores) // len(gpus)
+            for cores, gpus in grouped_affinities_to_device_ids.items()
+        ]
+    )
+
+    grouped_unique_affinities = [None] * nproc_per_node
+
+    for (
+        grouped_affinity,
+        device_ids,
+    ) in grouped_affinities_to_device_ids.items():
         devices_per_group = len(device_ids)
-        cores_per_device = len(socket_affinity) // devices_per_group
-        for group_id, device_id in enumerate(device_ids):
-            if device_id == gpu_id:
-                if mode == "interleaved":
-                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == "continuous":
-                    affinity = list(
-                        socket_affinity[group_id * cores_per_device:(group_id + 1) * cores_per_device]
-                    )
-                else:
-                    raise RuntimeError("Unknown set_socket_unique_affinity mode")
-
-                # reintroduce siblings
-                affinity += [
-                    siblings_dict[aff] for aff in affinity if aff in siblings_dict
-                ]
-                os.sched_setaffinity(0, affinity)
+        if balanced:
+            cores_per_device = min_physical_cores_per_gpu
+            grouped_affinity = grouped_affinity[
+                : devices_per_group * min_physical_cores_per_gpu
+            ]
+        else:
+            cores_per_device = len(grouped_affinity) // devices_per_group
+
+        for subgroup_id, device_id in enumerate(device_ids):
+            # In theory there should be no difference in performance between
+            # 'interleaved' and 'contiguous' pattern on Intel-based DGX-1,
+            # but 'contiguous' should be better for DGX A100 because on AMD
+            # Rome 4 consecutive cores are sharing L3 cache.
+            # TODO: code doesn't attempt to automatically detect layout of
+            # L3 cache, also external environment may already exclude some
+            # cores, this code makes no attempt to detect it and to align
+            # mapping to multiples of 4.
+
+            if mode == "unique_interleaved":
+                unique_grouped_affinity = list(
+                    grouped_affinity[subgroup_id::devices_per_group]
+                )
+            elif mode == "unique_contiguous":
+                unique_grouped_affinity = list(
+                    grouped_affinity[
+                        subgroup_id
+                        * cores_per_device: (subgroup_id + 1)
+                        * cores_per_device
+                    ]
+                )
+            else:
+                raise RuntimeError("Unknown set_unique mode")
+
+            grouped_unique_affinities[device_id] = unique_grouped_affinity
+
+    ungrouped_affinities = ungroup_all_and_check_count(
+        grouped_unique_affinities, scope, cores, min_cores, max_cores
+    )
+    return ungrouped_affinities
+
+
+def set_affinity(
+    gpu_id,
+    nproc_per_node,
+    *,
+    mode="unique_contiguous",
+    scope="node",
+    cores="all_logical",
+    balanced=True,
+    min_cores=1,
+    max_cores=None,
+):
+    """
+    The process is assigned with a proper CPU affinity that matches CPU-GPU
+    hardware architecture on a given platform. Usually, setting proper affinity
+    improves and stabilizes the performance of deep learning training workloads.
+
+    This function assumes that the workload runs in multi-process single-device
+    mode (there are multiple training processes, and each process is running on
+    a single GPU). This is typical for multi-GPU data-parallel training
+    workloads (e.g., using `torch.nn.parallel.DistributedDataParallel`).
+
+    Available affinity modes:
+    * 'all' - the process is assigned with all available physical CPU cores
+    recommended by pynvml for the GPU with a given id.
+    * 'single' - the process is assigned with the first available
+    physical CPU core from the list of all physical CPU cores recommended by
+    pynvml for the GPU with a given id (multiple GPUs could be assigned with
+    the same CPU core).
+    * 'single_unique' - the process is assigned with a single unique
+    available physical CPU core from the list of all CPU cores recommended by
+    pynvml for the GPU with a given id.
+    * 'unique_interleaved' - the process is assigned with a unique subset of
+    available physical CPU cores from the list of all physical CPU cores
+    recommended by pynvml for the GPU with a given id, cores are assigned with
+    interleaved indexing pattern
+    * 'unique_contiguous' - (the default mode) the process is assigned with a
+    unique subset of available physical CPU cores from the list of all physical
+    CPU cores recommended by pynvml for the GPU with a given id, cores are
+    assigned with contiguous indexing pattern
+
+    Available "scope" modes:
+    * 'node' - sets the scope for pynvml affinity queries to NUMA node
+    * 'socket' - sets the scope for pynvml affinity queries to processor socket
+
+    Available "cores" modes:
+    * 'all_logical' - assigns the process with all logical cores associated with
+    a given corresponding physical core (i.e., automatically includes all
+    available hyperthreading siblings)
+    * 'single_logical' - assigns the process with only one logical core
+    associated with a given corresponding physical core (i.e., excludes
+    hyperthreading siblings)
+
+    'unique_contiguous' is the recommended mode for deep learning
+    training workloads on NVIDIA DGX machines.
+
+    Args:
+        gpu_id: integer index of a GPU, value from 0 to 'nproc_per_node' - 1
+        nproc_per_node: number of processes per node
+        mode: affinity mode
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
+        balanced: assign an equal number of physical cores to each process,
+            affects only 'unique_interleaved' and
+            'unique_contiguous' affinity modes
+        min_cores: (default=1) the intended minimum number of physical cores per
+            process, code raises RuntimeError if the number of available cores
+            is less than 'min_cores'
+        max_cores: (default=None) the intended maxmimum number of physical cores
+            per process, the list of assigned cores is trimmed to the first
+            'max_cores' cores if max_cores is not None
+
+    Returns a set of logical CPU cores on which the process is eligible to run.
+
+    WARNING: On DGX A100, only half of the CPU cores have direct access to GPUs.
+    set_affinity with scope='node' restricts execution only to the CPU cores
+    directly connected to GPUs. On DGX A100, it will limit the code to half of
+    the CPU cores and half of CPU memory bandwidth (which may be fine for many
+    DL models). Use scope='socket' to use all available DGX A100 CPU cores.
+
+    WARNING: Intel's OpenMP implementation resets affinity on the first call to
+    an OpenMP function after a fork. It's recommended to run with env variable:
+    `KMP_AFFINITY=disabled` if the affinity set by gpu_affinity should be
+    preserved after a fork (e.g. in PyTorch DataLoader workers).
+
+    Example:
+
+    import argparse
+    import os
+
+    import gpu_affinity
+    import torch
+
+
+    def main():
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            '--local_rank',
+            type=int,
+            default=os.getenv('LOCAL_RANK', 0),
+        )
+        args = parser.parse_args()
 
+        nproc_per_node = torch.cuda.device_count()
+
+        affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node)
+        print(f'{args.local_rank}: core affinity: {affinity}')
 
-def get_thread_siblings_list():
-    path = "/sys/devices/system/cpu/cpu*/topology/thread_siblings_list"
-    thread_siblings_list = []
-    pattern = re.compile(r"(\d+)\D(\d+)")
-    for fname in pathlib.Path(path[0]).glob(path[1:]):
-        with open(fname) as f:
-            content = f.read().strip()
-            res = pattern.findall(content)
-            if res:
-                pair = tuple(map(int, res[0]))
-                thread_siblings_list.append(pair)
-    return thread_siblings_list
 
+    if __name__ == "__main__":
+        main()
 
-def set_affinity(gpu_id, nproc_per_node, mode="socket"):
-    if mode == "socket":
-        set_socket_affinity(gpu_id)
+    Launch the example with:
+    python -m torch.distributed.launch --nproc_per_node <#GPUs> example.py
+    """
+    pynvml.nvmlInit()
+
+    if mode == "all":
+        affinity = get_all(nproc_per_node, scope, cores, min_cores, max_cores)
     elif mode == "single":
-        set_single_affinity(gpu_id)
+        affinity = get_single(nproc_per_node, scope, cores)
     elif mode == "single_unique":
-        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == "socket_unique_interleaved":
-        set_socket_unique_affinity(gpu_id, nproc_per_node, "interleaved")
-    elif mode == "socket_unique_continuous":
-        set_socket_unique_affinity(gpu_id, nproc_per_node, "continuous")
+        affinity = get_single_unique(nproc_per_node, scope, cores)
+    elif mode == "unique_interleaved" or mode == "unique_contiguous":
+        affinity = get_unique(
+            nproc_per_node,
+            scope,
+            cores,
+            mode,
+            min_cores,
+            max_cores,
+            balanced,
+        )
     else:
         raise RuntimeError("Unknown affinity mode")
 
-    affinity = os.sched_getaffinity(0)
-    return affinity
+    os.sched_setaffinity(0, affinity[gpu_id])
+    set_affinity = os.sched_getaffinity(0)
+    return set_affinity

+ 5 - 7
TensorFlow2/Recommendation/WideAndDeep/trainer/utils/schedulers.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,14 +17,12 @@ import tensorflow as tf
 
 class LearningRateScheduler:
     def __init__(self, args, steps_per_epoch, optimizer):
-        assert (
-            args.deep_warmup_epochs <= args.num_epochs
-        ), "Number of warmup epochs cannot be higher than training epochs"
+        assert args.deep_warmup_epochs <= args.num_epochs, \
+            "Number of warmup epochs cannot be higher than training epochs"
         self.base_lr = args.deep_learning_rate
         self.warmup_steps = args.deep_warmup_epochs * steps_per_epoch
-        bound_epoch = (
-            args.deep_warmup_epochs + (args.num_epochs - args.deep_warmup_epochs) / 2
-        )
+        bound_epoch = args.deep_warmup_epochs + (args.num_epochs - args.deep_warmup_epochs) / 2
+
         self.boundaries = [bound_epoch * steps_per_epoch]
         self.values = [self.base_lr / 4, self.base_lr / 8]
         self.optimizer = optimizer

+ 19 - 4
TensorFlow2/Recommendation/WideAndDeep/trainer/utils/setup.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,12 +15,13 @@
 import glob
 import json
 import logging
+import multiprocessing
 import os
 
 import dllogger
 import horovod.tensorflow.keras as hvd
 import tensorflow as tf
-from data.outbrain.dataloader import train_input_fn, eval_input_fn
+from data.outbrain.dataloader import eval_input_fn, train_input_fn
 from trainer.utils.gpu_affinity import set_affinity
 
 
@@ -45,12 +46,26 @@ def init_gpu(args, logger):
         )
         logger.warning(f"{gpu_id}: thread affinity: {affinity}")
 
+    tf.config.threading.set_intra_op_parallelism_threads(1)
+    tf.config.threading.set_inter_op_parallelism_threads(
+        max(2, (multiprocessing.cpu_count() // hvd.size()) - 2)
+    )
+
     if args.amp:
         tf.keras.mixed_precision.set_global_policy("mixed_float16")
 
     if args.xla:
         tf.config.optimizer.set_jit(True)
 
+    # Max out L2 cache
+    import ctypes
+
+    _libcudart = ctypes.CDLL("libcudart.so")
+    pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
+    _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
+    _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
+    assert pValue.contents.value == 128
+
 
 def init_logger(args, full, logger):
     if full:
@@ -80,10 +95,10 @@ def init_logger(args, full, logger):
 
 def create_config(args):
     assert not (
-        args.cpu and args.amp
+            args.cpu and args.amp
     ), "Automatic mixed precision conversion works only with GPU"
     assert (
-        not args.benchmark or args.benchmark_warmup_steps < args.benchmark_steps
+            not args.benchmark or args.benchmark_warmup_steps < args.benchmark_steps
     ), "Number of benchmark steps must be higher than warmup steps"
 
     logger = logging.getLogger("tensorflow")

+ 57 - 37
TensorFlow2/Recommendation/WideAndDeep/trainer/utils/trainer.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,21 +18,23 @@ import os
 import dllogger
 import horovod.tensorflow as hvd
 import tensorflow as tf
+from data.outbrain.dataloader import pad_batch
+from trainer.model.widedeep import get_dummy_inputs
 
 
 class Trainer:
     def __init__(
-        self,
-        model,
-        scheduler,
-        deep_optimizer,
-        wide_optimizer,
-        throughput_calculator,
-        compiled_loss,
-        steps,
-        args,
-        train_dataset,
-        evaluator,
+            self,
+            model,
+            scheduler,
+            deep_optimizer,
+            wide_optimizer,
+            throughput_calculator,
+            compiled_loss,
+            steps,
+            args,
+            train_dataset,
+            evaluator,
     ):
         self.model = model
         self.scheduler = scheduler
@@ -40,6 +42,7 @@ class Trainer:
         self.wide_optimizer = wide_optimizer
         self.throughput_calculator = throughput_calculator
         self.steps = steps
+        self.steps_per_epoch = steps // args.num_epochs
         self.args = args
         self.train_dataset = train_dataset
         self.evaluator = evaluator
@@ -53,6 +56,7 @@ class Trainer:
             )
 
         self._init_checkpoint_manager()
+        self.max_steps = steps
 
     def _init_checkpoint_manager(self):
         self.checkpoint = tf.train.Checkpoint(
@@ -67,6 +71,24 @@ class Trainer:
             max_to_keep=1,
         )
 
+    @property
+    def current_epoch(self):
+        return int(self.current_step_var.numpy()) // self.steps
+
+    @property
+    def max_steps(self):
+        return self.__max_steps
+
+    @max_steps.setter
+    def max_steps(self, steps):
+        self.__max_steps = min(self.steps, steps)
+
+    def prepare_dataset(self, current_epoch):
+        benchmark_needed_steps = self.args.benchmark_steps // self.steps_per_epoch + 1
+        n = self.args.num_epochs - current_epoch if not self.args.benchmark \
+            else max(benchmark_needed_steps, self.args.num_epochs)
+        self.train_dataset = self.train_dataset.epochs(n)
+
     def maybe_restore_checkpoint(self):
         if self.args.use_checkpoint:
             self.checkpoint.restore(self.manager.latest_checkpoint).expect_partial()
@@ -94,7 +116,12 @@ class Trainer:
             )
 
         if not self.args.cpu:
-            tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True)
+            tape = hvd.DistributedGradientTape(
+                tape,
+                sparse_as_dense=True,
+                num_groups=1,
+                compression=hvd.Compression.fp16,
+            )
 
         linear_vars = self.model.linear_model.trainable_variables
         dnn_vars = self.model.dnn_model.trainable_variables
@@ -115,7 +142,7 @@ class Trainer:
 
         return loss
 
-    @tf.function
+    @tf.function(experimental_relax_shapes=True)
     def _execute_step_calculations(self, x, y):
         loss = self(x, y)
         with tf.device("/CPU:0"):
@@ -126,7 +153,7 @@ class Trainer:
 
     def log(self, current_step, loss):
         train_data = {"loss": f"{loss:.4f}"}
-        dllogger.log(data=train_data, step=(current_step, self.steps))
+        dllogger.log(data=train_data, step=(current_step, self.max_steps))
 
     def train_step(self, x, y):
 
@@ -140,31 +167,24 @@ class Trainer:
         elif (self.args.cpu or hvd.rank() == 0) and current_step % 100 == 0:
             self.log(current_step, loss.numpy())
 
-    def join_and_broadcast(self):
-        hvd.join()
-        if not self.args.benchmark:
-            hvd.broadcast_variables(self.model.linear_model.variables, root_rank=0)
-            hvd.broadcast_variables(self.model.dnn_model.variables, root_rank=0)
-            hvd.broadcast_variables(self.wide_optimizer.variables(), root_rank=0)
-            hvd.broadcast_variables(self.deep_optimizer.variables(), root_rank=0)
-
     def run_loop(self):
         eval_data = {}
-        current_epoch = int(self.current_step_var.numpy()) // len(self.train_dataset) + 1
-
-        for _ in range(current_epoch, self.args.num_epochs + 1):
-            range_val = 1 if not self.args.benchmark else 100
+        current_step = int(self.current_step_var.numpy()) + 1
 
-            # Graph mode part
-            for _ in range(range_val):
-                for x, y in self.train_dataset:
-                    self.train_step(x, y)
-                self.join_and_broadcast()
-
-            eval_data = self.evaluator.eval(self.current_step_var)
-
-            if self.args.cpu or hvd.rank() == 0:
-                self.manager.save()
+        # Graph mode part
+        for i, (x, y) in enumerate(self.train_dataset, current_step):
+            x = pad_batch(x)
+            self.train_step(x, y)
+            if not self.args.benchmark and (
+                    i % self.steps_per_epoch == 0 or i == self.max_steps
+            ):
+                eval_data = self.evaluator.eval(self.current_step_var)
+
+                if self.args.cpu or hvd.rank() == 0:
+                    self.manager.save()
+
+                if i == self.max_steps:
+                    break
 
         if self.args.cpu or hvd.rank() == 0:
             dllogger.log(data=eval_data, step=tuple())

+ 46 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/Dockerfile

@@ -0,0 +1,46 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/merlin/merlin-tensorflow-training:22.03
+FROM ${FROM_IMAGE_NAME}
+
+ENV HOROVOD_CYCLE_TIME=0.1
+ENV HOROVOD_FUSION_THRESHOLD=67108864
+ENV HOROVOD_NUM_STREAMS=2
+
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DCGM_VERSION=2.2.9
+
+# Install perf_client required library
+RUN apt-get update && \
+    apt-get install -y libb64-dev libb64-0d curl && \
+    curl -s -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \
+    dpkg -i datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \
+    rm datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set workdir and python path
+WORKDIR /workspace/wd2
+ENV PYTHONPATH /workspace/wd2
+
+# Install requirements
+ADD requirements.txt requirements.txt
+ADD triton/requirements.txt triton/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --extra-index-url https://pypi.ngc.nvidia.com --no-cache-dir -r triton/requirements.txt
+
+# Add model files to workspace
+COPY . .

+ 1321 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/README.md

@@ -0,0 +1,1321 @@
+# Deploying the Wide & Deep model on Triton Inference Server
+
+This folder contains instructions for deployment to run inference
+on Triton Inference Server as well as a detailed performance analysis.
+The purpose of this document is to help you with achieving
+the best inference performance.
+
+## Table of contents
+  - [Solution overview](#solution-overview)
+    - [Introduction](#introduction)
+    - [Deployment process](#deployment-process)
+  - [Setup](#setup)
+  - [Quick Start Guide](#quick-start-guide)
+  - [Performance](#performance)
+    - [Offline scenario](#offline-scenario)
+        - [Offline: NVIDIA A30, TensorFlow with FP32](#offline-nvidia-a30-tensorflow-with-fp32)
+        - [Offline: NVIDIA A30, NVIDIA TensorRT with FP16](#offline-nvidia-a30-nvidia-tensorrt-with-fp16)
+        - [Offline: NVIDIA DGX-1 (1x V100 32GB), TensorFlow with FP32](#offline-nvidia-dgx-1-1x-v100-32gb-tensorflow-with-fp32)
+        - [Offline: NVIDIA DGX-1 (1x V100 32GB), NVIDIA TensorRT with FP16](#offline-nvidia-dgx-1-1x-v100-32gb-nvidia-tensorrt-with-fp16)
+        - [Offline: NVIDIA DGX A100 (1x A100 80GB), TensorFlow with FP32](#offline-nvidia-dgx-a100-1x-a100-80gb-tensorflow-with-fp32)
+        - [Offline: NVIDIA DGX A100 (1x A100 80GB), NVIDIA TensorRT with FP16](#offline-nvidia-dgx-a100-1x-a100-80gb-nvidia-tensorrt-with-fp16)
+        - [Offline: NVIDIA T4, TensorFlow with FP32](#offline-nvidia-t4-tensorflow-with-fp32)
+        - [Offline: NVIDIA T4, NVIDIA TensorRT with FP16](#offline-nvidia-t4-nvidia-tensorrt-with-fp16)
+    - [Online scenario](#online-scenario)
+        - [Online: NVIDIA A30, TensorFlow with FP32](#online-nvidia-a30-tensorflow-with-fp32)
+        - [Online: NVIDIA A30, NVIDIA TensorRT with FP16](#online-nvidia-a30-nvidia-tensorrt-with-fp16)
+        - [Online: NVIDIA DGX-1 (1x V100 32GB), TensorFlow with FP32](#online-nvidia-dgx-1-1x-v100-32gb-tensorflow-with-fp32)
+        - [Online: NVIDIA DGX-1 (1x V100 32GB), NVIDIA TensorRT with FP16](#online-nvidia-dgx-1-1x-v100-32gb-nvidia-tensorrt-with-fp16)
+        - [Online: NVIDIA DGX A100 (1x A100 80GB), TensorFlow with FP32](#online-nvidia-dgx-a100-1x-a100-80gb-tensorflow-with-fp32)
+        - [Online: NVIDIA DGX A100 (1x A100 80GB), NVIDIA TensorRT with FP16](#online-nvidia-dgx-a100-1x-a100-80gb-nvidia-tensorrt-with-fp16)
+        - [Online: NVIDIA T4, TensorFlow with FP32](#online-nvidia-t4-tensorflow-with-fp32)
+        - [Online: NVIDIA T4, NVIDIA TensorRT with FP16](#online-nvidia-t4-nvidia-tensorrt-with-fp16)
+  - [Advanced](#advanced)
+    - [Step by step deployment process](#step-by-step-deployment-process)
+    - [Latency explanation](#latency-explanation)
+  - [Release notes](#release-notes)
+    - [Changelog](#changelog)
+    - [Known issues](#known-issues)
+
+
+## Solution overview
+### Introduction
+The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server)
+provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs.
+The server provides an inference service via an HTTP or gRPC endpoint,
+allowing remote clients to request inferencing for any number of GPU
+or CPU models being managed by the server.
+
+This README provides step-by-step deployment instructions for models generated
+during training (as described in the [model README](../README.md)).
+Additionally, this README provides the corresponding deployment scripts that
+ensure optimal GPU utilization during inferencing on Triton Inference Server.
+
+### Deployment process
+
+The deployment process consists of two steps:
+
+1. Conversion.
+
+   The purpose of conversion is to find the best performing model
+   format supported by Triton Inference Server.
+   Triton Inference Server uses a number of runtime backends such as
+   [TensorRT](https://developer.nvidia.com/tensorrt),
+   [LibTorch](https://github.com/triton-inference-server/pytorch_backend) and 
+   [ONNX Runtime](https://github.com/triton-inference-server/onnxruntime_backend)
+   to support various model types. Refer to the
+   [Triton documentation](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton)
+   for a list of available backends.
+
+2. Configuration.
+
+   Model configuration on Triton Inference Server, which generates
+   necessary [configuration files](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md).
+
+After deployment, Triton inference server is used for evaluation of converted model in two steps:
+
+1. Correctness tests.
+
+   Produce results which are tested against given correctness thresholds.
+
+2. Performance tests.
+
+   Produce latency and throughput results for offline (static batching)
+   and online (dynamic batching) scenarios.
+
+
+All steps are executed by provided runner script. Refer to [Quick Start Guide](#quick-start-guide)
+
+
+## Setup
+Ensure you have the following components:
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [NVIDIA TensorFlow NGC container 22.02](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorflow)
+* [NVIDIA Triton Inference Server NGC container 22.02](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver)
+* [NVIDIA CUDA](https://docs.nvidia.com/cuda/archive//index.html)
+* [NVIDIA Ampere](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/), [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+
+
+## Quick Start Guide
+Running the following scripts will build and launch the container with all required dependencies for native TensorFlow2 as well as Triton Inference Server. This is necessary for running inference and can also be used for data download, processing, and training of the model.
+
+1. Clone the repository.
+
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep
+```
+
+2. Prepare the dataset.
+
+Assuming that the outbrain dataset is already generated inside `${HOST_OUTBRAIN_PATH}/data`. (using `scripts/preproc.sh`, see [model README](../README.md#quick-start-guide))
+```
+mkdir -p ./datasets/outbrain
+cp -R ${HOST_OUTBRAIN_PATH}/data/valid ./datasets/outbrain
+```
+
+3. Build and run a container that extends NGC TensorFlow2 with the Triton client libraries and necessary dependencies.
+
+```
+./triton/scripts/docker/build.sh
+./triton/scripts/docker/interactive.sh
+```
+
+4. Execute runner script (please mind, the run scripts are prepared per NVIDIA GPU).
+
+```
+NVIDIA A30: ./triton/runner/start_NVIDIA-A30.sh
+
+NVIDIA DGX-1 (1x V100 32GB): ./triton/runner/start_NVIDIA-DGX-1-\(1x-V100-32GB\).sh
+
+NVIDIA DGX A100 (1x A100 80GB): ./triton/runner/start_NVIDIA-DGX-A100-\(1x-A100-80GB\).sh
+
+NVIDIA T4: ./triton/runner/start_NVIDIA-T4.sh
+```
+## Performance
+The performance measurements in this document were conducted at the time of publication and may not reflect
+the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to
+[NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
+### Offline scenario
+
+The offline scenario assumes the client and server are located on the same host. The tests uses:
+- tensors are passed through shared memory between client and server, the Perf Analyzer flag `shared-memory=system` is used
+- single request is send from client to server with static size of batch
+
+
+#### Offline: NVIDIA A30, TensorFlow with FP32
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA A30            |
+| Backend                      |TensorFlow        |
+| Backend accelerator          |Automatic FP16|
+| Precision                    |FP32      |
+| Model format                 |TensorFlow SavedModel   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td><img src="./reports/nvidia_a30_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png"></td>
+    <td><img src="./reports/nvidia_a30_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|       1 |             1 |              517.00 |               0.02 |                            0.24 |                0.02 |                        0.05 |                        1.59 |                         0.00 |               0.00 |               1.94 |               2.06 |               2.10 |               2.17 |               1.93 |
+|   16384 |             1 |          2654210.00 |               0.03 |                            0.29 |                0.04 |                        0.35 |                        5.44 |                         0.01 |               0.00 |               6.16 |               6.42 |               6.45 |               6.56 |               6.17 |
+|   32768 |             1 |          2916350.00 |               0.04 |                            0.39 |                0.05 |                        0.95 |                        9.73 |                         0.01 |               0.00 |              11.00 |              11.63 |              12.11 |              14.03 |              11.18 |
+|   49152 |             1 |          2973700.00 |               0.03 |                            0.40 |                0.07 |                        1.86 |                       14.02 |                         0.02 |               0.00 |              16.05 |              18.00 |              19.22 |              19.92 |              16.40 |
+|   65536 |             1 |          3058350.00 |               0.05 |                            0.54 |                0.07 |                        2.43 |                       18.16 |                         0.03 |               0.00 |              21.15 |              22.10 |              22.49 |              26.05 |              21.28 |
+|   81920 |             1 |          3139220.00 |               0.06 |                            0.54 |                0.07 |                        2.85 |                       22.37 |                         0.05 |               0.00 |              25.67 |              27.64 |              28.84 |              31.78 |              25.94 |
+|   98304 |             1 |          3244030.00 |               0.05 |                            0.48 |                0.07 |                        3.29 |                       26.28 |                         0.06 |               0.00 |              29.93 |              32.33 |              33.39 |              37.83 |              30.22 |
+|  114688 |             1 |          3297280.00 |               0.04 |                            0.38 |                0.07 |                        3.73 |                       30.39 |                         0.06 |               0.00 |              34.49 |              35.92 |              38.31 |              40.42 |              34.68 |
+|  131072 |             1 |          3308740.00 |               0.04 |                            0.42 |                0.08 |                        4.27 |                       34.47 |                         0.08 |               0.00 |              39.15 |              41.44 |              42.82 |              45.15 |              39.35 |
+
+</details>
+
+
+
+#### Offline: NVIDIA A30, NVIDIA TensorRT with FP16
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA A30            |
+| Backend                      |NVIDIA TensorRT        |
+| Backend accelerator          |-|
+| Precision                    |FP16      |
+| Model format                 |NVIDIA TensorRT   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| NVIDIA TensorRT Capture CUDA Graph | Enabled    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td><img src="./reports/nvidia_a30_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png"></td>
+    <td><img src="./reports/nvidia_a30_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|       1 |             1 |             1455.00 |               0.02 |                            0.19 |                0.02 |                        0.22 |                        0.23 |                         0.01 |               0.00 |               0.69 |               0.70 |               0.71 |               0.73 |               0.68 |
+|   16384 |             1 |          4849660.00 |               0.05 |                            0.33 |                0.02 |                        0.51 |                        2.43 |                         0.03 |               0.00 |               3.41 |               3.53 |               3.58 |               3.61 |               3.37 |
+|   32768 |             1 |          6193150.00 |               0.03 |                            0.27 |                0.02 |                        0.68 |                        4.25 |                         0.02 |               0.00 |               5.30 |               5.42 |               5.44 |               5.46 |               5.27 |
+|   49152 |             1 |          5210110.00 |               0.03 |                            0.44 |                0.03 |                        0.82 |                        8.07 |                         0.02 |               0.00 |               9.47 |               9.69 |               9.73 |               9.77 |               9.43 |
+|   65536 |             1 |          6750210.00 |               0.06 |                            0.52 |                0.06 |                        0.96 |                        8.05 |                         0.03 |               0.00 |               9.70 |               9.91 |               9.95 |              10.00 |               9.68 |
+|   81920 |             1 |          4505600.00 |               0.06 |                            0.51 |                0.06 |                        1.03 |                       16.38 |                         0.04 |               0.00 |              18.07 |              18.39 |              18.51 |              18.82 |              18.07 |
+|   98304 |             1 |          5357570.00 |               0.06 |                            0.52 |                0.06 |                        1.20 |                       16.35 |                         0.04 |               0.00 |              18.24 |              18.51 |              18.59 |              18.74 |              18.23 |
+|  114688 |             1 |          6193150.00 |               0.06 |                            0.54 |                0.07 |                        1.47 |                       16.32 |                         0.05 |               0.00 |              18.52 |              18.81 |              18.86 |              19.08 |              18.51 |
+|  131072 |             1 |          7077890.00 |               0.06 |                            0.54 |                0.07 |                        1.65 |                       15.98 |                         0.06 |               0.00 |              18.36 |              18.66 |              18.72 |              18.94 |              18.36 |
+
+</details>
+
+
+
+#### Offline: NVIDIA DGX-1 (1x V100 32GB), TensorFlow with FP32
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA DGX-1 (1x V100 32GB)            |
+| Backend                      |TensorFlow        |
+| Backend accelerator          |Automatic FP16|
+| Precision                    |FP32      |
+| Model format                 |TensorFlow SavedModel   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td><img src="./reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png"></td>
+    <td><img src="./reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|       1 |             1 |              294.70 |               0.05 |                            0.42 |                0.08 |                        0.06 |                        2.76 |                         0.00 |               0.00 |               3.34 |               3.66 |               3.76 |               4.07 |               3.38 |
+|   16384 |             1 |          2146300.00 |               0.07 |                            0.45 |                0.11 |                        0.34 |                        6.63 |                         0.01 |               0.00 |               7.57 |               7.84 |               7.93 |               8.21 |               7.60 |
+|   32768 |             1 |          2669260.00 |               0.06 |                            0.48 |                0.11 |                        0.73 |                       10.85 |                         0.02 |               0.00 |              12.19 |              12.76 |              12.99 |              13.33 |              12.25 |
+|   49152 |             1 |          2947650.00 |               0.06 |                            0.46 |                0.11 |                        1.09 |                       14.87 |                         0.02 |               0.00 |              16.57 |              17.34 |              17.51 |              17.94 |              16.60 |
+|   65536 |             1 |          3145730.00 |               0.05 |                            0.43 |                0.07 |                        1.45 |                       18.66 |                         0.03 |               0.00 |              20.60 |              21.49 |              21.70 |              22.36 |              20.70 |
+|   81920 |             1 |          3222190.00 |               0.06 |                            0.49 |                0.11 |                        1.91 |                       22.64 |                         0.03 |               0.00 |              25.24 |              26.01 |              26.17 |              27.37 |              25.25 |
+|   98304 |             1 |          3309570.00 |               0.06 |                            0.46 |                0.11 |                        2.18 |                       26.57 |                         0.05 |               0.00 |              29.38 |              30.30 |              30.45 |              31.26 |              29.43 |
+|  114688 |             1 |          3354620.00 |               0.05 |                            0.44 |                0.11 |                        2.89 |                       30.49 |                         0.06 |               0.00 |              33.92 |              34.80 |              35.03 |              36.68 |              34.05 |
+|  131072 |             1 |          3309570.00 |               0.07 |                            0.52 |                0.12 |                        3.68 |                       34.82 |                         0.07 |               0.00 |              39.21 |              40.06 |              40.17 |              40.56 |              39.28 |
+
+</details>
+
+
+
+#### Offline: NVIDIA DGX-1 (1x V100 32GB), NVIDIA TensorRT with FP16
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA DGX-1 (1x V100 32GB)            |
+| Backend                      |NVIDIA TensorRT        |
+| Backend accelerator          |-|
+| Precision                    |FP16      |
+| Model format                 |NVIDIA TensorRT   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| NVIDIA TensorRT Capture CUDA Graph | Enabled    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td><img src="./reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png"></td>
+    <td><img src="./reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|       1 |             1 |              749.25 |               0.07 |                            0.41 |                0.06 |                        0.35 |                        0.41 |                         0.03 |               0.00 |               1.32 |               1.41 |               1.44 |               1.53 |               1.33 |
+|   16384 |             1 |          3768320.00 |               0.05 |                            0.47 |                0.11 |                        0.66 |                        2.99 |                         0.05 |               0.00 |               4.33 |               4.42 |               4.46 |               4.65 |               4.34 |
+|   32768 |             1 |          4849660.00 |               0.05 |                            0.45 |                0.11 |                        0.85 |                        5.21 |                         0.06 |               0.00 |               6.72 |               6.82 |               6.84 |               6.90 |               6.72 |
+|   49152 |             1 |          4030460.00 |               0.06 |                            0.49 |                0.13 |                        1.41 |                        9.97 |                         0.10 |               0.00 |              12.14 |              12.28 |              12.32 |              12.52 |              12.16 |
+|   65536 |             1 |          5373950.00 |               0.06 |                            0.48 |                0.12 |                        1.55 |                        9.91 |                         0.06 |               0.00 |              12.17 |              12.32 |              12.36 |              12.93 |              12.19 |
+|   81920 |             1 |          3604480.00 |               0.07 |                            0.53 |                0.13 |                        2.39 |                       19.50 |                         0.09 |               0.00 |              22.64 |              22.85 |              22.92 |              24.87 |              22.70 |
+|   98304 |             1 |          4323940.00 |               0.08 |                            0.52 |                0.13 |                        2.30 |                       19.52 |                         0.08 |               0.00 |              22.46 |              23.03 |              23.41 |              26.04 |              22.63 |
+|  114688 |             1 |          5046270.00 |               0.06 |                            0.44 |                0.11 |                        2.66 |                       19.35 |                         0.10 |               0.00 |              22.67 |              22.87 |              23.08 |              23.96 |              22.72 |
+|  131072 |             1 |          5417640.00 |               0.07 |                            0.55 |                0.13 |                        4.23 |                       19.06 |                         0.12 |               0.00 |              24.35 |              24.47 |              24.63 |              25.48 |              24.17 |
+
+</details>
+
+
+
+#### Offline: NVIDIA DGX A100 (1x A100 80GB), TensorFlow with FP32
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA DGX A100 (1x A100 80GB)            |
+| Backend                      |TensorFlow        |
+| Backend accelerator          |Automatic FP16|
+| Precision                    |FP32      |
+| Model format                 |TensorFlow SavedModel   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td><img src="./reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png"></td>
+    <td><img src="./reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|       1 |             1 |              445.00 |               0.02 |                            0.23 |                0.02 |                        0.06 |                        1.91 |                         0.00 |               0.00 |               2.24 |               2.39 |               2.42 |               2.56 |               2.24 |
+|   16384 |             1 |          3440640.00 |               0.03 |                            0.27 |                0.02 |                        0.45 |                        3.98 |                         0.01 |               0.00 |               4.74 |               5.03 |               5.06 |               5.19 |               4.75 |
+|   32768 |             1 |          4554750.00 |               0.03 |                            0.28 |                0.02 |                        0.81 |                        6.04 |                         0.01 |               0.00 |               7.18 |               7.50 |               7.55 |               7.65 |               7.18 |
+|   49152 |             1 |          5013500.00 |               0.03 |                            0.26 |                0.02 |                        1.25 |                        8.20 |                         0.02 |               0.00 |               9.82 |              10.06 |              10.24 |              10.36 |               9.78 |
+|   65536 |             1 |          5174760.00 |               0.03 |                            0.27 |                0.02 |                        1.82 |                       10.46 |                         0.03 |               0.00 |              12.66 |              12.98 |              13.14 |              13.23 |              12.63 |
+|   81920 |             1 |          5160960.00 |               0.03 |                            0.33 |                0.03 |                        2.67 |                       12.72 |                         0.06 |               0.00 |              15.84 |              16.23 |              16.35 |              16.76 |              15.84 |
+|   98304 |             1 |          5455870.00 |               0.03 |                            0.31 |                0.04 |                        2.63 |                       14.86 |                         0.05 |               0.00 |              17.88 |              18.43 |              18.67 |              19.16 |              17.91 |
+|  114688 |             1 |          5657940.00 |               0.05 |                            0.36 |                0.04 |                        2.95 |                       16.76 |                         0.07 |               0.00 |              20.29 |              20.66 |              20.78 |              21.07 |              20.23 |
+|  131072 |             1 |          5546870.00 |               0.07 |                            0.44 |                0.04 |                        3.34 |                       19.59 |                         0.09 |               0.00 |              22.89 |              24.23 |              29.68 |              34.16 |              23.56 |
+
+</details>
+
+
+
+#### Offline: NVIDIA DGX A100 (1x A100 80GB), NVIDIA TensorRT with FP16
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA DGX A100 (1x A100 80GB)            |
+| Backend                      |NVIDIA TensorRT        |
+| Backend accelerator          |-|
+| Precision                    |FP16      |
+| Model format                 |NVIDIA TensorRT   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| NVIDIA TensorRT Capture CUDA Graph | Enabled    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td><img src="./reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png"></td>
+    <td><img src="./reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|       1 |             1 |             1108.00 |               0.02 |                            0.26 |                0.02 |                        0.34 |                        0.25 |                         0.02 |               0.00 |               0.89 |               0.91 |               0.97 |               1.35 |               0.90 |
+|   16384 |             1 |          7192580.00 |               0.02 |                            0.27 |                0.02 |                        0.52 |                        1.41 |                         0.03 |               0.00 |               2.24 |               2.31 |               2.37 |               3.36 |               2.27 |
+|   32768 |             1 |          9043970.00 |               0.02 |                            0.34 |                0.03 |                        0.72 |                        2.46 |                         0.05 |               0.00 |               3.57 |               3.67 |               3.75 |               5.35 |               3.62 |
+|   49152 |             1 |          7962620.00 |               0.02 |                            0.28 |                0.03 |                        1.17 |                        4.57 |                         0.05 |               0.00 |               5.97 |               6.14 |               6.28 |               9.31 |               6.13 |
+|   65536 |             1 |          9764860.00 |               0.02 |                            0.28 |                0.03 |                        1.77 |                        4.51 |                         0.06 |               0.00 |               6.59 |               7.01 |               7.24 |               7.59 |               6.68 |
+|   81920 |             1 |          7045120.00 |               0.02 |                            0.28 |                0.03 |                        2.49 |                        8.66 |                         0.07 |               0.00 |              11.45 |              12.10 |              12.34 |              12.60 |              11.56 |
+|   98304 |             1 |          8110080.00 |               0.02 |                            0.28 |                0.03 |                        3.02 |                        8.65 |                         0.08 |               0.00 |              11.97 |              12.66 |              13.00 |              13.19 |              12.08 |
+|  114688 |             1 |          9175040.00 |               0.02 |                            0.29 |                0.03 |                        3.40 |                        8.64 |                         0.09 |               0.00 |              12.43 |              12.69 |              12.77 |              12.89 |              12.48 |
+|  131072 |             1 |         10354700.00 |               0.02 |                            0.27 |                0.03 |                        3.84 |                        8.37 |                         0.10 |               0.00 |              12.57 |              12.77 |              13.02 |              13.16 |              12.63 |
+
+</details>
+
+
+
+#### Offline: NVIDIA T4, TensorFlow with FP32
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA T4            |
+| Backend                      |TensorFlow        |
+| Backend accelerator          |Automatic FP16|
+| Precision                    |FP32      |
+| Model format                 |TensorFlow SavedModel   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td><img src="./reports/nvidia_t4_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png"></td>
+    <td><img src="./reports/nvidia_t4_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|       1 |             1 |              181.00 |               0.09 |                            0.78 |                0.13 |                        0.19 |                        4.32 |                         0.02 |               0.00 |               5.52 |               6.18 |               6.30 |               6.59 |               5.52 |
+|   16384 |             1 |          1023490.00 |               0.12 |                            0.96 |                0.17 |                        0.86 |                       13.82 |                         0.04 |               0.00 |              15.95 |              16.92 |              17.16 |              17.49 |              15.98 |
+|   32768 |             1 |          1201090.00 |               0.12 |                            0.96 |                0.18 |                        1.50 |                       24.31 |                         0.06 |               0.00 |              27.14 |              28.06 |              28.18 |              28.40 |              27.12 |
+|   49152 |             1 |          1265350.00 |               0.12 |                            0.96 |                0.18 |                        2.30 |                       35.08 |                         0.07 |               0.00 |              38.60 |              39.79 |              40.11 |              43.47 |              38.70 |
+|   65536 |             1 |          1288870.00 |               0.12 |                            0.94 |                0.18 |                        3.13 |                       46.14 |                         0.11 |               0.00 |              50.54 |              51.51 |              51.68 |              57.69 |              50.63 |
+|   81920 |             1 |          1310530.00 |               0.12 |                            0.94 |                0.18 |                        3.86 |                       56.84 |                         0.13 |               0.00 |              61.96 |              63.21 |              63.36 |              64.08 |              62.06 |
+|   98304 |             1 |          1314650.00 |               0.12 |                            1.01 |                0.18 |                        4.38 |                       68.40 |                         0.14 |               0.00 |              74.34 |              75.17 |              75.40 |              76.45 |              74.24 |
+|  114688 |             1 |          1312390.00 |               0.13 |                            1.00 |                0.16 |                        5.75 |                       79.94 |                         0.19 |               0.00 |              87.31 |              88.67 |              89.27 |              89.89 |              87.18 |
+|  131072 |             1 |          1310590.00 |               0.13 |                            1.03 |                0.17 |                        6.29 |                       91.81 |                         0.20 |               0.00 |              99.64 |             101.02 |             101.41 |             101.68 |              99.63 |
+
+</details>
+
+
+
+#### Offline: NVIDIA T4, NVIDIA TensorRT with FP16
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA T4            |
+| Backend                      |NVIDIA TensorRT        |
+| Backend accelerator          |-|
+| Precision                    |FP16      |
+| Model format                 |NVIDIA TensorRT   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| NVIDIA TensorRT Capture CUDA Graph | Enabled    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td><img src="./reports/nvidia_t4_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png"></td>
+    <td><img src="./reports/nvidia_t4_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|       1 |             1 |              564.00 |               0.05 |                            0.61 |                0.15 |                        0.56 |                        0.37 |                         0.02 |               0.00 |               1.77 |               1.88 |               1.91 |               1.95 |               1.77 |
+|   16384 |             1 |          1916930.00 |               0.11 |                            0.89 |                0.18 |                        1.19 |                        6.08 |                         0.06 |               0.00 |               8.55 |               8.75 |               8.79 |               8.91 |               8.51 |
+|   32768 |             1 |          2129920.00 |               0.12 |                            0.92 |                0.18 |                        1.84 |                       12.18 |                         0.07 |               0.00 |              15.32 |              15.56 |              15.66 |              15.82 |              15.32 |
+|   49152 |             1 |          1703370.00 |               0.12 |                            0.94 |                0.18 |                        2.51 |                       24.94 |                         0.08 |               0.00 |              28.76 |              29.70 |              29.74 |              29.94 |              28.78 |
+|   65536 |             1 |          2228220.00 |               0.12 |                            0.97 |                0.18 |                        3.22 |                       24.59 |                         0.11 |               0.00 |              29.08 |              30.25 |              30.35 |              30.47 |              29.20 |
+|   81920 |             1 |          1447010.00 |               0.12 |                            0.99 |                0.18 |                        4.04 |                       51.04 |                         0.13 |               0.00 |              56.53 |              57.58 |              57.85 |              58.43 |              56.51 |
+|   98304 |             1 |          1720030.00 |               0.13 |                            1.00 |                0.18 |                        4.96 |                       50.51 |                         0.15 |               0.00 |              56.84 |              57.84 |              57.93 |              58.35 |              56.92 |
+|  114688 |             1 |          1987590.00 |               0.13 |                            1.04 |                0.19 |                        5.89 |                       50.14 |                         0.18 |               0.00 |              57.58 |              58.78 |              58.81 |              58.91 |              57.56 |
+|  131072 |             1 |          2271540.00 |               0.12 |                            0.98 |                0.19 |                        6.93 |                       49.07 |                         0.16 |               0.00 |              57.34 |              58.56 |              58.79 |              58.89 |              57.45 |
+
+</details>
+
+
+
+
+### Online scenario
+
+The online scenario assumes the client and server are located on different hosts. The tests uses:
+- tensors are passed through HTTP from client to server
+- concurrent requests are send from client to server, the final batch is created on server side
+
+
+#### Online: NVIDIA A30, TensorFlow with FP32
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA A30            |
+| Backend                      |TensorFlow        |
+| Backend accelerator          |Automatic FP16|
+| Precision                    |FP32      |
+| Model format                 |TensorFlow SavedModel   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td colspan="2" align="center"><img src="./reports/nvidia_a30_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|    2048 |             8 |          2205700.00 |               0.46 |                            2.09 |                0.99 |                        0.31 |                        3.53 |                         0.02 |               0.00 |               7.91 |               8.42 |               8.74 |               9.36 |               7.40 |
+|    2048 |            16 |          2686980.00 |               0.46 |                            2.83 |                2.38 |                        0.51 |                        5.91 |                         0.03 |               0.00 |              12.64 |              13.41 |              13.90 |              15.69 |              12.12 |
+|    2048 |            24 |          2658300.00 |               0.47 |                            4.46 |                3.75 |                        1.25 |                        8.24 |                         0.05 |               0.00 |              18.65 |              20.78 |              22.22 |              27.96 |              18.21 |
+|    2048 |            32 |          2672640.00 |               0.47 |                            4.46 |                6.46 |                        1.74 |                       11.02 |                         0.08 |               0.00 |              24.53 |              27.26 |              28.82 |              30.28 |              24.23 |
+|    2048 |            40 |          3217410.00 |               0.47 |                            5.12 |                3.96 |                        1.78 |                       13.76 |                         0.07 |               0.00 |              24.11 |              29.70 |              31.31 |              32.02 |              25.17 |
+|    2048 |            48 |          3246080.00 |               0.50 |                            5.77 |                5.01 |                        2.45 |                       15.96 |                         0.10 |               0.00 |              28.87 |              36.61 |              39.55 |              44.82 |              29.78 |
+|    2048 |            56 |          3391490.00 |               0.48 |                            5.52 |                5.74 |                        2.21 |                       19.18 |                         0.10 |               0.00 |              32.74 |              36.93 |              39.33 |              44.67 |              33.24 |
+|    2048 |            64 |          3481600.00 |               0.50 |                            5.98 |                6.83 |                        2.90 |                       20.61 |                         0.12 |               0.00 |              36.78 |              39.41 |              41.34 |              44.04 |              36.94 |
+|    2048 |            72 |          3532800.00 |               0.51 |                            7.84 |                5.61 |                        2.75 |                       23.65 |                         0.14 |               0.00 |              40.06 |              42.18 |              43.18 |              45.15 |              40.49 |
+|    2048 |            80 |          3551230.00 |               0.51 |                            8.02 |                8.24 |                        3.04 |                       25.05 |                         0.14 |               0.00 |              44.82 |              46.05 |              46.43 |              47.17 |              45.01 |
+|    2048 |            88 |          3491840.00 |               0.55 |                            6.85 |               10.81 |                        3.81 |                       27.98 |                         0.14 |               0.00 |              49.97 |              51.88 |              52.12 |              54.34 |              50.13 |
+|    2048 |            96 |          3678210.00 |               0.49 |                            6.44 |               10.60 |                        2.42 |                       31.40 |                         0.13 |               0.00 |              51.33 |              52.85 |              53.52 |              55.37 |              51.48 |
+|    2048 |           104 |          3627010.00 |               0.51 |                            8.84 |               11.81 |                        3.21 |                       32.91 |                         0.13 |               0.00 |              56.68 |              59.57 |              65.27 |              69.32 |              57.42 |
+|    2048 |           112 |          3670020.00 |               0.50 |                           10.27 |               11.60 |                        3.22 |                       35.39 |                         0.17 |               0.00 |              60.96 |              62.94 |              63.78 |              66.09 |              61.14 |
+|    2048 |           120 |          3596290.00 |               0.53 |                            8.14 |               15.83 |                        3.52 |                       37.44 |                         0.18 |               0.00 |              65.69 |              68.82 |              69.33 |              70.23 |              65.64 |
+|    2048 |           128 |          3747840.00 |               0.53 |                            9.94 |               13.78 |                        3.35 |                       39.42 |                         0.18 |               0.00 |              67.36 |              68.44 |              68.70 |              69.57 |              67.19 |
+|    2048 |           136 |          3708930.00 |               0.50 |                           11.62 |               15.82 |                        4.05 |                       40.59 |                         0.22 |               0.00 |              73.04 |              76.44 |              77.91 |              78.35 |              72.81 |
+|    2048 |           144 |          3631100.00 |               0.53 |                           13.62 |               17.34 |                        4.16 |                       42.39 |                         0.27 |               0.00 |              78.38 |              81.03 |              81.55 |              82.67 |              78.31 |
+|    2048 |           152 |          3624960.00 |               0.51 |                           16.29 |               16.20 |                        4.06 |                       45.15 |                         0.25 |               0.00 |              82.34 |              87.68 |              95.84 |             107.03 |              82.47 |
+|    2048 |           160 |          3598340.00 |               0.52 |                           12.15 |               19.21 |                        4.13 |                       49.93 |                         0.26 |               0.00 |              88.03 |              91.12 |              92.91 |              94.12 |              86.20 |
+|    2048 |           168 |          3715450.00 |               0.53 |                           15.01 |               17.67 |                        4.03 |                       50.90 |                         0.24 |               0.00 |              89.14 |              92.45 |              93.39 |              95.30 |              88.37 |
+|    2048 |           176 |          3653630.00 |               0.56 |                           10.28 |               23.72 |                        4.36 |                       52.77 |                         0.29 |               0.00 |              93.17 |              94.98 |              95.73 |              96.99 |              91.98 |
+|    2048 |           184 |          3700740.00 |               0.58 |                           15.49 |               20.40 |                        4.19 |                       55.47 |                         0.24 |               0.00 |              96.35 |             101.44 |             102.26 |             103.61 |              96.37 |
+|    2048 |           192 |          3764220.00 |               0.56 |                           12.25 |               26.51 |                        5.04 |                       56.14 |                         0.24 |               0.00 |             100.51 |             103.64 |             104.54 |             107.29 |             100.76 |
+|    2048 |           200 |          3538940.00 |               0.58 |                           10.53 |               34.43 |                        4.16 |                       55.98 |                         0.26 |               0.00 |             101.11 |             130.28 |             133.07 |             139.67 |             105.94 |
+|    2048 |           208 |          3535410.00 |               0.63 |                           10.26 |               39.10 |                        4.42 |                       57.79 |                         0.26 |               0.00 |             104.99 |             137.09 |             138.30 |             139.86 |             112.48 |
+|    2048 |           216 |          3538940.00 |               0.58 |                           13.14 |               40.62 |                        5.04 |                       55.45 |                         0.28 |               0.00 |             106.08 |             135.93 |             137.98 |             138.84 |             115.12 |
+|    2048 |           224 |          3407870.00 |               0.70 |                           12.87 |               46.33 |                        4.61 |                       57.95 |                         0.26 |               0.00 |             130.57 |             142.24 |             143.32 |             147.15 |             122.72 |
+|    2048 |           232 |          3670020.00 |               0.54 |                           14.55 |               46.11 |                        4.51 |                       57.72 |                         0.25 |               0.00 |             131.59 |             138.97 |             139.92 |             141.23 |             123.68 |
+|    2048 |           240 |          3565570.00 |               0.56 |                           13.52 |               51.26 |                        4.62 |                       56.97 |                         0.25 |               0.00 |             134.50 |             138.74 |             140.46 |             143.67 |             127.18 |
+|    2048 |           248 |          3670020.00 |               0.63 |                           17.72 |               50.87 |                        4.79 |                       58.02 |                         0.27 |               0.00 |             135.65 |             139.44 |             140.59 |             142.06 |             132.28 |
+|    2048 |           256 |          3670020.00 |               0.60 |                           12.77 |               61.03 |                        4.50 |                       57.72 |                         0.27 |               0.00 |             135.72 |             142.43 |             143.26 |             145.82 |             136.88 |
+
+</details>
+
+
+
+
+#### Online: NVIDIA A30, NVIDIA TensorRT with FP16
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA A30            |
+| Backend                      |NVIDIA TensorRT        |
+| Backend accelerator          |-|
+| Precision                    |FP16      |
+| Model format                 |NVIDIA TensorRT   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| NVIDIA TensorRT Capture CUDA Graph | Enabled    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td colspan="2" align="center"><img src="./reports/nvidia_a30_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|    2048 |             8 |          3377150.00 |               0.46 |                            2.17 |                0.48 |                        0.42 |                        1.30 |                         0.01 |               0.00 |               4.86 |               5.62 |               6.05 |               6.97 |               4.84 |
+|    2048 |            16 |          4280320.00 |               0.46 |                            2.99 |                0.95 |                        0.70 |                        2.47 |                         0.02 |               0.00 |               7.60 |               8.85 |               9.48 |              10.04 |               7.59 |
+|    2048 |            24 |          4155390.00 |               0.46 |                            3.78 |                2.06 |                        1.18 |                        4.26 |                         0.04 |               0.00 |              12.31 |              12.58 |              12.70 |              13.29 |              11.79 |
+|    2048 |            32 |          4634620.00 |               0.46 |                            4.33 |                2.49 |                        1.30 |                        5.42 |                         0.03 |               0.00 |              15.96 |              16.51 |              16.60 |              16.79 |              14.02 |
+|    2048 |            40 |          4114430.00 |               0.47 |                            4.86 |                4.83 |                        1.50 |                        7.99 |                         0.03 |               0.00 |              20.33 |              20.85 |              21.28 |              22.78 |              19.68 |
+|    2048 |            48 |          4751360.00 |               0.47 |                            5.56 |                4.41 |                        1.65 |                        8.20 |                         0.03 |               0.00 |              20.72 |              21.29 |              21.86 |              27.29 |              20.33 |
+|    2048 |            56 |          4876290.00 |               0.47 |                            6.45 |                4.78 |                        1.79 |                        9.54 |                         0.04 |               0.00 |              21.37 |              29.68 |              30.19 |              32.92 |              23.07 |
+|    2048 |            64 |          4403200.00 |               0.50 |                            8.18 |                6.24 |                        2.32 |                       12.24 |                         0.06 |               0.00 |              30.45 |              34.45 |              36.88 |              43.85 |              29.54 |
+|    2048 |            72 |          4696060.00 |               0.49 |                            7.73 |                6.31 |                        2.75 |                       13.64 |                         0.06 |               0.00 |              31.34 |              35.32 |              38.51 |              45.91 |              30.99 |
+|    2048 |            80 |          4929540.00 |               0.53 |                            8.75 |                5.59 |                        2.72 |                       14.38 |                         0.08 |               0.00 |              33.10 |              42.91 |              44.74 |              51.17 |              32.06 |
+|    2048 |            88 |          4378620.00 |               0.50 |                           12.86 |                7.76 |                        3.36 |                       15.10 |                         0.26 |               0.00 |              43.88 |              49.60 |              51.20 |              56.70 |              39.84 |
+|    2048 |            96 |          5371900.00 |               0.51 |                            7.79 |                6.89 |                        3.41 |                       17.18 |                         0.15 |               0.00 |              36.46 |              48.51 |              53.61 |              59.81 |              35.93 |
+|    2048 |           104 |          5129210.00 |               0.51 |                           10.65 |                9.44 |                        3.37 |                       16.40 |                         0.07 |               0.00 |              42.08 |              48.28 |              52.47 |              57.71 |              40.44 |
+|    2048 |           112 |          5058560.00 |               0.50 |                            9.38 |               10.30 |                        3.84 |                       19.75 |                         0.09 |               0.00 |              44.99 |              57.46 |              58.44 |              59.22 |              43.86 |
+|    2048 |           120 |          5435390.00 |               0.50 |                           12.86 |               10.68 |                        3.58 |                       16.98 |                         0.09 |               0.00 |              45.01 |              50.08 |              50.68 |              63.46 |              44.68 |
+|    2048 |           128 |          5499520.00 |               0.57 |                            9.42 |               11.85 |                        4.21 |                       20.00 |                         0.11 |               0.00 |              45.22 |              58.71 |              61.23 |              71.79 |              46.15 |
+|    2048 |           136 |          5584900.00 |               0.56 |                            7.95 |               14.70 |                        4.27 |                       21.17 |                         0.10 |               0.00 |              52.76 |              59.25 |              61.29 |              66.22 |              48.75 |
+|    2048 |           144 |          5828610.00 |               0.58 |                            8.76 |               14.21 |                        4.44 |                       21.67 |                         0.10 |               0.00 |              53.10 |              60.64 |              62.39 |              65.12 |              49.75 |
+|    2048 |           152 |          5812220.00 |               0.52 |                           12.79 |               13.75 |                        4.01 |                       21.15 |                         0.08 |               0.00 |              54.56 |              60.15 |              62.76 |              67.47 |              52.30 |
+|    2048 |           160 |          6000640.00 |               0.53 |                           13.68 |               13.01 |                        4.91 |                       21.32 |                         0.10 |               0.00 |              55.18 |              62.53 |              63.20 |              70.26 |              53.55 |
+|    2048 |           168 |          6053890.00 |               0.56 |                           11.52 |               15.04 |                        4.25 |                       22.97 |                         0.10 |               0.00 |              57.53 |              65.93 |              67.38 |              73.08 |              54.43 |
+|    2048 |           176 |          6443010.00 |               0.54 |                           10.17 |               16.84 |                        4.78 |                       22.56 |                         0.10 |               0.00 |              56.70 |              66.88 |              68.40 |              74.31 |              54.98 |
+|    2048 |           184 |          6369280.00 |               0.55 |                           11.80 |               17.61 |                        4.75 |                       22.30 |                         0.11 |               0.00 |              59.55 |              69.48 |              72.12 |              75.43 |              57.12 |
+|    2048 |           192 |          6166530.00 |               0.55 |                           13.54 |               19.58 |                        5.12 |                       22.33 |                         0.11 |               0.00 |              62.62 |              73.35 |              75.14 |              78.02 |              61.23 |
+|    2048 |           200 |          6432770.00 |               0.53 |                           12.88 |               20.48 |                        4.67 |                       23.44 |                         0.10 |               0.00 |              63.49 |              75.39 |              76.63 |              82.79 |              62.12 |
+|    2048 |           208 |          6539260.00 |               0.50 |                           17.18 |               18.68 |                        3.94 |                       22.89 |                         0.09 |               0.00 |              64.74 |              73.25 |              73.92 |              75.78 |              63.28 |
+|    2048 |           216 |          6420200.00 |               0.53 |                           14.62 |               23.30 |                        3.98 |                       24.26 |                         0.08 |               0.00 |              71.64 |              76.78 |              79.58 |              81.42 |              66.76 |
+|    2048 |           224 |          6457340.00 |               0.51 |                           13.34 |               26.25 |                        4.30 |                       23.93 |                         0.08 |               0.00 |              73.35 |              76.42 |              78.63 |              81.02 |              68.41 |
+|    2048 |           232 |          6793220.00 |               0.60 |                           12.23 |               25.87 |                        4.19 |                       24.82 |                         0.09 |               0.00 |              72.37 |              76.42 |              79.96 |              82.30 |              67.80 |
+|    2048 |           240 |          6778880.00 |               0.51 |                           16.46 |               23.31 |                        4.16 |                       24.70 |                         0.09 |               0.00 |              72.48 |              76.24 |              77.42 |              81.06 |              69.23 |
+|    2048 |           248 |          6877180.00 |               0.51 |                           14.99 |               25.03 |                        4.06 |                       25.86 |                         0.09 |               0.00 |              72.49 |              74.72 |              75.13 |              76.35 |              70.53 |
+|    2048 |           256 |          7071740.00 |               0.51 |                           14.85 |               26.94 |                        3.84 |                       25.88 |                         0.09 |               0.00 |              72.08 |              74.62 |              75.67 |              78.03 |              72.11 |
+
+</details>
+
+
+
+
+#### Online: NVIDIA DGX-1 (1x V100 32GB), TensorFlow with FP32
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA DGX-1 (1x V100 32GB)            |
+| Backend                      |TensorFlow        |
+| Backend accelerator          |Automatic FP16|
+| Precision                    |FP32      |
+| Model format                 |TensorFlow SavedModel   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td colspan="2" align="center"><img src="./reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|    2048 |             8 |          1406980.00 |               1.04 |                            2.73 |                1.62 |                        0.28 |                        5.88 |                         0.02 |               0.00 |              12.16 |              13.50 |              13.82 |              14.58 |              11.58 |
+|    2048 |            16 |          1937410.00 |               1.12 |                            3.69 |                3.49 |                        0.81 |                        7.54 |                         0.04 |               0.00 |              17.57 |              18.46 |              18.68 |              19.28 |              16.69 |
+|    2048 |            24 |          2236420.00 |               1.12 |                            5.16 |                4.61 |                        0.99 |                        9.81 |                         0.04 |               0.00 |              22.39 |              23.75 |              24.48 |              25.40 |              21.73 |
+|    2048 |            32 |          2439170.00 |               1.19 |                            6.46 |                5.89 |                        1.47 |                       11.61 |                         0.08 |               0.00 |              27.56 |              28.64 |              29.38 |              30.56 |              26.71 |
+|    2048 |            40 |          2586620.00 |               1.23 |                            6.53 |                5.38 |                        1.94 |                       15.81 |                         0.09 |               0.00 |              31.88 |              34.85 |              35.49 |              41.48 |              30.98 |
+|    2048 |            48 |          3145730.00 |               1.14 |                            5.45 |                4.67 |                        1.84 |                       17.55 |                         0.08 |               0.00 |              30.73 |              32.24 |              32.60 |              33.67 |              30.74 |
+|    2048 |            56 |          3211260.00 |               1.19 |                            6.09 |                5.79 |                        2.07 |                       19.84 |                         0.10 |               0.00 |              35.02 |              36.32 |              36.56 |              39.19 |              35.08 |
+|    2048 |            64 |          3229700.00 |               1.24 |                            7.60 |                5.88 |                        2.54 |                       22.48 |                         0.12 |               0.00 |              39.91 |              40.87 |              41.03 |              41.50 |              39.85 |
+|    2048 |            72 |          3231740.00 |               1.26 |                            7.51 |                7.54 |                        3.11 |                       24.84 |                         0.12 |               0.00 |              44.69 |              45.61 |              46.08 |              47.34 |              44.40 |
+|    2048 |            80 |          3325950.00 |               1.32 |                            7.15 |                9.10 |                        3.48 |                       27.39 |                         0.14 |               0.00 |              48.57 |              49.50 |              49.63 |              49.94 |              48.58 |
+|    2048 |            88 |          3303420.00 |               1.34 |                            8.98 |                9.23 |                        3.66 |                       29.86 |                         0.15 |               0.00 |              53.21 |              54.16 |              54.30 |              54.82 |              53.22 |
+|    2048 |            96 |          3407870.00 |               1.35 |                            9.52 |                9.82 |                        3.98 |                       31.35 |                         0.16 |               0.00 |              56.17 |              57.28 |              57.66 |              58.45 |              56.19 |
+|    2048 |           104 |          3352580.00 |               1.34 |                           10.83 |               10.69 |                        4.78 |                       33.99 |                         0.21 |               0.00 |              61.75 |              63.06 |              63.46 |              63.92 |              61.84 |
+|    2048 |           112 |          3299330.00 |               1.34 |                            9.79 |               13.48 |                        4.76 |                       36.84 |                         0.21 |               0.00 |              66.32 |              67.74 |              68.13 |              68.99 |              66.43 |
+|    2048 |           120 |          3483650.00 |               1.40 |                           10.80 |               13.38 |                        5.05 |                       37.15 |                         0.22 |               0.00 |              67.95 |              69.06 |              69.59 |              70.84 |              68.01 |
+|    2048 |           128 |          3391490.00 |               1.44 |                           12.91 |               14.60 |                        5.72 |                       40.50 |                         0.23 |               0.00 |              74.83 |              80.32 |              85.15 |              87.77 |              75.40 |
+|    2048 |           136 |          3339000.00 |               1.43 |                           11.07 |               18.41 |                        5.60 |                       42.67 |                         0.23 |               0.00 |              78.96 |              81.42 |              82.95 |              83.83 |              79.42 |
+|    2048 |           144 |          3430400.00 |               1.36 |                           13.13 |               15.70 |                        6.08 |                       45.65 |                         0.25 |               0.00 |              81.96 |              83.69 |              84.16 |              85.02 |              82.17 |
+|    2048 |           152 |          3424260.00 |               1.38 |                           14.29 |               19.05 |                        5.81 |                       46.75 |                         0.25 |               0.00 |              87.30 |              90.17 |              91.32 |              93.17 |              87.54 |
+|    2048 |           160 |          3522560.00 |               1.34 |                           12.27 |               20.53 |                        6.77 |                       48.22 |                         0.33 |               0.00 |              89.83 |              91.81 |              93.60 |              94.84 |              89.47 |
+|    2048 |           168 |          3475460.00 |               1.34 |                           16.24 |               18.55 |                        6.26 |                       51.10 |                         0.33 |               0.00 |              93.67 |              96.58 |              97.13 |              98.62 |              93.82 |
+|    2048 |           176 |          3352580.00 |               1.42 |                           14.59 |               24.17 |                        6.50 |                       54.21 |                         0.29 |               0.00 |             101.40 |             102.82 |             104.31 |             106.19 |             101.17 |
+|    2048 |           184 |          3391490.00 |               1.39 |                           15.43 |               26.57 |                        6.64 |                       55.30 |                         0.29 |               0.00 |             105.87 |             107.14 |             107.83 |             110.67 |             105.62 |
+|    2048 |           192 |          3291940.00 |               1.34 |                           17.09 |               24.89 |                        7.73 |                       58.10 |                         0.38 |               0.00 |             109.48 |             111.30 |             112.27 |             114.72 |             109.53 |
+|    2048 |           200 |          3407870.00 |               1.35 |                           15.22 |               33.43 |                        7.71 |                       55.65 |                         0.40 |               0.00 |             109.84 |             137.25 |             141.99 |             145.01 |             113.76 |
+|    2048 |           208 |          3276800.00 |               1.33 |                           16.02 |               37.77 |                        7.48 |                       56.28 |                         0.41 |               0.00 |             111.49 |             144.69 |             145.60 |             146.99 |             119.30 |
+|    2048 |           216 |          3403780.00 |               1.35 |                           16.62 |               41.90 |                        7.68 |                       55.20 |                         0.42 |               0.00 |             114.31 |             145.71 |             148.51 |             151.30 |             123.17 |
+|    2048 |           224 |          3407870.00 |               1.34 |                           16.04 |               42.59 |                        7.21 |                       58.50 |                         0.34 |               0.00 |             133.67 |             144.04 |             144.44 |             145.45 |             126.03 |
+|    2048 |           232 |          3538940.00 |               1.28 |                           19.49 |               43.11 |                        7.58 |                       55.67 |                         0.40 |               0.00 |             135.89 |             141.84 |             143.25 |             145.40 |             127.54 |
+|    2048 |           240 |          3407870.00 |               1.32 |                           20.15 |               46.31 |                        7.00 |                       57.81 |                         0.32 |               0.00 |             140.71 |             142.56 |             143.03 |             145.84 |             132.92 |
+|    2048 |           248 |          3538940.00 |               1.35 |                           21.56 |               50.58 |                        6.84 |                       56.74 |                         0.32 |               0.00 |             140.91 |             144.90 |             145.50 |             147.79 |             137.40 |
+|    2048 |           256 |          3407870.00 |               1.36 |                           19.44 |               57.60 |                        7.14 |                       58.67 |                         0.35 |               0.00 |             144.46 |             146.25 |             147.18 |             148.69 |             144.56 |
+
+</details>
+
+
+
+
+#### Online: NVIDIA DGX-1 (1x V100 32GB), NVIDIA TensorRT with FP16
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA DGX-1 (1x V100 32GB)            |
+| Backend                      |NVIDIA TensorRT        |
+| Backend accelerator          |-|
+| Precision                    |FP16      |
+| Model format                 |NVIDIA TensorRT   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| NVIDIA TensorRT Capture CUDA Graph | Enabled    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td colspan="2" align="center"><img src="./reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|    2048 |             8 |          2048000.00 |               1.06 |                            2.99 |                0.85 |                        0.66 |                        2.39 |                         0.02 |               0.00 |               7.68 |              10.34 |              10.43 |              10.81 |               7.97 |
+|    2048 |            16 |          3145730.00 |               1.09 |                            3.97 |                1.30 |                        0.74 |                        3.20 |                         0.02 |               0.00 |              10.40 |              11.76 |              12.62 |              13.54 |              10.33 |
+|    2048 |            24 |          3498580.00 |               1.08 |                            4.65 |                1.85 |                        1.07 |                        5.31 |                         0.03 |               0.00 |              14.55 |              14.74 |              14.80 |              14.96 |              14.00 |
+|    2048 |            32 |          3442690.00 |               1.10 |                            5.76 |                3.37 |                        1.58 |                        7.15 |                         0.05 |               0.00 |              20.14 |              22.84 |              25.34 |              30.42 |              19.01 |
+|    2048 |            40 |          3143680.00 |               1.08 |                            6.13 |                6.01 |                        2.04 |                       10.28 |                         0.08 |               0.00 |              26.23 |              26.40 |              26.45 |              26.53 |              25.61 |
+|    2048 |            48 |          3667970.00 |               1.18 |                            7.33 |                5.16 |                        2.39 |                       10.00 |                         0.06 |               0.00 |              26.91 |              34.01 |              37.07 |              44.18 |              26.13 |
+|    2048 |            56 |          3358720.00 |               1.25 |                            9.03 |                7.05 |                        3.40 |                       13.12 |                         0.19 |               0.00 |              33.38 |              39.30 |              42.76 |              43.68 |              34.04 |
+|    2048 |            64 |          3710980.00 |               1.22 |                            9.79 |                6.91 |                        3.24 |                       13.76 |                         0.08 |               0.00 |              35.20 |              46.53 |              51.79 |              58.32 |              35.00 |
+|    2048 |            72 |          3532800.00 |               1.27 |                            9.94 |                9.88 |                        4.50 |                       15.88 |                         0.10 |               0.00 |              42.32 |              45.32 |              47.01 |              48.25 |              41.57 |
+|    2048 |            80 |          3665920.00 |               1.22 |                           11.04 |                9.87 |                        4.50 |                       17.26 |                         0.11 |               0.00 |              43.84 |              54.40 |              57.38 |              69.41 |              44.01 |
+|    2048 |            88 |          3731460.00 |               1.23 |                           12.38 |                9.62 |                        4.46 |                       18.84 |                         0.13 |               0.00 |              49.49 |              60.37 |              64.00 |              70.11 |              46.66 |
+|    2048 |            96 |          3596290.00 |               1.30 |                           16.28 |               10.75 |                        5.33 |                       19.79 |                         0.12 |               0.00 |              56.59 |              60.01 |              64.17 |              65.67 |              53.58 |
+|    2048 |           104 |          4042750.00 |               1.27 |                           11.76 |               11.13 |                        5.64 |                       21.24 |                         0.11 |               0.00 |              51.11 |              63.48 |              68.62 |              81.35 |              51.16 |
+|    2048 |           112 |          4302850.00 |               1.26 |                           13.81 |               10.84 |                        5.57 |                       20.82 |                         0.14 |               0.00 |              52.92 |              65.63 |              70.87 |              73.82 |              52.42 |
+|    2048 |           120 |          4065280.00 |               1.32 |                           15.44 |               14.97 |                        5.56 |                       21.00 |                         0.12 |               0.00 |              67.00 |              71.61 |              73.35 |              74.64 |              58.40 |
+|    2048 |           128 |          4298750.00 |               1.33 |                           11.38 |               14.66 |                        6.38 |                       24.46 |                         0.14 |               0.00 |              57.55 |              74.42 |              75.14 |              75.96 |              58.34 |
+|    2048 |           136 |          4440060.00 |               1.26 |                           14.78 |               14.21 |                        6.41 |                       23.96 |                         0.13 |               0.00 |              65.35 |              75.63 |              79.32 |              84.87 |              60.76 |
+|    2048 |           144 |          4425730.00 |               1.24 |                           18.63 |               15.28 |                        6.32 |                       22.56 |                         0.16 |               0.00 |              67.99 |              76.48 |              78.16 |              82.24 |              64.18 |
+|    2048 |           152 |          4554750.00 |               1.27 |                           16.37 |               15.28 |                        6.73 |                       25.72 |                         0.16 |               0.00 |              67.57 |              76.59 |              78.25 |              89.80 |              65.55 |
+|    2048 |           160 |          4818940.00 |               1.31 |                           14.22 |               16.23 |                        7.65 |                       25.71 |                         0.16 |               0.00 |              67.81 |              78.92 |              83.34 |             108.24 |              65.27 |
+|    2048 |           168 |          4800510.00 |               1.28 |                           18.55 |               16.74 |                        7.45 |                       25.97 |                         0.15 |               0.00 |              72.54 |              85.59 |              90.37 |              99.32 |              70.16 |
+|    2048 |           176 |          4806660.00 |               1.27 |                           17.32 |               19.00 |                        7.16 |                       25.83 |                         0.14 |               0.00 |              73.55 |              85.24 |              86.53 |              89.98 |              70.73 |
+|    2048 |           184 |          4990980.00 |               1.29 |                           16.53 |               21.14 |                        7.73 |                       26.74 |                         0.17 |               0.00 |              76.09 |              89.39 |              94.63 |             107.31 |              73.61 |
+|    2048 |           192 |          4716540.00 |               1.30 |                           19.81 |               22.68 |                        8.54 |                       25.97 |                         0.19 |               0.00 |              79.06 |              96.15 |              97.55 |             102.86 |              78.48 |
+|    2048 |           200 |          5038080.00 |               1.26 |                           18.76 |               25.63 |                        7.40 |                       27.24 |                         0.16 |               0.00 |              84.41 |              94.34 |              95.84 |             102.56 |              80.47 |
+|    2048 |           208 |          4812800.00 |               1.32 |                           17.13 |               27.08 |                        8.30 |                       28.11 |                         0.16 |               0.00 |              87.32 |              96.77 |             107.36 |             120.31 |              82.10 |
+|    2048 |           216 |          4954110.00 |               1.26 |                           19.71 |               27.37 |                        7.18 |                       28.52 |                         0.18 |               0.00 |              87.99 |             101.20 |             106.30 |             126.02 |              84.20 |
+|    2048 |           224 |          5228540.00 |               1.31 |                           19.02 |               27.14 |                        7.92 |                       29.33 |                         0.15 |               0.00 |              90.37 |              99.31 |             105.15 |             114.43 |              84.87 |
+|    2048 |           232 |          5242880.00 |               1.28 |                           20.97 |               25.23 |                        7.82 |                       30.44 |                         0.19 |               0.00 |              93.14 |              96.63 |              98.39 |             100.47 |              85.92 |
+|    2048 |           240 |          5398530.00 |               1.29 |                           19.49 |               29.21 |                        7.61 |                       30.08 |                         0.17 |               0.00 |              92.47 |              95.17 |              95.65 |              98.29 |              87.85 |
+|    2048 |           248 |          5275650.00 |               1.32 |                           23.67 |               29.61 |                        8.34 |                       28.65 |                         0.17 |               0.00 |              93.53 |              97.05 |              99.55 |             100.99 |              91.75 |
+|    2048 |           256 |          5261310.00 |               1.25 |                           31.75 |               25.61 |                        7.47 |                       28.55 |                         0.16 |               0.00 |              95.47 |             113.51 |             118.44 |             122.77 |              94.79 |
+
+</details>
+
+
+
+
+#### Online: NVIDIA DGX A100 (1x A100 80GB), TensorFlow with FP32
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA DGX A100 (1x A100 80GB)            |
+| Backend                      |TensorFlow        |
+| Backend accelerator          |Automatic FP16|
+| Precision                    |FP32      |
+| Model format                 |TensorFlow SavedModel   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td colspan="2" align="center"><img src="./reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|    2048 |             8 |          2117630.00 |               0.39 |                            1.86 |                1.24 |                        0.32 |                        3.89 |                         0.02 |               0.00 |               7.91 |               9.30 |               9.66 |              10.36 |               7.72 |
+|    2048 |            16 |          3072000.00 |               0.45 |                            2.50 |                2.32 |                        0.58 |                        4.76 |                         0.02 |               0.00 |              10.98 |              12.15 |              12.70 |              13.56 |              10.64 |
+|    2048 |            24 |          3350530.00 |               0.46 |                            3.55 |                3.22 |                        0.96 |                        6.28 |                         0.04 |               0.00 |              15.04 |              16.18 |              16.47 |              17.53 |              14.50 |
+|    2048 |            32 |          3788800.00 |               0.47 |                            3.52 |                3.79 |                        1.35 |                        7.92 |                         0.07 |               0.00 |              17.34 |              19.63 |              19.96 |              21.71 |              17.11 |
+|    2048 |            40 |          4411390.00 |               0.45 |                            4.82 |                3.28 |                        1.25 |                        8.54 |                         0.07 |               0.00 |              18.94 |              21.93 |              22.89 |              25.76 |              18.41 |
+|    2048 |            48 |          5271550.00 |               0.44 |                            3.44 |                3.06 |                        1.88 |                        9.59 |                         0.07 |               0.00 |              18.58 |              19.17 |              19.40 |              19.91 |              18.48 |
+|    2048 |            56 |          5116930.00 |               0.44 |                            5.64 |                3.41 |                        1.96 |                       10.72 |                         0.09 |               0.00 |              21.15 |              27.85 |              29.67 |              35.70 |              22.26 |
+|    2048 |            64 |          5462700.00 |               0.45 |                            4.74 |                3.81 |                        2.23 |                       12.30 |                         0.10 |               0.00 |              23.67 |              24.60 |              24.85 |              25.19 |              23.63 |
+|    2048 |            72 |          5603330.00 |               0.49 |                            4.72 |                4.88 |                        2.57 |                       13.15 |                         0.13 |               0.00 |              26.01 |              26.96 |              27.19 |              27.60 |              25.94 |
+|    2048 |            80 |          5730300.00 |               0.49 |                            5.77 |                4.66 |                        2.69 |                       14.52 |                         0.13 |               0.00 |              28.26 |              28.98 |              29.24 |              29.64 |              28.26 |
+|    2048 |            88 |          5304320.00 |               0.56 |                            6.82 |                6.48 |                        3.50 |                       15.72 |                         0.15 |               0.00 |              32.96 |              41.00 |              43.34 |              50.28 |              33.24 |
+|    2048 |            96 |          6078460.00 |               0.47 |                            7.48 |                5.44 |                        3.06 |                       15.20 |                         0.14 |               0.00 |              30.81 |              40.15 |              41.58 |              45.13 |              31.80 |
+|    2048 |           104 |          5795840.00 |               0.51 |                            7.24 |                6.82 |                        3.19 |                       17.82 |                         0.17 |               0.00 |              36.06 |              44.67 |              48.78 |              50.98 |              35.75 |
+|    2048 |           112 |          6309890.00 |               0.48 |                            8.32 |                6.55 |                        3.03 |                       17.26 |                         0.16 |               0.00 |              35.22 |              40.60 |              45.25 |              54.10 |              35.79 |
+|    2048 |           120 |          6070350.00 |               0.48 |                            7.32 |                8.34 |                        4.02 |                       19.39 |                         0.22 |               0.00 |              39.67 |              52.07 |              55.22 |              62.96 |              39.78 |
+|    2048 |           128 |          5603330.00 |               0.48 |                           11.37 |                9.76 |                        3.65 |                       19.80 |                         0.21 |               0.00 |              45.55 |              56.76 |              57.75 |              60.84 |              45.28 |
+|    2048 |           136 |          6342660.00 |               0.47 |                           10.50 |                7.40 |                        3.40 |                       20.62 |                         0.19 |               0.00 |              42.67 |              43.36 |              43.68 |              44.46 |              42.58 |
+|    2048 |           144 |          6160380.00 |               0.51 |                            9.38 |                9.72 |                        3.96 |                       22.94 |                         0.22 |               0.00 |              47.19 |              50.34 |              53.58 |              62.89 |              46.73 |
+|    2048 |           152 |          6162430.00 |               0.50 |                            9.35 |               11.24 |                        4.06 |                       24.05 |                         0.22 |               0.00 |              49.62 |              50.93 |              51.40 |              52.12 |              49.43 |
+|    2048 |           160 |          6594560.00 |               0.48 |                            9.26 |               10.48 |                        4.33 |                       23.77 |                         0.23 |               0.00 |              48.82 |              49.97 |              50.25 |              51.14 |              48.55 |
+|    2048 |           168 |          6289410.00 |               0.54 |                            8.81 |               14.30 |                        4.31 |                       25.26 |                         0.23 |               0.00 |              53.23 |              54.47 |              54.93 |              64.09 |              53.46 |
+|    2048 |           176 |          6547460.00 |               0.51 |                            9.67 |               13.64 |                        4.92 |                       24.76 |                         0.27 |               0.00 |              54.30 |              56.66 |              58.01 |              60.22 |              53.78 |
+|    2048 |           184 |          6520830.00 |               0.53 |                            9.43 |               14.56 |                        4.54 |                       27.26 |                         0.25 |               0.00 |              57.16 |              59.69 |              60.11 |              60.62 |              56.57 |
+|    2048 |           192 |          6547460.00 |               0.51 |                            9.44 |               16.16 |                        4.73 |                       27.80 |                         0.25 |               0.00 |              58.92 |              59.96 |              60.35 |              62.24 |              58.90 |
+|    2048 |           200 |          6160380.00 |               0.55 |                            9.65 |               23.18 |                        6.02 |                       25.12 |                         0.33 |               0.00 |              62.63 |              79.47 |              81.42 |              83.06 |              64.86 |
+|    2048 |           208 |          6553600.00 |               0.51 |                            7.52 |               23.98 |                        5.24 |                       25.65 |                         0.28 |               0.00 |              59.00 |              77.14 |              77.89 |              79.00 |              63.17 |
+|    2048 |           216 |          6422530.00 |               0.51 |                            9.04 |               23.01 |                        4.66 |                       27.98 |                         0.27 |               0.00 |              59.66 |              77.53 |              77.99 |              78.71 |              65.46 |
+|    2048 |           224 |          6422530.00 |               0.52 |                            9.61 |               24.15 |                        4.55 |                       28.86 |                         0.24 |               0.00 |              70.81 |              78.24 |              78.68 |              80.45 |              67.94 |
+|    2048 |           232 |          6422530.00 |               0.51 |                            9.64 |               28.58 |                        4.57 |                       28.17 |                         0.26 |               0.00 |              78.30 |              79.89 |              80.26 |              81.71 |              71.72 |
+|    2048 |           240 |          6684670.00 |               0.50 |                           11.40 |               26.54 |                        4.61 |                       27.96 |                         0.25 |               0.00 |              74.96 |              77.42 |              79.14 |              80.80 |              71.26 |
+|    2048 |           248 |          6408190.00 |               0.49 |                           12.28 |               29.09 |                        4.86 |                       28.87 |                         0.26 |               0.00 |              77.54 |              81.01 |              82.15 |              82.76 |              75.85 |
+|    2048 |           256 |          6553600.00 |               0.50 |                           10.44 |               32.74 |                        4.35 |                       29.02 |                         0.25 |               0.00 |              77.27 |              78.51 |              78.74 |              80.09 |              77.31 |
+
+</details>
+
+
+
+
+#### Online: NVIDIA DGX A100 (1x A100 80GB), NVIDIA TensorRT with FP16
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA DGX A100 (1x A100 80GB)            |
+| Backend                      |NVIDIA TensorRT        |
+| Backend accelerator          |-|
+| Precision                    |FP16      |
+| Model format                 |NVIDIA TensorRT   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| NVIDIA TensorRT Capture CUDA Graph | Enabled    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td colspan="2" align="center"><img src="./reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|    2048 |             8 |          3344380.00 |               0.39 |                            2.26 |                0.59 |                        0.58 |                        1.05 |                         0.02 |               0.00 |               4.83 |               6.14 |               6.32 |               6.84 |               4.89 |
+|    2048 |            16 |          5148670.00 |               0.40 |                            3.14 |                0.78 |                        0.69 |                        1.31 |                         0.03 |               0.00 |               6.21 |               7.78 |               8.12 |               9.11 |               6.34 |
+|    2048 |            24 |          6113280.00 |               0.42 |                            3.33 |                1.07 |                        0.98 |                        2.18 |                         0.03 |               0.00 |               8.18 |               9.32 |              10.14 |              11.37 |               8.00 |
+|    2048 |            32 |          6434820.00 |               0.45 |                            4.10 |                1.43 |                        1.26 |                        2.84 |                         0.04 |               0.00 |              10.59 |              12.07 |              12.65 |              14.35 |              10.10 |
+|    2048 |            40 |          6946820.00 |               0.46 |                            4.01 |                2.14 |                        1.49 |                        3.59 |                         0.04 |               0.00 |              12.16 |              14.78 |              15.71 |              17.81 |              11.72 |
+|    2048 |            48 |          6770690.00 |               0.43 |                            5.27 |                2.43 |                        1.80 |                        4.39 |                         0.05 |               0.00 |              14.98 |              16.24 |              16.47 |              19.71 |              14.38 |
+|    2048 |            56 |          7225340.00 |               0.44 |                            6.06 |                2.32 |                        2.28 |                        4.63 |                         0.06 |               0.00 |              16.07 |              18.89 |              20.43 |              22.38 |              15.79 |
+|    2048 |            64 |          7217150.00 |               0.46 |                            6.95 |                2.74 |                        2.32 |                        5.57 |                         0.09 |               0.00 |              18.45 |              22.95 |              24.41 |              29.97 |              18.11 |
+|    2048 |            72 |          7436290.00 |               0.46 |                            6.99 |                3.44 |                        2.32 |                        6.45 |                         0.08 |               0.00 |              21.05 |              25.17 |              27.20 |              32.09 |              19.74 |
+|    2048 |            80 |          7757820.00 |               0.46 |                            7.62 |                3.36 |                        2.31 |                        6.90 |                         0.10 |               0.00 |              21.30 |              27.73 |              29.03 |              32.30 |              20.75 |
+|    2048 |            88 |          8118270.00 |               0.46 |                            6.24 |                4.01 |                        3.14 |                        8.00 |                         0.10 |               0.00 |              21.97 |              30.04 |              32.84 |              35.90 |              21.94 |
+|    2048 |            96 |          7417860.00 |               0.47 |                            9.43 |                3.91 |                        3.66 |                        8.74 |                         0.11 |               0.00 |              27.65 |              28.81 |              29.30 |              29.67 |              26.31 |
+|    2048 |           104 |          7948290.00 |               0.46 |                           10.29 |                3.97 |                        3.18 |                        8.49 |                         0.09 |               0.00 |              29.04 |              32.34 |              33.58 |              35.17 |              26.48 |
+|    2048 |           112 |          8038400.00 |               0.44 |                            9.26 |                5.20 |                        3.61 |                        9.38 |                         0.09 |               0.00 |              30.38 |              35.36 |              36.63 |              40.85 |              28.00 |
+|    2048 |           120 |          8720380.00 |               0.46 |                            8.97 |                5.44 |                        3.47 |                        9.39 |                         0.10 |               0.00 |              29.91 |              34.33 |              36.08 |              38.36 |              27.84 |
+|    2048 |           128 |          8339460.00 |               0.47 |                           11.57 |                5.64 |                        3.92 |                        9.35 |                         0.11 |               0.00 |              33.52 |              38.02 |              39.32 |              42.58 |              31.06 |
+|    2048 |           136 |          9078780.00 |               0.47 |                           11.30 |                5.39 |                        3.76 |                        9.01 |                         0.11 |               0.00 |              32.31 |              34.56 |              34.98 |              36.55 |              30.03 |
+|    2048 |           144 |          8794110.00 |               0.50 |                           10.94 |                7.06 |                        4.39 |                        9.72 |                         0.10 |               0.00 |              37.18 |              41.52 |              42.72 |              45.80 |              32.73 |
+|    2048 |           152 |          9527300.00 |               0.52 |                            9.28 |                7.14 |                        4.84 |                       10.36 |                         0.12 |               0.00 |              32.24 |              43.32 |              46.39 |              49.35 |              32.26 |
+|    2048 |           160 |          8984580.00 |               0.50 |                           13.36 |                7.18 |                        4.37 |                       10.19 |                         0.11 |               0.00 |              38.15 |              45.08 |              48.00 |              54.98 |              35.71 |
+|    2048 |           168 |          9719810.00 |               0.46 |                           14.35 |                5.22 |                        4.25 |                       10.02 |                         0.12 |               0.00 |              39.62 |              40.55 |              40.89 |              42.70 |              34.42 |
+|    2048 |           176 |         10377200.00 |               0.49 |                           10.02 |                7.91 |                        4.47 |                       10.81 |                         0.11 |               0.00 |              35.38 |              43.50 |              45.14 |              47.50 |              33.80 |
+|    2048 |           184 |          9897980.00 |               0.51 |                           12.32 |                8.22 |                        5.05 |                       10.56 |                         0.10 |               0.00 |              37.49 |              46.92 |              48.81 |              51.65 |              36.76 |
+|    2048 |           192 |         10129400.00 |               0.51 |                           12.08 |                9.12 |                        5.20 |                       10.59 |                         0.13 |               0.00 |              39.06 |              46.15 |              47.62 |              50.35 |              37.64 |
+|    2048 |           200 |         10266600.00 |               0.48 |                           13.34 |                9.49 |                        4.87 |                       10.76 |                         0.12 |               0.00 |              40.57 |              48.12 |              50.15 |              54.61 |              39.06 |
+|    2048 |           208 |         10154000.00 |               0.52 |                           15.22 |                9.31 |                        5.52 |                       10.54 |                         0.13 |               0.00 |              43.40 |              48.65 |              50.03 |              54.64 |              41.25 |
+|    2048 |           216 |         10244100.00 |               0.49 |                           14.22 |               11.24 |                        5.25 |                       10.88 |                         0.12 |               0.00 |              44.13 |              49.72 |              52.48 |              56.64 |              42.20 |
+|    2048 |           224 |         10235900.00 |               0.45 |                           18.12 |                9.39 |                        5.08 |                       10.62 |                         0.11 |               0.00 |              45.97 |              53.80 |              55.77 |              59.17 |              43.79 |
+|    2048 |           232 |         10397700.00 |               0.47 |                           17.96 |               10.05 |                        5.68 |                       10.37 |                         0.12 |               0.00 |              46.76 |              57.00 |              59.62 |              63.52 |              44.64 |
+|    2048 |           240 |         10287100.00 |               0.46 |                           21.07 |                9.12 |                        5.01 |                       10.69 |                         0.13 |               0.00 |              47.68 |              58.98 |              60.64 |              63.76 |              46.48 |
+|    2048 |           248 |         11300900.00 |               0.50 |                           12.09 |               14.32 |                        5.37 |                       11.27 |                         0.12 |               0.00 |              44.80 |              46.68 |              47.27 |              49.97 |              43.66 |
+|    2048 |           256 |         11272200.00 |               0.50 |                           11.16 |               16.80 |                        5.26 |                       11.49 |                         0.11 |               0.00 |              45.30 |              47.72 |              49.84 |              56.30 |              45.34 |
+
+</details>
+
+
+
+
+#### Online: NVIDIA T4, TensorFlow with FP32
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA T4            |
+| Backend                      |TensorFlow        |
+| Backend accelerator          |Automatic FP16|
+| Precision                    |FP32      |
+| Model format                 |TensorFlow SavedModel   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td colspan="2" align="center"><img src="./reports/nvidia_t4_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|    2048 |             8 |           865871.00 |               1.10 |                            4.48 |                3.04 |                        0.57 |                        9.61 |                         0.04 |               0.00 |              19.84 |              22.06 |              22.56 |              23.29 |              18.84 |
+|    2048 |            16 |          1089540.00 |               1.09 |                            5.22 |                7.21 |                        1.12 |                       15.25 |                         0.06 |               0.00 |              31.43 |              33.89 |              34.91 |              36.03 |              29.95 |
+|    2048 |            24 |          1099780.00 |               1.31 |                            6.78 |               10.59 |                        1.88 |                       22.41 |                         0.10 |               0.00 |              44.68 |              47.61 |              48.06 |              48.61 |              43.06 |
+|    2048 |            32 |          1171460.00 |               1.37 |                            8.07 |               13.38 |                        2.46 |                       28.96 |                         0.12 |               0.00 |              56.02 |              59.78 |              60.17 |              60.71 |              54.35 |
+|    2048 |            40 |          1325780.00 |               1.40 |                            6.04 |               13.19 |                        2.44 |                       37.20 |                         0.12 |               0.00 |              60.64 |              63.12 |              63.94 |              71.32 |              60.39 |
+|    2048 |            48 |          1376260.00 |               1.39 |                            8.23 |               12.70 |                        2.71 |                       44.43 |                         0.14 |               0.00 |              69.42 |              71.25 |              71.74 |              72.17 |              69.59 |
+|    2048 |            56 |          1376260.00 |               1.44 |                            8.59 |               18.14 |                        2.68 |                       50.12 |                         0.14 |               0.00 |              81.22 |              82.90 |              83.64 |              85.12 |              81.11 |
+|    2048 |            64 |          1368060.00 |               1.51 |                            8.70 |               21.25 |                        3.35 |                       57.52 |                         0.18 |               0.00 |              92.50 |              94.70 |              95.23 |              96.06 |              92.51 |
+|    2048 |            72 |          1372160.00 |               1.51 |                            9.72 |               24.49 |                        3.77 |                       63.79 |                         0.19 |               0.00 |             103.07 |             107.19 |             107.84 |             108.11 |             103.48 |
+|    2048 |            80 |          1310720.00 |               1.38 |                            9.70 |               27.25 |                        4.10 |                       72.40 |                         0.22 |               0.00 |             114.95 |             117.67 |             118.11 |             118.94 |             115.04 |
+|    2048 |            88 |          1308670.00 |               1.58 |                           11.56 |               26.68 |                        4.21 |                       81.20 |                         0.25 |               0.00 |             125.08 |             129.18 |             129.83 |             130.91 |             125.48 |
+|    2048 |            96 |          1347580.00 |               1.65 |                           11.22 |               32.70 |                        4.69 |                       87.01 |                         0.27 |               0.00 |             137.81 |             139.51 |             140.49 |             143.02 |             137.55 |
+|    2048 |           104 |          1347580.00 |               1.69 |                            9.35 |               40.72 |                        4.42 |                       90.71 |                         0.25 |               0.00 |             147.06 |             149.22 |             149.70 |             150.16 |             147.15 |
+|    2048 |           112 |          1314820.00 |               1.67 |                           11.60 |               42.33 |                        5.27 |                       97.35 |                         0.28 |               0.00 |             160.13 |             165.58 |             174.67 |             182.71 |             158.50 |
+|    2048 |           120 |          1259520.00 |               1.68 |                           12.02 |               45.84 |                        5.43 |                      105.70 |                         0.30 |               0.00 |             170.64 |             174.06 |             175.21 |             176.62 |             170.98 |
+|    2048 |           128 |          1318910.00 |               1.80 |                           11.93 |               50.38 |                        5.84 |                      112.15 |                         0.32 |               0.00 |             182.70 |             186.44 |             187.30 |             187.74 |             182.42 |
+|    2048 |           136 |          1314820.00 |               1.70 |                           17.22 |               46.92 |                        6.63 |                      120.48 |                         0.44 |               0.00 |             192.88 |             196.29 |             196.85 |             201.14 |             193.39 |
+|    2048 |           144 |          1311460.00 |               1.68 |                           16.08 |               51.66 |                        6.63 |                      127.27 |                         0.39 |               0.00 |             203.93 |             207.14 |             208.27 |             210.94 |             203.72 |
+|    2048 |           152 |          1267710.00 |               1.66 |                           15.52 |               58.86 |                        6.65 |                      133.29 |                         0.38 |               0.00 |             216.69 |             221.59 |             228.32 |             228.91 |             216.36 |
+|    2048 |           160 |          1200130.00 |               1.67 |                           15.44 |               63.33 |                        6.73 |                      140.23 |                         0.38 |               0.00 |             228.08 |             230.84 |             232.18 |             235.98 |             227.78 |
+|    2048 |           168 |          1290240.00 |               1.72 |                           15.64 |               65.90 |                        7.50 |                      147.90 |                         0.40 |               0.00 |             239.57 |             242.45 |             246.57 |             251.30 |             239.07 |
+|    2048 |           176 |          1317590.00 |               1.64 |                           14.87 |               72.50 |                        7.94 |                      153.87 |                         0.41 |               0.00 |             251.88 |             256.37 |             259.48 |             260.15 |             251.23 |
+|    2048 |           184 |          1247230.00 |               1.72 |                           14.28 |               75.90 |                        8.05 |                      162.36 |                         0.44 |               0.00 |             263.65 |             265.82 |             266.30 |             268.95 |             262.75 |
+|    2048 |           192 |          1251330.00 |               1.69 |                           15.09 |               79.04 |                        9.36 |                      168.48 |                         0.47 |               0.00 |             274.96 |             277.44 |             278.19 |             279.32 |             274.14 |
+|    2048 |           200 |          1179650.00 |               1.66 |                           14.45 |               93.11 |                        7.82 |                      167.90 |                         0.44 |               0.00 |             274.52 |             358.83 |             362.49 |             364.92 |             285.37 |
+|    2048 |           208 |          1179650.00 |               1.59 |                           14.07 |              104.92 |                        8.14 |                      168.38 |                         0.46 |               0.00 |             276.92 |             363.75 |             364.94 |             367.04 |             297.58 |
+|    2048 |           216 |          1179650.00 |               1.66 |                           15.02 |              115.94 |                        7.78 |                      166.93 |                         0.50 |               0.00 |             277.43 |             364.02 |             365.33 |             366.67 |             307.84 |
+|    2048 |           224 |          1178470.00 |               1.64 |                           14.27 |              128.81 |                        8.77 |                      166.54 |                         0.47 |               0.00 |             358.49 |             366.57 |             367.23 |             368.10 |             320.50 |
+|    2048 |           232 |          1179650.00 |               1.51 |                           20.32 |              132.74 |                        8.31 |                      169.39 |                         0.44 |               0.00 |             362.49 |             369.42 |             370.47 |             372.11 |             332.71 |
+|    2048 |           240 |          1179650.00 |               1.58 |                           18.17 |              146.59 |                        8.71 |                      168.74 |                         0.44 |               0.00 |             365.72 |             368.24 |             369.50 |             372.40 |             344.22 |
+|    2048 |           248 |          1179650.00 |               1.58 |                           20.87 |              154.53 |                        8.20 |                      168.54 |                         0.44 |               0.00 |             363.30 |             371.63 |             373.75 |             376.55 |             354.16 |
+|    2048 |           256 |          1179650.00 |               1.66 |                           17.51 |              167.41 |                        7.93 |                      169.97 |                         0.44 |               0.00 |             365.42 |             367.29 |             367.73 |             369.40 |             364.92 |
+
+</details>
+
+
+
+
+#### Online: NVIDIA T4, NVIDIA TensorRT with FP16
+
+Our results were obtained using the following configuration:
+
+| Parameter Name               | Parameter Value              |
+|:-----------------------------|:-----------------------------|
+| GPU                          |NVIDIA T4            |
+| Backend                      |NVIDIA TensorRT        |
+| Backend accelerator          |-|
+| Precision                    |FP16      |
+| Model format                 |NVIDIA TensorRT   |
+| Max batch size               |131072 |
+| Number of model instances    |2|
+| Export Format | TensorFlow SavedModel    |
+| NVIDIA TensorRT Capture CUDA Graph | Enabled    |
+| Device Kind | gpu                 |
+| Torch Jit | none                 |
+
+
+<table>
+<tbody>
+  <tr>
+    <td colspan="2" align="center"><img src="./reports/nvidia_t4_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png"></td>
+  </tr>
+</tbody>
+</table>
+
+<details>
+<summary>Results Table</summary>
+
+|   Batch |   Concurrency |   Inferences/Second |   Client Send (ms) |   Network+Server Send/Recv (ms) |   Server Queue (ms) |   Server Compute Input (ms) |   Server Compute Infer (ms) |   Server Compute Output (ms) |   Client Recv (ms) |   p50 latency (ms) |   p90 latency (ms) |   p95 latency (ms) |   p99 latency (ms) |   avg latency (ms) |
+|--------:|--------------:|--------------------:|-------------------:|--------------------------------:|--------------------:|----------------------------:|----------------------------:|-----------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
+|    2048 |             8 |          1689960.00 |               0.82 |                            3.48 |                1.44 |                        0.82 |                        3.02 |                         0.06 |               0.00 |              10.52 |              11.08 |              11.28 |              11.82 |               9.64 |
+|    2048 |            16 |          1585610.00 |               1.10 |                            5.57 |                3.93 |                        1.59 |                        8.20 |                         0.06 |               0.00 |              19.40 |              25.78 |              26.49 |              29.85 |              20.45 |
+|    2048 |            24 |          1564670.00 |               1.43 |                            6.54 |                7.94 |                        2.13 |                       12.68 |                         0.07 |               0.00 |              32.02 |              32.82 |              32.97 |              33.25 |              30.80 |
+|    2048 |            32 |          1525760.00 |               1.55 |                            8.59 |                8.88 |                        2.97 |                       19.30 |                         0.08 |               0.00 |              45.15 |              50.58 |              57.80 |              61.77 |              41.38 |
+|    2048 |            40 |          1583100.00 |               1.55 |                           10.34 |               10.41 |                        3.47 |                       24.54 |                         0.09 |               0.00 |              50.92 |              61.64 |              73.60 |              85.84 |              50.41 |
+|    2048 |            48 |          1640450.00 |               1.60 |                           10.56 |               13.58 |                        4.51 |                       28.45 |                         0.12 |               0.00 |              61.22 |              74.89 |              86.59 |              91.35 |              58.82 |
+|    2048 |            56 |          1525760.00 |               1.64 |                           13.66 |               10.72 |                        4.76 |                       40.94 |                         0.14 |               0.00 |              78.29 |              90.64 |              91.98 |              97.42 |              71.86 |
+|    2048 |            64 |          1574910.00 |               1.59 |                           12.86 |               13.92 |                        6.62 |                       46.63 |                         0.17 |               0.00 |              84.43 |              91.45 |             112.34 |             125.38 |              81.79 |
+|    2048 |            72 |          1473090.00 |               1.69 |                           15.22 |               20.89 |                        6.43 |                       48.72 |                         0.20 |               0.00 |              95.13 |             120.03 |             122.96 |             124.02 |              93.14 |
+|    2048 |            80 |          1662980.00 |               1.57 |                           17.32 |               21.28 |                        6.73 |                       46.90 |                         0.21 |               0.00 |              95.96 |             132.60 |             135.03 |             148.41 |              94.02 |
+|    2048 |            88 |          1624060.00 |               1.61 |                           16.58 |               24.76 |                        7.94 |                       50.47 |                         0.20 |               0.00 |             101.01 |             137.54 |             140.87 |             143.96 |             101.56 |
+|    2048 |            96 |          1703940.00 |               1.61 |                           17.20 |               25.42 |                        7.61 |                       54.91 |                         0.20 |               0.00 |             110.98 |             135.92 |             151.28 |             165.95 |             106.95 |
+|    2048 |           104 |          1622020.00 |               1.89 |                           17.01 |               41.48 |                        7.07 |                       53.83 |                         0.19 |               0.00 |             122.34 |             135.69 |             146.57 |             168.18 |             121.46 |
+|    2048 |           112 |          1945600.00 |               1.74 |                           13.44 |               28.63 |                        7.23 |                       60.03 |                         0.18 |               0.00 |             111.46 |             142.73 |             151.17 |             171.38 |             111.26 |
+|    2048 |           120 |          1919100.00 |               1.74 |                           13.70 |               32.97 |                        7.68 |                       61.34 |                         0.18 |               0.00 |             115.54 |             146.44 |             149.95 |             170.00 |             117.61 |
+|    2048 |           128 |          1933310.00 |               1.68 |                           15.30 |               38.92 |                        7.28 |                       61.93 |                         0.21 |               0.00 |             127.46 |             148.73 |             167.49 |             180.54 |             125.32 |
+|    2048 |           136 |          1732920.00 |               1.79 |                           16.22 |               52.00 |                        9.77 |                       65.01 |                         0.22 |               0.00 |             161.86 |             173.24 |             173.96 |             174.94 |             145.03 |
+|    2048 |           144 |          1802240.00 |               1.74 |                           19.45 |               55.78 |                        8.68 |                       67.15 |                         0.20 |               0.00 |             162.88 |             172.74 |             173.50 |             177.37 |             153.00 |
+|    2048 |           152 |          1898500.00 |               1.64 |                           16.21 |               58.72 |                        8.35 |                       68.42 |                         0.20 |               0.00 |             163.08 |             172.43 |             173.68 |             178.57 |             153.55 |
+|    2048 |           160 |          2060290.00 |               1.74 |                           15.49 |               51.38 |                       10.67 |                       68.51 |                         0.32 |               0.00 |             163.39 |             174.03 |             175.48 |             176.47 |             148.11 |
+|    2048 |           168 |          1961980.00 |               1.57 |                           22.56 |               58.75 |                       10.48 |                       68.02 |                         0.21 |               0.00 |             166.14 |             177.22 |             180.09 |             182.40 |             161.58 |
+|    2048 |           176 |          2166780.00 |               1.64 |                           14.96 |               45.06 |                       10.78 |                       81.05 |                         0.21 |               0.00 |             136.12 |             200.28 |             201.15 |             204.05 |             153.70 |
+|    2048 |           184 |          2119680.00 |               1.60 |                           18.60 |               57.29 |                        9.85 |                       80.64 |                         0.27 |               0.00 |             171.14 |             213.86 |             218.87 |             249.21 |             168.25 |
+|    2048 |           192 |          2097150.00 |               1.59 |                           15.68 |               56.32 |                       10.56 |                       82.88 |                         0.22 |               0.00 |             194.18 |             201.81 |             202.93 |             206.86 |             167.26 |
+|    2048 |           200 |          2097150.00 |               1.58 |                           17.20 |               61.80 |                       10.67 |                       82.66 |                         0.28 |               0.00 |             197.23 |             214.77 |             220.22 |             223.59 |             174.20 |
+|    2048 |           208 |          2097150.00 |               1.55 |                           15.34 |               70.57 |                       11.21 |                       81.81 |                         0.24 |               0.00 |             198.06 |             220.45 |             222.52 |             224.45 |             180.73 |
+|    2048 |           216 |          2103300.00 |               1.60 |                           16.60 |               76.06 |                       10.58 |                       82.43 |                         0.24 |               0.00 |             199.23 |             223.14 |             224.37 |             225.89 |             187.51 |
+|    2048 |           224 |          2097150.00 |               1.52 |                           16.82 |               81.37 |                        9.81 |                       82.91 |                         0.22 |               0.00 |             210.20 |             220.22 |             220.76 |             221.99 |             192.66 |
+|    2048 |           232 |          2095060.00 |               1.52 |                           17.79 |               88.51 |                       10.20 |                       82.63 |                         0.24 |               0.00 |             218.66 |             222.50 |             223.32 |             227.20 |             200.89 |
+|    2048 |           240 |          2095060.00 |               1.47 |                           18.26 |               93.63 |                       10.26 |                       82.72 |                         0.25 |               0.00 |             219.27 |             222.50 |             223.44 |             226.30 |             206.61 |
+|    2048 |           248 |          2076670.00 |               1.42 |                           25.49 |               95.51 |                       11.06 |                       81.93 |                         0.23 |               0.00 |             221.54 |             224.98 |             227.86 |             232.00 |             215.63 |
+|    2048 |           256 |          2095060.00 |               1.46 |                           17.32 |              109.94 |                       10.63 |                       82.65 |                         0.24 |               0.00 |             222.16 |             225.26 |             226.11 |             229.25 |             222.25 |
+
+</details>
+
+
+
+
+## Advanced
+|  Inference runtime | Mnemonic used in scripts |
+|--------------------|--------------------------|
+| [TensorFlow SavedModel](https://www.tensorflow.org/guide/saved_model) | `tf-savedmodel`  |
+| [TensorFlow TensorRT](https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html) | `tf-trt` |
+| [ONNX](https://onnx.ai) | `onnx` |
+| [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) | `trt` |
+
+
+### Step by step deployment process
+Commands described below can be used for exporting, converting and profiling the model.
+
+#### Clone Repository
+IMPORTANT: This step is executed on the host computer.
+<details>
+<summary>Clone Repository Command</summary>
+
+```shell
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep
+```
+</details>
+
+#### Start Triton Inference Server
+Setup the environment in the host computer and start Triton Inference Server.
+<details>
+<summary>Setup Environment and Start Triton Inference Server Command</summary>
+
+```shell
+source ./triton/scripts/setup_environment.sh
+./triton/scripts/docker/triton_inference_server.sh
+```
+</details>
+
+#### Setup Container
+Build and run a container that extends the NGC TensorFlow2 container with the Triton Inference Server client libraries and dependencies.
+<details>
+<summary>Setup Container Command</summary>
+
+Build container:
+
+```shell
+./triton/scripts/docker/build.sh
+```
+
+Run container in interactive mode:
+
+```shell
+./triton/scripts/docker/interactive.sh
+```
+
+Setup environment in order to share artifacts in steps and with Triton Inference Server:
+
+```shell
+source ./triton/scripts/setup_environment.sh
+```
+
+</details>
+
+
+#### Export Model
+Export model from Python source to desired format (e.g. Savedmodel or TorchScript)
+<details>
+<summary>Export Model Command</summary>
+
+```shell
+python3 triton/export_model.py \
+    --input-path triton/model.py \
+    --input-type tf-keras \
+    --output-path ${SHARED_DIR}/exported_model.savedmodel \
+    --output-type tf-savedmodel \
+    --ignore-unknown-parameters \
+    \
+    --checkpoint-dir ${CHECKPOINTS_DIR}/widedeep_tf2_amp_base_128k_nvtabular/checkpoint \
+    --batch-size 131072 \
+    --precision fp32 \
+    \
+    --dataloader triton/dataloader.py \
+    --batch-size 131072 \
+    --data-pattern "${DATASETS_DIR}/outbrain/valid/*.parquet"
+```
+
+</details>
+
+
+
+#### Convert Model
+Convert the model from training to inference format (e.g. TensorRT).
+<details>
+<summary>Convert Model Command</summary>
+
+```shell
+model-navigator convert \
+    --model-name WidenDeep \
+    --model-path ${SHARED_DIR}/exported_model.savedmodel \
+    --output-path ${SHARED_DIR}/converted_model \
+    --target-formats tf-savedmodel \
+    --target-precisions fp32 \
+    --launch-mode local \
+    --override-workspace \
+    --verbose \
+    \
+    --onnx-opsets 13 \
+    --max-batch-size 131072 \
+    --max-workspace-size 8589934592 \
+    --atol wide_deep_model=0.015 \
+    --rtol wide_deep_model=12.0
+```
+
+</details>
+
+
+#### Deploy Model
+Configure the model on Triton Inference Server.
+Generate the configuration from your model repository.
+<details>
+
+<summary>Deploy Model Command</summary>
+
+```shell
+model-navigator triton-config-model \
+    --model-repository ${MODEL_REPOSITORY_PATH} \
+    --model-name WidenDeep \
+    --model-version 1 \
+    --model-path ${SHARED_DIR}/converted_model \
+    --model-format tf-savedmodel \
+    --model-control-mode explicit \
+    --load-model \
+    --load-model-timeout-s 120 \
+    --verbose \
+    \
+    --batching dynamic \
+    --backend-accelerator amp \
+    --tensorrt-precision fp32 \
+    --tensorrt-capture-cuda-graph \
+    --max-batch-size 131072 \
+    --preferred-batch-sizes 131072 \
+    --engine-count-per-device gpu=2
+```
+
+</details>
+
+
+
+
+#### Triton Performance Offline Test
+We want to maximize throughput. It assumes you have your data available
+for inference or that your data saturate to maximum batch size quickly.
+Triton Inference Server supports offline scenarios with static batching.
+Static batching allows inference requests to be served
+as they are received. The largest improvements to throughput come
+from increasing the batch size due to efficiency gains in the GPU with larger
+batches.
+<details>
+<summary>Triton Performance Offline Test Command</summary>
+
+```shell
+python triton/run_performance_on_triton.py \
+    --model-repository ${MODEL_REPOSITORY_PATH} \
+    --model-name WidenDeep \
+    --input-data random \
+    --batch-sizes 1 16384 32768 49152 65536 81920 98304 114688 131072 \
+    --concurrency 1 \
+    --performance-tool perf_analyzer \
+    --measurement-request-count 100 \
+    --evaluation-mode offline \
+    --warmup \
+    --result-path ${SHARED_DIR}/triton_performance_offline.csv
+```
+
+ </details>
+
+
+
+#### Triton Performance Online Test
+We want to maximize throughput within latency budget constraints.
+Dynamic batching is a feature of Triton Inference Server that allows
+inference requests to be combined by the server, so that a batch is
+created dynamically, resulting in a reduced average latency.
+<details>
+<summary>Triton Performance Online Test</summary>
+
+```shell
+python triton/run_performance_on_triton.py \
+    --model-repository ${MODEL_REPOSITORY_PATH} \
+    --model-name WidenDeep \
+    --input-data random \
+    --batch-sizes 2048 \
+    --concurrency 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 \
+    --performance-tool perf_analyzer \
+    --measurement-request-count 500 \
+    --evaluation-mode online \
+    --warmup \
+    --result-path ${SHARED_DIR}/triton_performance_online.csv
+```
+
+
+</details>
+
+### Latency explanation
+A typical Triton Inference Server pipeline can be broken down into the following steps:
+
+1. The client serializes the inference request into a message and sends it to
+the server (Client Send).
+2. The message travels over the network from the client to the server (Network).
+3. The message arrives at the server and is deserialized (Server Receive).
+4. The request is placed on the queue (Server Queue).
+5. The request is removed from the queue and computed (Server Compute).
+6. The completed request is serialized in a message and sent back to
+the client (Server Send).
+7. The completed message then travels over the network from the server
+to the client (Network).
+8. The completed message is deserialized by the client and processed as
+a completed inference request (Client Receive).
+
+Generally, for local clients, steps 1-4 and 6-8 will only occupy
+a small fraction of time, compared to step 5. In distributed systems and online processing
+where client and server side are connect through network, the send and receive steps might have impact
+on overall processing performance. In order to analyze the possible bottlenecks, the detailed
+charts are presented in online scenario cases.
+
+
+
+## Release Notes
+We’re constantly refining and improving our performance on AI
+and HPC workloads even on the same hardware with frequent updates
+to our software stack. For our latest performance data, refer
+to these pages for
+[AI](https://developer.nvidia.com/deep-learning-performance-training-inference)
+and [HPC](https://developer.nvidia.com/hpc-application-performance) benchmarks.
+
+### Changelog
+
+May 2022
+- Initial release
+
+### Known issues
+
+- There are no known issues with this model.

+ 97 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/calculate_metrics.py

@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Using `calculate_metrics.py` script, you can obtain model accuracy/error metrics using defined `MetricsCalculator` class.
+
+Data provided to `MetricsCalculator` are obtained from dump files
+stored in directory pointed by `--dump-dir` argument.
+Above files are prepared by `run_inference_on_fw.py` and `run_inference_on_triton.py` scripts.
+
+Output data is stored in csv file pointed by `--csv` argument.
+
+Example call:
+
+```shell script
+python ./triton/calculate_metrics.py \
+    --dump-dir /results/dump_triton \
+    --csv /results/accuracy_results.csv \
+    --metrics metrics.py \
+    --metric-class-param1 value
+```
+"""
+
+import argparse
+import csv
+import logging
+import string
+from pathlib import Path
+
+# method from PEP-366 to support relative import in executed modules
+
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator
+from .deployment_toolkit.core import BaseMetricsCalculator, load_from_file
+from .deployment_toolkit.dump import JsonDumpReader
+
+LOGGER = logging.getLogger("calculate_metrics")
+TOTAL_COLUMN_NAME = "_total_"
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser(description="Run models with given dataloader", allow_abbrev=False)
+    parser.add_argument("--metrics", help="Path to python module containing metrics calculator", required=True)
+    parser.add_argument("--csv", help="Path to csv file", required=True)
+    parser.add_argument("--dump-dir", help="Path to directory with dumped outputs (and labels)", required=True)
+
+    args, *_ = parser.parse_known_args()
+
+    MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
+    ArgParserGenerator(MetricsCalculator).update_argparser(parser)
+
+    args = parser.parse_args()
+
+    LOGGER.info("args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
+    metrics_calculator: BaseMetricsCalculator = ArgParserGenerator(MetricsCalculator).from_args(args)
+
+    reader = JsonDumpReader(args.dump_dir)
+    for ids, x, y_true, y_pred in reader.iterate_over(["ids", "inputs", "labels", "outputs"]):
+        ids = list(ids["ids"]) if ids is not None else None
+        metrics_calculator.update(ids=ids, x=x, y_pred=y_pred, y_real=y_true)
+    metrics = metrics_calculator.metrics
+
+    metric_names_with_space = [name for name in metrics if any([c in string.whitespace for c in name])]
+    if metric_names_with_space:
+        raise ValueError(f"Metric names shall have no spaces; Incorrect names: {', '.join(metric_names_with_space)}")
+
+    csv_path = Path(args.csv)
+    csv_path.parent.mkdir(parents=True, exist_ok=True)
+    with csv_path.open("w") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=list(metrics.keys()))
+        writer.writeheader()
+        writer.writerow(metrics)
+
+
+if __name__ == "__main__":
+    main()

+ 63 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/dataloader.py

@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+
+import tensorflow as tf
+from triton.tf_dataloader import eval_input_fn
+
+
+def get_dataloader_fn(
+        *,
+        data_pattern: str,
+        batch_size: int,
+):
+    files_path = (glob.glob(data_pattern))
+    assert len(files_path), "Expected at least 1 parquet file, found 0"
+    with tf.device('/cpu:0'):
+        input_fn = eval_input_fn(
+            files_path=files_path,
+            records_batch_size=batch_size,
+        )
+
+    def _get_dataloader():
+        for x, y, ids in input_fn:
+            ids = ids.numpy()
+            x = {name: tensor.numpy() for name, tensor in x.items()}
+            y = {'wide_deep_model': y.numpy()}
+
+            yield ids, x, y
+
+    return _get_dataloader
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="short_description")
+    parser.add_argument("--data_pattern", required=True)
+    parser.add_argument("--batch_size", type=int, required=True)
+    args = parser.parse_args()
+
+    dataloader_fn = get_dataloader_fn(data_pattern=args.data_pattern,
+                                      batch_size=args.batch_size)
+
+    for i, (ids, x, y) in enumerate(dataloader_fn()):
+        print(x, y)
+
+
+if __name__ == "__main__":
+    main()

+ 13 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/__init__.py

@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 136 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/args.py

@@ -0,0 +1,136 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import inspect
+import logging
+from typing import Callable, Dict, Optional, Union
+
+from model_navigator.utils.cli import is_dict_generic, is_list_generic, is_optional_generic
+
+from .core import GET_ARGPARSER_FN_NAME, load_from_file
+
+LOGGER = logging.getLogger(__name__)
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def filter_fn_args(args: Union[dict, argparse.Namespace], fn: Callable) -> dict:
+    signature = inspect.signature(fn)
+    parameters_names = list(signature.parameters)
+    if isinstance(args, argparse.Namespace):
+        args = vars(args)
+    args = {k: v for k, v in args.items() if k in parameters_names}
+    return args
+
+
+def add_args_for_fn_signature(parser, fn) -> argparse.ArgumentParser:
+    parser.conflict_handler = "resolve"
+    signature = inspect.signature(fn)
+    for parameter in signature.parameters.values():
+        if parameter.name in ["self", "args", "kwargs"]:
+            continue
+        argument_kwargs = {}
+        if parameter.annotation != inspect.Parameter.empty:
+
+            is_optional = is_optional_generic(parameter.annotation)
+            if is_optional:
+                annotation = parameter.annotation.__args__[0]  # Optional[cls] will be changed into Union[cls, None]
+            else:
+                annotation = parameter.annotation
+
+            is_list = is_list_generic(annotation)
+            is_dict = is_dict_generic(annotation)
+
+            if parameter.annotation == bool:
+                argument_kwargs["type"] = str2bool
+                argument_kwargs["choices"] = [0, 1]
+            elif is_list:
+                argument_kwargs["type"] = annotation.__args__[0]  # List[cls] -> cls
+            elif is_dict:
+                raise RuntimeError(
+                    f"Could not prepare argument parser for {parameter.name}: {parameter.annotation} in {fn}"
+                )
+            else:
+                argument_kwargs["type"] = annotation
+
+        if parameter.default != inspect.Parameter.empty:
+            if parameter.annotation == bool:
+                argument_kwargs["default"] = str2bool(parameter.default)
+            else:
+                argument_kwargs["default"] = parameter.default
+        else:
+            argument_kwargs["required"] = True
+        name = parameter.name.replace("_", "-")
+        LOGGER.debug(f"Adding argument {name} with {argument_kwargs}")
+        parser.add_argument(f"--{name}", **argument_kwargs)
+    return parser
+
+
+class ArgParserGenerator:
+    def __init__(self, cls_or_fn, module_path: Optional[str] = None):
+        self._cls_or_fn = cls_or_fn
+
+        init_method_name = "__init__"
+        self._handle = cls_or_fn if inspect.isfunction(cls_or_fn) else getattr(cls_or_fn, init_method_name, None)
+        input_is_python_file = module_path and module_path.endswith(".py")
+        self._input_path = module_path if input_is_python_file else None
+        self._required_fn_name_for_signature_parsing = getattr(
+            cls_or_fn, "required_fn_name_for_signature_parsing", None
+        )
+
+    def update_argparser(self, parser):
+        name = self._handle.__name__
+        group_parser = parser.add_argument_group(name)
+        add_args_for_fn_signature(group_parser, fn=self._handle)
+        self._update_argparser(group_parser)
+
+    def get_args(self, args: argparse.Namespace):
+        filtered_args = filter_fn_args(args, fn=self._handle)
+
+        tmp_parser = argparse.ArgumentParser(allow_abbrev=False)
+        self._update_argparser(tmp_parser)
+        custom_names = [
+            p.dest.replace("-", "_") for p in tmp_parser._actions if not isinstance(p, argparse._HelpAction)
+        ]
+        custom_params = {n: getattr(args, n) for n in custom_names}
+        filtered_args = {**filtered_args, **custom_params}
+        return filtered_args
+
+    def from_args(self, args: Union[argparse.Namespace, Dict]):
+        args = self.get_args(args)
+        LOGGER.info(f"Initializing {self._cls_or_fn.__name__}({args})")
+        return self._cls_or_fn(**args)
+
+    def _update_argparser(self, parser):
+        label = "argparser_update"
+        if self._input_path:
+            update_argparser_handle = load_from_file(self._input_path, label=label, target=GET_ARGPARSER_FN_NAME)
+            if update_argparser_handle:
+                update_argparser_handle(parser)
+            elif self._required_fn_name_for_signature_parsing:
+                fn_handle = load_from_file(
+                    self._input_path, label=label, target=self._required_fn_name_for_signature_parsing
+                )
+                if fn_handle:
+                    add_args_for_fn_signature(parser, fn_handle)

+ 270 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/core.py

@@ -0,0 +1,270 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import importlib
+import logging
+import os
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union
+
+import numpy as np
+
+LOGGER = logging.getLogger(__name__)
+DATALOADER_FN_NAME = "get_dataloader_fn"
+GET_MODEL_FN_NAME = "get_model"
+GET_SERVING_INPUT_RECEIVER_FN = "get_serving_input_receiver_fn"
+GET_ARGPARSER_FN_NAME = "update_argparser"
+
+
+class TensorSpec(NamedTuple):
+    name: str
+    dtype: str
+    shape: Tuple
+
+
+class Parameter(Enum):
+    def __lt__(self, other: "Parameter") -> bool:
+        return self.value < other.value
+
+    def __str__(self):
+        return self.value
+
+
+class BackendAccelerator(Parameter):
+    NONE = "none"
+    AMP = "amp"
+    TRT = "trt"
+
+
+class ExportPrecision(Parameter):
+    FP16 = "fp16"
+    FP32 = "fp32"
+
+
+class Precision(Parameter):
+    INT8 = "int8"
+    FP16 = "fp16"
+    FP32 = "fp32"
+
+
+class DeviceKind(Parameter):
+    CPU = "cpu"
+    GPU = "gpu"
+
+
+class ModelInputType(Parameter):
+    TF_GRAPHDEF = "tf-graphdef"
+    TF_ESTIMATOR = "tf-estimator"
+    TF_KERAS = "tf-keras"
+    PYT = "pyt"
+
+
+class Format(Parameter):
+    TF_SAVEDMODEL = "tf-savedmodel"
+    TF_TRT = "tf-trt"
+    ONNX = "onnx"
+    TORCHSCRIPT = "torchscript"
+    TRT = "trt"
+    FASTERTRANSFORMER = "fastertransformer"
+
+    # deprecated, backward compatibility only
+    TS_TRACE = "ts-trace"
+    TS_SCRIPT = "ts-script"
+
+
+class ExportFormat(Parameter):
+    TF_SAVEDMODEL = "tf-savedmodel"
+    TORCHSCRIPT = "torchscript"
+    ONNX = "onnx"
+
+    # deprecated, backward compatibility only
+    TS_TRACE = "ts-trace"
+    TS_SCRIPT = "ts-script"
+
+
+class TorchJit(Parameter):
+    NONE = "none"
+    TRACE = "trace"
+    SCRIPT = "script"
+
+
+class Model(NamedTuple):
+    handle: object
+    # TODO: precision should be removed
+    precision: Optional[Precision]
+    inputs: Dict[str, TensorSpec]
+    outputs: Dict[str, TensorSpec]
+
+
+def load_from_file(file_path, label, target):
+    spec = importlib.util.spec_from_file_location(name=label, location=file_path)
+    my_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(my_module)  # pytype: disable=attribute-error
+    return getattr(my_module, target, None)
+
+
+class BaseLoader(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def load(self, model_path: Union[str, Path], **kwargs) -> Model:
+        """
+        Loads and process model from file based on given set of args
+        """
+        pass
+
+
+class BaseSaver(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def save(self, model: Model, model_path: Union[str, Path], dataloader_fn) -> None:
+        """
+        Save model to file
+        """
+        pass
+
+
+class BaseRunner(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def init_inference(self, model: Model):
+        raise NotImplementedError
+
+
+class BaseRunnerSession(abc.ABC):
+    def __init__(self, model: Model):
+        self._model = model
+
+    @abc.abstractmethod
+    def __enter__(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def __exit__(self, exc_type, exc_value, traceback):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def __call__(self, x: Dict[str, object]):
+        raise NotImplementedError()
+
+    def _set_env_variables(self) -> Dict[str, object]:
+        """this method not remove values; fix it if needed"""
+        to_set = {}
+        old_values = {k: os.environ.pop(k, None) for k in to_set}
+        os.environ.update(to_set)
+        return old_values
+
+    def _recover_env_variables(self, old_envs: Dict[str, object]):
+        for name, value in old_envs.items():
+            if value is None:
+                del os.environ[name]
+            else:
+                os.environ[name] = str(value)
+
+
+class BaseConverter(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def convert(self, model: Model, dataloader_fn) -> Model:
+        raise NotImplementedError()
+
+    @staticmethod
+    def required_source_model_precision(requested_model_precision: Precision) -> Precision:
+        return requested_model_precision
+
+
+class BaseMetricsCalculator(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    def calc(
+        self,
+        *,
+        ids: List[Any],
+        y_pred: Dict[str, np.ndarray],
+        x: Optional[Dict[str, np.ndarray]],
+        y_real: Optional[Dict[str, np.ndarray]],
+    ) -> Dict[str, float]:
+        """
+        Calculates error/accuracy metrics
+        Args:
+            ids: List of ids identifying each sample in the batch
+            y_pred: model output as dict where key is output name and value is output value
+            x: model input as dict where key is input name and value is input value
+            y_real: input ground truth as dict where key is output name and value is output value
+        Returns:
+            dictionary where key is metric name and value is its value
+        """
+        pass
+
+    @abc.abstractmethod
+    def update(
+        self,
+        ids: List[Any],
+        y_pred: Dict[str, np.ndarray],
+        x: Optional[Dict[str, np.ndarray]],
+        y_real: Optional[Dict[str, np.ndarray]],
+    ):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def metrics(self) -> Dict[str, Any]:
+        pass
+
+
+class ShapeSpec(NamedTuple):
+    min: Tuple
+    opt: Tuple
+    max: Tuple
+
+
+class MeasurementMode(Enum):
+    """
+    Available measurement stabilization modes
+    """
+
+    COUNT_WINDOWS = "count_windows"
+    TIME_WINDOWS = "time_windows"
+
+
+class PerformanceTool(Enum):
+    """
+    Available performance evaluation tools
+    """
+
+    MODEL_ANALYZER = "model_analyzer"
+    PERF_ANALYZER = "perf_analyzer"
+
+
+class EvaluationMode(Enum):
+    """
+    Available evaluation modes
+    """
+
+    OFFLINE = "offline"
+    ONLINE = "online"
+
+
+class OfflineMode(Enum):
+    """
+    Available offline mode for memory
+    """
+
+    SYSTEM = "system"
+    CUDA = "cuda"

+ 253 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/dump.py

@@ -0,0 +1,253 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import abc
+import json
+import pickle
+import threading
+from pathlib import Path
+from typing import Dict, Iterator, List, Union
+
+import numpy as np
+
+MB2B = 2 ** 20
+B2MB = 1 / MB2B
+FLUSH_THRESHOLD_B = 256 * MB2B
+
+
+def _validate_batch(name: str, value: Union[list, np.ndarray]):
+    if not isinstance(value, (list, np.ndarray)):
+        raise ValueError(f"Values shall be lists or np.ndarrays; current type {type(value)}")
+
+
+def _validate_prefix_data(prefix_data: Dict[str, List[np.ndarray]]):
+    batch_sizes_per_io_name = {name: [len(batch) for batch in batches] for name, batches in prefix_data.items()}
+    names = list(batch_sizes_per_io_name)
+    for io_name in names:
+        for batch_idx, batch_size in enumerate(batch_sizes_per_io_name[io_name]):
+            if not all([batch_sizes_per_io_name[other_name][batch_idx] == batch_size for other_name in names]):
+                non_equal_batch_sizes = {
+                    other_name: batch_sizes_per_io_name[other_name][batch_idx] for other_name in names
+                }
+                non_equal_batch_sizes_str = ", ".join(
+                    [f"{name}={batch_size}" for name, batch_size in non_equal_batch_sizes.items()]
+                )
+                raise ValueError(
+                    "All inputs/outputs should have same number of batches with equal batch_size. "
+                    f"At batch_idx={batch_idx} there are batch_sizes: {non_equal_batch_sizes_str}"
+                )
+        # ensure if each io has same number of batches with equal size
+
+
+def _get_nitems_and_batches(prefix_data: Dict[str, List[np.ndarray]]):
+    nitems = 0
+    nbatches = 0
+
+    if prefix_data:
+        nitems_per_io_name = {name: sum(len(batch) for batch in batches) for name, batches in prefix_data.items()}
+        nbatches_per_io_name = {name: len(batches) for name, batches in prefix_data.items()}
+        nitems = list(nitems_per_io_name.values())[0]
+        nbatches = list(nbatches_per_io_name.values())[0]
+    return nitems, nbatches
+
+
+class BaseDumpWriter(abc.ABC):
+    FILE_SUFFIX = ".abstract"
+
+    def __init__(self, output_dir: Union[str, Path]):
+        self._output_dir = Path(output_dir)
+        # outer dict key is prefix (i.e. input/output/labels/...), inner dict key is input/output name
+        # list is list of batches
+        self._items_cache: Dict[str, Dict[str, List[np.ndarray]]] = {}
+        # key is prefix
+        self._items_counters: Dict[str, int] = {}
+        self._cache_lock = threading.RLock()
+        self._flush_threshold_b = FLUSH_THRESHOLD_B
+
+    @property
+    def cache_size(self):
+        def _get_bytes_size(name, batch):
+            _validate_batch(name, batch)
+            if not isinstance(batch, np.ndarray):
+                batch = np.narray(batch)
+
+            return batch.nbytes
+
+        with self._cache_lock:
+            return {
+                prefix: sum(_get_bytes_size(name, batch) for name, batches in data.items() for batch in batches)
+                for prefix, data in self._items_cache.items()
+            }
+
+    def _append_to_cache(self, prefix, prefix_data):
+        if prefix_data is None:
+            return
+
+        if not isinstance(prefix_data, dict):
+            raise ValueError(f"{prefix} data to store shall be dict")
+
+        with self._cache_lock:
+            cached_prefix_data = self._items_cache.setdefault(prefix, {})
+            for name, batch in prefix_data.items():
+                _validate_batch(name, batch)
+                if not isinstance(batch, np.ndarray):
+                    batch = np.array(batch)
+
+                cached_batches = cached_prefix_data.setdefault(name, [])
+                cached_batches += [batch]
+
+    def write(self, **kwargs):
+        with self._cache_lock:
+            for prefix, prefix_data in kwargs.items():
+                self._append_to_cache(prefix, prefix_data)
+
+            biggest_prefix_data_size = max(self.cache_size.values())
+            if biggest_prefix_data_size > self._flush_threshold_b:
+                self.flush()
+
+    def flush(self):
+        with self._cache_lock:
+            for prefix, prefix_data in self._items_cache.items():
+                _validate_prefix_data(prefix_data)
+
+                output_path = self._output_dir / self._get_filename(prefix)
+                self._dump(prefix_data, output_path)
+
+                nitems, nbatches = _get_nitems_and_batches(prefix_data)
+                self._items_counters[prefix] += nitems
+            self._items_cache = {}
+
+    def _get_filename(self, prefix):
+        idx = self._items_counters.setdefault(prefix, 0)
+        return f"{prefix}-{idx:012d}{self.FILE_SUFFIX}"
+
+    @abc.abstractmethod
+    def _dump(self, prefix_data: Dict[str, List[np.ndarray]], output_path: Path):
+        pass
+
+    def __enter__(self):
+        if self._output_dir.exists() and len(list(self._output_dir.iterdir())):
+            raise ValueError(f"{self._output_dir.as_posix()} is not empty")
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.flush()
+
+
+class PickleDumpWriter(BaseDumpWriter):
+    FILE_SUFFIX = ".pkl"
+
+    def _dump(self, prefix_data: Dict[str, List[np.ndarray]], output_path: Path):
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with output_path.open("wb") as pickle_file:
+            pickle.dump(prefix_data, pickle_file)
+
+
+class JsonDumpWriter(BaseDumpWriter):
+    FILE_SUFFIX = ".json"
+
+    def _dump(self, prefix_data: Dict[str, List[np.ndarray]], output_path: Path):
+        repacked_prefix_data = self._format_data(prefix_data)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with output_path.open("w") as json_file:
+            json.dump(repacked_prefix_data, json_file)
+
+    def _format_data(self, prefix_data: Dict[str, List[np.ndarray]]) -> Dict:
+        def _format_batch_for_perf_analyzer_json_format(batch: np.ndarray):
+            return {
+                "content": batch.flatten().tolist(),
+                "shape": list(batch.shape),
+                "dtype": str(batch.dtype),
+            }
+
+        _, nbatches = _get_nitems_and_batches(prefix_data)
+        batches = [{} for _ in range(nbatches)]
+        for io_name, batches_per_io in prefix_data.items():
+            for batch_idx, batch in enumerate(batches_per_io):
+                batches[batch_idx][io_name] = _format_batch_for_perf_analyzer_json_format(batch)
+
+        return {"data": batches}
+
+
+class BaseDumpReader(abc.ABC):
+    FILE_SUFFIX = ".abstract"
+
+    def __init__(self, dump_dir: Union[Path, str]):
+        self._dump_dir = Path(dump_dir)
+
+    def get(self, prefix: str) -> Iterator[Dict[str, np.ndarray]]:
+        dump_files_paths = sorted(self._dump_dir.glob(f"{prefix}*{self.FILE_SUFFIX}"))
+        for dump_file_path in dump_files_paths:
+            prefix_data = self._load_file(dump_file_path)
+            nitems, nbatches = _get_nitems_and_batches(prefix_data)
+            for batch_idx in range(nbatches):
+                yield {io_name: prefix_data[io_name][batch_idx] for io_name in prefix_data}
+
+    @abc.abstractmethod
+    def _load_file(self, dump_file_path: Path) -> Dict[str, List[np.ndarray]]:
+        pass
+
+    def iterate_over(self, prefix_list: List[str]) -> Iterator:
+        iterators = [self.get(prefix) for prefix in prefix_list]
+        empty_iterators = [False] * len(iterators)
+        while not all(empty_iterators):
+            values = [None] * len(iterators)
+            for idx, iterator in enumerate(iterators):
+                if empty_iterators[idx]:
+                    continue
+                try:
+                    values[idx] = next(iterator)
+                except StopIteration:
+                    empty_iterators[idx] = True
+                    if all(empty_iterators):
+                        break
+
+            if not all(empty_iterators):
+                yield values
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+class PickleDumpReader(BaseDumpReader):
+    FILE_SUFFIX = ".pkl"
+
+    def _load_file(self, dump_file_path: Path) -> Dict[str, List[np.ndarray]]:
+        with dump_file_path.open("rb") as pickle_file:
+            return pickle.load(pickle_file)
+
+
+class JsonDumpReader(BaseDumpReader):
+    FILE_SUFFIX = ".json"
+
+    def _load_file(self, dump_file_path: Path) -> Dict[str, List[np.ndarray]]:
+        with dump_file_path.open("rb") as json_file:
+            data = json.load(json_file)
+            return self._repack_data(data)
+
+    def _repack_data(self, data: Dict) -> Dict[str, List[np.ndarray]]:
+        result: Dict[str, List[np.ndarray]] = {}
+        batches = data["data"]
+        for batch in batches:
+            for io_name, batch_as_dict in batch.items():
+                io_batches = result.setdefault(io_name, [])
+                flat_array = batch_as_dict["content"]
+                shape = batch_as_dict["shape"]
+                dtype = batch_as_dict["dtype"]
+                batch_as_array = np.array(flat_array).reshape(shape).astype(dtype)
+                io_batches.append(batch_as_array)
+        return result

+ 82 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/extensions.py

@@ -0,0 +1,82 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+import os
+import re
+from pathlib import Path
+from typing import List
+
+LOGGER = logging.getLogger(__name__)
+
+
+class ExtensionManager:
+    def __init__(self, name: str):
+        self._name = name
+        self._registry = {}
+
+    def register_extension(self, extension: str, clazz):
+        already_registered_class = self._registry.get(extension, None)
+        if already_registered_class and already_registered_class.__module__ != clazz.__module__:
+            raise RuntimeError(
+                f"Conflicting extension {self._name}/{extension}; "
+                f"{already_registered_class.__module__}.{already_registered_class.__name} "
+                f"and "
+                f"{clazz.__module__}.{clazz.__name__}"
+            )
+        elif already_registered_class is None:
+            clazz_full_name = f"{clazz.__module__}.{clazz.__name__}" if clazz is not None else "None"
+            LOGGER.debug(f"Registering extension {self._name}/{extension}: {clazz_full_name}")
+            self._registry[extension] = clazz
+
+    def get(self, extension):
+        if extension not in self._registry:
+            raise RuntimeError(f"Missing extension {self._name}/{extension}")
+        return self._registry[extension]
+
+    @property
+    def supported_extensions(self):
+        return list(self._registry)
+
+    @staticmethod
+    def scan_for_extensions(extension_dirs: List[Path]):
+        register_pattern = r".*\.register_extension\(.*"
+
+        for extension_dir in extension_dirs:
+            for python_path in extension_dir.rglob("*.py"):
+                if not python_path.is_file():
+                    continue
+                payload = python_path.read_text()
+                if re.findall(register_pattern, payload):
+                    import_path = python_path.relative_to(toolkit_root_dir.parent)
+                    package = import_path.parent.as_posix().replace(os.sep, ".")
+                    package_with_module = f"{package}.{import_path.stem}"
+                    spec = importlib.util.spec_from_file_location(name=package_with_module, location=python_path)
+                    my_module = importlib.util.module_from_spec(spec)
+                    my_module.__package__ = package
+
+                    try:
+                        spec.loader.exec_module(my_module)  # pytype: disable=attribute-error
+                    except ModuleNotFoundError as e:
+                        LOGGER.error(
+                            f"Could not load extensions from {import_path} due to missing python packages; {e}"
+                        )
+
+
+runners = ExtensionManager("runners")
+loaders = ExtensionManager("loaders")
+savers = ExtensionManager("savers")
+toolkit_root_dir = (Path(__file__).parent / "..").resolve()
+ExtensionManager.scan_for_extensions([toolkit_root_dir])

+ 13 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/__init__.py

@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 237 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/onnx.py

@@ -0,0 +1,237 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+import numpy as np
+
+# pytype: disable=import-error
+import onnx
+import onnx.shape_inference
+import onnxruntime
+from google.protobuf import text_format
+from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
+
+from ..core import BaseLoader, BaseRunner, BaseRunnerSession, BaseSaver, Format, Model, Precision, TensorSpec
+from ..extensions import loaders, runners, savers
+from .utils import infer_precision
+
+# pytype: enable=import-error
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _value_info2tensor_spec(value_info: onnx.ValueInfoProto):
+    onnx_data_type_map = {"float": "float32", "double": "float64"}
+
+    elem_type_name = onnx.TensorProto.DataType.Name(value_info.type.tensor_type.elem_type).lower()
+    dtype = onnx_data_type_map.get(elem_type_name, elem_type_name)
+
+    def _get_dim(dim):
+        which = dim.WhichOneof("value")
+        if which is not None:  # which is None when dim is None
+            dim = getattr(dim, which)
+        return None if isinstance(dim, (str, bytes)) else dim
+
+    shape = value_info.type.tensor_type.shape
+    shape = tuple(_get_dim(d) for d in shape.dim)
+    return TensorSpec(value_info.name, dtype=dtype, shape=shape)
+
+
+def _infer_graph_precision(onnx_graph: onnx.GraphProto) -> Optional[Precision]:
+    import networkx as nx
+
+    # build directed graph
+    nx_graph = nx.DiGraph()
+
+    def _get_dtype(vi):
+        t = vi.type
+        if hasattr(t, "tensor_type"):
+            type_id = t.tensor_type.elem_type
+        else:
+            raise NotImplementedError("Not implemented yet")
+        return TENSOR_TYPE_TO_NP_TYPE[type_id]
+
+    node_output2type = {vi.name: _get_dtype(vi) for vi in onnx_graph.value_info}
+
+    node_outputs2node = {output_name: node for node in onnx_graph.node for output_name in node.output}
+    node_inputs2node = {input_name: node for node in onnx_graph.node for input_name in node.input}
+
+    for node in onnx_graph.node:
+        node_dtype = node_output2type.get("+".join(node.output), None)
+        nx_graph.add_node(
+            node.name,
+            op=node.op_type,
+            attr={a.name: a for a in node.attribute},
+            dtype=node_dtype,
+        )
+        for input_name in node.input:
+            prev_node = node_outputs2node.get(input_name, None)
+            if prev_node:
+                nx_graph.add_edge(prev_node.name, node.name)
+
+    for input_node in onnx_graph.input:
+        input_name = input_node.name
+        nx_graph.add_node(input_name, op="input", dtype=_get_dtype(input_node))
+        next_node = node_inputs2node.get(input_name, None)
+        if next_node:
+            nx_graph.add_edge(input_name, next_node.name)
+
+    for output in onnx_graph.output:
+        output_name = output.name
+        nx_graph.add_node(output_name, op="output", dtype=_get_dtype(output))
+        prev_node = node_outputs2node.get(output_name, None)
+        if prev_node:
+            nx_graph.add_edge(prev_node.name, output_name)
+        else:
+            LOGGER.warning(f"Could not find previous node for {output_name}")
+
+    input_names = [n.name for n in onnx_graph.input]
+    output_names = [n.name for n in onnx_graph.output]
+    most_common_dtype = infer_precision(nx_graph, input_names, output_names, lambda node: node.get("dtype", None))
+    if most_common_dtype is not None:
+        precision = {np.dtype("float32"): Precision.FP32, np.dtype("float16"): Precision.FP16}[most_common_dtype]
+    else:
+        precision = None
+    return precision
+
+
+class OnnxLoader(BaseLoader):
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+
+        model = onnx.load(model_path)
+        onnx.checker.check_model(model)
+        onnx.helper.strip_doc_string(model)
+        model = onnx.shape_inference.infer_shapes(model)
+
+        # TODO: probably modification of onnx model ios causes error on optimize
+        # from onnx.utils import polish_model
+        # model = polish_model(model)  # run checker, docs strip, optimizer and shape inference
+
+        inputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.input}
+        outputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.output}
+
+        precision = _infer_graph_precision(model.graph)
+
+        return Model(model, precision, inputs, outputs)
+
+
+class OnnxSaver(BaseSaver):
+    def __init__(self, as_text: bool = False):
+        self._as_text = as_text
+
+    def save(self, model: Model, model_path: Union[str, Path], dataloader_fn) -> None:
+        model_path = Path(model_path)
+        LOGGER.debug(f"Saving ONNX model to {model_path.as_posix()}")
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+        onnx_model: onnx.ModelProto = model.handle
+        if self._as_text:
+            with model_path.open("w") as f:
+                f.write(text_format.MessageToString(onnx_model))
+        else:
+            with model_path.open("wb") as f:
+                f.write(onnx_model.SerializeToString())
+
+
+"""
+ExecutionProviders on onnxruntime 1.4.0
+['TensorrtExecutionProvider',
+ 'CUDAExecutionProvider',
+ 'MIGraphXExecutionProvider',
+ 'NGRAPHExecutionProvider',
+ 'OpenVINOExecutionProvider',
+ 'DnnlExecutionProvider',
+ 'NupharExecutionProvider',
+ 'VitisAIExecutionProvider',
+ 'ArmNNExecutionProvider',
+ 'ACLExecutionProvider',
+ 'CPUExecutionProvider']
+"""
+
+
+def _check_providers(providers):
+    providers = providers or []
+    if not isinstance(providers, (list, tuple)):
+        providers = [providers]
+    available_providers = onnxruntime.get_available_providers()
+    unavailable = set(providers) - set(available_providers)
+    if unavailable:
+        raise RuntimeError(f"Unavailable providers {unavailable}")
+    return providers
+
+
+class OnnxRunner(BaseRunner):
+    def __init__(self, verbose_runtime_logs: bool = False):
+        self._providers = None
+        self._verbose_runtime_logs = verbose_runtime_logs
+
+    def init_inference(self, model: Model):
+        assert isinstance(model.handle, onnx.ModelProto)
+        return OnnxRunnerSession(
+            model=model, providers=self._providers, verbose_runtime_logs=self._verbose_runtime_logs
+        )
+
+
+class OnnxRunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model, providers, verbose_runtime_logs: bool = False):
+        super().__init__(model)
+        self._input_names = None
+        self._output_names = None
+        self._session = None
+        self._providers = providers
+        self._verbose_runtime_logs = verbose_runtime_logs
+        self._old_env_values = {}
+
+    def __enter__(self):
+        self._old_env_values = self._set_env_variables()
+        sess_options = onnxruntime.SessionOptions()  # default session options
+        if self._verbose_runtime_logs:
+            sess_options.log_severity_level = 0
+            sess_options.log_verbosity_level = 1
+        LOGGER.info(
+            f"Starting inference session for onnx model providers={self._providers} sess_options={sess_options}"
+        )
+
+        self._input_names = list(self._model.inputs)
+        self._output_names = list(self._model.outputs)
+
+        model_payload = self._model.handle.SerializeToString()
+        self._session = onnxruntime.InferenceSession(
+            model_payload, providers=self._providers, sess_options=sess_options
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._input_names = None
+        self._output_names = None
+        self._session = None
+        self._recover_env_variables(self._old_env_values)
+
+    def __call__(self, x: Dict[str, object]):
+        feed_dict = {k: x[k] for k in self._input_names}
+        y_pred = self._session.run(self._output_names, feed_dict)
+        y_pred = dict(zip(self._output_names, y_pred))
+
+        return y_pred
+
+
+loaders.register_extension(Format.ONNX.value, OnnxLoader)
+runners.register_extension(Format.ONNX.value, OnnxRunner)
+savers.register_extension(Format.ONNX.value, OnnxSaver)

+ 232 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/tensorrt.py

@@ -0,0 +1,232 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import sys
+from pathlib import Path
+from typing import Dict, NamedTuple, Optional, Union
+
+import numpy as np
+
+# pytype: disable=import-error
+try:
+    import pycuda.autoinit
+    import pycuda.driver as cuda
+except Exception as e:
+    logging.getLogger(__name__).warning(f"Problems with importing pycuda package; {e}")
+# pytype: enable=import-error
+
+import tensorrt as trt  # pytype: disable=import-error
+
+from ..core import BaseLoader, BaseRunner, BaseRunnerSession, Format, Model, TensorSpec
+from ..extensions import loaders, runners
+
+LOGGER = logging.getLogger(__name__)
+TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+
+# documentation:
+# https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html
+# https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_samples_section
+
+_NP_DTYPE2TRT_DTYPE = {
+    np.dtype("float32"): trt.DataType.FLOAT,
+    np.dtype("float16"): trt.DataType.HALF,
+    np.dtype("int8"): trt.DataType.INT8,
+    np.dtype("int32"): trt.DataType.INT32,
+    np.dtype("bool"): trt.DataType.BOOL,
+}
+
+
+class TensorRTLoader(BaseLoader):
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        model_path = Path(model_path)
+        LOGGER.debug(f"Loading TensorRT engine from {model_path}")
+        engine = self._load_engine(model_path)
+
+        if engine is None:
+            LOGGER.debug("Unable to load engine without plugins. Loading plugins.")
+            trt.init_libnvinfer_plugins(logger=TRT_LOGGER, namespace="")
+            LOGGER.debug(f"Loading TensorRT engine with plugins from {model_path}")
+            engine = self._load_engine(model_path)
+
+        if engine is None:
+            raise RuntimeError(f"Could not load ICudaEngine from {model_path}")
+
+        inputs = {}
+        outputs = {}
+        for binding_idx in range(engine.num_bindings):
+            name = engine.get_binding_name(binding_idx)
+            is_input = engine.binding_is_input(binding_idx)
+            dtype = np.dtype(trt.nptype(engine.get_binding_dtype(binding_idx))).name
+            shape = engine.get_binding_shape(binding_idx)
+            if is_input:
+                inputs[name] = TensorSpec(name, dtype, shape)
+            else:
+                outputs[name] = TensorSpec(name, dtype, shape)
+
+        return Model(engine, None, inputs, outputs)
+
+    def _load_engine(self, model_path: Path):
+        with model_path.open("rb") as fh, trt.Runtime(TRT_LOGGER) as runtime:
+            engine = runtime.deserialize_cuda_engine(fh.read())
+
+        return engine
+
+
+class TRTBuffers(NamedTuple):
+    x_host: Optional[Dict[str, object]]
+    x_dev: Dict[str, object]
+    y_pred_host: Dict[str, object]
+    y_pred_dev: Dict[str, object]
+
+
+class TensorRTRunner(BaseRunner):
+    def __init__(self):
+        pass
+
+    def init_inference(self, model: Model):
+        return TensorRTRunnerSession(model=model)
+
+
+class TensorRTRunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model):
+        super().__init__(model)
+        assert isinstance(model.handle, trt.ICudaEngine)
+        self._model = model
+        self._has_dynamic_shapes = None
+
+        self._context = None
+        self._engine: trt.ICudaEngine = self._model.handle
+        self._cuda_context = pycuda.autoinit.context
+
+        self._input_names = None
+        self._output_names = None
+        self._buffers = None
+
+    def __enter__(self):
+        self._context = self._engine.create_execution_context()
+        self._context.__enter__()
+
+        self._input_names = [
+            self._engine[idx] for idx in range(self._engine.num_bindings) if self._engine.binding_is_input(idx)
+        ]
+        self._output_names = [
+            self._engine[idx] for idx in range(self._engine.num_bindings) if not self._engine.binding_is_input(idx)
+        ]
+        # all_binding_shapes_specified is True for models without dynamic shapes
+        # so initially this variable is False for models with dynamic shapes
+        self._has_dynamic_shapes = not self._context.all_binding_shapes_specified
+
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._context.__exit__(exc_type, exc_value, traceback)
+        self._input_names = None
+        self._output_names = None
+
+        # TODO: are cuda buffers dealloc automatically?
+        self._buffers = None
+
+    def __call__(self, x):
+        buffers = self._prepare_buffers_if_needed(x)
+        bindings = self._update_bindings(buffers)
+
+        for name in self._input_names:
+            cuda.memcpy_htod(buffers.x_dev[name], buffers.x_host[name])
+        self._cuda_context.push()
+        self._context.execute_v2(bindings=bindings)
+        self._cuda_context.pop()
+        for name in self._output_names:
+            cuda.memcpy_dtoh(buffers.y_pred_host[name], buffers.y_pred_dev[name])
+
+        return buffers.y_pred_host
+
+    def _update_bindings(self, buffers: TRTBuffers):
+        bindings = [None] * self._engine.num_bindings
+        for name in buffers.y_pred_dev:
+            binding_idx: int = self._engine[name]
+            bindings[binding_idx] = buffers.y_pred_dev[name]
+
+        for name in buffers.x_dev:
+            binding_idx: int = self._engine[name]
+            bindings[binding_idx] = buffers.x_dev[name]
+
+        return bindings
+
+    def _set_dynamic_input_shapes(self, x_host):
+        def _is_shape_dynamic(input_shape):
+            return any([dim is None or dim == -1 for dim in input_shape])
+
+        for name in self._input_names:
+            bindings_idx = self._engine[name]
+            data_shape = x_host[name].shape  # pytype: disable=attribute-error
+            if self._engine.is_shape_binding(bindings_idx):
+                input_shape = self._context.get_shape(bindings_idx)
+                if _is_shape_dynamic(input_shape):
+                    self._context.set_shape_input(bindings_idx, data_shape)
+            else:
+                input_shape = self._engine.get_binding_shape(bindings_idx)
+                if _is_shape_dynamic(input_shape):
+                    self._context.set_binding_shape(bindings_idx, data_shape)
+
+        assert self._context.all_binding_shapes_specified and self._context.all_shape_inputs_specified
+
+    def _prepare_buffers_if_needed(self, x_host: Dict[str, object]):
+        # pytype: disable=attribute-error
+        new_batch_size = list(x_host.values())[0].shape[0]
+        current_batch_size = list(self._buffers.y_pred_host.values())[0].shape[0] if self._buffers else 0
+        # pytype: enable=attribute-error
+
+        if self._has_dynamic_shapes or new_batch_size != current_batch_size:
+            # TODO: are CUDA buffers dealloc automatically?
+
+            self._set_dynamic_input_shapes(x_host)
+
+            y_pred_host = {}
+            for name in self._output_names:
+                shape = self._context.get_binding_shape(self._engine[name])
+                binding_idx: int = self._engine[name]
+                dtype_from_trt_binding = np.dtype(trt.nptype(self._engine.get_binding_dtype(binding_idx)))
+                dtype_from_model_spec = np.dtype(self._model.outputs[name].dtype)
+
+                assert dtype_from_model_spec == dtype_from_trt_binding
+
+                y_pred_host[name] = np.zeros(shape, dtype=dtype_from_model_spec)
+
+            y_pred_dev = {name: cuda.mem_alloc(data.nbytes) for name, data in y_pred_host.items()}
+
+            # cast host input into binding dtype
+            def _cast_input(name, data):
+                binding_idx: int = self._engine[name]
+                np_dtype = trt.nptype(self._engine.get_binding_dtype(binding_idx))
+                return data.astype(np_dtype)
+
+            x_host = {name: _cast_input(name, host_input) for name, host_input in x_host.items()}
+
+            x_dev = {
+                name: cuda.mem_alloc(host_input.nbytes)
+                for name, host_input in x_host.items()
+                if name in self._input_names  # pytype: disable=attribute-error
+            }
+
+            self._buffers = TRTBuffers(None, x_dev, y_pred_host, y_pred_dev)
+
+        return self._buffers._replace(x_host=x_host)
+
+
+if "pycuda.driver" in sys.modules:
+    loaders.register_extension(Format.TRT.value, TensorRTLoader)
+    runners.register_extension(Format.TRT.value, TensorRTRunner)
+else:
+    LOGGER.warning("Do not register TensorRT extension due problems with importing pycuda.driver package.")

+ 462 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/tf.py

@@ -0,0 +1,462 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Union
+
+# pytype: disable=import-error
+import tensorflow as tf
+from tensorflow.python.eager import wrap_function
+from tf2onnx.shape_inference import infer_shape
+from tf2onnx.tf_loader import freeze_session, inputs_without_resource, is_function, remove_redundant_inputs, tf_optimize
+
+from ..args import filter_fn_args
+from ..core import (
+    GET_MODEL_FN_NAME,
+    GET_SERVING_INPUT_RECEIVER_FN,
+    BaseLoader,
+    BaseRunner,
+    BaseRunnerSession,
+    BaseSaver,
+    ExportFormat,
+    Format,
+    Model,
+    ModelInputType,
+    TensorSpec,
+    load_from_file,
+)
+from ..extensions import loaders, runners, savers
+
+# pytype: enable=import-error
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+def is_tf2():
+    return tf.__version__.startswith("2.")
+
+
+def create_session_config(*, allow_growth=False, use_xla=False, gpu_memory_fraction=1.0):
+    gpu_options = tf.compat.v1.GPUOptions(
+        per_process_gpu_memory_fraction=gpu_memory_fraction, allow_growth=allow_growth
+    )
+    config = tf.compat.v1.ConfigProto(gpu_options=gpu_options)
+    if use_xla:
+        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+
+    LOGGER.debug(
+        f"Using gpu memory fraction: allow_growth={allow_growth} "
+        f"gpu_memory_fraction={gpu_memory_fraction} "
+        f"use_xla={use_xla}"
+    )
+    return config
+
+
+def _from_saved_model_v1(sess, model_path, tag, signatures):
+    """
+    Load tensorflow graph from saved_model.
+    NOTICE: Modified version from tf2onnx project
+    """
+
+    wrn_no_tag = "'--tag' not specified for saved_model. Using --tag serve"
+    wrn_empty_tag = "'--tag' value is empty string. Using tag =[[]]"
+
+    if tag is None:
+        tag = [tf.saved_model.SERVING]
+        LOGGER.warning(wrn_no_tag)
+
+    if tag == "":
+        tag = [[]]
+        LOGGER.warning(wrn_empty_tag)
+
+    if not isinstance(tag, list):
+        tag = [tag]
+
+    imported = tf.compat.v1.saved_model.loader.load(sess, tag, model_path)
+    for k in imported.signature_def.keys():
+        if k.startswith("_"):
+            # consider signatures starting with '_' private
+            continue
+        signatures.append(k)
+    try:
+        from tensorflow.contrib.saved_model.python.saved_model import (  # pytype: disable=import-error
+            signature_def_utils,
+        )
+
+        def get_signature_def(meta_graph_def, k):
+            return signature_def_utils.get_signature_def_by_key(meta_graph_def, k)
+
+    except ImportError:
+        # TF1.12 changed the api
+        def get_signature_def(meta_graph_def, k):
+            return meta_graph_def.signature_def[k]
+
+    inputs = {}
+    outputs = {}
+    for k in signatures:
+        inputs_tensor_info = get_signature_def(imported, k).inputs
+        for name, input_tensor in inputs_tensor_info.items():
+            inputs[name] = input_tensor.name
+        outputs_tensor_info = get_signature_def(imported, k).outputs
+        for name, output_tensor in outputs_tensor_info.items():
+            outputs[name] = output_tensor.name
+    frozen_graph = freeze_session(sess, input_names=list(inputs.values()), output_names=list(outputs.values()))
+    return frozen_graph, inputs, outputs
+
+
+class TFEstimatorLoader(BaseLoader):
+    required_fn_name_for_signature_parsing: Optional[str] = GET_MODEL_FN_NAME
+
+    def __init__(self, **kwargs):
+        self._model_args = kwargs
+
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+
+        get_model = load_from_file(model_path, "model", GET_MODEL_FN_NAME)
+        get_serving_input_receiver_fn = load_from_file(model_path, "model", GET_SERVING_INPUT_RECEIVER_FN)
+
+        if get_model is None:
+            raise RuntimeError(f"Could not find {GET_MODEL_FN_NAME} in {model_path}")
+        if get_serving_input_receiver_fn is None:
+            raise RuntimeError(f"Could not find {GET_SERVING_INPUT_RECEIVER_FN} in {model_path}")
+
+        model_args = filter_fn_args(self._model_args, fn=get_model)
+        serving_input_receiver_args = filter_fn_args(self._model_args, fn=get_serving_input_receiver_fn)
+
+        session_config = create_session_config(allow_growth=True)
+        tf.compat.v1.reset_default_graph()
+        with tf.compat.v1.Session(config=session_config) as sess:
+            estimator = get_model(**model_args)
+            serving_input_receiver_fn = get_serving_input_receiver_fn(**serving_input_receiver_args)
+
+            input_receiver = serving_input_receiver_fn()
+            estimator_spec = estimator.model_fn(
+                features=input_receiver.features,
+                labels=None,
+                mode=tf.estimator.ModeKeys.PREDICT,
+                config=estimator.config,
+            )
+
+            input_tensors_dict = input_receiver.receiver_tensors
+            output_tensors_dict = estimator_spec.predictions
+            inputs_dict = {k: tensor2tensor_spec(tensor) for k, tensor in input_tensors_dict.items()}
+            outputs_dict = {k: tensor2tensor_spec(tensor) for k, tensor in output_tensors_dict.items()}
+
+            input_tensor_names = [t.name for t in inputs_dict.values()]
+            output_tensor_names = [t.name for t in outputs_dict.values()]
+
+            graph_saver = estimator_spec.scaffold.saver or tf.compat.v1.train.Saver(sharded=True)
+            graph_saver.restore(sess, estimator.latest_checkpoint())
+
+            input_tensor_names = inputs_without_resource(sess, input_tensor_names)
+            frozen_graph = freeze_session(sess, input_names=input_tensor_names, output_names=output_tensor_names)
+            input_tensor_names = remove_redundant_inputs(frozen_graph, input_tensor_names)
+
+        tf.compat.v1.reset_default_graph()
+        with tf.compat.v1.Session(config=estimator.config.session_config):
+            frozen_graph = tf_optimize(input_tensor_names, output_tensor_names, frozen_graph)
+        tf.compat.v1.reset_default_graph()
+
+        return Model(frozen_graph, None, inputs_dict, outputs_dict)
+
+
+class TFKerasLoader(BaseLoader):
+    """
+    Loads keras model from source code
+
+    The tf-allow-growth flag control limiting GPU memory growth feature
+    (https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth). By default it is disabled.
+    """
+
+    required_fn_name_for_signature_parsing: Optional[str] = GET_MODEL_FN_NAME
+
+    def __init__(self, tf_allow_growth: bool = False, **kwargs):
+        self._allow_growth = tf_allow_growth
+        self._model_args = kwargs
+
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        # TODO fix: RuntimeError: Physical devices cannot be modified after being initialized
+        # if self._allow_growth:
+        #     physical_devices = tf.config.experimental.list_physical_devices("GPU")
+        #     for device in physical_devices:
+        #         tf.config.experimental.set_memory_growth(device, True)
+
+        tf.keras.backend.clear_session()
+        tf.keras.backend.set_learning_phase(False)
+
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+
+        get_model = load_from_file(model_path, "model", GET_MODEL_FN_NAME)
+        if get_model is None:
+            raise RuntimeError(f"Could not find {GET_MODEL_FN_NAME} in {model_path}")
+
+        model_args = filter_fn_args(self._model_args, fn=get_model)
+
+        model, call_fn = get_model(**model_args)
+
+        inputs_dict: Dict[str, TensorSpec] = {
+            input_name: TensorSpec(t.name, t.dtype.name, tuple(t.shape.as_list()))
+            for input_name, t in zip(model.input_names, model.inputs)
+        }
+
+        concrete_func = call_fn.get_concrete_function(
+            *(tf.TensorSpec(shape=spec.shape, dtype=spec.dtype, name=name) for name, spec in inputs_dict.items())
+        )
+
+        output_tensors_names = [tensor.name for tensor in concrete_func.outputs]
+
+        outputs_dict: Dict[str, TensorSpec] = {
+            output_name: TensorSpec(output_tensor_name, t.dtype.name, tuple(t.shape.as_list()))
+            for output_name, output_tensor_name, t in zip(model.output_names, output_tensors_names, model.outputs)
+        }
+
+        tf.keras.backend.clear_session()
+        tf.keras.backend.set_learning_phase(False)
+
+        def _add_suffix_as_quickfix_for_tf24_func_refactor(spec):
+            if not spec.name.endswith(":0"):
+                spec = spec._replace(name=spec.name + ":0")
+            return spec
+
+        inputs_dict = {name: _add_suffix_as_quickfix_for_tf24_func_refactor(spec) for name, spec in inputs_dict.items()}
+
+        return Model(model, None, inputs_dict, outputs_dict)
+
+
+class TFSavedModelLoader(BaseLoader):
+    def __init__(self, tf_allow_growth: bool = False):
+        self._allow_growth = tf_allow_growth
+
+    def load(self, model_path: Union[str, Path], **kwargs) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+        tf.compat.v1.reset_default_graph()
+
+        if self._allow_growth:
+            physical_devices = tf.config.experimental.list_physical_devices("GPU")
+            for device in physical_devices:
+                tf.config.experimental.set_memory_growth(device, True)
+
+        if is_tf2():
+            from tf2onnx.tf_loader import _from_saved_model_v2  # pytype: disable=import-error
+
+            (
+                graph_def,
+                input_names,
+                output_names,
+                concrete_func,
+                imported,
+                initialized_tables,
+                tensors_to_rename,
+            ) = _from_saved_model_v2(
+                model_path=model_path,
+                input_names=None,
+                output_names=None,
+                tag=None,
+                signature_def=[],
+                concrete_function_index=None,
+                large_model=False,
+                use_graph_names=False,
+            )
+
+            # inspired by
+            # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/saved_model_cli.py#L205
+            if concrete_func.structured_input_signature:
+                input_args, input_kwargs = concrete_func.structured_input_signature
+                input_names = list(input_kwargs)
+                assert (
+                    not input_args
+                ), f"Not supported args in concrete function signature args={input_args}, kwargs={input_kwargs}"
+            elif concrete_func._arg_keywords:  # pylint: disable=protected-access
+                # For pure ConcreteFunctions we might have nothing better than _arg_keywords.
+                assert concrete_func._num_positional_args in [0, 1]
+                input_names = concrete_func._arg_keywords
+
+            input_tensors = [tensor for tensor in concrete_func.inputs if tensor.dtype != tf.dtypes.resource]
+            inputs = {name: tensor.name for name, tensor in zip(input_names, input_tensors)}
+
+            # they are already flattened
+            output_tensors = [tensor for tensor in concrete_func.outputs if tensor.dtype != tf.dtypes.resource]
+            output_names = sorted(concrete_func.structured_outputs)  # because outputs are in flatten form
+            outputs = {name: tensor.name for name, tensor in zip(output_names, output_tensors)}
+        else:
+            session_config = create_session_config(allow_growth=True)
+            with tf.compat.v1.Session(config=session_config) as sess:
+                graph_def, inputs, outputs = _from_saved_model_v1(sess, model_path, tag=None, signatures=[])
+
+        inputs, outputs = handle_tensor_specs(graph_def, inputs, outputs)
+
+        return Model(graph_def, None, inputs, outputs)
+
+
+class TFRunner(BaseRunner):
+    def __init__(self):
+        pass
+
+    def init_inference(self, model: Model):
+        if is_tf2():
+            return TF2RunnerSession(model=model)
+        else:
+            return TF1RunnerSession(model=model)
+
+
+class TF1RunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model):
+        super().__init__(model)
+
+        assert isinstance(model.handle, tf.compat.v1.GraphDef)
+
+        self._inputs = None
+        self._outputs = None
+        self._session = None
+        self._old_env_values = {}
+
+    def __enter__(self):
+        self._old_env_values = self._set_env_variables()
+
+        tf.compat.v1.reset_default_graph()
+
+        session_config = create_session_config(allow_growth=True)
+        self._session = tf.compat.v1.Session(config=session_config)
+        self._session.__enter__()
+
+        tf.import_graph_def(self._model.handle, name="")
+
+        self._inputs = {
+            name: self._session.graph.get_tensor_by_name(spec.name) for name, spec in self._model.inputs.items()
+        }
+        self._outputs = {
+            name: self._session.graph.get_tensor_by_name(spec.name) for name, spec in self._model.outputs.items()
+        }
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._session.__exit__(exc_type, exc_value, traceback)
+        tf.compat.v1.reset_default_graph()
+        self._inputs = None
+        self._outputs = None
+        self._session = None
+        self._recover_env_variables(self._old_env_values)
+
+    def __call__(self, x: Dict[str, object]):
+        feed_dict = {placeholder: x[name] for name, placeholder in self._inputs.items()}
+        return self._session.run(self._outputs, feed_dict=feed_dict)
+
+
+class TF2RunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model):
+        super().__init__(model)
+        assert isinstance(model.handle, tf.compat.v1.GraphDef)
+        self._concrete_func = None
+
+    def __enter__(self):
+        tf.compat.v1.reset_default_graph()
+        input_tensor_names = [spec.name for spec in self._model.inputs.values()]
+        output_tensor_names = [spec.name for spec in self._model.outputs.values()]
+        self._concrete_func = wrap_function.function_from_graph_def(
+            self._model.handle, input_tensor_names, output_tensor_names
+        )
+        self._concrete_func._signature = [
+            tf.TensorSpec(shape=spec.shape, dtype=spec.dtype, name=name) for name, spec in self._model.inputs.items()
+        ]
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._concrete_func = None
+        tf.compat.v1.reset_default_graph()
+
+    def __call__(self, x: Dict[str, object]):
+        x = tf.nest.map_structure(tf.convert_to_tensor, x)
+        y_pred = self._concrete_func(**x)
+        output_struct = {name: spec.name for name, spec in self._model.outputs.items()}
+        y_pred = tf.nest.map_structure(lambda t: t.numpy(), y_pred)
+        y_pred = tf.nest.pack_sequence_as(output_struct, y_pred)
+        return y_pred
+
+
+class TFSavedModelSaver(BaseSaver):
+    def save(self, model, model_path: Union[str, Path], dataloader_fn) -> None:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+        if is_tf2():
+            tf.keras.models.save_model(model=model.handle, filepath=model_path, overwrite=True)
+        else:
+            session_config = create_session_config(allow_growth=True)
+            with tf.compat.v1.Session(config=session_config) as sess:
+                tf.import_graph_def(model.handle, name="")
+
+                is_func = is_function(sess.graph)
+                if not is_func:
+                    infer_shape(sess.graph, {})
+
+                inputs = {name: sess.graph.get_tensor_by_name(spec.name) for name, spec in model.inputs.items()}
+                outputs = {name: sess.graph.get_tensor_by_name(spec.name) for name, spec in model.outputs.items()}
+
+                def _ensure_shape(tensors_dict, tensors_specs):
+                    for name, tensor in tensors_dict.items():
+                        if tensor.shape.rank is None:
+                            tensor.set_shape(tensors_specs[name].shape)
+                    return tensors_dict
+
+                inputs = _ensure_shape(inputs, model.inputs)
+                outputs = _ensure_shape(outputs, model.outputs)
+
+                LOGGER.info(inputs)
+                LOGGER.info(outputs)
+
+                tf.compat.v1.saved_model.simple_save(sess, model_path, inputs, outputs, legacy_init_op=None)
+
+
+def handle_tensor_specs(
+    graph_def, inputs: Dict[str, str], outputs: Dict[str, str]
+) -> Tuple[Dict[str, TensorSpec], Dict[str, TensorSpec]]:
+    session_config = tf.compat.v1.ConfigProto(graph_options=tf.compat.v1.GraphOptions(infer_shapes=True))
+    tf.compat.v1.reset_default_graph()
+    with tf.compat.v1.Session(config=session_config) as sess:
+        tf.import_graph_def(graph_def, name="")
+
+        def _get_spec(tensors_dict):
+            tensors_dict = {name: sess.graph.get_tensor_by_name(tname) for name, tname in tensors_dict.items()}
+            return {name: tensor2tensor_spec(tensor) for name, tensor in tensors_dict.items()}
+
+        inputs = _get_spec(inputs)
+        outputs = _get_spec(outputs)
+
+    tf.compat.v1.reset_default_graph()
+    return inputs, outputs
+
+
+def tensor2tensor_spec(tensor):
+    shape = tuple(s.value if hasattr(s, "value") else s for s in tensor.shape)
+    return TensorSpec(tensor.name, tensor.dtype.name, shape)
+
+
+loaders.register_extension(ModelInputType.TF_ESTIMATOR.value, TFEstimatorLoader)
+loaders.register_extension(ModelInputType.TF_KERAS.value, TFKerasLoader)
+
+loaders.register_extension(Format.TF_SAVEDMODEL.value, TFSavedModelLoader)
+loaders.register_extension(Format.TF_TRT.value, TFSavedModelLoader)
+
+savers.register_extension(Format.TF_SAVEDMODEL.value, TFSavedModelSaver)
+savers.register_extension(Format.TF_TRT.value, TFSavedModelSaver)
+
+runners.register_extension(ModelInputType.TF_ESTIMATOR.value, TFRunner)
+runners.register_extension(ModelInputType.TF_KERAS.value, TFRunner)
+runners.register_extension(Format.TF_SAVEDMODEL.value, TFRunner)
+runners.register_extension(Format.TF_TRT.value, TFRunner)

+ 129 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/library/utils.py

@@ -0,0 +1,129 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import Counter
+from typing import Callable, Dict, List, Optional
+
+import networkx as nx
+
+from ..core import ShapeSpec
+
+
+def infer_precision(
+    nx_graph: nx.Graph,
+    input_names: List[str],
+    output_names: List[str],
+    get_node_dtype_fn: Callable,
+):
+    node_dtypes = [nx_graph.nodes[node_name].get("dtype", None) for node_name in nx_graph.nodes]
+    node_dtypes = [dt for dt in node_dtypes if dt is None or dt.kind not in ["i", "b"]]
+    dtypes_counter = Counter(node_dtypes)
+    return dtypes_counter.most_common()[0][0]
+
+
+def get_shapes_with_dynamic_axes(dataloader, batch_size_dim: Optional[int] = None):
+    def _set_dynamic_shapes(t, shapes):
+        for k, v in t.items():
+            shape = list(v.shape)
+            for dim, s in enumerate(shape):
+                if shapes[k][dim] != -1 and shapes[k][dim] != s:
+                    shapes[k][dim] = -1
+
+    def _mark_batch_axis(shape, batch_axis: int):
+        shape = list(shape)
+        shape[batch_axis] = -1
+        return tuple(shape)
+
+    ## get all shapes from input and output tensors
+    input_shapes = {}
+    output_shapes = {}
+    for batch in dataloader:
+        _, x, y = batch
+        for k, v in x.items():
+            input_shapes[k] = list(v.shape)
+        for k, v in y.items():
+            output_shapes[k] = list(v.shape)
+        break
+
+    # based on max <max_num_iters> iterations, check which
+    # dimensions differ to determine dynamic_axes
+    max_num_iters = 100
+    for idx, batch in enumerate(dataloader):
+        if idx >= max_num_iters:
+            break
+
+        _, x, y = batch
+
+        _set_dynamic_shapes(x, input_shapes)
+        _set_dynamic_shapes(y, output_shapes)
+
+    if batch_size_dim is not None:
+        input_shapes = {name: _mark_batch_axis(shape, batch_size_dim) for name, shape in input_shapes.items()}
+        output_shapes = {name: _mark_batch_axis(shape, batch_size_dim) for name, shape in output_shapes.items()}
+
+    return input_shapes, output_shapes
+
+
+def get_dynamic_axes(dataloader, batch_size_dim: Optional[int] = None):
+    input_shapes, output_shapes = get_shapes_with_dynamic_axes(dataloader, batch_size_dim=batch_size_dim)
+    all_shapes = {**input_shapes, **output_shapes}
+    dynamic_axes = {}
+
+    for k, shape in all_shapes.items():
+        for idx, s in enumerate(shape):
+            if s == -1:
+                dynamic_axes[k] = {idx: k + "_" + str(idx)}
+
+    for k in all_shapes:
+        if k in dynamic_axes:
+            dynamic_axes[k].update({batch_size_dim: "batch_size_" + str(batch_size_dim)})
+        else:
+            dynamic_axes[k] = {batch_size_dim: "batch_size_" + str(batch_size_dim)}
+
+    return dynamic_axes
+
+
+def get_input_shapes(dataloader, max_batch_size=1) -> Dict[str, ShapeSpec]:
+    def init_counters_and_shapes(x, counters, min_shapes, max_shapes):
+        for k, v in x.items():
+            counters[k] = Counter()
+            min_shapes[k] = [float("inf")] * v.ndim
+            max_shapes[k] = [float("-inf")] * v.ndim
+
+    counters = {}
+    min_shapes: Dict[str, tuple] = {}
+    max_shapes: Dict[str, tuple] = {}
+    for idx, batch in enumerate(dataloader):
+        ids, x, y = batch
+
+        if idx == 0:
+            init_counters_and_shapes(x, counters, min_shapes, max_shapes)
+
+        for k, v in x.items():
+            shape = v.shape
+            counters[k][shape] += 1
+            min_shapes[k] = tuple(min(a, b) for a, b in zip(min_shapes[k], shape))
+            max_shapes[k] = tuple(max(a, b) for a, b in zip(max_shapes[k], shape))
+
+    opt_shapes: Dict[str, tuple] = {}
+    for k, v in counters.items():
+        opt_shapes[k] = v.most_common(1)[0][0]
+
+    shapes = {}
+    for k in opt_shapes.keys():  # same keys in min_shapes and max_shapes
+        shapes[k] = ShapeSpec(
+            min=(1,) + min_shapes[k][1:],
+            max=(max_batch_size,) + max_shapes[k][1:],
+            opt=(max_batch_size,) + opt_shapes[k][1:],
+        )
+    return shapes

+ 61 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/report.py

@@ -0,0 +1,61 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import re
+from typing import Dict, List
+
+from natsort import natsorted
+from tabulate import tabulate
+
+
+def sort_results(results: List):
+    results = natsorted(results, key=lambda item: [item[key] for key in item.keys()])
+    return results
+
+
+def save_results(filename: str, data: List, formatted: bool = False):
+    data = format_data(data=data) if formatted else data
+    with open(filename, "a") as csvfile:
+        fieldnames = data[0].keys()
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+        writer.writeheader()
+        for row in data:
+            writer.writerow(row)
+
+
+def format_data(data: List[Dict]) -> List[Dict]:
+    formatted_data = list()
+    for item in data:
+        formatted_item = format_keys(data=item)
+        formatted_data.append(formatted_item)
+
+    return formatted_data
+
+
+def format_keys(data: Dict) -> Dict:
+    keys = {format_key(key=key): value for key, value in data.items()}
+    return keys
+
+
+def format_key(key: str) -> str:
+    key = " ".join([k.capitalize() for k in re.split("_| ", key)])
+    return key
+
+
+def show_results(results: List[Dict]):
+    headers = list(results[0].keys())
+    summary = map(lambda x: list(map(lambda item: item[1], x.items())), results)
+    print(tabulate(summary, headers=headers))

+ 14 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/__init__.py

@@ -0,0 +1,14 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .runner import TritonInferenceRunner  # noqa: F401

+ 51 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/base.py

@@ -0,0 +1,51 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Optional
+
+LOGGER = logging.getLogger("triton_inference_runner.base")
+
+
+class BaseRunner:
+    DEFAULT_MAX_RESP_WAIT_S = 120
+    DEFAULT_MAX_FINISH_WAIT_S = 900  # 15min
+
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        model_version: str,
+        *,
+        dataloader,
+        verbose=False,
+        response_wait_time: Optional[float] = None,
+    ):
+        self._model_name = model_name
+        self._model_version = model_version
+        self._dataloader = dataloader
+        self._verbose = verbose
+        self._response_wait_t = int(self.DEFAULT_MAX_RESP_WAIT_S if response_wait_time is None else response_wait_time)
+        self._response_wait_t_ms = self._response_wait_t * 1000 * 1000
+        self._max_wait_time = max(self._response_wait_t, self.DEFAULT_MAX_FINISH_WAIT_S)
+        self._server_url = server_url
+
+    def _verify_triton_state(self, triton_client):
+        errors = []
+        if not triton_client.is_server_live():
+            errors.append(f"Triton server {self._server_url} is not live")
+        elif not triton_client.is_server_ready():
+            errors.append(f"Triton server {self._server_url} is not ready")
+        elif not triton_client.is_model_ready(self._model_name, self._model_version):
+            errors.append(f"Model {self._model_name}:{self._model_version} is not ready")
+        return errors

+ 238 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/grpc.py

@@ -0,0 +1,238 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import logging
+import queue
+import threading
+from pathlib import Path
+from typing import Optional
+
+# pytype: disable=import-error
+try:
+    from tritonclient import utils as client_utils  # noqa: F401
+except ImportError:
+    import tritonclientutils as client_utils  # noqa: F401
+
+try:
+    import tritonclient.grpc as grpc_client
+except ImportError:
+    import tritongrpcclient as grpc_client
+# pytype: enable=import-error
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .base import BaseRunner
+
+LOGGER = logging.getLogger("triton_inference_runner.grpc")
+
+
+class SyncInferenceRunner(BaseRunner):
+    def __iter__(self):
+        LOGGER.debug(f"Connecting to {self._server_url}")
+        client = grpc_client.InferenceServerClient(url=self._server_url, verbose=self._verbose)
+
+        error = self._verify_triton_state(client)
+        if error:
+            raise RuntimeError(f"Could not communicate to Triton Server: {error}")
+
+        LOGGER.debug(
+            f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} " f"are up and ready!"
+        )
+
+        model_config = client.get_model_config(self._model_name, self._model_version)
+        model_metadata = client.get_model_metadata(self._model_name, self._model_version)
+        LOGGER.info(f"Model config {model_config}")
+        LOGGER.info(f"Model metadata {model_metadata}")
+
+        inputs = {tm.name: tm for tm in model_metadata.inputs}
+        outputs = {tm.name: tm for tm in model_metadata.outputs}
+        output_names = list(outputs)
+        outputs_req = [grpc_client.InferRequestedOutput(name) for name in outputs]
+
+        for ids, x, y_real in self._dataloader:
+            infer_inputs = []
+            for name in inputs:
+                data = x[name]
+                datatype = inputs[name].datatype
+                infer_input = grpc_client.InferInput(name, data.shape, datatype)
+
+                target_np_dtype = client_utils.triton_to_np_dtype(datatype)
+                data = data.astype(target_np_dtype)
+
+                infer_input.set_data_from_numpy(data)
+                infer_inputs.append(infer_input)
+
+            results = client.infer(
+                model_name=self._model_name,
+                model_version=self._model_version,
+                inputs=infer_inputs,
+                outputs=outputs_req,
+                timeout=self._response_wait_t,
+            )
+            y_pred = {name: results.as_numpy(name) for name in output_names}
+            yield ids, x, y_pred, y_real
+
+
+class AsyncInferenceRunner(BaseRunner):
+    DEFAULT_MAX_UNRESP_REQS = 128
+
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        model_version: str,
+        *,
+        dataloader,
+        verbose=False,
+        response_wait_time: Optional[float] = None,
+        max_unresponded_requests: Optional[int] = None,
+    ):
+        super().__init__(
+            server_url,
+            model_name,
+            model_version,
+            dataloader=dataloader,
+            verbose=verbose,
+            response_wait_time=response_wait_time,
+        )
+        self._max_unresp_reqs = (
+            self.DEFAULT_MAX_UNRESP_REQS if max_unresponded_requests is None else max_unresponded_requests
+        )
+
+        self._results = queue.Queue()
+        self._processed_all = False
+        self._errors = []
+        self._num_waiting_for = 0
+        self._sync = threading.Condition()
+        self._req_thread = threading.Thread(target=self.req_loop, daemon=True)
+
+    def __iter__(self):
+        self._req_thread.start()
+        timeout_s = 0.050  # check flags processed_all and error flags every 50ms
+        while True:
+            try:
+                ids, x, y_pred, y_real = self._results.get(timeout=timeout_s)
+                yield ids, x, y_pred, y_real
+            except queue.Empty:
+                shall_stop = self._processed_all or self._errors
+                if shall_stop:
+                    break
+
+        LOGGER.debug("Waiting for request thread to stop")
+        self._req_thread.join()
+        if self._errors:
+            error_msg = "\n".join(map(str, self._errors))
+            raise RuntimeError(error_msg)
+
+    def _on_result(self, ids, x, y_real, output_names, result, error):
+        with self._sync:
+            request_id = str(ids[0])
+            NOT_MATCHING_REQUEST_ID_MSG = (
+                "Error during processing result - request_id doesn't match. This shouldn't have happened."
+            )
+            if error:
+                response_id = error.get_response().id
+                if response_id != request_id:
+                    raise RuntimeError(NOT_MATCHING_REQUEST_ID_MSG)
+                self._errors.append(error)
+            else:
+                response_id = result.get_response().id
+                if response_id != request_id:
+                    raise RuntimeError(NOT_MATCHING_REQUEST_ID_MSG)
+                y_pred = {name: result.as_numpy(name) for name in output_names}
+                self._results.put((ids, x, y_pred, y_real))
+            self._num_waiting_for -= 1
+            self._sync.notify_all()
+
+    def req_loop(self):
+        LOGGER.debug(f"Connecting to {self._server_url}")
+        client = grpc_client.InferenceServerClient(url=self._server_url, verbose=self._verbose)
+
+        self._errors = self._verify_triton_state(client)
+        if self._errors:
+            return
+
+        LOGGER.debug(
+            f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} " f"are up and ready!"
+        )
+
+        model_config = client.get_model_config(self._model_name, self._model_version)
+        model_metadata = client.get_model_metadata(self._model_name, self._model_version)
+        LOGGER.info(f"Model config {model_config}")
+        LOGGER.info(f"Model metadata {model_metadata}")
+
+        inputs = {tm.name: tm for tm in model_metadata.inputs}
+        outputs = {tm.name: tm for tm in model_metadata.outputs}
+        output_names = list(outputs)
+
+        self._num_waiting_for = 0
+
+        for ids, x, y_real in self._dataloader:
+            infer_inputs = []
+            for name in inputs:
+                data = x[name]
+                datatype = inputs[name].datatype
+                infer_input = grpc_client.InferInput(name, data.shape, datatype)
+
+                target_np_dtype = client_utils.triton_to_np_dtype(datatype)
+                data = data.astype(target_np_dtype)
+
+                infer_input.set_data_from_numpy(data)
+                infer_inputs.append(infer_input)
+
+            outputs_req = [grpc_client.InferRequestedOutput(name) for name in outputs]
+
+            with self._sync:
+
+                def _check_can_send():
+                    return self._num_waiting_for < self._max_unresp_reqs
+
+                can_send = self._sync.wait_for(_check_can_send, timeout=self._response_wait_t)
+                if not can_send:
+                    error_msg = f"Runner could not send new requests for {self._response_wait_t}s"
+                    self._errors.append(error_msg)
+                    self._sync.notify_all()
+                    break
+
+                request_id = str(ids[0])
+                callback = functools.partial(AsyncInferenceRunner._on_result, self, ids, x, y_real, output_names)
+                client.async_infer(
+                    model_name=self._model_name,
+                    model_version=self._model_version,
+                    inputs=infer_inputs,
+                    outputs=outputs_req,
+                    callback=callback,
+                    request_id=request_id,
+                )
+                self._num_waiting_for += 1
+                self._sync.notify_all()
+
+        # wait till receive all requested data
+        with self._sync:
+
+            def _all_processed():
+                LOGGER.debug(f"wait for {self._num_waiting_for} unprocessed jobs")
+                return self._num_waiting_for == 0
+
+            self._processed_all = self._sync.wait_for(_all_processed, self._max_wait_time)
+            if not self._processed_all:
+                error_msg = f"Runner {self._response_wait_t}s timeout received while waiting for results from server"
+                self._errors.append(error_msg)
+
+            self._sync.notify_all()
+
+        LOGGER.debug("Finished request thread")

+ 190 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/http.py

@@ -0,0 +1,190 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+from pathlib import Path
+from typing import Optional
+
+# pytype: disable=import-error
+try:
+    from tritonclient import utils as client_utils  # noqa: F401
+except ImportError:
+    import tritonclientutils as client_utils  # noqa: F401
+
+try:
+    import tritonclient.http as http_client
+except (ImportError, RuntimeError):
+    import tritonhttpclient as http_client
+# pytype: enable=import-error
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .base import BaseRunner
+
+LOGGER = logging.getLogger("triton_inference_runner.http")
+
+
+class HTTPInferenceRunner(BaseRunner):
+    def _parse_content(self, response):
+        return json.dumps(response, indent=4)
+
+
+class SyncInferenceRunner(HTTPInferenceRunner):
+    def __iter__(self):
+        LOGGER.debug(f"Connecting to {self._server_url}")
+        client = http_client.InferenceServerClient(
+            url=self._server_url,
+            verbose=self._verbose,
+            connection_timeout=self._response_wait_t,
+            network_timeout=self._response_wait_t,
+        )
+
+        error = self._verify_triton_state(client)
+        if error:
+            raise RuntimeError(f"Could not communicate to Triton Server: {error}")
+
+        LOGGER.debug(
+            f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} " f"are up and ready!"
+        )
+
+        model_config = client.get_model_config(self._model_name, self._model_version)
+        model_metadata = client.get_model_metadata(self._model_name, self._model_version)
+        LOGGER.info(f"Model config {self._parse_content(model_config)}")
+        LOGGER.info(f"Model metadata {self._parse_content(model_metadata)}")
+
+        inputs = {tm["name"]: tm for tm in model_metadata["inputs"]}
+        outputs = {tm["name"]: tm for tm in model_metadata["outputs"]}
+        output_names = list(outputs)
+        outputs_req = [http_client.InferRequestedOutput(name) for name in outputs]
+
+        for ids, x, y_real in self._dataloader:
+            infer_inputs = []
+            for name in inputs:
+                data = x[name]
+                datatype = inputs[name]["datatype"]
+                infer_input = http_client.InferInput(name, data.shape, datatype)
+
+                target_np_dtype = client_utils.triton_to_np_dtype(datatype)
+                data = data.astype(target_np_dtype)
+
+                infer_input.set_data_from_numpy(data)
+                infer_inputs.append(infer_input)
+
+            results = client.infer(
+                model_name=self._model_name,
+                model_version=self._model_version,
+                inputs=infer_inputs,
+                outputs=outputs_req,
+                timeout=self._response_wait_t_ms,
+            )
+            y_pred = {name: results.as_numpy(name) for name in output_names}
+            yield ids, x, y_pred, y_real
+
+
+class AsyncInferenceRunner(HTTPInferenceRunner):
+    DEFAULT_MAX_UNRESP_REQS = 128
+
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        model_version: str,
+        *,
+        dataloader,
+        verbose=False,
+        response_wait_time: Optional[float] = None,
+        max_unresponded_requests: Optional[int] = None,
+    ):
+        super().__init__(
+            server_url,
+            model_name,
+            model_version,
+            dataloader=dataloader,
+            verbose=verbose,
+            response_wait_time=response_wait_time,
+        )
+        self._max_unresp_reqs = (
+            self.DEFAULT_MAX_UNRESP_REQS if max_unresponded_requests is None else max_unresponded_requests
+        )
+
+    def __iter__(self):
+        client = http_client.InferenceServerClient(
+            url=self._server_url,
+            verbose=self._verbose,
+            concurrency=self._max_unresp_reqs,
+            connection_timeout=self._response_wait_t,
+            network_timeout=self._response_wait_t,
+        )
+
+        self._errors = self._verify_triton_state(client)
+        if self._errors:
+            return
+
+        LOGGER.debug(
+            f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} " f"are up and ready!"
+        )
+
+        model_config = client.get_model_config(self._model_name, self._model_version)
+        model_metadata = client.get_model_metadata(self._model_name, self._model_version)
+        LOGGER.info(f"Model config {self._parse_content(model_config)}")
+        LOGGER.info(f"Model metadata {self._parse_content(model_metadata)}")
+
+        inputs = {tm["name"]: tm for tm in model_metadata["inputs"]}
+        outputs = {tm["name"]: tm for tm in model_metadata["outputs"]}
+        output_names = list(outputs)
+
+        async_requests = []
+        for ids, x, y_real in self._dataloader:
+            infer_inputs = []
+            for name in inputs:
+                data = x[name]
+                datatype = inputs[name]["datatype"]
+                infer_input = http_client.InferInput(name, data.shape, datatype)
+
+                target_np_dtype = client_utils.triton_to_np_dtype(datatype)
+                data = data.astype(target_np_dtype)
+
+                infer_input.set_data_from_numpy(data)
+                infer_inputs.append(infer_input)
+
+            outputs_req = [http_client.InferRequestedOutput(name) for name in outputs]
+
+            request_id = str(ids[0])
+            async_request = client.async_infer(
+                model_name=self._model_name,
+                model_version=self._model_version,
+                inputs=infer_inputs,
+                outputs=outputs_req,
+                request_id=request_id,
+                timeout=self._response_wait_t_ms,
+            )
+            async_requests.append((ids, x, y_real, async_request))
+
+            if len(async_requests) > self._max_unresp_reqs:
+                yield from self._yield_response(async_requests, output_names)
+                async_requests = []
+
+        yield from self._yield_response(async_requests, output_names)
+
+        LOGGER.debug("Finished request thread")
+
+    def _yield_response(self, async_requests, output_names):
+        for ids, x, y_real, async_response in async_requests:
+            result = async_response.get_result()
+            y_pred = {name: result.as_numpy(name) for name in output_names}
+
+            yield ids, x, y_pred, y_real

+ 78 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_inference_runner/runner.py

@@ -0,0 +1,78 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+from typing import Optional
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from ..utils import TritonClientProtocol, parse_server_url
+from .grpc import AsyncInferenceRunner as AsyncGRPCRunner
+from .grpc import SyncInferenceRunner as SyncGRPCRunner
+from .http import AsyncInferenceRunner as AsyncHTPPRunner
+from .http import SyncInferenceRunner as SyncHTTPRunner
+
+
+class TritonInferenceRunner:
+
+    async_runners = {
+        TritonClientProtocol.GRPC: AsyncGRPCRunner,
+        TritonClientProtocol.HTTP: AsyncHTPPRunner,
+    }
+
+    sync_runners = {
+        TritonClientProtocol.GRPC: SyncGRPCRunner,
+        TritonClientProtocol.HTTP: SyncHTTPRunner,
+    }
+
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        model_version: str,
+        dataloader_fn,
+        verbose: bool = False,
+        response_wait_time: Optional[float] = None,
+        max_unresponded_requests: int = 128,
+        synchronous: bool = False,
+    ):
+
+        protocol, host, port = parse_server_url(server_url)
+        server_url = f"{host}:{port}"
+
+        if synchronous:
+            sync_runner_cls = TritonInferenceRunner.sync_runners[protocol]
+            self._runner = sync_runner_cls(
+                server_url,
+                model_name,
+                model_version,
+                dataloader=dataloader_fn(),
+                verbose=verbose,
+                response_wait_time=response_wait_time,
+            )
+        else:
+            async_runner_cls = TritonInferenceRunner.async_runners[protocol]
+            self._runner = async_runner_cls(
+                server_url,
+                model_name,
+                model_version,
+                dataloader=dataloader_fn(),
+                verbose=verbose,
+                response_wait_time=response_wait_time,
+                max_unresponded_requests=max_unresponded_requests,
+            )
+
+    def __iter__(self):
+        return self._runner.__iter__()

+ 14 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/__init__.py

@@ -0,0 +1,14 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .runner import TritonPerformanceRunner  # noqa: F401

+ 14 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/__init__.py

@@ -0,0 +1,14 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .runner import ModelAnalyzerRunner  # noqa: F401

+ 39 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/exceptions.py

@@ -0,0 +1,39 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+class ModelAnalyzerException(Exception):
+    def __init__(self, message: str):
+        self._message = message
+
+    def __str__(self):
+        """
+        Get the exception string representation.
+
+        Returns
+        -------
+        str
+            The message associated with this exception, or None if no message.
+        """
+        return self._message
+
+    @property
+    def message(self):
+        """
+        Get the exception message.
+
+        Returns
+        -------
+        str
+            The message associated with this exception, or None if no message.
+        """
+        return self._message

+ 89 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/model_analyzer.py

@@ -0,0 +1,89 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import subprocess
+from subprocess import CalledProcessError
+
+from .exceptions import ModelAnalyzerException
+
+SERVER_OUTPUT_TIMEOUT_SECS = 5
+LOGGER = logging.getLogger(__name__)
+
+
+class ModelAnalyzerMode:
+    PROFILE = "profile"
+    ANALYZE = "analyze"
+    REPORT = "report"
+
+
+class ModelAnalyzerReportMode:
+    OFFLINE = "offline"
+    ONLINE = "online"
+
+
+class ModelAnalyzer:
+    """
+    Concrete Implementation of Model Analyzer interface that runs
+    analyzer locally as as subprocess.
+    """
+
+    _analyzer_path = "model-analyzer"
+
+    def __init__(self, config, timeout: int = None):
+        """
+        Parameters
+        ----------
+        config : AnalyzerConfig
+            the config object containing arguments for this server instance
+        """
+
+        self._analyzer_process = None
+        self._analyzer_config = config
+        self._log = None
+        self._timeout = timeout
+
+    def run(self, mode: str, verbose: bool = False, quiet: bool = False, report_mode: str = None):
+        """
+        Starts the model analyzer locally
+        """
+
+        if self._analyzer_path:
+
+            cmd = []
+            if self._timeout:
+                cmd = ["timeout", str(self._timeout)]
+
+            cmd += [self._analyzer_path]
+            if verbose:
+                cmd += ["--verbose"]
+
+            if quiet:
+                cmd += ["--quiet"]
+
+            if report_mode:
+                cmd += ["-m"]
+                cmd += [report_mode]
+
+            cmd += [mode]
+            cmd += self._analyzer_config.to_cli_string().split()
+
+            LOGGER.debug(f"Model Analyze command: {cmd}")
+            try:
+                subprocess.run(cmd, check=True, start_new_session=True)
+
+            except CalledProcessError as e:
+                raise ModelAnalyzerException(
+                    f"Running {self._analyzer_path} with {e.cmd} failed with"
+                    f" exit status {e.returncode} : {e.output}"
+                )

+ 113 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/model_analyzer_config.py

@@ -0,0 +1,113 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .exceptions import ModelAnalyzerException
+
+
+class ModelAnalyzerConfig:
+    """
+    A config class to set arguments to the Model Analyzer.
+    An argument set to None will use the default.
+    """
+
+    model_analyzer_args = [
+        "config-file",
+    ]
+
+    input_to_options = [
+        "config-file",
+    ]
+
+    def __init__(self):
+        # Args will be a dict with the string representation as key
+        self._args = {k: None for k in self.model_analyzer_args}
+
+        self._options = {
+            "-f": "config.yaml",
+        }
+
+        self._input_to_options = {
+            "config-file": "-f",
+        }
+
+    def to_cli_string(self):
+        """
+        Utility function to convert a config into a
+        string of arguments to the server with CLI.
+        Returns
+        -------
+        str
+            the command consisting of all set arguments to
+            the model analyzer.
+            e.g. '--model-repository=/models --verbose=True'
+        """
+        # single dashed options, then verbose flags, then main args
+        args = [f"{k} {v}" for k, v in self._options.items() if v]
+        args += [f"--{k}={v}" for k, v in self._args.items() if v]
+
+        return " ".join(args)
+
+    @classmethod
+    def allowed_keys(cls):
+        """
+        Returns
+        -------
+        list of str
+            The keys that are allowed to be
+            passed into model_analyzer
+        """
+
+        return list(cls.model_analyzer_args) + list(cls.input_to_options)
+
+    def __getitem__(self, key):
+        """
+        Gets an arguments value in config
+        Parameters
+        ----------
+        key : str
+            The name of the argument to the model analyzer
+        Returns
+        -------
+            The value that the argument is set to in this config
+        """
+
+        if key in self._args:
+            return self._args[key]
+        elif key in self._input_to_options:
+            return self._options[self._input_to_options[key]]
+        else:
+            raise ModelAnalyzerException(f"'{key}' Key not found in config")
+
+    def __setitem__(self, key, value):
+        """
+        Sets an arguments value in config
+        after checking if defined/supported.
+        Parameters
+        ----------
+        key : str
+            The name of the argument to the model analyzer
+        value : (any)
+            The value to which the argument is being set
+        Raises
+        ------
+        TritonModelAnalyzerException
+            If key is unsupported or undefined in the
+            config class
+        """
+        if key in self._args:
+            self._args[key] = value
+        elif key in self._input_to_options:
+            self._options[self._input_to_options[key]] = value
+        else:
+            raise ModelAnalyzerException(f"The argument '{key}' to the Model Analyzer is not supported.")

+ 296 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/model_analyzer/runner.py

@@ -0,0 +1,296 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import pathlib
+import shutil
+import sys
+from distutils.version import LooseVersion
+from typing import List, Optional
+
+import yaml
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from ...core import EvaluationMode, MeasurementMode, OfflineMode
+from ...utils import log_dict, parse_server_url
+from .model_analyzer import ModelAnalyzer, ModelAnalyzerMode
+from .model_analyzer_config import ModelAnalyzerConfig
+
+if LooseVersion(sys.version) >= LooseVersion("3.8.0"):
+    from importlib.metadata import version
+
+    TRITON_CLIENT_VERSION = LooseVersion(version("tritonclient"))
+    TRITON_MODEL_ANALYZER_VERSION = LooseVersion(version("triton-model-analyzer"))
+else:
+    import pkg_resources
+
+    TRITON_CLIENT_VERSION = LooseVersion(pkg_resources.get_distribution("tritonclient").version)
+    TRITON_MODEL_ANALYZER_VERSION = LooseVersion(pkg_resources.get_distribution("triton-model-analyzer").version)
+
+LOGGER = logging.getLogger("triton_performance_runner.model_analyzer")
+
+
+class ModelAnalyzerRunner:
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        input_data: str,
+        input_shapes: List[str],
+        batch_sizes: List[int],
+        concurrency: List[int],
+        measurement_mode: MeasurementMode,
+        measurement_interval: int,
+        measurement_request_count: int,
+        evaluation_mode: EvaluationMode,
+        offline_mode: OfflineMode,
+        model_repository: str,
+        result_path: pathlib.Path,
+        output_shared_memory_size: int = 102400,
+        timeout: Optional[int] = None,
+        verbose: bool = False,
+    ):
+        log_dict(
+            "Selected configuration",
+            {
+                "server_url": server_url,
+                "model_name": model_name,
+                "input_data": input_data,
+                "input_shapes": input_shapes,
+                "batch_sizes": batch_sizes,
+                "concurrency": concurrency,
+                "measurement_mode": measurement_mode,
+                "measurement_interval": measurement_interval,
+                "measurement_request_count": measurement_request_count,
+                "evaluation_mode": evaluation_mode,
+                "offline_mode": offline_mode,
+                "output_shared_memory_size": output_shared_memory_size,
+                "model_repository": model_repository,
+                "result_path": result_path,
+                "verbose": verbose,
+            },
+        )
+
+        if result_path.suffix:
+            raise ValueError(
+                "Results path for Model Analyzer is invalid. Please, provide the directory name. Example: results"
+            )
+
+        self._checkpoints = pathlib.Path("./checkpoints")
+        self._result_path = result_path
+        self._verbose = verbose
+
+        self._filename_model_inference = "metrics-model-inference.csv"
+        self._filename_model_gpu = "metrics-model-gpu.csv"
+
+        self._profile_config = self._prepare_profile_config(
+            server_url=server_url,
+            model_name=model_name,
+            input_data=input_data,
+            input_shapes=input_shapes,
+            batch_sizes=batch_sizes,
+            concurrency=concurrency,
+            measurement_mode=measurement_mode,
+            measurement_interval=measurement_interval,
+            measurement_request_count=measurement_request_count,
+            evaluation_mode=evaluation_mode,
+            offline_mode=offline_mode,
+            model_repository=model_repository,
+            output_shared_memory_size=output_shared_memory_size,
+            checkpoints=self._checkpoints,
+            verbose=verbose,
+        )
+        self._analyze_config = self._prepare_analyze_config(
+            model_name=model_name,
+            result_path=result_path,
+            verbose=verbose,
+            filename_model_inference=self._filename_model_inference,
+            filename_model_gpu=self._filename_model_gpu,
+        )
+
+    def run(self):
+        self._result_path.mkdir(parents=True, exist_ok=True)
+
+        if self._checkpoints.is_dir():
+            shutil.rmtree(self._checkpoints.as_posix())
+        self._checkpoints.mkdir(parents=True, exist_ok=True)
+
+        model_analyzer = ModelAnalyzer(config=self._profile_config)
+        model_analyzer.run(mode=ModelAnalyzerMode.PROFILE, verbose=self._verbose)
+
+        for file in self._checkpoints.iterdir():
+            if not file.is_file() or file.suffix != ".ckpt":
+                continue
+
+            LOGGER.info(f"Moving checkpoint {file.name} to {self._result_path}")
+            shutil.move(file, self._result_path / file.name)
+
+        model_analyzer = ModelAnalyzer(config=self._analyze_config)
+        model_analyzer.run(mode=ModelAnalyzerMode.ANALYZE, verbose=self._verbose)
+
+        inference_metrics_file = pathlib.Path("/tmp") / "results" / self._filename_model_inference
+        gpu_metrics_file = pathlib.Path("/tmp") / "results" / self._filename_model_gpu
+
+        for file in [inference_metrics_file, gpu_metrics_file]:
+            LOGGER.info(f"Moving metrics {file.name} to {self._result_path}")
+            shutil.move(file, self._result_path / file.name)
+
+    def _prepare_profile_config(
+        self,
+        server_url: str,
+        model_name: str,
+        input_data: str,
+        input_shapes: List[str],
+        batch_sizes: List[int],
+        concurrency: List[int],
+        measurement_mode: MeasurementMode,
+        measurement_interval: int,
+        measurement_request_count: int,
+        evaluation_mode: EvaluationMode,
+        offline_mode: OfflineMode,
+        model_repository: str,
+        checkpoints: pathlib.Path,
+        output_shared_memory_size: int = 102400,
+        verbose: bool = False,
+    ):
+        protocol, host, port = parse_server_url(server_url)
+
+        perf_analyzer_config = self._perf_analyzer_config(
+            input_data,
+            input_shapes,
+            measurement_mode,
+            measurement_interval,
+            measurement_request_count,
+            evaluation_mode,
+            offline_mode,
+            output_shared_memory_size,
+        )
+
+        config = {
+            "model_repository": model_repository,
+            "triton_launch_mode": "remote",
+            "run_config_search_disable": True,
+            "perf_analyzer_flags": perf_analyzer_config,
+            "perf_analyzer_timeout": 3600,  # Workaround for Perf Analyzer timeout - use 1h
+            "profile_models": [model_name],
+            "batch_sizes": batch_sizes,
+            "concurrency": concurrency,
+            "verbose": verbose,
+            "checkpoint_directory": checkpoints.as_posix(),
+            "override_output_model_repository": True,
+            "client_protocol": protocol.value,
+            f"triton_{protocol.value}_endpoint": f"{host}:{port}",
+        }
+
+        if verbose:
+            log_dict("Model Analyzer profiling configuration", config)
+
+        with open("config_profile.yaml", "w") as file:
+            yaml.safe_dump(config, file)
+
+        config = ModelAnalyzerConfig()
+        config["config-file"] = "config_profile.yaml"
+
+        return config
+
+    def _prepare_analyze_config(
+        self,
+        model_name: str,
+        result_path: pathlib.Path,
+        filename_model_inference: str,
+        filename_model_gpu: str,
+        verbose: bool,
+    ):
+        inference_output_fields = [
+            "batch_size",
+            "concurrency",
+            "perf_throughput",
+            "perf_latency",
+            "perf_client_send_recv",
+            "perf_client_response_wait",
+            "perf_server_queue",
+            "perf_server_compute_input",
+            "perf_server_compute_infer",
+            "perf_server_compute_output",
+        ]
+        gpu_output_fields = [
+            "gpu_uuid",
+            "batch_size",
+            "concurrency",
+            "gpu_used_memory",
+            "gpu_free_memory",
+            "gpu_utilization",
+            "gpu_power_usage",
+        ]
+
+        config = {
+            "analysis_models": model_name,
+            "checkpoint_directory": result_path.as_posix(),
+            "export_path": "/tmp",
+            "inference_output_fields": inference_output_fields,
+            "gpu_output_fields": gpu_output_fields,
+            "filename_model_inference": filename_model_inference,
+            "filename_model_gpu": filename_model_gpu,
+            "summarize": False,
+        }
+
+        if verbose:
+            log_dict("Model Analyzer analysis configuration", config)
+
+        with open("config_analyze.yaml", "w") as file:
+            yaml.safe_dump(config, file)
+
+        config = ModelAnalyzerConfig()
+        config["config-file"] = "config_analyze.yaml"
+
+        return config
+
+    def _perf_analyzer_config(
+        self,
+        input_data: str,
+        input_shapes: List[str],
+        measurement_mode: MeasurementMode,
+        measurement_interval: int,
+        measurement_request_count: int,
+        evaluation_mode: EvaluationMode,
+        offline_mode: OfflineMode,
+        output_shared_memory_size: int = 102400,
+    ):
+        perf_analyzer_config = {
+            "measurement-interval": measurement_interval,
+        }
+
+        if TRITON_MODEL_ANALYZER_VERSION >= LooseVersion("1.8.0"):
+            perf_analyzer_config["input-data"] = [input_data]
+        else:
+            perf_analyzer_config["input-data"] = input_data
+
+        if TRITON_CLIENT_VERSION >= LooseVersion("2.11.0"):
+            perf_analyzer_config["measurement-mode"] = measurement_mode.value
+            perf_analyzer_config["measurement-request-count"] = measurement_request_count
+
+        if evaluation_mode == EvaluationMode.OFFLINE:
+            perf_analyzer_config["shared-memory"] = offline_mode.value
+            perf_analyzer_config["output-shared-memory-size"] = output_shared_memory_size
+
+        if input_shapes:
+            if TRITON_MODEL_ANALYZER_VERSION > LooseVersion("1.8.0"):
+                perf_analyzer_config["shape"] = input_shapes
+            else:
+                perf_analyzer_config["shape"] = input_shapes[0]
+                LOGGER.warning("Model Analyzer <= 1.8.0 support only single shape param for Perf Analyzer.")
+
+        return perf_analyzer_config

+ 15 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/__init__.py

@@ -0,0 +1,15 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .runner import PerfAnalyzerRunner  # noqa: F401
+from .warmup import PerfAnalyzerWarmupRunner  # noqa: F401

+ 41 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/exceptions.py

@@ -0,0 +1,41 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class PerfAnalyzerException(Exception):
+    def __init__(self, message: str):
+        self._message = message
+
+    def __str__(self):
+        """
+        Get the exception string representation.
+
+        Returns
+        -------
+        str
+            The message associated with this exception, or None if no message.
+        """
+        return self._message
+
+    @property
+    def message(self):
+        """
+        Get the exception message.
+
+        Returns
+        -------
+        str
+            The message associated with this exception, or None if no message.
+        """
+        return self._message

+ 159 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/perf_analyzer.py

@@ -0,0 +1,159 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import pathlib
+from subprocess import PIPE, CalledProcessError, Popen
+
+# method from PEP-366 to support relative import in executed modules
+from typing import List, Optional
+
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from .exceptions import PerfAnalyzerException
+
+MAX_INTERVAL_CHANGES = 10
+COUNT_INTERVAL_DELTA = 50
+TIME_INTERVAL_DELTA = 2000
+
+LOGGER = logging.getLogger(__name__)
+
+
+class PerfAnalyzer:
+    """
+    This class provides an interface for running workloads
+    with perf_analyzer.
+    """
+
+    def __init__(self, config, timeout: Optional[int]):
+        """
+        Parameters
+        ----------
+        config : PerfAnalyzerConfig
+            keys are names of arguments to perf_analyzer,
+            values are their values.
+        """
+        self.bin_path = "perf_analyzer"
+        self._config = config
+        self._output = ""
+        self._timeout = timeout
+
+    def run(self):
+        """
+        Runs the perf analyzer with the
+        initialized configuration
+
+        Returns
+        -------
+        List of Records
+            List of the metrics obtained from this
+            run of perf_analyzer
+
+        Raises
+        ------
+        PerfAnalyzerException
+            If subprocess throws CalledProcessError
+        """
+        self._output = ""
+
+        for _ in range(MAX_INTERVAL_CHANGES):
+            command = [self.bin_path]
+            command += self._config.to_cli_string().replace("=", " ").split()
+
+            LOGGER.debug(f"Perf Analyze command: {command}")
+            if not self._timeout:
+                LOGGER.debug("Perf Analyze command timeout not set")
+            else:
+                LOGGER.debug(f"Perf Analyze command timeout: {self._timeout} [s]")
+
+            try:
+                self._run_with_stream(command=command)
+                return
+            except CalledProcessError as e:
+                if self._failed_with_measurement_inverval(e.output):
+                    if self._config["measurement-mode"] is None or self._config["measurement-mode"] == "count_windows":
+                        self._increase_request_count()
+                    else:
+                        self._increase_time_interval()
+                else:
+                    raise PerfAnalyzerException(
+                        f"Running perf_analyzer with {e.cmd} failed with" f" exit status {e.returncode} : {e.output}"
+                    )
+
+        raise PerfAnalyzerException(f"Ran perf_analyzer {MAX_INTERVAL_CHANGES} times, but no valid requests recorded.")
+
+    def output(self):
+        """
+        Returns
+        -------
+        The stdout output of the
+        last perf_analyzer run
+        """
+        if self._output:
+            return self._output
+        raise PerfAnalyzerException("Attempted to get perf_analyzer output" "without calling run first.")
+
+    def _run_with_stream(self, command: List[str]):
+        commands_lst = []
+
+        if self._timeout:
+            commands_lst = ["timeout", str(self._timeout)]
+
+        commands_lst.extend(command)
+        LOGGER.debug(f"Run with stream: {commands_lst}")
+        process = Popen(commands_lst, start_new_session=True, stdout=PIPE, encoding="utf-8")
+        streamed_output = ""
+        while True:
+            output = process.stdout.readline()
+            if output == "" and process.poll() is not None:
+                break
+            if output:
+                streamed_output += output
+                print(output.rstrip())
+
+        self._output += streamed_output
+        result = process.poll()
+        LOGGER.debug(f"Perf Analyzer process exited with result: {result}")
+
+        # WAR for Perf Analyzer exit code 0 when stabilization failed
+        if result == 0 and self._failed_with_measurement_inverval(streamed_output):
+            LOGGER.debug("Perf Analyzer finished with exit status 0, however measurement stabilization failed.")
+            result = 1
+
+        if result != 0:
+            raise CalledProcessError(returncode=result, cmd=commands_lst, output=streamed_output)
+
+    def _failed_with_measurement_inverval(self, output: str):
+        checks = [
+            output.find("Failed to obtain stable measurement"),
+            output.find("Please use a larger time window"),
+        ]
+        result = any([status != -1 for status in checks])
+
+        LOGGER.debug(f"Measurement stability message validation: {checks}. Result: {result}.")
+        return result
+
+    def _increase_request_count(self):
+        self._config["measurement-request-count"] += COUNT_INTERVAL_DELTA
+        LOGGER.debug(
+            "perf_analyzer's measurement request count is too small, "
+            f"increased to {self._config['measurement-request-count']}."
+        )
+
+    def _increase_time_interval(self):
+        self._config["measurement-interval"] += TIME_INTERVAL_DELTA
+        LOGGER.debug(
+            "perf_analyzer's measurement window is too small, "
+            f"increased to {self._config['measurement-interval']} ms."
+        )

+ 216 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/perf_config.py

@@ -0,0 +1,216 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+from .exceptions import PerfAnalyzerException
+
+
+class PerfAnalyzerConfig:
+    """
+    A config class to set arguments to the perf_analyzer.
+    An argument set to None will use the perf_analyzer's default.
+    """
+
+    perf_analyzer_args = [
+        "async",
+        "sync",
+        "measurement-interval",
+        "measurement-mode",
+        "measurement-request-count",
+        "concurrency-range",
+        "request-rate-range",
+        "request-distribution",
+        "request-intervals",
+        "binary-search",
+        "num-of-sequence",
+        "latency-threshold",
+        "max-threads",
+        "stability-percentage",
+        "max-trials",
+        "percentile",
+        "input-data",
+        "shared-memory",
+        "output-shared-memory-size",
+        "sequence-length",
+        "string-length",
+        "string-data",
+    ]
+
+    perf_analyzer_multiple_args = [
+        "shape",
+    ]
+
+    input_to_options = [
+        "model-name",
+        "model-version",
+        "batch-size",
+        "url",
+        "protocol",
+        "latency-report-file",
+        "streaming",
+    ]
+
+    input_to_verbose = ["verbose", "extra-verbose"]
+
+    def __init__(self):
+        """
+        Construct a PerfAnalyzerConfig
+        """
+
+        self._args = {k: None for k in self.perf_analyzer_args}
+        self._multiple_args = {k: [] for k in self.perf_analyzer_multiple_args}
+
+        self._options = {
+            "-m": None,
+            "-x": None,
+            "-b": None,
+            "-u": None,
+            "-i": None,
+            "-f": None,
+            "-H": None,
+            "-c": None,
+            "-t": None,
+        }
+        self._verbose = {"-v": None, "-v -v": None}
+
+        self._input_to_options = {
+            "model-name": "-m",
+            "model-version": "-x",
+            "batch-size": "-b",
+            "url": "-u",
+            "protocol": "-i",
+            "latency-report-file": "-f",
+            "streaming": "-H",
+            "concurrency": "-c",
+            "threads": "-t",
+        }
+
+        self._input_to_verbose = {"verbose": "-v", "extra-verbose": "-v -v"}
+
+    @classmethod
+    def allowed_keys(cls):
+        """
+        Returns
+        -------
+        list of str
+            The keys that are allowed to be
+            passed into perf_analyzer
+        """
+
+        return (
+            list(cls.perf_analyzer_args)
+            + list(cls.perf_analyzer_multiple_args)
+            + list(cls.input_to_options)
+            + list(cls.input_to_verbose)
+        )
+
+    def update_config(self, params=None):
+        """
+        Allows setting values from a
+        params dict
+
+        Parameters
+        ----------
+        params: dict
+            keys are allowed args to perf_analyzer
+        """
+
+        if params:
+            for key in params:
+                self[key] = params[key]
+
+    def to_cli_string(self):
+        """
+        Utility function to convert a config into a
+        string of arguments to the perf_analyzer with CLI.
+
+        Returns
+        -------
+        str
+            cli command string consisting of all arguments
+            to the perf_analyzer set in the config, without
+            the executable name.
+        """
+
+        # single dashed options, then verbose flags, then main args
+        args = [f"{k} {v}" for k, v in self._options.items() if v]
+        args += [k for k, v in self._verbose.items() if v]
+        args += [f"--{k}={v}" for k, v in self._args.items() if v]
+        for k, v in self._multiple_args.items():
+            for item in v:
+                args.append(f"--{k}={item}")
+
+        return " ".join(args)
+
+    def __getitem__(self, key: str):
+        """
+        Gets an arguments value in config
+
+        Parameters
+        ----------
+        key : str
+            The name of the argument to the perf_analyzer
+
+        Returns
+        -------
+            The value that the argument is set to in this config
+
+        Raises
+        ------
+        TritonModelAnalyzerException
+            If argument not found in the config
+        """
+
+        if key in self._args:
+            return self._args[key]
+        elif key in self._multiple_args:
+            return self._multiple_args[key]
+        elif key in self._input_to_options:
+            return self._options[self._input_to_options[key]]
+        elif key in self._input_to_verbose:
+            return self._verbose[self._input_to_verbose[key]]
+        else:
+            raise PerfAnalyzerException(f"'{key}' Key not found in config")
+
+    def __setitem__(self, key: str, value: Any):
+        """
+        Sets an arguments value in config
+        after checking if defined/supported.
+
+        Parameters
+        ----------
+        key : str
+            The name of the argument to the perf_analyzer
+        value : (any)
+            The value to which the argument is being set
+
+        Raises
+        ------
+        TritonModelAnalyzerException
+            If key is unsupported or undefined in the
+            config class
+        """
+
+        if key in self._args:
+            self._args[key] = value
+        elif key in self._multiple_args:
+            self._multiple_args[key].append(value)
+        elif key in self._input_to_options:
+            self._options[self._input_to_options[key]] = value
+        elif key in self._input_to_verbose:
+            self._verbose[self._input_to_verbose[key]] = value
+        else:
+            raise PerfAnalyzerException(
+                f"The argument '{key}' to the perf_analyzer " "is not supported by the model analyzer."
+            )

+ 183 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/runner.py

@@ -0,0 +1,183 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import csv
+import logging
+import os
+import pathlib
+import sys
+from distutils.version import LooseVersion
+from typing import Dict, List, Optional
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from ...core import EvaluationMode, MeasurementMode, OfflineMode
+from ...report import save_results, show_results, sort_results
+from ...utils import log_dict, parse_server_url
+from .perf_analyzer import PerfAnalyzer
+from .perf_config import PerfAnalyzerConfig
+
+if LooseVersion(sys.version) >= LooseVersion("3.8.0"):
+    from importlib.metadata import version
+
+    TRITON_CLIENT_VERSION = LooseVersion(version("tritonclient"))
+else:
+    import pkg_resources
+
+    TRITON_CLIENT_VERSION = LooseVersion(pkg_resources.get_distribution("tritonclient").version)
+
+LOGGER = logging.getLogger("triton_performance_runner.perf_analyzer")
+
+
+class PerfAnalyzerRunner:
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        input_data: str,
+        input_shapes: List[str],
+        batch_sizes: List[int],
+        concurrency: List[int],
+        measurement_mode: MeasurementMode,
+        measurement_interval: int,
+        measurement_request_count: int,
+        evaluation_mode: EvaluationMode,
+        offline_mode: OfflineMode,
+        result_path: pathlib.Path,
+        output_shared_memory_size: int = 102400,
+        timeout: Optional[int] = None,
+        verbose: bool = False,
+    ):
+        log_dict(
+            "Selected configuration",
+            {
+                "server_url": server_url,
+                "model_name": model_name,
+                "input_data": input_data,
+                "input_shapes": input_shapes,
+                "batch_sizes": batch_sizes,
+                "concurrency": concurrency,
+                "measurement_mode": measurement_mode,
+                "measurement_interval": measurement_interval,
+                "measurement_request_count": measurement_request_count,
+                "evaluation_mode": evaluation_mode,
+                "offline_mode": offline_mode,
+                "output_shared_memory_size": output_shared_memory_size,
+                "result_path": result_path,
+                "timeout": timeout,
+                "verbose": verbose,
+            },
+        )
+
+        if result_path.suffix != ".csv":
+            raise ValueError(
+                "Results path for Perf Analyzer is invalid. Please, provide the CSV file name. Example: results.csv"
+            )
+
+        self._server_url = server_url
+        self._model_name = model_name
+        self._input_data = input_data
+        self._input_shapes = input_shapes
+        self._batch_sizes = batch_sizes
+        self._concurrency = concurrency
+        self._measurement_mode = measurement_mode
+        self._measurement_interval = measurement_interval
+        self._measurement_request_count = measurement_request_count
+        self._evaluation_mode = evaluation_mode
+        self._offline_mode = offline_mode
+        self._result_path = result_path
+        self._output_shared_memory_size = output_shared_memory_size
+        self._timeout = timeout
+        self._verbose = verbose
+
+        self._protocol, self._host, self._port = parse_server_url(server_url)
+
+    def run(self):
+
+        results: List[Dict] = []
+        for batch_size in self._batch_sizes:
+            for concurrency in self._concurrency:
+                performance_partial_file = (
+                    f"{self._evaluation_mode.value.lower()}_partial_{batch_size}_{concurrency}.csv"
+                )
+
+                params = {
+                    "model-name": self._model_name,
+                    "model-version": 1,
+                    "batch-size": batch_size,
+                    "url": f"{self._host}:{self._port}",
+                    "protocol": self._protocol.value,
+                    "input-data": self._input_data,
+                    "measurement-interval": self._measurement_interval,
+                    "concurrency-range": f"{concurrency}:{concurrency}:1",
+                    "latency-report-file": performance_partial_file,
+                }
+
+                if self._verbose:
+                    params["extra-verbose"] = True
+
+                if TRITON_CLIENT_VERSION >= LooseVersion("2.11.0"):
+                    params["measurement-mode"] = self._measurement_mode.value
+                    params["measurement-request-count"] = self._measurement_request_count
+
+                if self._evaluation_mode == EvaluationMode.OFFLINE:
+                    params["shared-memory"] = self._offline_mode.value
+                    params["output-shared-memory-size"] = self._output_shared_memory_size
+
+                if self._verbose:
+                    log_dict(
+                        f"Perf Analyzer config for batch_size: {batch_size} and concurrency: {concurrency}", params
+                    )
+
+                config = PerfAnalyzerConfig()
+                for param, value in params.items():
+                    config[param] = value
+
+                for shape in self._input_shapes:
+                    config["shape"] = shape
+
+                perf_analyzer = PerfAnalyzer(config=config, timeout=self._timeout)
+                perf_analyzer.run()
+                self._update_performance_data(results, batch_size, performance_partial_file)
+                os.remove(performance_partial_file)
+
+        results = sort_results(results=results)
+
+        save_results(filename=self._result_path.as_posix(), data=results)
+        show_results(results=results)
+
+    def _calculate_average_latency(self, r):
+        avg_sum_fields = [
+            "Client Send",
+            "Network+Server Send/Recv",
+            "Server Queue",
+            "Server Compute",
+            "Server Compute Input",
+            "Server Compute Infer",
+            "Server Compute Output",
+            "Client Recv",
+        ]
+        avg_latency = sum(int(r.get(f, 0)) for f in avg_sum_fields)
+
+        return avg_latency
+
+    def _update_performance_data(self, results: List, batch_size: int, performance_partial_file: str):
+        row: Dict = {"Batch": batch_size}
+        with open(performance_partial_file) as csvfile:
+            reader = csv.DictReader(csvfile)
+            for r in reader:
+                avg_latency = self._calculate_average_latency(r)
+                row = {**row, **r, "avg latency": avg_latency}
+                results.append(row)

+ 99 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/perf_analyzer/warmup.py

@@ -0,0 +1,99 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import pathlib
+from distutils.version import LooseVersion
+from importlib.metadata import version
+from typing import List, Optional
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from ...core import EvaluationMode, MeasurementMode, OfflineMode
+from ...utils import parse_server_url
+from .perf_analyzer import PerfAnalyzer
+from .perf_config import PerfAnalyzerConfig
+
+LOGGER = logging.getLogger("warmup")
+
+TRITON_CLIENT_VERSION = LooseVersion(version("tritonclient"))
+
+
+class PerfAnalyzerWarmupRunner:
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        batch_sizes: List[int],
+        concurrency: List[int],
+        input_data: str,
+        input_shapes: List[str],
+        measurement_mode: MeasurementMode,
+        measurement_interval: int,
+        measurement_request_count: int,
+        offline_mode: OfflineMode,
+        evaluation_mode: EvaluationMode,
+        output_shared_memory_size: int,
+        timeout: Optional[int],
+    ):
+        self._model_name = model_name
+        self._input_data = input_data
+        self._input_shapes = input_shapes
+        self._measurement_mode = measurement_mode
+        self._offline_mode = offline_mode
+        self._evaluation_mode = evaluation_mode
+        self._output_shared_memory_size = output_shared_memory_size
+
+        self._protocol, self._host, self._port = parse_server_url(server_url)
+
+        self._measurement_interval = 2 * measurement_interval
+        self._measurement_request_count = 2 * measurement_request_count
+
+        self._batch_sizes = [min(batch_sizes)]
+        self._concurrency = [max(concurrency)]
+        self._timeout = timeout
+
+    def run(self):
+        for batch_size in self._batch_sizes:
+            for concurrency in self._concurrency:
+                params = {
+                    "model-name": self._model_name,
+                    "model-version": 1,
+                    "batch-size": batch_size,
+                    "url": f"{self._host}:{self._port}",
+                    "protocol": self._protocol.value,
+                    "input-data": self._input_data,
+                    "measurement-interval": self._measurement_interval,
+                    "concurrency-range": f"{concurrency}:{concurrency}:1",
+                    "verbose": True,
+                }
+
+                if TRITON_CLIENT_VERSION >= LooseVersion("2.11.0"):
+                    params["measurement-mode"] = self._measurement_mode.value
+                    params["measurement-request-count"] = self._measurement_request_count
+
+                if self._evaluation_mode == EvaluationMode.OFFLINE:
+                    params["shared-memory"] = self._offline_mode.value
+                    params["output-shared-memory-size"] = self._output_shared_memory_size
+
+                config = PerfAnalyzerConfig()
+                for param, value in params.items():
+                    config[param] = value
+
+                for shape in self._input_shapes:
+                    config["shape"] = shape
+
+                perf_analyzer = PerfAnalyzer(config=config, timeout=self._timeout)
+                perf_analyzer.run()

+ 117 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/triton_performance_runner/runner.py

@@ -0,0 +1,117 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# method from PEP-366 to support relative import in executed modules
+import logging
+import pathlib
+from typing import List, Optional
+
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from ..core import EvaluationMode, MeasurementMode, OfflineMode, PerformanceTool
+from .model_analyzer import ModelAnalyzerRunner
+from .perf_analyzer import PerfAnalyzerRunner, PerfAnalyzerWarmupRunner
+
+LOGGER = logging.getLogger("triton_performance_runner")
+
+
+class TritonPerformanceRunner:
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        input_data: str,
+        input_shapes: List[str],
+        batch_sizes: List[int],
+        concurrency: List[int],
+        measurement_mode: MeasurementMode,
+        measurement_interval: int,
+        measurement_request_count: int,
+        evaluation_mode: EvaluationMode,
+        offline_mode: OfflineMode,
+        output_shared_memory_size: int,
+        performance_tool: PerformanceTool,
+        model_repository: str,
+        result_path: pathlib.Path,
+        warmup: bool,
+        timeout: Optional[int],
+        verbose: bool,
+    ):
+
+        self._warmup_runner = None
+        if warmup:
+            LOGGER.info("Running warmup before the main test")
+            self._warmup_runner = PerfAnalyzerWarmupRunner(
+                server_url=server_url,
+                model_name=model_name,
+                input_data=input_data,
+                input_shapes=input_shapes,
+                batch_sizes=batch_sizes,
+                concurrency=concurrency,
+                measurement_mode=measurement_mode,
+                measurement_interval=measurement_interval,
+                measurement_request_count=measurement_request_count,
+                evaluation_mode=evaluation_mode,
+                offline_mode=offline_mode,
+                output_shared_memory_size=output_shared_memory_size,
+                timeout=timeout,
+            )
+
+        if performance_tool == PerformanceTool.MODEL_ANALYZER:
+            LOGGER.info("Using Model Analyzer for performance evaluation")
+            self._runner = ModelAnalyzerRunner(
+                server_url=server_url,
+                model_name=model_name,
+                input_data=input_data,
+                input_shapes=input_shapes,
+                batch_sizes=batch_sizes,
+                concurrency=concurrency,
+                measurement_mode=measurement_mode,
+                measurement_interval=measurement_interval,
+                measurement_request_count=measurement_request_count,
+                evaluation_mode=evaluation_mode,
+                offline_mode=offline_mode,
+                output_shared_memory_size=output_shared_memory_size,
+                model_repository=model_repository,
+                result_path=result_path,
+                timeout=timeout,
+                verbose=verbose,
+            )
+        elif performance_tool == PerformanceTool.PERF_ANALYZER:
+            LOGGER.info("Using Perf Analyzer for performance evaluation")
+            self._runner = PerfAnalyzerRunner(
+                server_url=server_url,
+                model_name=model_name,
+                input_data=input_data,
+                input_shapes=input_shapes,
+                batch_sizes=batch_sizes,
+                measurement_mode=measurement_mode,
+                measurement_interval=measurement_interval,
+                measurement_request_count=measurement_request_count,
+                concurrency=concurrency,
+                evaluation_mode=evaluation_mode,
+                offline_mode=offline_mode,
+                output_shared_memory_size=output_shared_memory_size,
+                result_path=result_path,
+                timeout=timeout,
+                verbose=verbose,
+            )
+        else:
+            raise ValueError(f"Unsupported performance tool {performance_tool}")
+
+    def run(self):
+        if self._warmup_runner:
+            self._warmup_runner.run()
+
+        self._runner.run()

+ 64 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/deployment_toolkit/utils.py

@@ -0,0 +1,64 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from enum import Enum
+from typing import Any, Dict, Tuple
+
+LOGGER = logging.getLogger(__name__)
+
+
+class TritonClientProtocol(Enum):
+    """Describe protocol with which client communicates with Triton"""
+
+    GRPC = "grpc"
+    HTTP = "http"
+
+
+def parse_server_url(server_url: str) -> Tuple[TritonClientProtocol, str, int]:
+    DEFAULT_PORTS = {
+        TritonClientProtocol.HTTP: 8000,
+        TritonClientProtocol.GRPC: 8001,
+    }
+
+    # extract protocol
+    server_url_items = server_url.split("://")
+    if len(server_url_items) != 2:
+        raise ValueError("Prefix server_url with protocol ex.: grpc://127.0.0.1:8001")
+    requested_protocol, server_url = server_url_items
+    requested_protocol = TritonClientProtocol(requested_protocol.lower())
+
+    if requested_protocol not in DEFAULT_PORTS:
+        raise ValueError(f"Unsupported protocol: {requested_protocol}")
+
+    # extract host and port
+    default_port = DEFAULT_PORTS[requested_protocol]
+    server_url_items = server_url.split(":")
+    if len(server_url_items) == 1:
+        host, port = server_url, default_port
+    elif len(server_url_items) == 2:
+        host, port = server_url_items
+        port = int(port)
+        if port != default_port:
+            LOGGER.warning(
+                f"Current server URL is {server_url} while default {requested_protocol} port is {default_port}"
+            )
+    else:
+        raise ValueError(f"Could not parse {server_url}. Example of correct server URL: grpc://127.0.0.1:8001")
+    return requested_protocol, host, port
+
+
+def log_dict(title: str, dict_: Dict[str, Any]):
+    LOGGER.info(title)
+    for key, value in dict_.items():
+        LOGGER.info(f"\t{key} = {value}")

+ 156 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/export_model.py

@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+from pathlib import Path
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "1"
+
+# method from PEP-366 to support relative import in executed modules
+if __name__ == "__main__" and __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator  # noqa: E402  module level import not at top of file
+from .deployment_toolkit.core import (  # noqa: E402  module level import not at top of file
+    DATALOADER_FN_NAME,
+    BaseLoader,
+    BaseSaver,
+    ExportFormat,
+    ModelInputType,
+    TorchJit,
+    load_from_file,
+)
+from .deployment_toolkit.extensions import loaders, savers  # noqa: E402  module level import not at top of file
+
+LOGGER = logging.getLogger("export_model")
+
+INPUT_MODEL_TYPES = [
+    ModelInputType.TF_ESTIMATOR,
+    ModelInputType.TF_KERAS,
+    ModelInputType.PYT,
+]
+
+OUTPUT_MODEL_TYPES = [
+    ExportFormat.TF_SAVEDMODEL,
+    ExportFormat.TORCHSCRIPT,
+    ExportFormat.ONNX,
+]
+
+TORCH_JIT_TYPES = [
+    TorchJit.NONE,
+    TorchJit.TRACE,
+    TorchJit.SCRIPT,
+]
+
+
+def _get_args():
+    parser = argparse.ArgumentParser(
+        description="Script for exporting models from supported frameworks.", allow_abbrev=False
+    )
+    parser.add_argument("--input-path", help="Path to input python module", required=True)
+    parser.add_argument(
+        "--input-type", help="Input model type", choices=[f.value for f in INPUT_MODEL_TYPES], required=True
+    )
+    parser.add_argument("--output-path", help="Path to output model file", required=True)
+    parser.add_argument(
+        "--output-type", help="Output model type", choices=[f.value for f in OUTPUT_MODEL_TYPES], required=True
+    )
+    parser.add_argument(
+        "--torch-jit",
+        help="Torch Jit",
+        choices=[f.value for f in TORCH_JIT_TYPES],
+        required=False,
+        default=None,
+    )
+    parser.add_argument("--dataloader", help="Path to python module containing data loader")
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
+    parser.add_argument(
+        "--ignore-unknown-parameters",
+        help="Ignore unknown parameters (argument often used in CI where set of arguments is constant)",
+        action="store_true",
+        default=False,
+    )
+
+    args, unparsed_args = parser.parse_known_args()
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
+
+    if args.input_type == ModelInputType.PYT.value and args.output_type == ExportFormat.ONNX.value:
+        saver_type = f"{ModelInputType.PYT.value}--{ExportFormat.ONNX.value}"
+    else:
+        saver_type = args.output_type
+    Saver: BaseSaver = savers.get(saver_type)
+    ArgParserGenerator(Saver).update_argparser(parser)
+
+    if args.dataloader is not None:
+        get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+        ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+
+    if args.ignore_unknown_parameters:
+        args, unknown_args = parser.parse_known_args()
+        LOGGER.warning(f"Got additional args {unknown_args}")
+    else:
+        args = parser.parse_args()
+    return args
+
+
+def main():
+    args = _get_args()
+
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info("args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    dataloader_fn = None
+    if args.dataloader is not None:
+        get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+        dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
+
+    print(args.input_path)
+    print(os.path.isfile(args.input_path))
+    print(args.output_type)
+    model = loader.load(
+        args.input_path,
+        dataloader_fn=dataloader_fn,
+        output_type=args.output_type,
+        torch_jit=args.torch_jit,
+    )
+
+    LOGGER.info("inputs: %s", model.inputs)
+    LOGGER.info("outputs: %s", model.outputs)
+
+    if args.input_type == ModelInputType.PYT.value and args.output_type == ExportFormat.ONNX.value:
+        saver_type = f"{ModelInputType.PYT.value}--{ExportFormat.ONNX.value}"
+    else:
+        saver_type = args.output_type
+
+    Saver: BaseSaver = savers.get(saver_type)
+    saver = ArgParserGenerator(Saver).from_args(args)
+    saver.save(model, args.output_path, dataloader_fn)
+
+
+if __name__ == "__main__":
+    main()

+ 89 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/metrics.py

@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import tensorflow as tf
+from triton.deployment_toolkit.core import BaseMetricsCalculator
+
+
+class MetricsCalculator(BaseMetricsCalculator):
+    def __init__(self, *, output_used_for_metrics: str):
+        self.output_used_for_metrics = output_used_for_metrics
+        self._ids = None
+        self._y_pred = None
+        self._y_real = None
+
+    def update(
+        self,
+        ids: List[Any],
+        y_pred: Dict[str, np.ndarray],
+        x: Optional[Dict[str, np.ndarray]],
+        y_real: Optional[Dict[str, np.ndarray]],
+    ):
+        y_real = y_real[self.output_used_for_metrics]
+        y_pred = y_pred[self.output_used_for_metrics]
+
+        def _concat_batches(b1, b2):
+            if b1 is None:
+                return b2
+            else:
+                return np.concatenate([b1, b2], axis=0)
+
+        self._ids = _concat_batches(self._ids, ids)
+        self._y_real = _concat_batches(self._y_real, y_real)
+        self._y_pred = _concat_batches(self._y_pred, y_pred)
+
+    @property
+    def metrics(self) -> Dict[str, Any]:
+        metrics = {"map12": self.get_map12(self._ids, self._y_pred, self._y_real)}
+
+        return metrics
+
+    def get_map12(self, ids, y_pred, y_real):
+        with tf.device("/cpu:0"):
+            predictions = tf.reshape(y_pred, [-1])
+            predictions = tf.cast(predictions, tf.float64)
+            display_ids = tf.reshape(ids, [-1])
+            labels = tf.reshape(y_real, [-1])
+            sorted_ids = tf.argsort(display_ids)
+            display_ids = tf.gather(display_ids, indices=sorted_ids)
+            predictions = tf.gather(predictions, indices=sorted_ids)
+            labels = tf.gather(labels, indices=sorted_ids)
+            _, display_ids_idx, display_ids_ads_count = tf.unique_with_counts(display_ids, out_idx=tf.int64)
+            pad_length = 30 - tf.reduce_max(display_ids_ads_count)
+            preds = tf.RaggedTensor.from_value_rowids(predictions, display_ids_idx).to_tensor()
+            labels = tf.RaggedTensor.from_value_rowids(labels, display_ids_idx).to_tensor()
+
+            labels_mask = tf.math.reduce_max(labels, 1)
+            preds_masked = tf.boolean_mask(preds, labels_mask)
+            labels_masked = tf.boolean_mask(labels, labels_mask)
+            labels_masked = tf.argmax(labels_masked, axis=1, output_type=tf.int32)
+            labels_masked = tf.reshape(labels_masked, [-1, 1])
+
+            preds_masked = tf.pad(preds_masked, [(0, 0), (0, pad_length)])
+            _, predictions_idx = tf.math.top_k(preds_masked, 12)
+            indices = tf.math.equal(predictions_idx, labels_masked)
+            indices_mask = tf.math.reduce_any(indices, 1)
+            masked_indices = tf.boolean_mask(indices, indices_mask)
+
+            res = tf.argmax(masked_indices, axis=1)
+            ap_matrix = tf.divide(1, tf.add(res, 1))
+            ap_sum = tf.reduce_sum(ap_matrix)
+            shape = tf.cast(tf.shape(indices)[0], tf.float64)
+
+            return (ap_sum / shape).numpy()

+ 73 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/model.py

@@ -0,0 +1,73 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import SimpleNamespace
+from typing import List
+
+import tensorflow as tf
+from trainer.model.widedeep import wide_deep_model
+
+
+def update_argparser(parser):
+    parser.add_argument('--deep-hidden-units', type=int, default=[1024, 1024, 1024, 1024, 1024], nargs='+',
+                        help='Hidden units per layer for deep model, separated by spaces')
+
+    parser.add_argument('--deep-dropout', type=float, default=0.1,
+                        help='Dropout regularization for deep model')
+
+    parser.add_argument('--combiner', type=str, default='sum', choices=['mean', 'sum'],
+                        help='Type of aggregation used for multi hot categorical features')
+
+    parser.add_argument('--precision', type=str, default="fp16", choices=['fp32', 'fp16'],
+                        help='Precision of the ops. AMP will be used in case of fp16')
+
+    parser.add_argument('--checkpoint-dir', type=str, required=True,
+                        help='Path to directory containing checkpoint')
+
+
+def get_model(
+        *,
+        deep_hidden_units: List[int],
+        deep_dropout: float,
+        combiner: str,
+        checkpoint_dir: str,
+        precision: str = "fp32",
+        batch_size: int = 131072
+):
+    args = {
+        'deep_hidden_units': deep_hidden_units,
+        'deep_dropout': deep_dropout,
+        'combiner': combiner
+    }
+
+    args = SimpleNamespace(**args)
+
+    model, features = wide_deep_model(args)
+
+    checkpoint = tf.train.Checkpoint(model=model)
+    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
+    inputs = features.values()
+    outputs = model(features, training=False)
+    model = tf.keras.Model(inputs=inputs, outputs=outputs)
+
+    @tf.function
+    def call_fn(*model_inputs):
+        return model(model_inputs, training=False)
+
+    return model, call_fn
+
+
+if __name__ == '__main__':
+    get_model(deep_hidden_units=[1024, 1024, 1024, 1024, 1024], deep_dropout=0.1, combiner='sum',
+              checkpoint_dir='/tmp/wd2/checkpoint')

TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_a30_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx-1_(1x_v100_32gb)_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_dgx_a100_(1x_a100_80gb)_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_10_triton_performance_offline_10/plots/latency_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_10_triton_performance_offline_10/plots/throughput_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_10_triton_performance_online_10/plots/latency_vs_concurrency.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_6_triton_performance_offline_6/plots/latency_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_6_triton_performance_offline_6/plots/throughput_vs_batch.png


TEMPAT SAMPAH
TensorFlow2/Recommendation/WideAndDeep/triton/reports/nvidia_t4_experiment_6_triton_performance_online_6/plots/latency_vs_concurrency.png


+ 25 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/requirements.txt

@@ -0,0 +1,25 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_navigator[tf] @ git+https://github.com/triton-inference-server/[email protected]#egg=model_navigator
+natsort>=7.0.0
+networkx==2.5
+numpy
+onnx>=1.8.0,<1.9.0
+onnxruntime-gpu==1.8.1
+pycuda>=2019.1.2
+PyYAML>=5.2
+tabulate>=0.8.7
+tf2onnx>=1.9.0,<1.10.0
+tqdm>=4.44.1

+ 140 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/run_inference_on_fw.py

@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+To infer the model on framework runtime, you can use `run_inference_on_fw.py` script.
+It infers data obtained from pointed data loader locally and saves received data into dump files.
+Those files are stored in directory pointed by `--output-dir` argument.
+
+Example call:
+
+```shell script
+python ./triton/run_inference_on_fw.py \
+    --input-path /models/exported/model.onnx \
+    --input-type onnx \
+    --dataloader triton/dataloader.py \
+    --data-dir /data/imagenet \
+    --batch-size 32 \
+    --output-dir /results/dump_local \
+    --dump-labels
+```
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+from tqdm import tqdm
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "0"
+
+
+from .deployment_toolkit.args import ArgParserGenerator  # noqa: E402  module level import not at top of file
+from .deployment_toolkit.core import (  # noqa: E402  module level import not at top of file
+    DATALOADER_FN_NAME,
+    BaseLoader,
+    BaseRunner,
+    load_from_file,
+)
+from .deployment_toolkit.dump import JsonDumpWriter  # noqa: E402  module level import not at top of file
+from .deployment_toolkit.extensions import loaders, runners  # noqa: E402  module level import not at top of file
+
+LOGGER = logging.getLogger("run_inference_on_fw")
+
+
+def _verify_and_format_dump(args, ids, x, y_pred, y_real):
+    data = {"outputs": y_pred, "ids": {"ids": ids}}
+    if args.dump_inputs:
+        data["inputs"] = x
+    if args.dump_labels:
+        if not y_real:
+            raise ValueError(
+                "Found empty label values. Please provide labels in dataloader_fn or do not use --dump-labels argument"
+            )
+        data["labels"] = y_real
+    return data
+
+
+def _parse_and_validate_args():
+    supported_inputs = set(runners.supported_extensions) & set(loaders.supported_extensions)
+
+    parser = argparse.ArgumentParser(description="Dump local inference output of given model", allow_abbrev=False)
+    parser.add_argument("--input-path", help="Path to input model", required=True)
+    parser.add_argument("--input-type", help="Input model type", choices=supported_inputs, required=True)
+    parser.add_argument("--dataloader", help="Path to python file containing dataloader.", required=True)
+    parser.add_argument("--output-dir", help="Path to dir where output files will be stored", required=True)
+    parser.add_argument("--dump-labels", help="Dump labels to output dir", action="store_true", default=False)
+    parser.add_argument("--dump-inputs", help="Dump inputs to output dir", action="store_true", default=False)
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
+
+    args, *_ = parser.parse_known_args()
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
+
+    Runner: BaseRunner = runners.get(args.input_type)
+    ArgParserGenerator(Runner).update_argparser(parser)
+
+    args = parser.parse_args()
+
+    types_requiring_io_params = []
+
+    if args.input_type in types_requiring_io_params and not all(p for p in [args.inputs, args.outptputs]):
+        parser.error(f"For {args.input_type} input provide --inputs and --outputs parameters")
+
+    return args
+
+
+def main():
+    args = _parse_and_validate_args()
+
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info("args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    Runner: BaseRunner = runners.get(args.input_type)
+
+    loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
+    runner = ArgParserGenerator(Runner).from_args(args)
+    LOGGER.info(f"Loading {args.input_path}")
+    model = loader.load(args.input_path)
+    with runner.init_inference(model=model) as runner_session, JsonDumpWriter(args.output_dir) as writer:
+        get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+        dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+        LOGGER.info("Data loader initialized; Running inference")
+        for ids, x, y_real in tqdm(dataloader_fn(), unit="batch", mininterval=10):
+            y_pred = runner_session(x)
+            data = _verify_and_format_dump(args, ids=ids, x=x, y_pred=y_pred, y_real=y_real)
+            writer.write(**data)
+        LOGGER.info("Inference finished")
+
+
+if __name__ == "__main__":
+    main()

+ 146 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/run_inference_on_triton.py

@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+To infer the model deployed on Triton, you can use `run_inference_on_triton.py` script.
+It sends a request with data obtained from pointed data loader and dumps received data into dump files.
+Those files are stored in directory pointed by `--output-dir` argument.
+
+Currently, the client communicates with the Triton server asynchronously using GRPC protocol.
+
+Example call:
+
+```shell script
+python ./triton/run_inference_on_triton.py \
+    --server-url localhost:8001 \
+    --model-name ResNet50 \
+    --model-version 1 \
+    --dump-labels \
+    --output-dir /results/dump_triton
+```
+"""
+
+import argparse
+import logging
+import time
+import traceback
+from pathlib import Path
+
+from tqdm import tqdm
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator
+from .deployment_toolkit.core import DATALOADER_FN_NAME, load_from_file
+from .deployment_toolkit.dump import JsonDumpWriter
+from .deployment_toolkit.triton_inference_runner import TritonInferenceRunner
+
+LOGGER = logging.getLogger("run_inference_on_triton")
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(description="Infer model on Triton server", allow_abbrev=False)
+    parser.add_argument(
+        "--server-url", type=str, default="localhost:8001", help="Inference server URL (default localhost:8001)"
+    )
+    parser.add_argument("--model-name", help="The name of the model used for inference.", required=True)
+    parser.add_argument("--model-version", help="The version of the model used for inference.", required=True)
+    parser.add_argument("--dataloader", help="Path to python file containing dataloader.", required=True)
+    parser.add_argument("--dump-labels", help="Dump labels to output dir", action="store_true", default=False)
+    parser.add_argument("--dump-inputs", help="Dump inputs to output dir", action="store_true", default=False)
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=True)
+    parser.add_argument("--output-dir", required=True, help="Path to directory where outputs will be saved")
+    parser.add_argument(
+        "--response-wait-time", required=False, help="Maximal time to wait for response", default=120, type=float
+    )
+    parser.add_argument(
+        "--max-unresponded-requests",
+        required=False,
+        help="Maximal number of unresponded requests",
+        default=128,
+        type=int,
+    )
+    parser.add_argument(
+        "--synchronous", help="Enable synchronous calls to Triton Server", action="store_true", default=False
+    )
+
+    args, *_ = parser.parse_known_args()
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = _parse_args()
+
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info("args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+
+    try:
+        runner = TritonInferenceRunner(
+            server_url=args.server_url,
+            model_name=args.model_name,
+            model_version=args.model_version,
+            dataloader_fn=dataloader_fn,
+            verbose=False,
+            response_wait_time=args.response_wait_time,
+            max_unresponded_requests=args.max_unresponded_requests,
+            synchronous=args.synchronous,
+        )
+
+    except Exception as e:
+        message = traceback.format_exc()
+        LOGGER.error(f"Encountered exception \n{message}")
+        raise e
+
+    with JsonDumpWriter(output_dir=args.output_dir) as writer:
+        start = time.time()
+        for ids, x, y_pred, y_real in tqdm(runner, unit="batch", mininterval=10):
+            data = _verify_and_format_dump(args, ids, x, y_pred, y_real)
+            writer.write(**data)
+        stop = time.time()
+
+    LOGGER.info(f"\nThe inference took {stop - start:0.3f}s")
+
+
+def _verify_and_format_dump(args, ids, x, y_pred, y_real):
+    data = {"outputs": y_pred, "ids": {"ids": ids}}
+    if args.dump_inputs:
+        data["inputs"] = x
+    if args.dump_labels:
+        if not y_real:
+            raise ValueError(
+                "Found empty label values. Please provide labels in dataloader_fn or do not use --dump-labels argument"
+            )
+        data["labels"] = y_real
+    return data
+
+
+if __name__ == "__main__":
+    main()

+ 196 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/run_performance_on_triton.py

@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import pathlib
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from .deployment_toolkit.core import EvaluationMode, MeasurementMode, OfflineMode, PerformanceTool
+from .deployment_toolkit.triton_performance_runner import TritonPerformanceRunner
+
+LOGGER = logging.getLogger("run_performance_on_triton")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        required=True,
+        help="Name of the model to test",
+    )
+    parser.add_argument(
+        "--result-path",
+        type=pathlib.Path,
+        required=True,
+        help="Path where results files is stored.",
+    )
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="http://127.0.0.1:8000",
+        help="Url to Triton server",
+    )
+
+    parser.add_argument(
+        "--model-version",
+        type=str,
+        default=1,
+        help="Version of model",
+    )
+    parser.add_argument(
+        "--input-data",
+        type=str,
+        default="random",
+        help="Input data to perform profiling.",
+    )
+    parser.add_argument(
+        "--input-shapes",
+        action="append",
+        help="Input data shape in form INPUT_NAME:<full_shape_without_batch_axis>.",
+    )
+    parser.add_argument(
+        "--batch-sizes",
+        type=int,
+        default=[1],
+        help="List of batch sizes to tests.",
+        nargs="*",
+    )
+    parser.add_argument(
+        "--concurrency",
+        type=int,
+        default=[1],
+        help="List of concurrency modes.",
+        nargs="*",
+    )
+    parser.add_argument(
+        "--measurement-mode",
+        choices=[item.value for item in MeasurementMode],
+        default=MeasurementMode.COUNT_WINDOWS.value,
+        type=str,
+        help="Select measurement mode "
+        "'time_windows' stabilize performance on measurement window. "
+        "'count_windows' stabilize performance on number of samples.",
+    )
+    parser.add_argument(
+        "--measurement-interval",
+        help="Time window perf_analyzer will wait to stabilize the measurement",
+        default=5000,
+        type=int,
+    )
+    parser.add_argument(
+        "--measurement-request-count",
+        help="Number of samples on which perf_analyzer will stabilize the measurement",
+        default=50,
+        type=int,
+    )
+    parser.add_argument(
+        "--evaluation-mode",
+        choices=[item.value for item in EvaluationMode],
+        default=EvaluationMode.OFFLINE.value,
+        type=str,
+        help="Select evaluation mode "
+        "'offline' run offline analysis and use GPU memory to pass tensors. "
+        "'online' run online analysis and use HTTP protocol.",
+    )
+    parser.add_argument(
+        "--offline-mode",
+        choices=[item.value for item in OfflineMode],
+        default=OfflineMode.SYSTEM.value,
+        type=str,
+        help="Select offline mode "
+        "'system' pass tensors through CPU RAM memory. "
+        "'cuda' pass tensors through GPU RAM memory.",
+    )
+    parser.add_argument(
+        "--output-shared-memory-size",
+        default=102400,
+        type=int,
+        help="Size of memory buffer allocated for output with dynamic shapes in bytes. "
+        "Has to be equal to maximal size of output tensor.",
+    )
+    parser.add_argument(
+        "--performance-tool",
+        choices=[item.value for item in PerformanceTool],
+        default=PerformanceTool.MODEL_ANALYZER.value,
+        type=str,
+        help="Select performance tool for measurement mode "
+        "'model_analyzer' use Model Analyzer "
+        "'perf_analyzer' use Perf Analyzer",
+    )
+    parser.add_argument(
+        "--model-repository",
+        default=None,
+        type=str,
+        help="Path to model repository. Valid when using Model Analyzer",
+    )
+    parser.add_argument(
+        "--warmup",
+        help="Enable model warmup before performance test",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--timeout",
+        help="Timeout for performance analysis",
+        type=int,
+        default=None,
+        required=False,
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        help="Verbose logs",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    logging.basicConfig(level=log_level, format=log_format)
+
+    runner = TritonPerformanceRunner(
+        server_url=args.server_url,
+        model_name=args.model_name,
+        input_data=args.input_data,
+        input_shapes=args.input_shapes or [],
+        batch_sizes=args.batch_sizes,
+        measurement_mode=MeasurementMode(args.measurement_mode),
+        measurement_interval=args.measurement_interval,
+        measurement_request_count=args.measurement_request_count,
+        concurrency=args.concurrency,
+        evaluation_mode=EvaluationMode(args.evaluation_mode),
+        offline_mode=OfflineMode(args.offline_mode),
+        output_shared_memory_size=args.output_shared_memory_size,
+        performance_tool=PerformanceTool(args.performance_tool),
+        model_repository=args.model_repository,
+        result_path=args.result_path,
+        warmup=args.warmup,
+        timeout=args.timeout,
+        verbose=args.verbose,
+    )
+
+    runner.run()
+
+
+if __name__ == "__main__":
+    main()

+ 13 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/runner/__init__.py

@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 63 - 0
TensorFlow2/Recommendation/WideAndDeep/triton/runner/__main__.py

@@ -0,0 +1,63 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import pathlib
+from typing import List
+
+if __name__ == "__main__" and __package__ is None:
+    __package__ = pathlib.Path(__file__).parent.name
+
+from .config import Config
+from .executor import Executor
+from .finalizer import ExperimentFinalizer
+from .maintainer import DockerMaintainer
+from .preparer import ExperimentPreparer
+from .runner_proxy import RunnerProxy
+from .pipeline_impl import pipeline
+
+
+class ExperimentRunner(RunnerProxy):
+    """
+    Experiment Runner proxy for runner wrapper
+    """
+
+    maintainer_cls = DockerMaintainer
+    executor_cls = Executor
+    preparer_cls = ExperimentPreparer
+    finalizer_cls = ExperimentFinalizer
+
+
+def execute(config_path: str, devices: List[str]):
+    if len(devices) == 0:
+        devices = ["0"]
+
+    config = Config.from_file(config_path)
+    runner = ExperimentRunner(config=config, pipeline=pipeline, devices=devices)
+    runner.start()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config-path", type=str, required=True, help="Path to configuration file with details.")
+    parser.add_argument(
+        "--devices", type=str, nargs="*", required=False, help="Path to configuration file with details."
+    )
+
+    args = parser.parse_args()
+
+    config_path = args.config_path
+    devices = args.devices
+
+    execute(config_path, devices)

Beberapa file tidak ditampilkan karena terlalu banyak file yang berubah dalam diff ini