Просмотр исходного кода

Merge pull request #630 from NVIDIA/gh/release

Updating GNMT/PyT/TF for Ampere, fixes in WnD/TF and Jasper/PyT
nv-kkudrynski 5 лет назад
Родитель
Сommit
386dd8ebf6
79 измененных файлов с 1162 добавлено и 755 удалено
  1. 2 0
      PyTorch/SpeechRecognition/Jasper/.dockerignore
  2. 4 4
      PyTorch/SpeechRecognition/Jasper/.gitmodules
  3. 95 0
      PyTorch/SpeechRecognition/Jasper/external/Dockerfile.client.patched
  4. 1 0
      PyTorch/SpeechRecognition/Jasper/external/triton-inference-server
  5. 4 0
      PyTorch/SpeechRecognition/Jasper/inference.py
  6. 3 3
      PyTorch/SpeechRecognition/Jasper/model.py
  7. 5 5
      PyTorch/SpeechRecognition/Jasper/optimizers.py
  8. 3 3
      PyTorch/SpeechRecognition/Jasper/scripts/train.sh
  9. 5 3
      PyTorch/SpeechRecognition/Jasper/tensorrt/Dockerfile
  10. 2 2
      PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/build.sh
  11. 1 1
      PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/trt_inference_benchmark.sh
  12. 29 13
      PyTorch/SpeechRecognition/Jasper/triton/Dockerfile
  13. 3 2
      PyTorch/SpeechRecognition/Jasper/triton/scripts/docker/build.sh
  14. 2 2
      PyTorch/SpeechRecognition/Jasper/triton/scripts/docker/launch.sh
  15. 2 2
      PyTorch/SpeechRecognition/Jasper/triton/scripts/execute_all_perf_runs.sh
  16. 3 3
      PyTorch/SpeechRecognition/Jasper/triton/scripts/export_model.sh
  17. 3 3
      PyTorch/SpeechRecognition/Jasper/triton/scripts/export_model_helper.sh
  18. 3 3
      PyTorch/SpeechRecognition/Jasper/triton/scripts/run_client.sh
  19. 2 2
      PyTorch/SpeechRecognition/Jasper/triton/scripts/run_perf_client.sh
  20. 1 1
      PyTorch/SpeechRecognition/Jasper/triton/scripts/run_server.sh
  21. 5 2
      PyTorch/Translation/GNMT/Dockerfile
  22. 1 1
      PyTorch/Translation/GNMT/LICENSE
  23. 374 191
      PyTorch/Translation/GNMT/README.md
  24. BIN
      PyTorch/Translation/GNMT/img/training_accuracy.png
  25. 0 307
      PyTorch/Translation/GNMT/launch.py
  26. 1 2
      PyTorch/Translation/GNMT/requirements.txt
  27. 2 2
      PyTorch/Translation/GNMT/scripts/docker/build.sh
  28. 2 2
      PyTorch/Translation/GNMT/scripts/docker/interactive.sh
  29. 1 1
      PyTorch/Translation/GNMT/scripts/filter_dataset.py
  30. 4 4
      PyTorch/Translation/GNMT/scripts/tests/inference.sh
  31. 4 4
      PyTorch/Translation/GNMT/scripts/tests/train_1epoch.sh
  32. 5 5
      PyTorch/Translation/GNMT/scripts/tests/train_bench.sh
  33. 4 4
      PyTorch/Translation/GNMT/scripts/tests/train_full.sh
  34. 0 25
      PyTorch/Translation/GNMT/scripts/train.sh
  35. 1 1
      PyTorch/Translation/GNMT/scripts/verify_dataset.sh
  36. 1 1
      PyTorch/Translation/GNMT/scripts/wmt16_en_de.sh
  37. 1 1
      PyTorch/Translation/GNMT/seq2seq/data/config.py
  38. 32 1
      PyTorch/Translation/GNMT/seq2seq/data/dataset.py
  39. 1 1
      PyTorch/Translation/GNMT/seq2seq/data/sampler.py
  40. 1 1
      PyTorch/Translation/GNMT/seq2seq/data/tokenizer.py
  41. 2 2
      PyTorch/Translation/GNMT/seq2seq/inference/beam_search.py
  42. 5 1
      PyTorch/Translation/GNMT/seq2seq/inference/tables.py
  43. 6 5
      PyTorch/Translation/GNMT/seq2seq/inference/translator.py
  44. 1 1
      PyTorch/Translation/GNMT/seq2seq/models/attention.py
  45. 1 1
      PyTorch/Translation/GNMT/seq2seq/models/decoder.py
  46. 1 1
      PyTorch/Translation/GNMT/seq2seq/models/encoder.py
  47. 1 1
      PyTorch/Translation/GNMT/seq2seq/models/gnmt.py
  48. 1 1
      PyTorch/Translation/GNMT/seq2seq/models/seq2seq_base.py
  49. 1 1
      PyTorch/Translation/GNMT/seq2seq/train/fp_optimizers.py
  50. 1 1
      PyTorch/Translation/GNMT/seq2seq/train/lr_scheduler.py
  51. 1 1
      PyTorch/Translation/GNMT/seq2seq/train/smoothing.py
  52. 1 1
      PyTorch/Translation/GNMT/seq2seq/train/table.py
  53. 10 7
      PyTorch/Translation/GNMT/seq2seq/train/trainer.py
  54. 18 2
      PyTorch/Translation/GNMT/seq2seq/utils.py
  55. 24 11
      PyTorch/Translation/GNMT/train.py
  56. 77 25
      PyTorch/Translation/GNMT/translate.py
  57. 2 2
      TensorFlow/Recommendation/WideAndDeep/scripts/DGX1_benchmark_training_amp_1gpu.sh
  58. 2 2
      TensorFlow/Recommendation/WideAndDeep/scripts/DGX1_benchmark_training_amp_8gpu.sh
  59. 2 2
      TensorFlow/Recommendation/WideAndDeep/scripts/DGX1_benchmark_training_fp32_1gpu.sh
  60. 2 2
      TensorFlow/Recommendation/WideAndDeep/scripts/DGX1_benchmark_training_fp32_8gpu.sh
  61. 2 2
      TensorFlow/Recommendation/WideAndDeep/scripts/DGXA100_benchmark_training_amp_1gpu.sh
  62. 2 2
      TensorFlow/Recommendation/WideAndDeep/scripts/DGXA100_benchmark_training_amp_8gpu.sh
  63. 2 2
      TensorFlow/Recommendation/WideAndDeep/scripts/DGXA100_benchmark_training_tf32_1gpu.sh
  64. 2 2
      TensorFlow/Recommendation/WideAndDeep/scripts/DGXA100_benchmark_training_tf32_8gpu.sh
  65. 4 4
      TensorFlow/Translation/GNMT/Dockerfile
  66. 247 59
      TensorFlow/Translation/GNMT/README.md
  67. 1 0
      TensorFlow/Translation/GNMT/examples/DGX1_AMP_1GPU.sh
  68. 1 0
      TensorFlow/Translation/GNMT/examples/DGX1_AMP_8GPU.sh
  69. 1 0
      TensorFlow/Translation/GNMT/examples/DGX1_FP32_1GPU.sh
  70. 1 0
      TensorFlow/Translation/GNMT/examples/DGX1_FP32_8GPU.sh
  71. 1 0
      TensorFlow/Translation/GNMT/examples/DGXA100_AMP_1GPU.sh
  72. 1 0
      TensorFlow/Translation/GNMT/examples/DGXA100_AMP_8GPU.sh
  73. 1 0
      TensorFlow/Translation/GNMT/examples/DGXA100_TF32_1GPU.sh
  74. 1 0
      TensorFlow/Translation/GNMT/examples/DGXA100_TF32_8GPU.sh
  75. BIN
      TensorFlow/Translation/GNMT/img/bleu_score.png
  76. 36 6
      TensorFlow/Translation/GNMT/nmt.py
  77. 43 0
      TensorFlow/Translation/GNMT/qa/L1_joc_GNMT_inferbench_fp16.sh
  78. 43 0
      TensorFlow/Translation/GNMT/qa/L1_joc_GNMT_inferbench_fp32.sh
  79. 1 0
      TensorFlow/Translation/GNMT/requirements.txt

+ 2 - 0
PyTorch/SpeechRecognition/Jasper/.dockerignore

@@ -5,3 +5,5 @@ checkpoints/
 datasets/
 external/tensorrt-inference-server/
 checkpoints/
+triton/model_repo
+triton/deploy

+ 4 - 4
PyTorch/SpeechRecognition/Jasper/.gitmodules

@@ -1,4 +1,4 @@
-[submodule "external/tensorrt-inference-server"]
-	path = external/tensorrt-inference-server
-	url = https://github.com/NVIDIA/tensorrt-inference-server.git
-	branch = r19.06
+[submodule "external/triton-inference-server"]
+	path = external/triton-inference-server
+	url = https://github.com/NVIDIA/triton-inference-server
+	branch = r19.12

+ 95 - 0
PyTorch/SpeechRecognition/Jasper/external/Dockerfile.client.patched

@@ -0,0 +1,95 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Default setting is building on nvidia/cuda:10.1-devel-ubuntu18.04
+ARG BASE_IMAGE=nvidia/cuda:10.1-devel-ubuntu18.04
+
+FROM ${BASE_IMAGE}
+
+# Default to use Python3. Allowed values are "2" and "3".
+ARG PYVER=3
+
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYVER=$PYVER
+
+RUN PYSFX=`[ "$PYVER" != "2" ] && echo "$PYVER" || echo ""` && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+            software-properties-common \
+            autoconf \
+            automake \
+            build-essential \
+            cmake \
+            curl \
+            git \
+            libopencv-dev \
+            libopencv-core-dev \
+            libssl-dev \
+            libtool \
+            pkg-config \
+            python${PYSFX} \
+            python${PYSFX}-pip \
+            python${PYSFX}-dev && \
+    pip${PYSFX} install --upgrade setuptools wheel
+
+RUN PYSFX=`[ "$PYVER" != "2" ] && echo "$PYVER" || echo ""` && \
+    pip${PYSFX} install --upgrade grpcio-tools
+
+# Build expects "python" executable (not python3).
+RUN rm -f /usr/bin/python && \
+    ln -s /usr/bin/python$PYVER /usr/bin/python
+
+# Build the client library and examples
+WORKDIR /workspace
+COPY VERSION .
+COPY build build
+COPY src/clients src/clients
+COPY src/core src/core
+
+RUN cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX:PATH=/workspace/install && \
+    make -j16 trtis-clients
+RUN cd install && \
+    export VERSION=`cat /workspace/VERSION` && \
+    tar zcf /workspace/v$VERSION.clients.tar.gz *
+
+# For CI testing need to install a test script.
+COPY qa/L0_client_tar/test.sh /tmp/test.sh
+
+# Install an image needed by the quickstart and other documentation.
+COPY qa/images/mug.jpg images/mug.jpg
+
+# Install the dependencies needed to run the client examples. These
+# are not needed for building but including them allows this image to
+# be used to run the client examples. The special upgrade and handling
+# of pip is needed to get numpy to install correctly with python2 on
+# ubuntu 16.04.
+RUN python -m pip install --user --upgrade pip && \
+    python -m pip install --upgrade install/python/tensorrtserver-*.whl numpy pillow
+
+ENV PATH //workspace/install/bin:${PATH}
+ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}

+ 1 - 0
PyTorch/SpeechRecognition/Jasper/external/triton-inference-server

@@ -0,0 +1 @@
+Subproject commit a1f3860ba65c0fd8f2be3adfcab2673efd039348

+ 4 - 0
PyTorch/SpeechRecognition/Jasper/inference.py

@@ -56,6 +56,10 @@ def parse_args():
     parser.add_argument("--wav", type=str, help='absolute path to .wav file (16KHz)')
     parser.add_argument("--cpu", action="store_true", help="Run inference on CPU")
     parser.add_argument("--ema", action="store_true", help="If available, load EMA model weights")
+
+    # FIXME Unused, but passed by Triton helper scripts
+    parser.add_argument("--pyt_fp16", action='store_true', help='use half precision')
+
     return parser.parse_args()
 
 def calc_wer(data_layer, audio_processor,

+ 3 - 3
PyTorch/SpeechRecognition/Jasper/model.py

@@ -100,7 +100,7 @@ class SpecAugment(nn.Module):
     def forward(self, x):
         sh = x.shape
 
-        mask = torch.zeros(x.shape).byte()
+        mask = torch.zeros(x.shape, dtype=torch.bool)
         for idx in range(sh[0]):
             for _ in range(self.cutout_x_regions):
                 cutout_x_left = int(random.uniform(0, sh[1] - self.cutout_x_width))
@@ -130,7 +130,7 @@ class SpecCutoutRegions(nn.Module):
     def forward(self, x):
         sh = x.shape
 
-        mask = torch.zeros(x.shape, dtype=torch.uint8)
+        mask = torch.zeros(x.shape, dtype=torch.bool)
 
         for idx in range(sh[0]):
             for i in range(self.cutout_rect_regions):
@@ -275,7 +275,7 @@ class MaskedConv1d(nn.Conv1d):
 
     def get_seq_len(self, lens):
         return ((lens + 2 * self.padding[0] - self.dilation[0] * (
-            self.kernel_size[0] - 1) - 1) / self.stride[0] + 1)
+            self.kernel_size[0] - 1) - 1) // self.stride[0] + 1)
 
     def forward(self, inp):
         if self.use_conv_mask:

+ 5 - 5
PyTorch/SpeechRecognition/Jasper/optimizers.py

@@ -98,7 +98,7 @@ class AdamW(Optimizer):
   
                 state['step'] += 1
                 # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                 if amsgrad:
                     # Maintains the maximum of all 2nd moment running avg. till now
@@ -111,7 +111,7 @@ class AdamW(Optimizer):
                 bias_correction1 = 1 - beta1 ** state['step']
                 bias_correction2 = 1 - beta2 ** state['step']
                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
-                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
+                p.data.add_(torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom), alpha=-step_size)
   
         return loss
   
@@ -201,7 +201,7 @@ class Novograd(Optimizer):
                 if exp_avg_sq == 0:
                     exp_avg_sq.copy_(norm)
                 else:
-                    exp_avg_sq.mul_(beta2).add_(1 - beta2, norm)
+                    exp_avg_sq.mul_(beta2).add_(norm, alpha=1 - beta2)
 
                 if amsgrad:
                     # Maintains the maximum of all 2nd moment running avg. till now
@@ -213,11 +213,11 @@ class Novograd(Optimizer):
 
                 grad.div_(denom)
                 if group['weight_decay'] != 0:
-                    grad.add_(group['weight_decay'], p.data)
+                    grad.add_(p.data, alpha=group['weight_decay'])
                 if group['grad_averaging']:
                     grad.mul_(1 - beta1)
                 exp_avg.mul_(beta1).add_(grad)
 
-                p.data.add_(-group['lr'], exp_avg)
+                p.data.add_(exp_avg, alpha=-group['lr'])
         
         return loss

+ 3 - 3
PyTorch/SpeechRecognition/Jasper/scripts/train.sh

@@ -47,9 +47,9 @@ CMD+=" --seed=$SEED"
 CMD+=" --optimizer=novograd"
 CMD+=" --dataset_dir=$DATA_DIR"
 CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
-CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,"
-CMD+="$DATA_DIR/librispeech-train-clean-360-wav.json,"
-CMD+="$DATA_DIR/librispeech-train-other-500-wav.json"
+CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json"
+CMD+=",$DATA_DIR/librispeech-train-clean-360-wav.json"
+CMD+=",$DATA_DIR/librispeech-train-other-500-wav.json"
 CMD+=" --weight_decay=1e-3"
 CMD+=" --save_freq=$SAVE_FREQUENCY"
 CMD+=" --eval_freq=100"

+ 5 - 3
PyTorch/SpeechRecognition/Jasper/tensorrt/Dockerfile

@@ -1,8 +1,10 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.10-py3
 FROM ${FROM_IMAGE_NAME}
 
+RUN apt-get update && apt-get install -y python3
+
 WORKDIR /tmp/onnx-trt
-COPY trt/onnx-trt.patch .
+COPY tensorrt/onnx-trt.patch .
 RUN git clone https://github.com/onnx/onnx-tensorrt.git && cd onnx-tensorrt && git checkout 8716c9b && git submodule update --init --recursive && \
     patch -f < ../onnx-trt.patch && mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DGPU_ARCHS="60 70 75" && make -j16 && make install && mv -f /usr/lib/libnvonnx* /usr/lib/x86_64-linux-gnu/ && ldconfig
 
@@ -11,7 +13,7 @@ RUN git clone https://github.com/onnx/onnx-tensorrt.git && cd onnx-tensorrt && g
 # At the same step, also install TRT pip reqs
 WORKDIR /tmp/pipReqs
 COPY requirements.txt /tmp/pipReqs/jocRequirements.txt
-COPY trt/requirements.txt /tmp/pipReqs/trtRequirements.txt
+COPY tensorrt/requirements.txt /tmp/pipReqs/trtRequirements.txt
 RUN pip install --disable-pip-version-check -U -r jocRequirements.txt -r trtRequirements.txt
 
 

+ 2 - 2
PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/build.sh

@@ -1,5 +1,5 @@
 #!/bin/bash
 
 # Constructs a docker image containing dependencies for execution of JASPER through TRT
-echo "docker build . -f ./trt/Dockerfile -t jasper:trt6"
-docker build . -f ./trt/Dockerfile -t jasper:trt6
+echo "docker build . -f ./tensorrt/Dockerfile -t jasper:trt6"
+docker build . -f ./tensorrt/Dockerfile -t jasper:trt6

+ 1 - 1
PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/trt_inference_benchmark.sh

@@ -130,7 +130,7 @@ else
    PYT_PREDICTION_PATH=" --pyt_prediction_path=${PYT_PREDICTION_PATH}"
 fi
 
-CMD="python trt/perf.py"
+CMD="python tensorrt/perf.py"
 CMD+=" --batch_size $BATCH_SIZE"
 CMD+=" --engine_batch_size $BATCH_SIZE"
 CMD+=" --model_toml configs/jasper10x5dr_nomask.toml"

+ 29 - 13
PyTorch/SpeechRecognition/Jasper/triton/Dockerfile

@@ -1,22 +1,38 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidian/pytorch:20.03-py3
-ARG TRITON_BASE_IMAGE=nvcr.io/nvidia/tritonserver:20.03.1-py3-clientsdk
-FROM ${TRITON_BASE_IMAGE} as triton
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3
+
+FROM tensorrtserver_client as trtis-client
 FROM ${FROM_IMAGE_NAME}
+RUN apt-get update && apt-get install -y python3
+ARG version=6.0.1-1+cuda10.1
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.1.243-1_amd64.deb \
+&& dpkg -i cuda-repo-*.deb \
+&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb \
+&& dpkg -i nvidia-machine-learning-repo-*.deb \
+&& apt-get update \
+&& apt-get install -y --no-install-recommends libnvinfer6=${version} libnvonnxparsers6=${version} libnvparsers6=${version} libnvinfer-plugin6=${version} libnvinfer-dev=${version} libnvonnxparsers-dev=${version} libnvparsers-dev=${version} libnvinfer-plugin-dev=${version} python-libnvinfer=${version} python3-libnvinfer=${version}
+RUN cp -r /usr/lib/python3.6/dist-packages/tensorrt /opt/conda/lib/python3.6/site-packages/tensorrt
 
-ADD requirements.txt .
-RUN pip install -r requirements.txt
-RUN pip install onnxruntime
 
-ADD triton/requirements.txt .
-RUN pip install -r requirements.txt
+ENV PATH=$PATH:/usr/src/tensorrt/bin
+WORKDIR /tmp/onnx-trt
+COPY tensorrt/onnx-trt.patch .
+RUN git clone https://github.com/onnx/onnx-tensorrt.git && cd onnx-tensorrt && git checkout  b677b9cbf19af803fa6f76d05ce558e657e4d8b6  && git submodule update --init --recursive && \
+    patch -f < ../onnx-trt.patch && mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DGPU_ARCHS="60 70 75" && make -j16 && make install && mv -f /usr/lib/libnvonnx* /usr/lib/x86_64-linux-gnu/ && ldconfig
 
-ADD tensorrt/requirements.txt .
-RUN pip install -r requirements.txt
 
-COPY --from=triton /opt/tritonserver/qa/pkgs/tensorrtserver-1.13.0-py3-none-linux_x86_64.whl ./tensorrtserver-1.13.0-py3-none-linux_x86_64.whl
+# Here's a good place to install pip reqs from JoC repo.
+# At the same step, also install TRT pip reqs
+WORKDIR /tmp/pipReqs
+COPY requirements.txt /tmp/pipReqs/pytRequirements.txt
+COPY tensorrt/requirements.txt /tmp/pipReqs/trtRequirements.txt
+COPY triton/requirements.txt /tmp/pipReqs/trtisRequirements.txt
+RUN apt-get update && apt-get install -y --no-install-recommends portaudio19-dev && pip install -r pytRequirements.txt && pip install -r trtRequirements.txt && pip install -r trtisRequirements.txt
 
-RUN pip install tensorrtserver-1.13.0-py3-none-linux_x86_64.whl
+#Copy the perf_client over
+COPY --from=trtis-client /workspace/install/bin/perf_client /workspace/install/bin/perf_client
+#Copy the python wheel and install with pip
+COPY --from=trtis-client /workspace/install/python/tensorrtserver*.whl /tmp/
+RUN pip install /tmp/tensorrtserver*.whl && rm /tmp/tensorrtserver*.whl
 
 WORKDIR /workspace/jasper
 COPY . .
-RUN pip install --no-cache-dir -e .

+ 3 - 2
PyTorch/SpeechRecognition/Jasper/triton/scripts/docker/build.sh

@@ -4,5 +4,6 @@ SCRIPT_DIR=$(cd $(dirname $0); pwd)
 PROJECT_DIR=${SCRIPT_DIR}/../../../
 docker pull nvcr.io/nvidia/tensorrtserver:19.09-py3
 git submodule update --init --recursive
-docker build -t tensorrtserver_client -f ${PROJECT_DIR}/external/triton-inference-server/Dockerfile.client ${PROJECT_DIR}/external/triton-inference-server
-docker build . --rm -f ${PROJECT_DIR}/trtis/Dockerfile -t jasper:trtis
+docker build -t tensorrtserver_client  \
+             -f ${PROJECT_DIR}/external/Dockerfile.client.patched ${PROJECT_DIR}/external/triton-inference-server
+docker build . --rm -f ${PROJECT_DIR}/triton/Dockerfile -t jasper:triton

+ 2 - 2
PyTorch/SpeechRecognition/Jasper/triton/scripts/docker/launch.sh

@@ -29,11 +29,11 @@ fi
 
 echo $MOUNTS
 docker run -it --rm \
-  --runtime=nvidia \
+  --gpus=all \
   --shm-size=4g \
   --ulimit memlock=-1 \
   --ulimit stack=67108864 \
   ${MOUNTS} \
   -v ${JASPER_REPO}:/jasper \
   ${EXTRA_JASPER_ENV} \
-  jasper:trtis bash $PROGRAM_PATH
+  jasper:triton bash $PROGRAM_PATH

+ 2 - 2
PyTorch/SpeechRecognition/Jasper/triton/scripts/execute_all_perf_runs.sh

@@ -44,7 +44,7 @@ export GPU=${GPU:-}
 
 SCRIPT_DIR=$(cd $(dirname $0); pwd)
 PROJECT_DIR=${SCRIPT_DIR}/../..
-MODEL_REPO=${MODEL_REPO:-"${PROJECT_DIR}/trtis/model_repo"}
+MODEL_REPO=${MODEL_REPO:-"${PROJECT_DIR}/triton/model_repo"}
 
 # We need to make sure TRTIS uses only one GPU, same as export does
 # for TRTIS
@@ -78,7 +78,7 @@ do
   
     if [ "${REGENERATE_ENGINES}" == "yes" ]; then
         ARCH=${ARCH} CHECKPOINT_DIR=${CHECKPOINT_DIR} CHECKPOINT=${CHECKPOINT} PRECISION=${PRECISION} MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE} \
-        ${PROJECT_DIR}/trtis/scripts/export_model.sh || exit 1
+        ${PROJECT_DIR}/triton/scripts/export_model.sh || exit 1
     fi
   
     for BATCH_SIZE in 1 2 4 8 16 32 64;

+ 3 - 3
PyTorch/SpeechRecognition/Jasper/triton/scripts/export_model.sh

@@ -25,12 +25,12 @@ GPU=${GPU:-0}
 SCRIPT_DIR=$(cd $(dirname $0); pwd)
 PROJECT_DIR=${SCRIPT_DIR}/../..
 if [ -f /.dockerenv ]; then # inside docker
-	CUDA_VISIBLE_DEVICES=${GPU} CHECKPOINT=${CHECKPOINT} CHECKPOINT_DIR=${CHECKPOINT_DIR} PRECISION=${PRECISION} ARCH=${ARCH} MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE} ${PROJECT_DIR}/trtis/scripts/export_model_helper.sh || exit 1
+	CUDA_VISIBLE_DEVICES=${GPU} CHECKPOINT=${CHECKPOINT} CHECKPOINT_DIR=${CHECKPOINT_DIR} PRECISION=${PRECISION} ARCH=${ARCH} MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE} ${PROJECT_DIR}/triton/scripts/export_model_helper.sh || exit 1
 else
 	set -x
-	PROGRAM_PATH="/jasper/trtis/scripts/export_model_helper.sh"  \
+	PROGRAM_PATH="/jasper/triton/scripts/export_model_helper.sh"  \
 	EXTRA_JASPER_ENV="-e PRECISION=${PRECISION} -e CHECKPOINT=${CHECKPOINT} -e CHECKPOINT_DIR=/checkpoints -e ARCH=${ARCH} -e MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE} -e CUDA_VISIBLE_DEVICES=${GPU}" \
 	CHECKPOINT_DIR=${CHECKPOINT_DIR} DATA_DIR= RESULT_DIR= \
-	${PROJECT_DIR}/trtis/scripts/docker/launch.sh || exit 1
+	${PROJECT_DIR}/triton/scripts/docker/launch.sh || exit 1
 	set +x
 fi

+ 3 - 3
PyTorch/SpeechRecognition/Jasper/triton/scripts/export_model_helper.sh

@@ -66,13 +66,13 @@ echo "export_model.sh: Exporting TRT engine, CUDA ARCH = ${ARCH} ... "
 PREC_FLAGS=""
 if [ "$PRECISION" == "fp16" ]
 then
-	PREC_FLAGS="--trt_fp16"
+ 	PREC_FLAGS="--trt_fp16"
 fi
 
 # remove targtes first
 rm -f ${MODEL_REPO}/jasper-trt/1/jasper_${ARCH}.plan ${MODEL_REPO}/jasper-onnx/1/jasper.onnx
 
-python  ${JASPER_REPO}/trt/perf.py \
+python  ${JASPER_REPO}/tensorrt/perf.py \
 	--ckpt_path ${CHECKPOINT_DIR}/${CHECKPOINT} \
 	--wav=${JASPER_REPO}/notebooks/example1.wav \
 	--model_toml=${JASPER_REPO}/configs/${MODEL_CONFIG} \
@@ -85,7 +85,7 @@ if [ "$PRECISION" == "fp16" ]
 then
 	PREC_FLAGS="--trt_fp16 --pyt_fp16"
 fi
-python  ${JASPER_REPO}/trt/perf.py \
+python  ${JASPER_REPO}/tensorrt/perf.py \
 	--ckpt_path ${CHECKPOINT_DIR}/${CHECKPOINT} \
 	--wav=${JASPER_REPO}/notebooks/example1.wav \
 	--model_toml=${JASPER_REPO}/configs/${MODEL_CONFIG} \

+ 3 - 3
PyTorch/SpeechRecognition/Jasper/triton/scripts/run_client.sh

@@ -26,13 +26,13 @@ FILE=${3} # json manifest file, OR single wav file
 JASPER_CONTAINER_TAG=${JASPER_CONTAINER_TAG:-jasper:trtis}
 
 if [ "$#" -ge 1 ] && [ "${FILE: -4}" == ".wav" ]; then 
-  CMD="python /jasper/trtis/jasper-client.py --data_dir /data --audio_filename ${FILE} --model_platform ${MODEL_TYPE}"
+  CMD="python /jasper/triton/jasper-client.py --data_dir /data --audio_filename ${FILE} --model_platform ${MODEL_TYPE}"
   ARGS=""
   ARGS="$ARGS -v $DATA_DIR:/data"
 elif [ "$#" -ge 1 ] && [ "${FILE: -4}" == "json" ]; then
   ARGS=""
   ARGS="$ARGS -v $DATA_DIR:/data"
-  CMD="python /jasper/trtis/jasper-client.py --manifest_filename ${FILE} --model_platform ${MODEL_TYPE} --data_dir /data"
+  CMD="python /jasper/triton/jasper-client.py --manifest_filename ${FILE} --model_platform ${MODEL_TYPE} --data_dir /data"
 else
   ARGS="-it"
   CMD=""
@@ -49,4 +49,4 @@ nvidia-docker run --rm -it \
    -v ${PROJECT_DIR}:/jasper \
    --name=trtis-client \
    ${ARGS} ${JASPER_CONTAINER_TAG} ${CMD}
-set +x
+set +x

+ 2 - 2
PyTorch/SpeechRecognition/Jasper/triton/scripts/run_perf_client.sh

@@ -67,13 +67,13 @@ ARGS="\
 curl -s "http://${SERVER_HOSTNAME}:8000/api/status/${MODEL_NAME}" | grep ready_state | grep SERVER_READY || (echo "Model ${MODEL_NAME} is not ready, perf_client skipped..." && exit 1)
 
 echo "=== STARTING: perf client ${ARGS} --concurrency-range 1:4:1 ==="
-docker run  -e DISPLAY=${DISPLAY}  --runtime nvidia --rm \
+docker run  -e DISPLAY=${DISPLAY}  --gpus all --rm \
 	      --privileged --net=host \
 	      -v ${RESULT_DIR_H}:/results --name jasper-perf-client \
 	      ${TRTIS_CLIENT_CONTAINER_TAG}  perf_client $ARGS -f /results/${OUTPUT_FILE_CSV}_p1 --concurrency-range 1:4:1 2>&1 | tee -a $LOGNAME
 
 echo "=== STARTING: perf client ${ARGS} --concurrency-range 8:${MAX_CONCURRENCY}:8 ==="
-docker run  -e DISPLAY=${DISPLAY}  --runtime nvidia --rm \
+docker run  -e DISPLAY=${DISPLAY}  --gpus all --rm \
 	      --privileged --net=host \
 	      -v ${RESULT_DIR_H}:/results --name jasper-perf-client \
 	      ${TRTIS_CLIENT_CONTAINER_TAG}  perf_client $ARGS -f /results/${OUTPUT_FILE_CSV}_p2 --concurrency-range 8:${MAX_CONCURRENCY}:8 2>&1 | tee -a $LOGNAME

+ 1 - 1
PyTorch/SpeechRecognition/Jasper/triton/scripts/run_server.sh

@@ -48,7 +48,7 @@ RM=${RM:-"--rm"}
 
 set -x
 docker run -p 8000:8000 -p 8001:8001 -p 8002:8002 \
-       --runtime nvidia \
+       --gpus all \
        -e NVIDIA_VISIBLE_DEVICES=${NV_VISIBLE_DEVICES} \
        -v ${MODELS_DIR}:/models \
        -v ${TRTIS_DIR}/model_repo:/model_repo \

+ 5 - 2
PyTorch/Translation/GNMT/Dockerfile

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -18,15 +18,18 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.05-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
 FROM ${FROM_IMAGE_NAME}
 
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
 
+RUN pip install --global-option="--cpp_ext" --global-option="--cuda_ext" git+git://github.com/NVIDIA/apex.git#egg=apex
+
 WORKDIR /workspace/gnmt
 
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
 
 ADD . /workspace/gnmt

+ 1 - 1
PyTorch/Translation/GNMT/LICENSE

@@ -1,5 +1,5 @@
 Copyright (c) 2017 Elad Hoffer
-Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

+ 374 - 191
PyTorch/Translation/GNMT/README.md

@@ -14,6 +14,7 @@ achieve state of the art accuracy, and is tested and maintained by NVIDIA.
     * [Features](#features)
   * [Mixed precision training](#mixed-precision-training)
     * [Enabling mixed precision](#enabling-mixed-precision)
+    * [Enabling TF32](#enabling-tf32)
 * [Setup](#setup)
   * [Requirements](#requirements)
 * [Quick Start Guide](#quick-start-guide)
@@ -31,18 +32,23 @@ achieve state of the art accuracy, and is tested and maintained by NVIDIA.
     * [Inference performance benchmark](#inference-performance-benchmark)
   * [Results](#results)
     * [Training accuracy results](#training-accuracy-results)
-      * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
-      * [Training accuracy: NVIDIA DGX-2 (16x V100 32G)](#training-accuracy-nvidia-dgx-2-16x-v100-32g)
+      * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
+      * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+      * [Training accuracy: NVIDIA DGX-2H (16x V100 32GB)](#training-accuracy-nvidia-dgx-2h-16x-v100-32gb)
       * [Training stability test](#training-stability-test)
     * [Training throughput results](#training-throughput-results)
-      * [Training throughput: NVIDIA DGX-1 (8x V100 16G)](#training-throughput-nvidia-dgx-1-8x-v100-16g)
-      * [Training throughput: NVIDIA DGX-2 (16x V100 32G)](#training-throughput-nvidia-dgx-2-16x-v100-32g)
+      * [Training throughput: NVIDIA DGX A100 (8x A100 40GB)](#training-throughput-nvidia-dgx-a100-8x-a100-40gb)
+      * [Training throughput: NVIDIA DGX-1 (8x V100 16GB)](#training-throughput-nvidia-dgx-1-8x-v100-16gb)
+      * [Training throughput: NVIDIA DGX-2H (16x V100 32GB)](#training-throughput-nvidia-dgx-2h-16x-v100-32gb)
     * [Inference accuracy results](#inference-accuracy-results)
-      * [Inference accuracy: NVIDIA Tesla V100 16G](#inference-accuracy-nvidia-tesla-v100-16g)
+      * [Inference accuracy: NVIDIA A100 40GB](#inference-accuracy-nvidia-a100-40gb)
+      * [Inference accuracy: NVIDIA Tesla V100 16GB](#inference-accuracy-nvidia-tesla-v100-16gb)
       * [Inference accuracy: NVIDIA T4](#inference-accuracy-nvidia-t4)
     * [Inference throughput results](#inference-throughput-results)
+      * [Inference throughput: NVIDIA A100 40GB](#inference-throughput-nvidia-a100-40gb)
       * [Inference throughput: NVIDIA T4](#inference-throughput-nvidia-t4)
     * [Inference latency results](#inference-latency-results)
+      * [Inference latency: NVIDIA A100 40GB](#inference-latency-nvidia-a100-40gb)
       * [Inference latency: NVIDIA T4](#inference-latency-nvidia-t4)
 * [Release notes](#release-notes)
   * [Changelog](#changelog)
@@ -67,7 +73,7 @@ Tutorial](https://github.com/tensorflow/nmt) and [NVIDIA OpenSeq2Seq
 Toolkit](https://github.com/NVIDIA/OpenSeq2Seq).
 
 ### Model architecture
-![TrainingLoss](./img/diagram.png)
+![ModelArchitecture](./img/diagram.png)
 
 ### Default configuration
 
@@ -126,7 +132,7 @@ Code from this repository can be used to train a larger, 8-layer GNMT v2 model.
 Our experiments show that a 4-layer model is significantly faster to train and
 yields comparable accuracy on the public [WMT16
 English-German](http://www.statmt.org/wmt16/translation-task.html) dataset. The
-number of LSTM layers is controlled by the `--num_layers` parameter in the
+number of LSTM layers is controlled by the `--num-layers` parameter in the
 `train.py` training script.
 
 ### Feature support matrix
@@ -159,11 +165,11 @@ computational method.
 computational speedup by performing operations in half-precision format, while
 storing minimal information in single-precision to retain as much information
 as possible in critical parts of the network. Since the introduction of [Tensor
-Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing
-architectures, significant training speedups are experienced by switching to
-mixed precision -- up to 3x overall speedup on the most arithmetically intense
-model architectures. Using mixed precision training previously required two
-steps:
+Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with
+both the Turing and Ampere architectures, significant training speedups are
+experienced by switching to mixed precision -- up to 3x overall speedup on the
+most arithmetically intense model architectures. Using mixed precision training
+previously required two steps:
 
 1. Porting the model to use the FP16 data type where appropriate.
 2. Manually adding loss scaling to preserve small gradient values.
@@ -188,11 +194,6 @@ For information about:
   .
 
 #### Enabling mixed precision
-By default, the `train.py` training script will launch mixed precision training
-with Tensor Cores. You can change this behavior and execute the training in
-single precision by setting the `--math fp32` flag for the `train.py` training
-script.
-
 Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
 (AMP), library from [APEX](https://github.com/NVIDIA/apex) that casts variables
 to half-precision upon retrieval, while storing variables in single-precision
@@ -208,7 +209,7 @@ For an in-depth walk through on AMP, check out sample usage
 [here](https://nvidia.github.io/apex/amp.html#).
 [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains
 utility libraries, such as AMP, which require minimal network code changes to
-leverage tensor cores performance.
+leverage Tensor Cores performance.
 
 The following steps were needed to enable mixed precision training in GNMT:
 
@@ -246,6 +247,25 @@ if self.grad_clip != float('inf'):
     clip_grad_norm_(amp.master_params(optimizer), self.grad_clip)
 ```
 
+#### Enabling TF32
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA
+A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the
+matrix math also called tensor operations. TF32 running on Tensor Cores in A100
+GPUs can provide up to 10x speedups compared to single-precision floating-point
+math (FP32) on Volta GPUs.
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of
+accuracy. It is more robust than FP16 for models which require high dynamic
+range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates
+AI Training, HPC up to
+20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/)
+blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by
+default.
+
 ## Setup
 
 The following section lists the requirements in order to start training the
@@ -254,13 +274,14 @@ GNMT v2 model.
 ### Requirements
 
 This repository contains `Dockerfile` which extends the PyTorch NGC container
-and encapsulates some dependencies.  Aside from these dependencies, ensure you
+and encapsulates some dependencies. Aside from these dependencies, ensure you
 have the following components:
-
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 19.05-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
-* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
-  or [Turing](https://www.nvidia.com/pl-pl/geforce/turing/) based GPU
+* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+* GPU architecture:
+  * [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+  * [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/)
+  * [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
 
 For more information about how to get started with NGC containers, see the
 following sections from the NVIDIA GPU Cloud Documentation and the Deep
@@ -277,10 +298,10 @@ Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.htm
 
 
 ## Quick Start Guide
-To train your model using mixed precision with Tensor Cores or using FP32,
-perform the following steps using the default parameters of the GNMT v2 model
-on the WMT16 English German dataset. For the specifics concerning training
-and inference, see the [Advanced](#advanced) section.
+To train your model using mixed or TF32 precision with Tensor Cores or using
+FP32, perform the following steps using the default parameters of the GNMT v2
+model on the WMT16 English German dataset. For the specifics concerning
+training and inference, see the [Advanced](#advanced) section.
 
 **1. Clone the repository.**
 
@@ -313,24 +334,28 @@ bash scripts/wmt16_en_de.sh
 
 **5. Start training.**
 
-By default, the `train.py` training script will use all available GPUs. The
-training script saves only one checkpoint with the lowest value of the loss
+The training script saves only one checkpoint with the lowest value of the loss
 function on the validation dataset. All results and logs are saved to the
-`results` directory (on the host) or to the `/workspace/gnmt/results` directory
+`gnmt` directory (on the host) or to the `/workspace/gnmt/gnmt` directory
 (in the container). By default, the `train.py` script will launch mixed
-precision training with Tensor Cores. You can change this behavior by setting
-the `--math fp32` flag for the `train.py` training script.
+precision training with Tensor Cores. You can change this behavior by setting:
+* the `--math fp32` flag to launch single precision training (for NVIDIA Volta
+  and NVIDIA Turing architectures) or
+* the `--math tf32` flag to launch TF32 training with Tensor Cores (for NVIDIA
+  Ampere architecture) 
+
+for the `train.py` training script.
 
 To launch mixed precision training on 1, 4 or 8 GPUs, run:
 
 ```
-python3 -m launch train.py --seed 2 --train-global-batch-size 1024
+python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024
 ```
 
 To launch mixed precision training on 16 GPUs, run:
 
 ```
-python3 -m launch train.py --seed 2 --train-global-batch-size 2048
+python3 -m torch.distributed.launch --nproc_per_node=16 train.py --seed 2 --train-global-batch-size 2048
 ```
 
 By default, the training script will launch training with batch size 128 per
@@ -348,14 +373,14 @@ after each training epoch. Additionally, after the training is done, you can
 manually run inference on the test dataset with the checkpoint saved during the
 training.
 
-To launch mixed precision inference on the `newstest2014.en` test set, run:
+To launch FP16 inference on the `newstest2014.en` test set, run:
 
 ```
 python3 translate.py \
   --input data/wmt16_de_en/newstest2014.en \
   --reference data/wmt16_de_en/newstest2014.de \
   --output /tmp/output \
-  --model results/gnmt/model_best.pth
+  --model gnmt/model_best.pth
 ```
 
 The script will load the checkpoint specified by the `--model` option, then it
@@ -370,7 +395,7 @@ Additionally, one can pass the input text directly from the command-line:
 ```
 python3 translate.py \
   --input-text "The quick brown fox jumps over the lazy dog" \
-  --model results/gnmt/model_best.pth
+  --model gnmt/model_best.pth
 ```
 
 Translated output will be printed to the console:
@@ -381,9 +406,14 @@ Translated output will be printed to the console:
 Der schnelle braune Fuchs springt über den faulen Hund
 ```
 
-By default, the `translate.py` script will launch mixed precision inference
-with Tensor Cores. You can change this behavior by setting the `--math fp32`
-flag for the `translate.py` inference script.
+By default, the `translate.py` script will launch FP16 inference with Tensor
+Cores. You can change this behavior by setting:
+* the `--math fp32` flag to launch single precision inference (for NVIDIA Volta
+  and NVIDIA Turing architectures) or
+* the `--math tf32` flag to launch TF32 inference with Tensor Cores (for NVIDIA
+  Ampere architecture)
+
+for the `translate.py` inference script.
 
 ## Advanced
 The following sections provide greater details of the dataset, running training
@@ -475,12 +505,9 @@ dataset setup:
                         entire dataset (default: None)
 
 results setup:
-  --results-dir RESULTS_DIR
-                        path to directory with results, it will be
+  --save-dir SAVE_DIR   path to directory with results, it will be
                         automatically created if it does not exist (default:
-                        results)
-  --save-dir SAVE_DIR   defines subdirectory within RESULTS_DIR for results
-                        from this training run (default: gnmt)
+                        gnmt)
   --print-freq PRINT_FREQ
                         print log every PRINT_FREQ batches (default: 10)
 
@@ -499,13 +526,16 @@ model setup:
                         with label smoothing loss (default: 0.1)
 
 general setup:
-  --math {fp16,fp32,manual_fp16}
+  --math {fp16,fp32,tf32,manual_fp16}
                         precision (default: fp16)
   --seed SEED           master seed for random number generators, if "seed" is
                         undefined then the master seed will be sampled from
                         random.SystemRandom() (default: None)
   --prealloc-mode {off,once,always}
                         controls preallocation (default: always)
+  --dllog-file DLLOG_FILE
+                        Name of the DLLogger output file (default:
+                        train_log.json)
   --eval                run validation and test after every epoch (use '--no-
                         eval' to disable) (default: True)
   --env                 print info about execution env (use '--no-env' to
@@ -554,7 +584,7 @@ training setup:
 
 optimizer setup:
   --optimizer OPTIMIZER
-                        training optimizer (default: SparseAdam)
+                        training optimizer (default: Adam)
   --lr LR               learning rate (default: 0.002)
   --optimizer-extra OPTIMIZER_EXTRA
                         extra options for the optimizer (default: {})
@@ -654,6 +684,13 @@ data setup:
                         sacrebleu, raw text) (default: None)
   -m MODEL, --model MODEL
                         full path to the model checkpoint file (default: None)
+  --synthetic           use synthetic dataset (default: False)
+  --synthetic-batches SYNTHETIC_BATCHES
+                        number of synthetic batches to generate (default: 64)
+  --synthetic-vocab SYNTHETIC_VOCAB
+                        size of synthetic vocabulary (default: 32320)
+  --synthetic-len SYNTHETIC_LEN
+                        sequence length of synthetic samples (default: 50)
   -i INPUT, --input INPUT
                         full path to the input file (raw text) (default: None)
   -t INPUT_TEXT [INPUT_TEXT ...], --input-text INPUT_TEXT [INPUT_TEXT ...]
@@ -676,8 +713,8 @@ inference setup:
                         length normalization constant (default: 5.0)
 
 general setup:
-  --math {fp16,fp32} [{fp16,fp32} ...]
-                        arithmetic type (default: ['fp16'])
+  --math {fp16,fp32,tf32} [{fp16,fp32,tf32} ...]
+                        precision (default: ['fp16'])
   --env                 print info about execution env (use '--no-env' to
                         disable) (default: False)
   --bleu                compares with reference translation and computes BLEU
@@ -690,6 +727,12 @@ general setup:
                         (default: True)
   --seq-first           uses (seq, batch, feature) data format for RNNs
                         (default: True)
+  --save-dir SAVE_DIR   path to directory with results, it will be
+                        automatically created if it does not exist (default:
+                        gnmt)
+  --dllog-file DLLOG_FILE
+                        Name of the DLLogger output file (default:
+                        eval_log.json)
   --print-freq PRINT_FREQ, -p PRINT_FREQ
                         print log every PRINT_FREQ batches (default: 1)
 
@@ -707,8 +750,7 @@ benchmark setup:
                         0)
   --percentiles PERCENTILES [PERCENTILES ...]
                         Percentiles for confidence intervals for
-                        throughput/latency benchmarks (default: (50, 90, 95,
-                        99, 100))
+                        throughput/latency benchmarks (default: (90, 95, 99))
   --tables              print accuracy, throughput and latency results in
                         tables (use '--no-tables' to disable) (default: False)
 ```
@@ -724,13 +766,13 @@ usage: train.py [-h] [--dataset-dir DATASET_DIR] [--src-lang SRC_LANG]
                 [--tgt-lang TGT_LANG] [--vocab VOCAB] [-bpe BPE_CODES]
                 [--train-src TRAIN_SRC] [--train-tgt TRAIN_TGT]
                 [--val-src VAL_SRC] [--val-tgt VAL_TGT] [--test-src TEST_SRC]
-                [--test-tgt TEST_TGT] [--results-dir RESULTS_DIR]
-                [--save-dir SAVE_DIR] [--print-freq PRINT_FREQ]
-                [--hidden-size HIDDEN_SIZE] [--num-layers NUM_LAYERS]
-                [--dropout DROPOUT] [--share-embedding]
-                [--smoothing SMOOTHING] [--math {fp16,fp32,manual_fp16}]
-                [--seed SEED] [--prealloc-mode {off,once,always}] [--eval]
-                [--env] [--cuda] [--cudnn] [--log-all-ranks]
+                [--test-tgt TEST_TGT] [--save-dir SAVE_DIR]
+                [--print-freq PRINT_FREQ] [--hidden-size HIDDEN_SIZE]
+                [--num-layers NUM_LAYERS] [--dropout DROPOUT]
+                [--share-embedding] [--smoothing SMOOTHING]
+                [--math {fp16,fp32,tf32,manual_fp16}] [--seed SEED]
+                [--prealloc-mode {off,once,always}] [--dllog-file DLLOG_FILE]
+                [--eval] [--env] [--cuda] [--cudnn] [--log-all-ranks]
                 [--train-max-size TRAIN_MAX_SIZE]
                 [--train-batch-size TRAIN_BATCH_SIZE]
                 [--train-global-batch-size TRAIN_GLOBAL_BATCH_SIZE]
@@ -760,28 +802,32 @@ usage: train.py [-h] [--dataset-dir DATASET_DIR] [--src-lang SRC_LANG]
                 [--start-epoch START_EPOCH] [--resume PATH] [--save-all]
                 [--save-freq SAVE_FREQ] [--keep-checkpoints KEEP_CHECKPOINTS]
                 [--target-perf TARGET_PERF] [--target-bleu TARGET_BLEU]
-                [--rank RANK] [--local_rank LOCAL_RANK]
+                [--local_rank LOCAL_RANK]
 ```
 For example, for inference:
 
 ```
 python3 translate.py --help
 
-usage: translate.py [-h] [-o OUTPUT] [-r REFERENCE] -m MODEL
-                    (-i INPUT | -t INPUT_TEXT [INPUT_TEXT ...]) [--sort]
+usage: translate.py [-h] [-o OUTPUT] [-r REFERENCE] [-m MODEL] [--synthetic]
+                    [--synthetic-batches SYNTHETIC_BATCHES]
+                    [--synthetic-vocab SYNTHETIC_VOCAB]
+                    [--synthetic-len SYNTHETIC_LEN]
+                    [-i INPUT | -t INPUT_TEXT [INPUT_TEXT ...]] [--sort]
                     [--batch-size BATCH_SIZE [BATCH_SIZE ...]]
                     [--beam-size BEAM_SIZE [BEAM_SIZE ...]]
                     [--max-seq-len MAX_SEQ_LEN]
                     [--len-norm-factor LEN_NORM_FACTOR]
                     [--cov-penalty-factor COV_PENALTY_FACTOR]
                     [--len-norm-const LEN_NORM_CONST]
-                    [--math {fp16,fp32} [{fp16,fp32} ...]] [--env] [--bleu]
-                    [--cuda] [--cudnn] [--batch-first | --seq-first]
+                    [--math {fp16,fp32,tf32} [{fp16,fp32,tf32} ...]] [--env]
+                    [--bleu] [--cuda] [--cudnn] [--batch-first | --seq-first]
+                    [--save-dir SAVE_DIR] [--dllog-file DLLOG_FILE]
                     [--print-freq PRINT_FREQ] [--target-perf TARGET_PERF]
                     [--target-bleu TARGET_BLEU] [--repeat REPEAT [REPEAT ...]]
                     [--warmup WARMUP]
                     [--percentiles PERCENTILES [PERCENTILES ...]] [--tables]
-                    [--rank RANK] [--local_rank LOCAL_RANK]
+                    [--local_rank LOCAL_RANK]
 ```
 
 ### Getting the data
@@ -838,7 +884,7 @@ The default training configuration can be launched by running the `train.py`
 training script. By default, the training script saves only one checkpoint with
 the lowest value of the loss function on the validation dataset. An evaluation
 is then performed after each training epoch. Results are stored in the
-`results/gnmt` directory.
+`gnmt` directory.
 
 The training script launches data-parallel training with batch size 128 per GPU
 on all available GPUs. We have tested reliance on up to 16 GPUs on a single
@@ -846,7 +892,7 @@ node.
 After each training epoch, the script runs an evaluation on the validation
 dataset and outputs a BLEU score on the test dataset (newstest2014). BLEU is
 computed by the [SacreBLEU](https://github.com/mjpost/sacreBLEU) package. Logs
-from the training and evaluation are saved to the `results` directory.
+from the training and evaluation are saved to the `gnmt` directory.
 
 The summary after each training epoch is printed in the following format:
 
@@ -861,15 +907,14 @@ the test dataset. Performance is reported in total tokens per second. The
 result is averaged over an entire training epoch and summed over all GPUs
 participating in the training.
 
-Even though the training script uses all available GPUs, you can change this
-behavior by setting the `CUDA_VISIBLE_DEVICES` variable in your environment or
-by setting the `NV_GPU` variable at the Docker container launch ([see section
-"GPU
-isolation"](https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker#gpu-isolation)).
-
 By default, the `train.py` script will launch mixed precision training with
-Tensor Cores. You can change this behavior by setting the `--math fp32` flag
-for the `train.py` script.
+Tensor Cores. You can change this behavior by setting:
+* the `--math fp32` flag to launch single precision training (for NVIDIA Volta
+  and NVIDIA Turing architectures) or
+* the `--math tf32` flag to launch TF32 training with Tensor Cores (for NVIDIA
+  Ampere architecture)
+
+for the `train.py` training script.
 
 To view all available options for training, run `python3 train.py --help`.
 
@@ -904,15 +949,17 @@ The following commands will launch one epoch of training:
 
 To launch mixed precision training on 1, 4 or 8 GPUs, run:
 ```
-python3 -m launch train.py --seed 2 --train-global-batch-size 1024 --epochs 1 --math fp16
+python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024 --epochs 1 --math fp16
 ```
 
 To launch mixed precision training on 16 GPUs, run:
 ```
-python3 -m launch train.py --seed 2 --train-global-batch-size 2048 --epochs 1 --math fp16
+python3 -m torch.distributed.launch --nproc_per_node=16 train.py --seed 2 --train-global-batch-size 2048 --epochs 1 --math fp16
 ```
 
-Change `--math fp16` to `--math fp32` to launch a single precision training.
+Change `--math fp16` to `--math fp32` to launch single precision training (for
+NVIDIA Volta and NVIDIA Turing architectures) or to `--math tf32` to launch
+TF32 training with Tensor Cores (for NVIDIA Ampere architecture).
 
 After the training is completed, the `train.py` script prints a summary to
 standard output. Performance results are printed in the following format:
@@ -930,11 +977,12 @@ training epoch and summed over all GPUs participating in the training.
 The inference performance and accuracy benchmarks require a checkpoint from a
 fully trained model.
 
-Command to launch the inference accuracy benchmark:
+Command to launch the inference accuracy benchmark on NVIDIA Volta or on NVIDIA
+Turing architectures:
 
 ```
 python3 translate.py \
-  --model results/gnmt/model_best.pth \
+  --model gnmt/model_best.pth \
   --input data/wmt16_de_en/newstest2014.en \
   --reference data/wmt16_de_en/newstest2014.de \
   --output /tmp/output \
@@ -944,11 +992,26 @@ python3 translate.py \
   --tables
 ```
 
-Command to launch the inference throughput and latency benchmarks:
+Command to launch the inference accuracy benchmark on NVIDIA Ampere architecture:
+
+```
+python3 translate.py \
+  --model gnmt/model_best.pth \
+  --input data/wmt16_de_en/newstest2014.en \
+  --reference data/wmt16_de_en/newstest2014.de \
+  --output /tmp/output \
+  --math fp16 tf32 \
+  --batch-size 128 \
+  --beam-size 1 2 5 \
+  --tables
+```
+
+Command to launch the inference throughput and latency benchmarks on NVIDIA
+Volta or NVIDIA Turing architectures:
 
 ```
 python3 translate.py \
-  --model results/gnmt/model_best.pth \
+  --model gnmt/model_best.pth \
   --input data/wmt16_de_en/newstest2014.en \
   --reference data/wmt16_de_en/newstest2014.de \
   --output /tmp/output \
@@ -960,59 +1023,86 @@ python3 translate.py \
   --tables
 ```
 
+Command to launch the inference throughput and latency benchmarks on NVIDIA
+Ampere architecture:
+
+```
+python3 translate.py \
+  --model gnmt/model_best.pth \
+  --input data/wmt16_de_en/newstest2014.en \
+  --reference data/wmt16_de_en/newstest2014.de \
+  --output /tmp/output \
+  --math fp16 tf32 \
+  --batch-size 1 2 4 8 32 128 512 \
+  --repeat 1 1 1 1 2 8 16 \
+  --beam-size 1 2 5 \
+  --warmup 5 \
+  --tables
+```
+
 ### Results
 The following sections provide details on how we achieved our performance and
 accuracy in training and inference.
 
 #### Training accuracy results
 
-##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
 Our results were obtained by running the `train.py` script with the default
-batch size = 128 per GPU in the pytorch-19.05-py3 NGC container on NVIDIA DGX-1
-with (8x V100 16G) GPUs.
+batch size = 128 per GPU in the pytorch-20.06-py3 NGC container on NVIDIA DGX
+A100 with 8x A100 40GB GPUs.
 
 Command to launch the training:
 
 ```
-python3 -m launch train.py --seed 2 --train-global-batch-size 1024 --math fp16
+python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024 --math fp16
 ```
 
-Change `--math fp16` to `--math fp32` to launch a single precision training.
+Change `--math fp16` to `--math tf32` to launch TF32 training with Tensor Cores.
 
-| **GPUs** | **Batch Size / GPU** | **Accuracy - FP32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - FP32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (FP32 to Mixed precision)** |
+| **GPUs** | **Batch Size / GPU** | **Accuracy - TF32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - TF32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (TF32 to Mixed precision)** |
 | --- | --- | ----- | ----- | ----- | ------ | ---- |
-|  1  | 128 | 24.41 | 24.41 | 821.2 | 256.0  | 3.21 |
-|  4  | 128 | 24.43 | 24.51 | 232.3 | 79.0   | 2.94 |
-|  8  | 128 | 24.45 | 24.48 | 118.1 | 42.5   | 2.78 |
+| 8   | 128 | 24.46 | 24.60 | 34.7  | 22.7   | 1.53 |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
 outlined above.
 
-##### Training accuracy: NVIDIA DGX-2 (16x V100 32G)
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 Our results were obtained by running the `train.py` script with the default
-batch size = 128 per GPU in the pytorch-19.05-py3 NGC container on NVIDIA DGX-2
-with (16x V100 32G) GPUs.
+batch size = 128 per GPU in the pytorch-20.06-py3 NGC container on NVIDIA DGX-1
+with 8x V100 16GB GPUs.
 
-Commands to launch the training:
+Command to launch the training:
 
-To launch mixed precision training on 1, 4 or 8 GPUs, run:
 ```
-python3 -m launch train.py --seed 2 --train-global-batch-size 1024 --math fp16
+python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024 --math fp16
 ```
 
+Change `--math fp16` to `--math fp32` to launch single precision training.
+
+| **GPUs** | **Batch Size / GPU** | **Accuracy - FP32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - FP32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (FP32 to Mixed precision)** |
+| --- | --- | ----- | ----- | ----- | ------ | ---- |
+| 1   | 128 | 24.41 | 24.42 | 810.0 | 224.0  | 3.62 |
+| 4   | 128 | 24.40 | 24.33 | 218.2 | 69.5   | 3.14 |
+| 8   | 128 | 24.45 | 24.38 | 112.0 | 38.6   | 2.90 |
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
+outlined above.
+
+##### Training accuracy: NVIDIA DGX-2H (16x V100 32GB)
+Our results were obtained by running the `train.py` script with the default
+batch size = 128 per GPU in the pytorch-20.06-py3 NGC container on NVIDIA DGX-2H
+with 16x V100 32GB GPUs.
+
 To launch mixed precision training on 16 GPUs, run:
 ```
-python3 -m launch train.py --seed 2 --train-global-batch-size 2048 --math fp16
+python3 -m torch.distributed.launch --nproc_per_node=16 train.py --seed 2 --train-global-batch-size 2048 --math fp16
 ```
 
-Change `--math fp16` to `--math fp32` to launch a single precision training.
+Change `--math fp16` to `--math fp32` to launch single precision training.
 
 | **GPUs** | **Batch Size / GPU** | **Accuracy - FP32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - FP32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (FP32 to Mixed precision)** |
 | --- | --- | ----- | ----- | ------ | ----- | ---- |
-| 1   | 128 | 24.41 | 24.41 | 831.4  | 240.8 | 3.45 |
-| 4   | 128 | 24.43 | 24.45 | 219.2  | 74.5  | 2.94 |
-| 8   | 128 | 24.72 | 24.56 | 114.6  | 42.4  | 2.70 |
-| 16  | 128 | 23.98 | 24.08 | 59.1   | 23.3  | 2.54 |
+| 16  | 128 | 24.41 | 24.38 | 52.1   | 19.4  | 2.69 |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
 outlined above.
@@ -1020,66 +1110,89 @@ outlined above.
 ![TrainingLoss](./img/training_loss.png)
 
 ##### Training stability test
-The GNMT v2 model was trained for 6 epochs, starting from 50 different initial
+The GNMT v2 model was trained for 6 epochs, starting from 32 different initial
 random seeds. After each training epoch, the model was evaluated on the test
 dataset and the BLEU score was recorded. The training was performed in the
-pytorch-19.05-py3 Docker container on NVIDIA DGX-1 with 8 Tesla V100 16G GPUs.
+pytorch-20.06-py3 Docker container on NVIDIA DGX A100 with 8x A100 40GB GPUs.
 The following table summarizes the results of the stability test.
 
-![TrainingAccuracy](./img/training_accuracy.png)
-
 In the following table, the BLEU scores after each training epoch for different
 initial random seeds are displayed.
 
 | **Epoch** | **Average** | **Standard deviation** | **Minimum** | **Maximum** | **Median** |
 | --- | ------ | ----- | ------ | ------ | ------ |
-|  1  | 19.960 | 0.347 | 18.460 | 20.510 | 19.975 |
-|  2  | 21.778 | 0.248 | 21.190 | 22.170 | 21.790 |
-|  3  | 22.501 | 0.210 | 21.890 | 22.870 | 22.475 |
-|  4  | 23.148 | 0.169 | 22.660 | 23.480 | 23.165 |
-|  5  | 24.158 | 0.140 | 23.910 | 24.460 | 24.155 |
-|  6  | 24.378 | 0.165 | 24.010 | 24.690 | 24.395 |
+| 1   | 19.959 | 0.238 | 19.410 | 20.390 | 19.970 |
+| 2   | 21.772 | 0.293 | 20.960 | 22.280 | 21.820 |
+| 3   | 22.435 | 0.264 | 21.740 | 22.870 | 22.465 |
+| 4   | 23.167 | 0.166 | 22.870 | 23.620 | 23.195 |
+| 5   | 24.233 | 0.149 | 23.820 | 24.530 | 24.235 |
+| 6   | 24.416 | 0.131 | 24.140 | 24.660 | 24.390 |
 
+#### Training throughput results
 
+##### Training throughput: NVIDIA DGX A100 (8x A100 40GB)
+Our results were obtained by running the `train.py` training script in the
+pytorch-20.06-py3 NGC container on NVIDIA DGX A100 with 8x A100 40GB GPUs.
+Throughput performance numbers (in tokens per second) were averaged over an
+entire training epoch.
 
-#### Training throughput results
+| **GPUs** | **Batch size / GPU** | **Throughput - TF32 (tok/s)** | **Throughput - Mixed precision (tok/s)** | **Throughput speedup (TF32 to Mixed precision)** | **Strong Scaling - TF32** | **Strong Scaling - Mixed precision** |
+| --- | --- | ------ | ------ | ----- | ----- | ----- |
+| 1   | 128 | 83214  | 140909 | 1.693 | 1.000 | 1.000 |
+| 4   | 128 | 278576 | 463144 | 1.663 | 3.348 | 3.287 |
+| 8   | 128 | 519952 | 822024 | 1.581 | 6.248 | 5.834 |
 
-##### Training throughput: NVIDIA DGX-1 (8x V100 16G)
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
+outlined above.
+
+##### Training throughput: NVIDIA DGX-1 (8x V100 16GB)
 Our results were obtained by running the `train.py` training script in the
-pytorch-19.05-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+pytorch-20.06-py3 NGC container on NVIDIA DGX-1 with 8x V100 16GB GPUs.
 Throughput performance numbers (in tokens per second) were averaged over an
 entire training epoch.
 
 | **GPUs** | **Batch size / GPU** | **Throughput - FP32 (tok/s)** | **Throughput - Mixed precision (tok/s)** | **Throughput speedup (FP32 to Mixed precision)** | **Strong Scaling - FP32** | **Strong Scaling - Mixed precision** |
 | --- | --- | ------ | ------ | ----- | ----- | ----- |
-|  1  | 128 | 21424  | 68312  | 3.189 | 1.000 | 1.000 |
-|  4  | 128 | 75658  | 221308 | 2.925 | 3.531 | 3.240 |
-|  8  | 128 | 149552 | 419075 | 2.802 | 6.981 | 6.135 |
+| 1   | 128 | 21860  | 76438  | 3.497 | 1.000 | 1.000 |
+| 4   | 128 | 80224  | 249168 | 3.106 | 3.670 | 3.260 |
+| 8   | 128 | 154168 | 447832 | 2.905 | 7.053 | 5.859 |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
 outlined above.
 
-##### Training throughput: NVIDIA DGX-2 (16x V100 32G)
+##### Training throughput: NVIDIA DGX-2H (16x V100 32GB)
 Our results were obtained by running the `train.py` training script in the
-pytorch-19.05-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs.
+pytorch-20.06-py3 NGC container on NVIDIA DGX-2H with 16x V100 32GB GPUs.
 Throughput performance numbers (in tokens per second) were averaged over an
 entire training epoch.
 
 | **GPUs** | **Batch size / GPU** | **Throughput - FP32 (tok/s)** | **Throughput - Mixed precision (tok/s)** | **Throughput speedup (FP32 to Mixed precision)** | **Strong Scaling - FP32** | **Strong Scaling - Mixed precision** |
 | --- | --- | ------ | ------ | ----- | ------ | ------ |
-|  1  | 128 | 22742  | 72684  | 3.196 | 1.000  | 1.000  |
-|  4  | 128 | 80395  | 237616 | 2.956 | 3.535  | 3.269  |
-|  8  | 128 | 155297 | 430377 | 2.771 | 6.829  | 5.921  |
-| 16  | 128 | 312426 | 852550 | 2.729 | 13.738 | 11.730 |
+| 1  | 128 | 25583  | 87829   | 3.433 | 1.000  | 1.000  |
+| 4  | 128 | 91400  | 290640  | 3.180 | 3.573  | 3.309  |
+| 8  | 128 | 176616 | 522008  | 2.956 | 6.904  | 5.943  |
+| 16 | 128 | 351792 | 1010880 | 2.874 | 13.751 | 11.510 |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
 outlined above.
 
 #### Inference accuracy results
 
-##### Inference accuracy: NVIDIA Tesla V100 16G
+##### Inference accuracy: NVIDIA A100 40GB
+Our results were obtained by running the `translate.py` script in the
+pytorch-20.06-py3 NGC Docker container with NVIDIA A100 40GB GPU. Full
+command to launch the inference accuracy benchmark was provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
+
+| **Batch Size** | **Beam Size** | **Accuracy - TF32 (BLEU)** | **Accuracy - FP16 (BLEU)** |
+| -------------: | ------------: | -------------------------: | -------------------------: |
+| 128            | 1             | 23.07                      | 23.07                      |
+| 128            | 2             | 23.81                      | 23.81                      |
+| 128            | 5             | 24.41                      | 24.43                      |
+
+##### Inference accuracy: NVIDIA Tesla V100 16GB
 Our results were obtained by running the `translate.py` script in the
-pytorch-19.05-py3 NGC Docker container with NVIDIA Tesla V100 16G GPUs. Full
+pytorch-20.06-py3 NGC Docker container with NVIDIA Tesla V100 16GB GPU. Full
 command to launch the inference accuracy benchmark was provided in the
 [Inference performance benchmark](#inference-performance-benchmark) section.
 
@@ -1091,7 +1204,7 @@ command to launch the inference accuracy benchmark was provided in the
 
 ##### Inference accuracy: NVIDIA T4
 Our results were obtained by running the `translate.py` script in the
-pytorch-19.05-py3 NGC Docker container with NVIDIA Tesla T4. Full command to
+pytorch-20.06-py3 NGC Docker container with NVIDIA Tesla T4. Full command to
 launch the inference accuracy benchmark was provided in the [Inference
 performance benchmark](#inference-performance-benchmark) section.
 
@@ -1109,39 +1222,72 @@ Tables presented in this section show the average inference throughput (columns
 **Avg (tok/s)**) and inference throughput for various confidence intervals
 (columns **N% (ms)**, where `N` denotes the confidence interval). Inference
 throughput is measured in tokens per second. Speedups reported in FP16
-subsections are relative to FP32 numbers for corresponding configuration.
+subsections are relative to FP32 (for NVIDIA Volta and NVIDIA Turing) and
+relative to TF32 (for NVIDIA Ampere) numbers for corresponding configuration.
+
+##### Inference throughput: NVIDIA A100 40GB
+Our results were obtained by running the `translate.py` script in the
+pytorch-20.06-py3 NGC Docker container with NVIDIA A100 40GB.
+Full command to launch the inference throughput benchmark was provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
+
+**FP16**
+
+|**Batch Size**|**Beam Size**|**Avg (tok/s)**|**Speedup**|**90% (tok/s)**|**Speedup**|**95% (tok/s)**|**Speedup**|**99% (tok/s)**|**Speedup**|
+|-------------:|------------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|
+|             1|            1|         1291.6|      1.031|         1195.7|      1.029|         1165.8|      1.029|         1104.7|      1.030|
+|             1|            2|          882.7|      1.019|          803.4|      1.015|          769.2|      1.015|          696.7|      1.017|
+|             1|            5|          848.3|      1.042|          753.0|      1.037|          715.0|      1.043|          636.4|      1.033|
+|             2|            1|         2060.5|      1.034|         1700.8|      1.032|         1621.8|      1.032|         1487.4|      1.022|
+|             2|            2|         1445.7|      1.026|         1197.6|      1.024|         1132.5|      1.023|         1043.7|      1.033|
+|             2|            5|         1402.3|      1.063|         1152.4|      1.056|         1100.5|      1.053|          992.9|      1.053|
+|             4|            1|         3465.6|      1.046|         2838.3|      1.040|         2672.7|      1.043|         2392.8|      1.043|
+|             4|            2|         2425.4|      1.041|         2002.5|      1.028|         1898.3|      1.033|         1690.2|      1.028|
+|             4|            5|         2364.4|      1.075|         1930.0|      1.067|         1822.0|      1.065|         1626.1|      1.058|
+|             8|            1|         6151.1|      1.099|         5078.0|      1.087|         4786.5|      1.096|         4206.9|      1.090|
+|             8|            2|         4241.9|      1.075|         3494.1|      1.066|         3293.6|      1.066|         2970.9|      1.064|
+|             8|            5|         4117.7|      1.118|         3430.9|      1.103|         3224.5|      1.104|         2833.5|      1.110|
+|            32|            1|        18830.4|      1.147|        16210.0|      1.152|        15563.9|      1.138|        13973.2|      1.135|
+|            32|            2|        12698.2|      1.133|        10812.3|      1.114|        10256.1|      1.145|         9330.2|      1.101|
+|            32|            5|        11802.6|      1.355|         9998.8|      1.318|         9671.6|      1.329|         9058.4|      1.335|
+|           128|            1|        53394.5|      1.350|        48867.6|      1.342|        46898.5|      1.414|        40670.6|      1.305|
+|           128|            2|        34876.4|      1.483|        31687.4|      1.491|        30025.4|      1.505|        27677.1|      1.421|
+|           128|            5|        28201.3|      1.986|        25660.5|      1.997|        24306.0|      1.967|        23326.2|      2.007|
+|           512|            1|       119675.3|      1.904|       112400.5|      1.971|       109694.8|      1.927|       108781.3|      1.919|
+|           512|            2|        74514.7|      2.126|        69578.9|      2.209|        69348.1|      2.210|        69253.7|      2.212|
+|           512|            5|        47003.2|      2.760|        43348.2|      2.893|        43080.3|      2.884|        42878.4|      2.881|
 
 ##### Inference throughput: NVIDIA T4
 Our results were obtained by running the `translate.py` script in the
-pytorch-19.05-py3 NGC Docker container with NVIDIA T4.
+pytorch-20.06-py3 NGC Docker container with NVIDIA T4.
 Full command to launch the inference throughput benchmark was provided in the
 [Inference performance benchmark](#inference-performance-benchmark) section.
 
 **FP16**
 
-| **Batch Size** | **Beam Size** | **Avg (tok/s)** | **Speedup** | **50% (tok/s)** | **Speedup** | **90% (tok/s)** | **Speedup** | **95% (tok/s)** | **Speedup** | **99% (tok/s)** | **Speedup** | **100% (tok/s)** | **Speedup** |
-| -------------: | ------------: | --------------: | ----------: | --------------: | ----------: | --------------: | ----------: | --------------: | ----------: | --------------: | ----------: | ---------------: | ----------: |
-| 1              | 1             | 987.6           | 1.221       | 985.5           | 1.221       | 921.6           | 1.208       | 898.6           | 1.203       | 855.8           | 1.195       | 665.2            | 1.127       |
-| 1              | 2             | 664.4           | 1.239       | 667.5           | 1.239       | 608.4           | 1.234       | 582.4           | 1.233       | 529.8           | 1.225       | 412.1            | 1.218       |
-| 1              | 5             | 633.1           | 1.373       | 639.7           | 1.371       | 566.0           | 1.369       | 537.0           | 1.371       | 481.0           | 1.356       | 292.2            | 1.344       |
-| 2              | 1             | 1530.2          | 1.301       | 1538.6          | 1.304       | 1281.3          | 1.288       | 1225.7          | 1.285       | 1127.6          | 1.261       | 1032.9           | 1.241       |
-| 2              | 2             | 1085.3          | 1.325       | 1090.3          | 1.323       | 898.5           | 1.286       | 852.2           | 1.279       | 780.1           | 1.260       | 692.1            | 1.277       |
-| 2              | 5             | 1041.4          | 1.381       | 1041.8          | 1.380       | 855.2           | 1.382       | 819.1           | 1.375       | 760.0           | 1.402       | 636.4            | 1.364       |
-| 4              | 1             | 2545.2          | 1.392       | 2538.4          | 1.387       | 2104.4          | 1.358       | 1985.7          | 1.347       | 1801.2          | 1.332       | 1607.0           | 1.304       |
-| 4              | 2             | 1820.8          | 1.348       | 1815.3          | 1.347       | 1508.7          | 1.328       | 1421.1          | 1.308       | 1278.3          | 1.309       | 1052.9           | 1.273       |
-| 4              | 5             | 1702.1          | 1.339       | 1694.6          | 1.336       | 1395.9          | 1.347       | 1314.0          | 1.349       | 1181.0          | 1.333       | 1000.0           | 1.346       |
-| 8              | 1             | 4361.0          | 1.453       | 4372.8          | 1.460       | 3636.9          | 1.425       | 3388.2          | 1.401       | 2945.2          | 1.342       | 2650.7           | 1.351       |
-| 8              | 2             | 3087.6          | 1.337       | 3094.6          | 1.339       | 2555.4          | 1.322       | 2411.7          | 1.333       | 2173.2          | 1.348       | 1928.6           | 1.329       |
-| 8              | 5             | 2927.4          | 1.623       | 2934.8          | 1.619       | 2456.4          | 1.588       | 2304.5          | 1.578       | 2018.4          | 1.512       | 1976.1           | 1.577       |
-| 32             | 1             | 12564.5         | 1.621       | 12615.6         | 1.632       | 10924.6         | 1.632       | 10252.5         | 1.602       | 9577.1          | 1.653       | 8987.6           | 1.594       |
-| 32             | 2             | 8652.0          | 1.753       | 8765.0          | 1.761       | 7460.1          | 1.762       | 6903.1          | 1.690       | 6531.2          | 1.739       | 6413.8           | 1.746       |
-| 32             | 5             | 6750.6          | 2.455       | 6774.2          | 2.455       | 5842.7          | 2.347       | 5640.9          | 2.342       | 5239.2          | 2.325       | 5185.2           | 2.401       |
-| 128            | 1             | 29255.3         | 2.602       | 29157.9         | 2.578       | 26514.6         | 2.622       | 23953.9         | 2.540       | 23105.5         | 2.541       | 22825.4          | 2.543       |
-| 128            | 2             | 17823.4         | 2.640       | 17788.7         | 2.633       | 16089.4         | 2.641       | 14960.7         | 2.521       | 14573.0         | 2.677       | 14440.7          | 2.671       |
-| 128            | 5             | 10106.9         | 3.128       | 10116.9         | 3.109       | 9111.7          | 3.087       | 8798.0          | 3.014       | 8273.3          | 3.133       | 8207.6           | 3.141       |
-| 512            | 1             | 40817.8         | 3.381       | 41080.7         | 3.391       | 36490.2         | 3.418       | 36296.2         | 3.416       | 36133.0         | 3.416       | 36066.3          | 3.412       |
-| 512            | 2             | 23112.0         | 3.238       | 23174.9         | 3.240       | 20655.0         | 3.262       | 20540.2         | 3.250       | 20430.7         | 3.243       | 20429.4          | 3.245       |
-| 512            | 5             | 10836.4         | 3.460       | 10888.2         | 3.467       | 9598.3          | 3.432       | 9573.4          | 3.434       | 9527.9          | 3.424       | 9498.1           | 3.416       |
+|**Batch Size**|**Beam Size**|**Avg (tok/s)**|**Speedup**|**90% (tok/s)**|**Speedup**|**95% (tok/s)**|**Speedup**|**99% (tok/s)**|**Speedup**|
+|-------------:|------------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|
+|             1|            1|         1133.8|      1.266|         1059.1|      1.253|         1036.6|      1.251|          989.5|      1.242|
+|             1|            2|          793.9|      1.169|          728.3|      1.165|          698.1|      1.163|          637.1|      1.157|
+|             1|            5|          766.8|      1.343|          685.6|      1.335|          649.3|      1.335|          584.1|      1.318|
+|             2|            1|         1759.8|      1.233|         1461.6|      1.239|         1402.3|      1.242|         1302.1|      1.242|
+|             2|            2|         1313.3|      1.186|         1088.7|      1.185|         1031.6|      1.180|          953.2|      1.178|
+|             2|            5|         1257.2|      1.301|         1034.1|      1.316|          990.3|      1.313|          886.3|      1.265|
+|             4|            1|         2974.0|      1.261|         2440.3|      1.255|         2294.6|      1.257|         2087.7|      1.261|
+|             4|            2|         2204.7|      1.320|         1826.3|      1.283|         1718.9|      1.260|         1548.4|      1.260|
+|             4|            5|         2106.1|      1.340|         1727.8|      1.345|         1625.7|      1.353|         1467.7|      1.346|
+|             8|            1|         5076.6|      1.423|         4207.9|      1.367|         3904.4|      1.360|         3475.3|      1.355|
+|             8|            2|         3761.7|      1.311|         3108.1|      1.285|         2931.6|      1.300|         2628.7|      1.300|
+|             8|            5|         3578.2|      1.660|         2998.2|      1.614|         2812.1|      1.609|         2447.6|      1.523|
+|            32|            1|        14637.8|      1.636|        12702.5|      1.644|        12070.3|      1.634|        11036.9|      1.647|
+|            32|            2|        10627.3|      1.818|         9198.3|      1.818|         8431.6|      1.725|         8000.0|      1.773|
+|            32|            5|         8205.7|      2.598|         7117.6|      2.476|         6825.2|      2.497|         6293.2|      2.437|
+|           128|            1|        33800.5|      2.755|        30824.5|      2.816|        27685.2|      2.661|        26580.9|      2.694|
+|           128|            2|        20829.4|      2.795|        18665.2|      2.778|        17372.1|      2.639|        16820.5|      2.821|
+|           128|            5|        11753.9|      3.309|        10658.1|      3.273|        10308.7|      3.205|         9630.7|      3.328|
+|           512|            1|        44474.6|      3.327|        40108.1|      3.394|        39816.6|      3.378|        39708.0|      3.381|
+|           512|            2|        26057.9|      3.295|        23197.3|      3.294|        23019.8|      3.284|        22951.4|      3.284|
+|           512|            5|        12161.5|      3.428|        10777.5|      3.418|        10733.1|      3.414|        10710.5|      3.420|
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
 outlined above.
@@ -1151,64 +1297,101 @@ Tables presented in this section show the average inference latency (columns **A
 (ms)**) and inference latency for various confidence intervals (columns **N%
 (ms)**, where `N` denotes the confidence interval). Inference latency is
 measured in milliseconds. Speedups reported in FP16 subsections are relative to
-FP32 numbers for corresponding configuration.
+FP32 (for NVIDIA Volta and NVIDIA Turing) and relative to TF32 (for NVIDIA
+Ampere) numbers for corresponding configuration.
+
+##### Inference latency: NVIDIA A100 40GB
+Our results were obtained by running the `translate.py` script in the
+pytorch-20.06-py3 NGC Docker container with NVIDIA A100 40GB.
+Full command to launch the inference latency benchmark was provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
+
+**FP16**
+
+|**Batch Size**|**Beam Size**|**Avg (ms)**|**Speedup**|**90% (ms)**|**Speedup**|**95% (ms)**|**Speedup**|**99% (ms)**|**Speedup**|
+|-------------:|------------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|
+|             1|            1|       44.69|      1.032|       74.04|      1.035|       84.61|      1.034|       99.14|      1.042|
+|             1|            2|       64.76|      1.020|      105.18|      1.018|      118.92|      1.019|      139.42|      1.023|
+|             1|            5|       67.06|      1.043|      107.56|      1.049|      121.82|      1.054|      143.85|      1.054|
+|             2|            1|       56.57|      1.034|       85.59|      1.037|       92.55|      1.038|      107.59|      1.046|
+|             2|            2|       80.22|      1.027|      119.22|      1.027|      128.43|      1.030|      150.06|      1.028|
+|             2|            5|       82.54|      1.063|      121.37|      1.067|      132.35|      1.069|      156.34|      1.059|
+|             4|            1|       67.29|      1.047|       92.69|      1.048|      100.08|      1.056|      112.63|      1.064|
+|             4|            2|       95.86|      1.041|      129.83|      1.040|      139.48|      1.044|      162.34|      1.045|
+|             4|            5|       98.34|      1.075|      133.83|      1.076|      142.70|      1.068|      168.30|      1.075|
+|             8|            1|       75.60|      1.099|       97.87|      1.103|      104.13|      1.099|      117.40|      1.102|
+|             8|            2|      109.38|      1.074|      137.71|      1.079|      147.69|      1.069|      168.79|      1.065|
+|             8|            5|      112.71|      1.116|      143.50|      1.104|      153.17|      1.118|      172.60|      1.113|
+|            32|            1|       98.40|      1.146|      117.02|      1.153|      123.42|      1.150|      129.01|      1.128|
+|            32|            2|      145.87|      1.133|      171.71|      1.159|      184.01|      1.127|      188.64|      1.141|
+|            32|            5|      156.82|      1.357|      189.10|      1.374|      194.95|      1.392|      196.65|      1.419|
+|           128|            1|      137.97|      1.350|      150.04|      1.348|      151.52|      1.349|      154.52|      1.434|
+|           128|            2|      211.58|      1.484|      232.96|      1.490|      237.46|      1.505|      239.86|      1.567|
+|           128|            5|      261.44|      1.990|      288.54|      2.017|      291.63|      2.052|      298.73|      2.136|
+|           512|            1|      245.93|      1.906|      262.51|      1.998|      264.24|      1.999|      265.23|      2.000|
+|           512|            2|      395.61|      2.129|      428.54|      2.219|      431.58|      2.224|      433.86|      2.227|
+|           512|            5|      627.21|      2.767|      691.72|      2.878|      696.01|      2.895|      702.13|      2.887|
 
 ##### Inference latency: NVIDIA T4
 Our results were obtained by running the `translate.py` script in the
-pytorch-19.05-py3 NGC Docker container with NVIDIA T4.
+pytorch-20.06-py3 NGC Docker container with NVIDIA T4.
 Full command to launch the inference latency benchmark was provided in the
 [Inference performance benchmark](#inference-performance-benchmark) section.
 
 **FP16**
 
-| **Batch Size** | **Beam Size** | **Avg (ms)** | **Speedup** | **50% (ms)** | **Speedup** | **90% (ms)** | **Speedup** | **95% (ms)** | **Speedup** | **99% (ms)** | **Speedup** | **100% (ms)** | **Speedup** |
-| -------------: | ------------: | -----------: | ----------: | -----------: | ----------: | -----------: | ----------: | -----------: | ----------: | -----------: | ----------: | ------------: | ----------: |
-| 1              | 1             | 58.35        | 1.217       | 53.92        | 1.214       | 96.43        | 1.208       | 110.4        | 1.202       | 129.7        | 1.211       | 161.2         | 1.227       |
-| 1              | 2             | 86.04        | 1.238       | 79.70        | 1.232       | 139.58       | 1.241       | 158.2        | 1.241       | 187.3        | 1.242       | 231.9         | 1.252       |
-| 1              | 5             | 89.92        | 1.373       | 83.20        | 1.369       | 144.67       | 1.379       | 165.1        | 1.372       | 193.9        | 1.387       | 249.0         | 1.307       |
-| 2              | 1             | 76.07        | 1.298       | 72.06        | 1.299       | 115.35       | 1.292       | 124.8        | 1.287       | 146.7        | 1.284       | 169.7         | 1.305       |
-| 2              | 2             | 107.00       | 1.319       | 101.46       | 1.323       | 159.65       | 1.312       | 171.4        | 1.314       | 199.8        | 1.310       | 236.5         | 1.278       |
-| 2              | 5             | 111.24       | 1.383       | 105.79       | 1.384       | 165.88       | 1.379       | 178.8        | 1.392       | 210.3        | 1.410       | 235.0         | 1.465       |
-| 4              | 1             | 91.62        | 1.385       | 89.44        | 1.387       | 126.05       | 1.375       | 136.9        | 1.358       | 155.5        | 1.395       | 173.4         | 1.393       |
-| 4              | 2             | 127.74       | 1.346       | 125.35       | 1.349       | 173.20       | 1.348       | 186.6        | 1.344       | 216.4        | 1.350       | 237.0         | 1.419       |
-| 4              | 5             | 136.62       | 1.349       | 134.64       | 1.335       | 185.34       | 1.386       | 198.6        | 1.396       | 237.0        | 1.405       | 250.0         | 1.492       |
-| 8              | 1             | 106.57       | 1.450       | 106.08       | 1.452       | 137.45       | 1.440       | 147.0        | 1.452       | 166.0        | 1.463       | 175.8         | 1.455       |
-| 8              | 2             | 150.30       | 1.341       | 150.59       | 1.340       | 190.34       | 1.347       | 203.0        | 1.361       | 232.7        | 1.386       | 245.5         | 1.417       |
-| 8              | 5             | 158.51       | 1.628       | 157.91       | 1.614       | 200.90       | 1.665       | 217.3        | 1.683       | 244.2        | 1.706       | 269.5         | 1.633       |
-| 32             | 1             | 147.38       | 1.626       | 148.19       | 1.597       | 177.61       | 1.686       | 184.7        | 1.685       | 192.5        | 1.694       | 197.6         | 1.725       |
-| 32             | 2             | 214.38       | 1.756       | 211.63       | 1.773       | 259.40       | 1.816       | 273.0        | 1.780       | 286.3        | 1.787       | 293.8         | 1.825       |
-| 32             | 5             | 274.72       | 2.455       | 273.77       | 2.461       | 337.34       | 2.443       | 351.2        | 2.498       | 363.2        | 2.518       | 375.0         | 2.530       |
-| 128            | 1             | 252.24       | 2.601       | 253.45       | 2.609       | 276.06       | 2.663       | 281.8        | 2.661       | 309.7        | 2.647       | 312.4         | 2.658       |
-| 128            | 2             | 414.53       | 2.642       | 415.38       | 2.648       | 458.75       | 2.675       | 474.4        | 2.665       | 501.5        | 2.719       | 509.1         | 2.721       |
-| 128            | 5             | 730.79       | 3.129       | 738.87       | 3.118       | 820.11       | 3.141       | 843.1        | 3.137       | 895.4        | 3.150       | 915.7         | 3.141       |
-| 512            | 1             | 722.77       | 3.382       | 710.66       | 3.377       | 823.14       | 3.414       | 826.9        | 3.419       | 831.8        | 3.412       | 840.0         | 3.395       |
-| 512            | 2             | 1278.33      | 3.239       | 1264.85      | 3.227       | 1453.04      | 3.252       | 1467.6       | 3.251       | 1478.1       | 3.250       | 1485.9        | 3.245       |
-| 512            | 5             | 2726.34      | 3.458       | 2700.51      | 3.467       | 3107.75      | 3.432       | 3146.0       | 3.433       | 3172.9       | 3.422       | 3180.1        | 3.443       |
+|**Batch Size**|**Beam Size**|**Avg (ms)**|**Speedup**|**90% (ms)**|**Speedup**|**95% (ms)**|**Speedup**|**99% (ms)**|**Speedup**|
+|-------------:|------------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|
+|             1|            1|       51.08|      1.261|       84.82|      1.254|       97.45|      1.251|       114.6|      1.257|
+|             1|            2|       72.05|      1.168|      117.41|      1.165|      132.33|      1.170|       155.8|      1.174|
+|             1|            5|       74.20|      1.345|      119.45|      1.352|      135.07|      1.354|       160.3|      1.354|
+|             2|            1|       66.31|      1.232|      100.90|      1.232|      108.52|      1.235|       126.9|      1.238|
+|             2|            2|       88.35|      1.185|      131.47|      1.188|      141.46|      1.185|       164.7|      1.191|
+|             2|            5|       92.12|      1.305|      136.30|      1.310|      148.66|      1.309|       174.8|      1.320|
+|             4|            1|       78.54|      1.260|      108.53|      1.256|      117.19|      1.259|       133.7|      1.259|
+|             4|            2|      105.54|      1.315|      142.74|      1.317|      154.36|      1.307|       178.7|      1.303|
+|             4|            5|      110.43|      1.351|      150.62|      1.388|      161.61|      1.397|       191.2|      1.427|
+|             8|            1|       91.65|      1.418|      117.92|      1.421|      126.60|      1.405|       144.0|      1.411|
+|             8|            2|      123.39|      1.315|      156.00|      1.337|      167.34|      1.347|       193.4|      1.340|
+|             8|            5|      129.69|      1.666|      165.01|      1.705|      178.18|      1.723|       200.3|      1.765|
+|            32|            1|      126.53|      1.641|      153.23|      1.689|      159.58|      1.692|       167.0|      1.700|
+|            32|            2|      174.37|      1.822|      209.04|      1.899|      219.59|      1.877|       228.6|      1.878|
+|            32|            5|      226.15|      2.598|      277.38|      2.636|      290.27|      2.648|       299.4|      2.664|
+|           128|            1|      218.29|      2.755|      238.94|      2.826|      243.18|      2.843|       267.1|      2.828|
+|           128|            2|      354.83|      2.796|      396.63|      2.832|      410.53|      2.803|       433.2|      2.866|
+|           128|            5|      628.32|      3.311|      699.57|      3.353|      723.98|      3.323|       771.0|      3.337|
+|           512|            1|      663.07|      3.330|      748.62|      3.388|      753.20|      3.388|       758.0|      3.378|
+|           512|            2|     1134.04|      3.295|     1297.85|      3.283|     1302.25|      3.304|      1306.9|      3.308|
+|           512|            5|     2428.82|      3.428|     2771.72|      3.415|     2801.32|      3.427|      2817.6|      3.422|
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
 outlined above.
 
 ## Release notes
 ### Changelog
-1. Aug 7, 2018
-  * Initial release
-2. Dec 4, 2018
-  * Added exponential warm-up and step learning rate decay
-  * Multi-GPU (distributed) inference and validation
-  * Default container updated to NGC PyTorch 18.11-py3
-  * General performance improvements
-3. Feb 14, 2019
+* July 2020
+  * Added support for NVIDIA DGX A100
+  * Default container updated to NGC PyTorch 20.06-py3
+* June 2019
+  * Default container updated to NGC PyTorch 19.05-py3
+  * Mixed precision training implemented using APEX AMP
+  * Added inference throughput and latency results on NVIDIA T4 and NVIDIA
+    Tesla V100 16GB
+  * Added option to run inference on user-provided raw input text from command
+    line
+* February 2019
   * Different batching algorithm (bucketing with 5 equal-width buckets)
   * Additional dropouts before first LSTM layer in encoder and in decoder
   * Weight initialization changed to uniform (-0.1,0.1)
   * Switched order of dropout and concatenation with attention in decoder
   * Default container updated to NGC PyTorch 19.01-py3
-4. Jun 25, 2019
-  * Default container updated to NGC PyTorch 19.05-py3
-  * Mixed precision training implemented using APEX AMP
-  * Added inference throughput and latency results on NVIDIA T4 and NVIDIA Tesla V100 16G
-  * Added option to run inference on user-provided raw input text from command
-    line
+* December 2018
+  * Added exponential warm-up and step learning rate decay
+  * Multi-GPU (distributed) inference and validation
+  * Default container updated to NGC PyTorch 18.11-py3
+  * General performance improvements
+* August 2018
+  * Initial release
 
 ### Known issues
 There are no known issues in this release.

BIN
PyTorch/Translation/GNMT/img/training_accuracy.png


+ 0 - 307
PyTorch/Translation/GNMT/launch.py

@@ -1,307 +0,0 @@
-# From PyTorch:
-#
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-#
-# From Caffe2:
-#
-# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
-#
-# All contributions by Facebook:
-# Copyright (c) 2016 Facebook Inc.
-#
-# All contributions by Google:
-# Copyright (c) 2015 Google Inc.
-# All rights reserved.
-#
-# All contributions by Yangqing Jia:
-# Copyright (c) 2015 Yangqing Jia
-# All rights reserved.
-#
-# All contributions from Caffe:
-# Copyright(c) 2013, 2014, 2015, the respective contributors
-# All rights reserved.
-#
-# All other contributions:
-# Copyright(c) 2015, 2016 the respective contributors
-# All rights reserved.
-#
-# Caffe2 uses a copyright model similar to Caffe: each contributor holds
-# copyright over their contributions to Caffe2. The project versioning records
-# all such contribution and copyright details. If a contributor wants to further
-# mark their specific copyright on a particular contribution, they should
-# indicate their copyright solely in the commit message of the change when it is
-# committed.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#
-# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
-#    and IDIAP Research Institute nor the names of its contributors may be
-#    used to endorse or promote products derived from this software without
-#    specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-
-r"""
-`torch.distributed.launch` is a module that spawns up multiple distributed
-training processes on each of the training nodes.
-
-The utility can be used for single-node distributed training, in which one or
-more processes per node will be spawned. The utility can be used for either
-CPU training or GPU training. If the utility is used for GPU training,
-each distributed process will be operating on a single GPU. This can achieve
-well-improved single-node training performance. It can also be used in
-multi-node distributed training, by spawning up multiple processes on each node
-for well-improved multi-node distributed training performance as well.
-This will especially be benefitial for systems with multiple Infiniband
-interfaces that have direct-GPU support, since all of them can be utilized for
-aggregated communication bandwidth.
-
-In both cases of single-node distributed training or multi-node distributed
-training, this utility will launch the given number of processes per node
-(``--nproc_per_node``). If used for GPU training, this number needs to be less
-or euqal to the number of GPUs on the current system (``nproc_per_node``),
-and each process will be operating on a single GPU from *GPU 0 to
-GPU (nproc_per_node - 1)*.
-
-**How to use this module:**
-
-1. Single-Node multi-process distributed training
-
-::
-
-    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
-               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
-               arguments of your training script)
-
-2. Multi-Node multi-process distributed training: (e.g. two nodes)
-
-
-Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
-
-::
-
-    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
-               --nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
-               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
-               and all other arguments of your training script)
-
-Node 2:
-
-::
-
-    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
-               --nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
-               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
-               and all other arguments of your training script)
-
-3. To look up what optional arguments this module offers:
-
-::
-
-    >>> python -m torch.distributed.launch --help
-
-
-**Important Notices:**
-
-1. This utilty and multi-process distributed (single-node or
-multi-node) GPU training currently only achieves the best performance using
-the NCCL distributed backend. Thus NCCL backend is the recommended backend to
-use for GPU training.
-
-2. In your training program, you must parse the command-line argument:
-``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
-If your training program uses GPUs, you should ensure that your code only
-runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
-
-Parsing the local_rank argument
-
-::
-
-    >>> import argparse
-    >>> parser = argparse.ArgumentParser()
-    >>> parser.add_argument("--local_rank", type=int)
-    >>> args = parser.parse_args()
-
-Set your device to local rank using either
-
-::
-
-    >>> torch.cuda.set_device(arg.local_rank)  # before your code runs
-
-or
-
-::
-
-    >>> with torch.cuda.device(arg.local_rank):
-    >>>    # your code to run
-
-3. In your training program, you are supposed to call the following function
-at the beginning to start the distributed backend. You need to make sure that
-the init_method uses ``env://``, which is the only supported ``init_method``
-by this module.
-
-::
-
-    torch.distributed.init_process_group(backend='YOUR BACKEND',
-                                         init_method='env://')
-
-4. In your training program, you can either use regular distributed functions
-or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
-training program uses GPUs for training and you would like to use
-:func:`torch.nn.parallel.DistributedDataParallel` module,
-here is how to configure it.
-
-::
-
-    model = torch.nn.parallel.DistributedDataParallel(model,
-                                                      device_ids=[arg.local_rank],
-                                                      output_device=arg.local_rank)
-
-Please ensure that ``device_ids`` argument is set to be the only GPU device id
-that your code will be operating on. This is generally the local rank of the
-process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``,
-and ``output_device`` needs to be ``args.local_rank`` in order to use this
-utility
-
-.. warning::
-
-    ``local_rank`` is NOT globally unique: it is only unique per process
-    on a machine.  Thus, don't use it to decide if you should, e.g.,
-    write to a networked filesystem.  See
-    https://github.com/pytorch/pytorch/issues/12042 for an example of
-    how things can go wrong if you don't do this correctly.
-
-"""
-
-
-import sys
-import subprocess
-import os
-import socket
-from argparse import ArgumentParser, REMAINDER
-
-import torch
-
-
-def parse_args():
-    """
-    Helper function parsing the command line options
-    @retval ArgumentParser
-    """
-    parser = ArgumentParser(description="PyTorch distributed training launch "
-                                        "helper utilty that will spawn up "
-                                        "multiple distributed processes")
-
-    # Optional arguments for the launch helper
-    parser.add_argument("--nnodes", type=int, default=1,
-                        help="The number of nodes to use for distributed "
-                             "training")
-    parser.add_argument("--node_rank", type=int, default=0,
-                        help="The rank of the node for multi-node distributed "
-                             "training")
-    parser.add_argument("--nproc_per_node", type=int, default=None,
-                        help="The number of processes to launch on each node, "
-                             "for GPU training, this is recommended to be set "
-                             "to the number of GPUs in your system so that "
-                             "each process can be bound to a single GPU.")
-    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
-                        help="Master node (rank 0)'s address, should be either "
-                             "the IP address or the hostname of node 0, for "
-                             "single node multi-proc training, the "
-                             "--master_addr can simply be 127.0.0.1")
-    parser.add_argument("--master_port", default=29500, type=int,
-                        help="Master node (rank 0)'s free port that needs to "
-                             "be used for communciation during distributed "
-                             "training")
-
-    # positional
-    parser.add_argument("training_script", type=str,
-                        help="The full path to the single GPU training "
-                             "program/script to be launched in parallel, "
-                             "followed by all the arguments for the "
-                             "training script")
-
-    # rest from the training program
-    parser.add_argument('training_script_args', nargs=REMAINDER)
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    if args.nproc_per_node is None:
-        args.nproc_per_node = torch.cuda.device_count()
-
-    # world size in terms of number of processes
-    dist_world_size = args.nproc_per_node * args.nnodes
-
-    # set PyTorch distributed related environmental variables
-    current_env = os.environ.copy()
-    current_env["MASTER_ADDR"] = args.master_addr
-    current_env["MASTER_PORT"] = str(args.master_port)
-    current_env["WORLD_SIZE"] = str(dist_world_size)
-
-    processes = []
-
-    for local_rank in range(0, args.nproc_per_node):
-        # each process's rank
-        dist_rank = args.nproc_per_node * args.node_rank + local_rank
-        current_env["RANK"] = str(dist_rank)
-
-        # spawn the processes
-        cmd = [sys.executable,
-               "-u",
-               args.training_script,
-               "--local_rank={}".format(local_rank)] + args.training_script_args
-
-        process = subprocess.Popen(cmd, env=current_env)
-        processes.append(process)
-
-    returncode = 0
-    try:
-        for process in processes:
-            process_returncode = process.wait()
-            if process_returncode != 0:
-                returncode = 1
-    except KeyboardInterrupt:
-        print('CTRL-C, TERMINATING WORKERS ...')
-        for process in processes:
-            process.terminate()
-        for process in processes:
-            process.wait()
-        raise
-
-    sys.exit(returncode)
-
-
-if __name__ == "__main__":
-    main()

+ 1 - 2
PyTorch/Translation/GNMT/requirements.txt

@@ -1,5 +1,4 @@
-pytablewriter
+pytablewriter==0.47.0
 sacrebleu==1.2.10
 sacremoses==0.0.19
 git+git://github.com/rsennrich/subword-nmt.git@48ba99e657591c329e0003f0c6e32e493fa959ef
-git+git://github.com/NVIDIA/apex.git#egg=apex

+ 2 - 2
PyTorch/Translation/GNMT/scripts/docker/build.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -20,4 +20,4 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-docker build . --rm -t gnmt:latest
+docker build . --network=host --rm -t gnmt:latest

+ 2 - 2
PyTorch/Translation/GNMT/scripts/docker/interactive.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -20,4 +20,4 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-nvidia-docker run --init -it --rm --ipc=host -v $PWD:/workspace/gnmt/ gnmt bash
+docker run --gpus all --init -it --rm --network=host --ipc=host -v $PWD:/workspace/gnmt/ gnmt bash

+ 1 - 1
PyTorch/Translation/GNMT/scripts/filter_dataset.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 4 - 4
PyTorch/Translation/GNMT/scripts/tests/inference.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +28,8 @@ REPO_DIR='/workspace/gnmt'
 REFERENCE_FILE=$REPO_DIR/scripts/tests/reference_inference_performance
 
 MATH=$1
-if [[ ${MATH} != "fp16" && ${MATH} != "fp32" ]]; then
-   echo "Unsupported option for MATH, use either 'fp16' or 'fp32'"
+if [[ ${MATH} != "fp16" && ${MATH} != "fp32" && ${MATH} != "tf32" ]]; then
+   echo "Unsupported option for MATH, use either 'fp16' or 'fp32' or 'tf32'"
    exit 1
 fi
 
@@ -57,7 +57,7 @@ python3 translate.py \
    --input ${DATASET_DIR}/newstest2014.en \
    --reference ${DATASET_DIR}/newstest2014.de \
    --output /tmp/output \
-   --model results/gnmt/model_best.pth \
+   --model gnmt/model_best.pth \
    --batch-size ${BATCH_SIZE} \
    --beam-size ${BEAM_SIZE} \
    --math ${MATH} \

+ 4 - 4
PyTorch/Translation/GNMT/scripts/tests/train_1epoch.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +28,8 @@ REPO_DIR='/workspace/gnmt'
 REFERENCE_FILE=$REPO_DIR/scripts/tests/reference_training_performance
 
 MATH=$1
-if [[ ${MATH} != "fp16" && ${MATH} != "fp32" ]]; then
-   echo "Unsupported option for MATH, use either 'fp16' or 'fp32'"
+if [[ ${MATH} != "fp16" && ${MATH} != "fp32" && ${MATH} != "tf32" ]]; then
+   echo "Unsupported option for MATH, use either 'fp16' or 'fp32' or 'tf32'"
    exit 1
 fi
 
@@ -62,7 +62,7 @@ fi
 
 cd $REPO_DIR
 
-python3 -m launch train.py \
+python3 -m torch.distributed.launch --nproc_per_node=${GPU_COUNT} train.py \
    --dataset-dir $DATASET_DIR \
    --seed 2 \
    --epochs 1 \

+ 5 - 5
PyTorch/Translation/GNMT/scripts/tests/train_bench.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +28,8 @@ REPO_DIR='/workspace/gnmt'
 REFERENCE_FILE=$REPO_DIR/scripts/tests/reference_training_performance
 
 MATH=$1
-if [[ ${MATH} != "fp16" && ${MATH} != "fp32" ]]; then
-   echo "Unsupported option for MATH, use either 'fp16' or 'fp32'"
+if [[ ${MATH} != "fp16" && ${MATH} != "fp32" && ${MATH} != "tf32" ]]; then
+   echo "Unsupported option for MATH, use either 'fp16' or 'fp32' or 'tf32'"
    exit 1
 fi
 
@@ -62,9 +62,9 @@ fi
 
 cd $REPO_DIR
 
-python3 -m launch train.py \
+python3 -m torch.distributed.launch --nproc_per_node=${GPU_COUNT} train.py \
    --dataset-dir $DATASET_DIR \
-   --seed 1 \
+   --seed 2 \
    --epochs 1 \
    --remain-steps 1.0 \
    --no-eval \

+ 4 - 4
PyTorch/Translation/GNMT/scripts/tests/train_full.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +28,8 @@ REPO_DIR='/workspace/gnmt'
 REFERENCE_FILE=$REPO_DIR/scripts/tests/reference_training_performance
 
 MATH=$1
-if [[ ${MATH} != "fp16" && ${MATH} != "fp32" ]]; then
-   echo "Unsupported option for MATH, use either 'fp16' or 'fp32'"
+if [[ ${MATH} != "fp16" && ${MATH} != "fp32" && ${MATH} != "tf32" ]]; then
+   echo "Unsupported option for MATH, use either 'fp16' or 'fp32' or 'tf32'"
    exit 1
 fi
 
@@ -62,7 +62,7 @@ fi
 
 cd $REPO_DIR
 
-python3 -m launch train.py \
+python3 -m torch.distributed.launch --nproc_per_node=${GPU_COUNT} train.py \
    --dataset-dir $DATASET_DIR \
    --seed 2 \
    --epochs 6 \

+ 0 - 25
PyTorch/Translation/GNMT/scripts/train.sh

@@ -1,25 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-set -e
-
-python3 -m launch train.py

+ 1 - 1
PyTorch/Translation/GNMT/scripts/verify_dataset.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/scripts/wmt16_en_de.sh

@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/data/config.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 32 - 1
PyTorch/Translation/GNMT/seq2seq/data/dataset.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -102,6 +102,37 @@ def build_collate_fn(batch_first=False, parallel=True, sort=False):
         return single_collate
 
 
+class SyntheticDataset(Dataset):
+    def __init__(self, vocab_size, seq_len, nsamples):
+        self.vocab_size = vocab_size
+        self.nsamples = nsamples
+        self.seq_len = seq_len
+
+    def __getitem__(self, idx):
+        rand = torch.randint(0, self.vocab_size, size=(self.seq_len,))
+        return rand
+
+    def unsort(self, array):
+        return array
+
+    def get_loader(self, batch_size=1, num_workers=0, batch_first=False,
+                   pad=False, repeat=1):
+
+        collate_fn = build_collate_fn(batch_first, parallel=False,
+                                      sort=True)
+        sampler = StaticDistributedSampler(self, batch_size, pad, repeat)
+
+        return DataLoader(self,
+                          batch_size=batch_size,
+                          collate_fn=collate_fn,
+                          sampler=sampler,
+                          num_workers=num_workers,
+                          pin_memory=True,
+                          drop_last=False)
+
+    def __len__(self):
+        return self.nsamples
+
 class RawTextDataset(Dataset):
     def __init__(self, raw_data=None, raw_datafile=None, tokenizer=None,
                  sort=False, max_size=None):

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/data/sampler.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/data/tokenizer.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 2 - 2
PyTorch/Translation/GNMT/seq2seq/inference/beam_search.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -238,7 +238,7 @@ class SequenceGenerator:
             new_scores = new_scores.view(-1, beam_size * beam_size)
             # index: (batch, beam)
             _, index = new_scores.topk(beam_size, dim=1)
-            source_beam = index / beam_size
+            source_beam = index // beam_size
 
             new_scores = new_scores.view(-1, beam_size * beam_size)
             best_scores = torch.gather(new_scores, 1, index)

+ 5 - 1
PyTorch/Translation/GNMT/seq2seq/inference/tables.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -44,6 +44,8 @@ class AccuracyTable:
         data_header = []
         if 'fp32' in write_math:
             data_header += [f'**Accuracy - FP32 ({self.unit})**']
+        if 'tf32' in write_math:
+            data_header += [f'**Accuracy - TF32 ({self.unit})**']
         if 'fp16' in write_math:
             data_header += [f'**Accuracy - FP16 ({self.unit})**']
         writer.headers = main_header + data_header
@@ -54,6 +56,8 @@ class AccuracyTable:
             row = [batch_size, beam_size]
             if 'fp32' in write_math:
                 row.append(v['fp32'])
+            if 'tf32' in write_math:
+                row.append(v['tf32'])
             if 'fp16' in write_math:
                 row.append(v['fp16'])
             writer.value_matrix.append(row)

+ 6 - 5
PyTorch/Translation/GNMT/seq2seq/inference/translator.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -201,10 +201,11 @@ class Translator:
             preds = preds.scatter(0, indices.unsqueeze(1).expand_as(preds), preds)
             preds = gather_predictions(preds).cpu()
 
-            for pred in preds:
-                pred = pred.tolist()
-                detok = self.tokenizer.detokenize(pred)
-                output.append(detok)
+            if self.tokenizer:
+                for pred in preds:
+                    pred = pred.tolist()
+                    detok = self.tokenizer.detokenize(pred)
+                    output.append(detok)
 
             elapsed = time.time() - translate_timer
             batch_time.update(elapsed, batch_size)

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/models/attention.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/models/decoder.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/models/encoder.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/models/gnmt.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/models/seq2seq_base.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/train/fp_optimizers.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/train/lr_scheduler.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/train/smoothing.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 1 - 1
PyTorch/Translation/GNMT/seq2seq/train/table.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

+ 10 - 7
PyTorch/Translation/GNMT/seq2seq/train/trainer.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -60,6 +60,7 @@ class Seq2SeqTrainer:
                  loss_scaling={},
                  intra_epoch_eval=0,
                  prealloc_mode='always',
+                 warmup=0,
                  iter_size=1,
                  translator=None,
                  verbose=False):
@@ -85,6 +86,7 @@ class Seq2SeqTrainer:
             training epoch
         :param prealloc_mode: controls preallocation,
             choices=['off', 'once', 'always']
+        :param warmup: number of warmup iterations for performance counters
         :param iter_size: number of iterations between weight updates
         :param translator: instance of Translator, runs inference on test set
         :param verbose: enables verbose logging
@@ -106,6 +108,7 @@ class Seq2SeqTrainer:
         self.loss = None
         self.translator = translator
         self.intra_epoch_eval = intra_epoch_eval
+        self.warmup = warmup
         self.iter_size = iter_size
         self.prealloc_mode = prealloc_mode
         self.preallocated = False
@@ -122,7 +125,7 @@ class Seq2SeqTrainer:
                 dls_upscale_interval=loss_scaling['upscale_interval']
                 )
             params = self.fp_optimizer.fp32_params
-        elif math == 'fp32':
+        elif math == 'fp32' or math == 'tf32':
             self.fp_optimizer = FP32Optimizer(self.model, grad_clip)
 
         opt_name = opt_config.pop('optimizer')
@@ -208,14 +211,14 @@ class Seq2SeqTrainer:
             eval_iters = eval_iters * self.iter_size
             eval_iters = set(eval_iters)
 
-        batch_time = AverageMeter()
-        data_time = AverageMeter()
+        batch_time = AverageMeter(self.warmup)
+        data_time = AverageMeter(self.warmup)
         losses_per_token = AverageMeter()
         losses_per_sentence = AverageMeter()
 
-        tot_tok_time = AverageMeter()
-        src_tok_time = AverageMeter()
-        tgt_tok_time = AverageMeter()
+        tot_tok_time = AverageMeter(self.warmup)
+        src_tok_time = AverageMeter(self.warmup)
+        tgt_tok_time = AverageMeter(self.warmup)
 
         batch_size = data_loader.batch_size
 

+ 18 - 2
PyTorch/Translation/GNMT/seq2seq/utils.py

@@ -1,5 +1,5 @@
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ import sys
 import time
 from contextlib import contextmanager
 
+import dllogger
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -220,6 +221,21 @@ def setup_logging(log_all_ranks=True, log_file=os.devnull):
     logging.getLogger('').addFilter(rank_filter)
 
 
+def setup_dllogger(enabled=True, filename=os.devnull):
+    rank = get_rank()
+
+    if enabled and rank == 0:
+        backends = [
+            dllogger.JSONStreamBackend(
+                dllogger.Verbosity.VERBOSE,
+                filename,
+                ),
+            ]
+        dllogger.init(backends)
+    else:
+        dllogger.init([])
+
+
 def set_device(cuda, local_rank):
     """
     Sets device based on local_rank and returns instance of torch.device.
@@ -262,7 +278,7 @@ def log_env_info():
 
 
 def pad_vocabulary(math):
-    if math == 'fp16' or math == 'manual_fp16':
+    if math == 'tf32' or math == 'fp16' or math == 'manual_fp16':
         pad_vocab = 8
     elif math == 'fp32':
         pad_vocab = 1

+ 24 - 11
PyTorch/Translation/GNMT/train.py

@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -28,6 +28,7 @@ import sys
 import time
 from ast import literal_eval
 
+import dllogger
 import torch.nn as nn
 import torch.nn.parallel
 import torch.optim
@@ -113,14 +114,14 @@ def parse_args():
 
     # results
     results = parser.add_argument_group('results setup')
-    results.add_argument('--results-dir', default='results',
+    results.add_argument('--save-dir', default='gnmt',
                          help='path to directory with results, it will be \
                          automatically created if it does not exist')
-    results.add_argument('--save-dir', default='gnmt',
-                         help='defines subdirectory within RESULTS_DIR for \
-                         results from this training run')
     results.add_argument('--print-freq', default=10, type=int,
                          help='print log every PRINT_FREQ batches')
+    results.add_argument('--warmup', default=1, type=int,
+                         help='number of warmup iterations for performance \
+                         counters')
 
     # model
     model = parser.add_argument_group('model setup')
@@ -142,7 +143,7 @@ def parse_args():
     # setup
     general = parser.add_argument_group('general setup')
     general.add_argument('--math', default='fp16',
-                         choices=['fp16', 'fp32', 'manual_fp16'],
+                         choices=['fp16', 'fp32', 'tf32', 'manual_fp16'],
                          help='precision')
     general.add_argument('--seed', default=None, type=int,
                          help='master seed for random number generators, if \
@@ -151,6 +152,8 @@ def parse_args():
     general.add_argument('--prealloc-mode', default='always', type=str,
                          choices=['off', 'once', 'always'],
                          help='controls preallocation')
+    general.add_argument('--dllog-file', type=str, default='train_log.json',
+                         help='Name of the DLLogger output file')
 
     exclusive_group(group=general, name='eval', default=True,
                     help='run validation and test after every epoch')
@@ -304,16 +307,14 @@ def parse_args():
 
     # distributed
     distributed = parser.add_argument_group('distributed setup')
-    distributed.add_argument('--rank', default=0, type=int,
-                             help='global rank of the process, do not set!')
-    distributed.add_argument('--local_rank', default=0, type=int,
-                             help='local rank of the process, do not set!')
+    distributed.add_argument('--local_rank',  type=int,
+                             default=os.getenv('LOCAL_RANK', 0),
+                             help='Used for multi-process training.')
 
     args = parser.parse_args()
 
     args.lang = {'src': args.src_lang, 'tgt': args.tgt_lang}
 
-    args.save_dir = os.path.join(args.results_dir, args.save_dir)
     args.vocab = os.path.join(args.dataset_dir, args.vocab)
     args.bpe_codes = os.path.join(args.dataset_dir, args.bpe_codes)
     args.train_src = os.path.join(args.dataset_dir, args.train_src)
@@ -381,11 +382,15 @@ def main():
     utils.setup_logging(args.log_all_ranks,
                         os.path.join(args.save_dir, log_filename))
 
+    dllog_file = os.path.join(args.save_dir, args.dllog_file)
+    utils.setup_dllogger(enabled=True, filename=dllog_file)
+
     if args.env:
         utils.log_env_info()
 
     logging.info(f'Saving results to: {args.save_dir}')
     logging.info(f'Run arguments: {args}')
+    dllogger.log(step='PARAMETER', data=vars(args))
 
     args.train_iter_size = set_iter_size(args.train_iter_size,
                                          args.train_global_batch_size,
@@ -527,6 +532,7 @@ def main():
         intra_epoch_eval=args.intra_epoch_eval,
         translator=translator,
         prealloc_mode=args.prealloc_mode,
+        warmup=args.warmup,
         )
 
     trainer = trainers.Seq2SeqTrainer(**trainer_options)
@@ -613,6 +619,13 @@ def main():
     if utils.get_rank() == 0:
         table.write('Training Summary', args.math)
 
+    summary = {
+        'train_throughput': avg_training_perf,
+        'train_elapsed': training_time,
+        'test_bleu': test_bleu,
+        }
+    dllogger.log(step=tuple(), data=summary)
+
     passed = utils.benchmark(test_bleu, args.target_bleu,
                              train_perf, args.target_perf)
     if not passed:

+ 77 - 25
PyTorch/Translation/GNMT/translate.py

@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 # Copyright (c) 2017 Elad Hoffer
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -22,20 +22,24 @@
 # SOFTWARE.
 
 import argparse
-import logging
 import itertools
+import logging
+import os
 import sys
 import warnings
 from itertools import product
 
+import dllogger
+import numpy as np
 import torch
 
 import seq2seq.utils as utils
 from seq2seq.data.dataset import RawTextDataset
+from seq2seq.data.dataset import SyntheticDataset
 from seq2seq.data.tokenizer import Tokenizer
+from seq2seq.inference import tables
 from seq2seq.inference.translator import Translator
 from seq2seq.models.gnmt import GNMT
-from seq2seq.inference import tables
 
 
 def parse_args():
@@ -64,10 +68,19 @@ def parse_args():
     dataset.add_argument('-r', '--reference', default=None,
                          help='full path to the file with reference \
                          translations (for sacrebleu, raw text)')
-    dataset.add_argument('-m', '--model', required=True,
+    dataset.add_argument('-m', '--model', type=str, default=None,
                          help='full path to the model checkpoint file')
 
-    source = dataset.add_mutually_exclusive_group(required=True)
+    dataset.add_argument('--synthetic', action='store_true',
+                         help='use synthetic dataset')
+    dataset.add_argument('--synthetic-batches', type=int, default=64,
+                         help='number of synthetic batches to generate')
+    dataset.add_argument('--synthetic-vocab', type=int, default=32320,
+                         help='size of synthetic vocabulary')
+    dataset.add_argument('--synthetic-len', type=int, default=50,
+                         help='sequence length of synthetic samples')
+
+    source = dataset.add_mutually_exclusive_group(required=False)
     source.add_argument('-i', '--input', required=False,
                         help='full path to the input file (raw text)')
     source.add_argument('-t', '--input-text', nargs='+', required=False,
@@ -93,7 +106,7 @@ def parse_args():
     # general setup
     general = parser.add_argument_group('general setup')
     general.add_argument('--math', nargs='+', default=['fp16'],
-                         choices=['fp16', 'fp32'], help='precision')
+                         choices=['fp16', 'fp32', 'tf32'], help='precision')
 
     exclusive_group(group=general, name='env', default=False,
                     help='print info about execution env')
@@ -116,6 +129,11 @@ def parse_args():
                                     format for RNNs')
     batch_first_parser.set_defaults(batch_first=True)
 
+    general.add_argument('--save-dir', default='gnmt',
+                         help='path to directory with results, it will be \
+                         automatically created if it does not exist')
+    general.add_argument('--dllog-file', type=str, default='eval_log.json',
+                         help='Name of the DLLogger output file')
     general.add_argument('--print-freq', '-p', default=1, type=int,
                          help='print log every PRINT_FREQ batches')
 
@@ -134,7 +152,7 @@ def parse_args():
     benchmark.add_argument('--warmup', default=0, type=int,
                            help='warmup iterations for performance counters')
     benchmark.add_argument('--percentiles', nargs='+', type=int,
-                           default=(50, 90, 95, 99, 100),
+                           default=(90, 95, 99),
                            help='Percentiles for confidence intervals for \
                            throughput/latency benchmarks')
     exclusive_group(group=benchmark, name='tables', default=False,
@@ -143,10 +161,9 @@ def parse_args():
 
     # distributed
     distributed = parser.add_argument_group('distributed setup')
-    distributed.add_argument('--rank', default=0, type=int,
-                             help='global rank of the process, do not set!')
-    distributed.add_argument('--local_rank', default=0, type=int,
-                             help='local rank of the process, do not set!')
+    distributed.add_argument('--local_rank',  type=int,
+                             default=os.getenv('LOCAL_RANK', 0),
+                             help='Used for multi-process training.')
 
     args = parser.parse_args()
 
@@ -156,8 +173,8 @@ def parse_args():
     if args.bleu and args.reference is None:
         parser.error('--bleu requires --reference')
 
-    if 'fp16' in args.math and not args.cuda:
-        parser.error('--math fp16 requires --cuda')
+    if ('fp16' in args.math or 'tf32' in args.math) and not args.cuda:
+        parser.error(f'--math {args.math} requires --cuda')
 
     if len(list(product(args.math, args.batch_size, args.beam_size))) > 1:
         args.target_bleu = None
@@ -180,12 +197,17 @@ def main():
     device = utils.set_device(args.cuda, args.local_rank)
     utils.init_distributed(args.cuda)
     args.rank = utils.get_rank()
+    os.makedirs(args.save_dir, exist_ok=True)
     utils.setup_logging()
 
+    dllog_file = os.path.join(args.save_dir, args.dllog_file)
+    utils.setup_dllogger(enabled=True, filename=dllog_file)
+
     if args.env:
         utils.log_env_info()
 
     logging.info(f'Run arguments: {args}')
+    dllogger.log(step='PARAMETER', data=vars(args))
 
     if not args.cuda and torch.cuda.is_available():
         warnings.warn('cuda is available but not enabled')
@@ -193,16 +215,22 @@ def main():
         torch.backends.cudnn.enabled = False
 
     # load checkpoint and deserialize to CPU (to save GPU memory)
-    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})
-
-    # build GNMT model
-    tokenizer = Tokenizer()
-    tokenizer.set_state(checkpoint['tokenizer'])
-    model_config = checkpoint['model_config']
-    model_config['batch_first'] = args.batch_first
-    model_config['vocab_size'] = tokenizer.vocab_size
-    model = GNMT(**model_config)
-    model.load_state_dict(checkpoint['state_dict'])
+    if args.model:
+        checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})
+
+        # build GNMT model
+        tokenizer = Tokenizer()
+        tokenizer.set_state(checkpoint['tokenizer'])
+        model_config = checkpoint['model_config']
+        model_config['batch_first'] = args.batch_first
+        model_config['vocab_size'] = tokenizer.vocab_size
+        model = GNMT(**model_config)
+        model.load_state_dict(checkpoint['state_dict'])
+    elif args.synthetic:
+        model = GNMT(args.synthetic_vocab, batch_first=args.batch_first)
+        tokenizer = None
+    else:
+        raise RuntimeError('Specify model either with --synthetic or with --model flag')
 
     # construct the dataset
     if args.input:
@@ -215,12 +243,18 @@ def main():
                               tokenizer=tokenizer,
                               sort=args.sort,
                               )
+    elif args.synthetic:
+        data = SyntheticDataset(args.synthetic_vocab, args.synthetic_len, args.batch_size[0] * args.synthetic_batches)
 
     latency_table = tables.LatencyTable(args.percentiles)
     throughput_table = tables.ThroughputTable(args.percentiles)
     accuracy_table = tables.AccuracyTable('BLEU')
 
-    dtype = {'fp32': torch.FloatTensor, 'fp16': torch.HalfTensor}
+    dtype = {
+        'fp32': torch.FloatTensor,
+        'tf32': torch.FloatTensor,
+        'fp16': torch.HalfTensor
+    }
 
     for (math, batch_size, beam_size) in product(args.math, args.batch_size,
                                                  args.beam_size):
@@ -263,7 +297,7 @@ def main():
             )
 
         # print translated outputs
-        if not args.output and args.rank == 0:
+        if not args.synthetic and (not args.output and args.rank == 0):
             logging.info(f'Translated output:')
             for out in output:
                 print(out)
@@ -278,21 +312,39 @@ def main():
 
         if 'fp16' in args.math and 'fp32' in args.math:
             relative = 'fp32'
+        elif 'fp16' in args.math and 'tf32' in args.math:
+            relative = 'tf32'
         else:
             relative = None
 
         if 'fp32' in args.math:
             throughput_table.write('Inference throughput', 'fp32')
+        if 'tf32' in args.math:
+            throughput_table.write('Inference throughput', 'tf32')
         if 'fp16' in args.math:
             throughput_table.write('Inference throughput', 'fp16',
                                    relative=relative)
 
         if 'fp32' in args.math:
             latency_table.write('Inference latency', 'fp32')
+        if 'tf32' in args.math:
+            latency_table.write('Inference latency', 'tf32')
         if 'fp16' in args.math:
             latency_table.write('Inference latency', 'fp16',
                                 relative=relative, reverse_speedup=True)
 
+    avg_throughput = np.array(stats['throughputs']).mean()
+    avg_latency = np.array(stats['runtimes']).mean()
+    summary = {
+        'eval_throughput': avg_throughput,
+        'eval_bleu': stats['bleu'],
+        'eval_avg_latency': avg_latency,
+        }
+    for p in args.percentiles:
+        summary[f'eval_{p}%_latency'] = 1000 * np.percentile(stats['runtimes'], p)
+
+    dllogger.log(step=tuple(), data=summary)
+
     passed = utils.benchmark(stats['bleu'], args.target_bleu,
                              stats['tokens_per_sec'], args.target_perf)
     return passed

+ 2 - 2
TensorFlow/Recommendation/WideAndDeep/scripts/DGX1_benchmark_training_amp_1gpu.sh

@@ -18,8 +18,8 @@ set -x
 set -e
 
 python -m trainer.task \
-  --benchmark_warmup_steps 50 \
-  --benchmark_steps 200 \
+  --benchmark_warmup_steps 500 \
+  --benchmark_steps 1000 \
   --gpu \
   --benchmark \
   --amp

+ 2 - 2
TensorFlow/Recommendation/WideAndDeep/scripts/DGX1_benchmark_training_amp_8gpu.sh

@@ -19,8 +19,8 @@ set -e
 
 mpiexec --allow-run-as-root --bind-to socket -np 8 \
   python -m trainer.task \
-  --benchmark_warmup_steps 50 \
-  --benchmark_steps 200 \
+  --benchmark_warmup_steps 500 \
+  --benchmark_steps 1000 \
   --gpu \
   --benchmark \
   --amp \

+ 2 - 2
TensorFlow/Recommendation/WideAndDeep/scripts/DGX1_benchmark_training_fp32_1gpu.sh

@@ -18,7 +18,7 @@ set -x
 set -e
 
 python -m trainer.task \
-  --benchmark_warmup_steps 50 \
-  --benchmark_steps 200 \
+  --benchmark_warmup_steps 500 \
+  --benchmark_steps 1000 \
   --gpu \
   --benchmark

+ 2 - 2
TensorFlow/Recommendation/WideAndDeep/scripts/DGX1_benchmark_training_fp32_8gpu.sh

@@ -20,7 +20,7 @@ set -e
 mpiexec --allow-run-as-root --bind-to socket -np 8 \
   python -m trainer.task \
   --hvd \
-  --benchmark_warmup_steps 50 \
-  --benchmark_steps 200 \
+  --benchmark_warmup_steps 500 \
+  --benchmark_steps 1000 \
   --gpu \
   --benchmark

+ 2 - 2
TensorFlow/Recommendation/WideAndDeep/scripts/DGXA100_benchmark_training_amp_1gpu.sh

@@ -18,8 +18,8 @@ set -x
 set -e
 
 python -m trainer.task \
-  --benchmark_warmup_steps 50 \
-  --benchmark_steps 200 \
+  --benchmark_warmup_steps 500 \
+  --benchmark_steps 1000 \
   --gpu \
   --benchmark \
   --amp

+ 2 - 2
TensorFlow/Recommendation/WideAndDeep/scripts/DGXA100_benchmark_training_amp_8gpu.sh

@@ -19,8 +19,8 @@ set -e
 
 mpiexec --allow-run-as-root --bind-to socket -np 8 \
   python -m trainer.task \
-  --benchmark_warmup_steps 50 \
-  --benchmark_steps 200 \
+  --benchmark_warmup_steps 500 \
+  --benchmark_steps 1000 \
   --gpu \
   --benchmark \
   --amp \

+ 2 - 2
TensorFlow/Recommendation/WideAndDeep/scripts/DGXA100_benchmark_training_tf32_1gpu.sh

@@ -18,7 +18,7 @@ set -x
 set -e
 
 python -m trainer.task \
-  --benchmark_warmup_steps 50 \
-  --benchmark_steps 200 \
+  --benchmark_warmup_steps 500 \
+  --benchmark_steps 1000 \
   --gpu \
   --benchmark

+ 2 - 2
TensorFlow/Recommendation/WideAndDeep/scripts/DGXA100_benchmark_training_tf32_8gpu.sh

@@ -20,7 +20,7 @@ set -e
 mpiexec --allow-run-as-root --bind-to socket -np 8 \
   python -m trainer.task \
   --hvd \
-  --benchmark_warmup_steps 50 \
-  --benchmark_steps 200 \
+  --benchmark_warmup_steps 500 \
+  --benchmark_steps 1000 \
   --gpu \
   --benchmark

+ 4 - 4
TensorFlow/Translation/GNMT/Dockerfile

@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/tensorflow:19.07-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
+FROM ${FROM_IMAGE_NAME}
 
-COPY . /workspace/gnmt
 WORKDIR /workspace/gnmt
 
+COPY requirements.txt .
 RUN pip install -r requirements.txt
 
-ENTRYPOINT ["/bin/sh", "-c"]
-CMD ["bash"]
+COPY . .

+ 247 - 59
TensorFlow/Translation/GNMT/README.md

@@ -1,6 +1,6 @@
 # GNMT v2 For TensorFlow
 
-This repository provides a script and recipe to train the GNMT v2 model to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
+This repository provides a script and recipe to train the GNMT v2 model to achieve state-of-the-art accuracy and is tested and maintained by NVIDIA.
 
 ## Table Of Contents
 - [Model overview](#model-overview)
@@ -10,6 +10,7 @@ This repository provides a script and recipe to train the GNMT v2 model to achie
         * [Features](#features)
     * [Mixed precision training](#mixed-precision-training)
         * [Enabling mixed precision](#enabling-mixed-precision)
+        * [Enabling TF32](#enabling-tf32)
 - [Setup](#setup)
     * [Requirements](#requirements)
 - [Quick Start Guide](#quick-start-guide)
@@ -29,13 +30,17 @@ This repository provides a script and recipe to train the GNMT v2 model to achie
         * [Inference performance benchmark](#inference-performance-benchmark)
     * [Results](#results)
         * [Training accuracy results](#training-accuracy-results)
-            * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
             * [Training stability test](#training-stability-test)
         * [Inference accuracy results](#inference-accuracy-results)
-            * [Inference accuracy: NVIDIA DGX-1 (8x V100 16G)](#inference-accuracy-nvidia-dgx-1-(8x-v100-16G))
+            * [Inference accuracy: NVIDIA DGX-1 (8x V100 16GB)](#inference-accuracy-nvidia-dgx-1-8x-v100-16gb)
         * [Training performance results](#training-performance-results)
-            * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+            * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
         * [Inference performance results](#inference-performance-results)
+            * [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
+            * [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
             * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
 - [Release notes](#release-notes)
     * [Changelog](#changelog)
@@ -60,7 +65,7 @@ GNMT-like models from
 and
 [NVIDIA OpenSeq2Seq Toolkit](https://github.com/NVIDIA/OpenSeq2Seq).
 
-This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures.  Therefore, researchers can get results 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
 
 ### Model architecture
 
@@ -139,47 +144,64 @@ The following features are supported by this model.
 
 The following features are supported by this model.
 
-* Automatic Mixed Precision (TF-AMP) - enables mixed precision training without any changes to the code-base by performing automatic graph rewrites and loss scaling controlled by an environmental variable.
+* Automatic Mixed Precision (AMP) - Computation graphs can be modified by TensorFlow on runtime to support mixed precision training. Detailed explanation of mixed precision can be found in the next section.
+
 
 
 ### Mixed precision training
 
-Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) previously required two steps:
 1.  Porting the model to use the FP16 data type where appropriate.
 2.  Adding loss scaling to preserve small gradient values.
 
-The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code.  AMP enables mixed precision training on Volta, Turing, and NVIDIA Ampere GPU architectures automatically. The TensorFlow framework code makes all necessary model changes internally.
+
+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
 
 For information about:
--   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
 -   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
 -   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+-   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
 
 #### Enabling mixed precision
 
-To enable this feature inside the container, simply set a single environment variable:
+Mixed precision is enabled in TensorFlow by using the Automatic Mixed Precision (TF-AMP) extension which casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In TensorFlow, loss scaling can be applied statically by using simple multiplication of loss by a constant value or automatically, by TF-AMP. Automatic mixed precision makes all the adjustments internally in TensorFlow, providing two benefits over manual operations. First, programmers need not modify network model code, reducing development and maintenance effort. Second, using AMP maintains forward and backward compatibility with all the APIs for defining and running TensorFlow models.
 
-```bash
-export TF_ENABLE_AUTO_MIXED_PRECISION=1
-```
+To enable mixed precision, you can simply add the values to the environmental variables inside your training script:
+- Enable TF-AMP graph rewrite:
+  ```
+  os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+  ```
 
-As an alternative, the environment variable can be set inside the TensorFlow Python script:
+- Enable Automated Mixed Precision:
+  ```
+  os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
+  ```
 
-```python
-os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
-```
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs.
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
 
 ## Setup
 
-The following section lists the requirements in order to start training the GNMT
-v2 model.
+The following section lists the requirements that you need to meet in order to start training the GNMT v2 model.
 
 ### Requirements
 
 This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 -   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
--   [TensorFlow 19.07-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
--   [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+-   [TensorFlow 20.06-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
+-   Supported GPUs:
+    - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+    - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
+    - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
 
 For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
 -   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
@@ -191,9 +213,10 @@ For those unable to use the TensorFlow NGC container, to set up the required env
 
 ## Quick Start Guide
 
-To train your model using mixed precision with Tensor Cores or using FP32,
+To train your model using mixed or TF32 precision with Tensor Cores or using FP32,
 perform the following steps using the default parameters of the GNMT v2 model
 on the WMT16 English German dataset.
+For the specifics concerning training and inference, see the [Advanced](#advanced) section.
 
 **1. Clone the repository.**
 ```
@@ -232,25 +255,25 @@ argument.
 To launch mixed precision training on 1 GPU, run:
 
 ```
-python nmt.py --output_dir=results --batch_size=128 --learning_rate=5e-4
+python nmt.py --output_dir=results --batch_size=128 --learning_rate=5e-4 --amp
 ```
 
 To launch mixed precision training on 8 GPUs, run:
 
 ```
-python nmt.py --output_dir=results --batch_size=1024 --num_gpus=8 --learning_rate=2e-3
+python nmt.py --output_dir=results --batch_size=1024 --num_gpus=8 --learning_rate=2e-3 --amp
 ```
 
-To launch FP32 training on 1 GPU, run:
+To launch FP32 (TF32 on NVIDIA Ampere GPUs) training on 1 GPU, run:
 
 ```
-python nmt.py --output_dir=results --batch_size=128 --learning_rate=5e-4 --use_amp=false
+python nmt.py --output_dir=results --batch_size=128 --learning_rate=5e-4
 ```
 
-To launch FP32 training on 8 GPUs, run:
+To launch FP32 (TF32 on NVIDIA Ampere GPUs) training on 8 GPUs, run:
 
 ```
-python nmt.py --output_dir=results --batch_size=1024 --num_gpus=8 --learning_rate=2e-3 --use_amp=false
+python nmt.py --output_dir=results --batch_size=1024 --num_gpus=8 --learning_rate=2e-3
 ```
 
 **6. Start evaluation.**
@@ -263,13 +286,13 @@ training.
 To launch mixed precision inference on 1 GPU, run:
 
 ```
-python nmt.py --output_dir=results --infer_batch_size=128 --mode=infer
+python nmt.py --output_dir=results --infer_batch_size=128 --mode=infer --amp
 ```
 
-To launch FP32 inference on 1 GPU, run:
+To launch FP32 (TF32 on NVIDIA Ampere GPUs) inference on 1 GPU, run:
 
 ```
-python nmt.py --output_dir=results --infer_batch_size=128 --use_amp=false --mode=infer
+python nmt.py --output_dir=results --infer_batch_size=128 --mode=infer
 ```
 
 **7. Start translation.**
@@ -341,7 +364,7 @@ The most useful arguments are as follows:
   --beam_width BEAM_WIDTH
                         beam width when using beam search decoder. If 0, use
                         standard decoder with greedy helper.
-  --use_amp USE_AMP     use_amp for training and inference
+  --amp                 use amp for training and inference
   --mode {train_and_eval,infer,translate}
 ```
 
@@ -449,7 +472,7 @@ pre-trained model checkpoint and tokenized input (for validation) and  non-token
 
 #### Validation process
 
-The `nmt.py` script, supports batched validation (`--mode=infer` flag). By
+The `nmt.py` script supports batched validation (`--mode=infer` flag). By
 default, it launches beam search with beam size of 5, coverage penalty term and
 length normalization term. Greedy decoding can be enabled by setting the
 `--beam_width=1` flag for the `nmt.py` inference script. To control the
@@ -459,7 +482,7 @@ To view all available options for validation, run `python nmt.py --help`.
 
 #### Translation process
 
-The `nmt.py` script, supports batched translation (`--mode=translate` flag). By
+The `nmt.py` script supports batched translation (`--mode=translate` flag). By
 default, it launches beam search with beam size of 5, coverage penalty term and
 length normalization term. Greedy decoding can be enabled by setting the
 `--beam_width=1` flag for the `nmt.py` prediction script. To control the
@@ -482,8 +505,8 @@ The following section shows how to run benchmarks measuring the model performanc
 
 To benchmark training performance, run:
 
-* `python nmt.py --output_dir=results --max_train_epochs=1 --num_gpus <num GPUs> --batch_size <total batch size>` for mixed precision
-* `python nmt.py --output_dir=results --max_train_epochs=1 --num_gpus <num GPUs> --batch_size <total batch size> --use_amp=false` for FP32
+* `python nmt.py --output_dir=results --max_train_epochs=1 --num_gpus <num GPUs> --batch_size <total batch size> --amp` for mixed precision
+* `python nmt.py --output_dir=results --max_train_epochs=1 --num_gpus <num GPUs> --batch_size <total batch size>` for FP32/TF32
 
 
 The log file will contain training performance in the following format:
@@ -496,11 +519,11 @@ training time for epoch 1: 25.75 mins (3625.19 sent/sec, 173461.27 tokens/sec)
 
 To benchmark inference performance, run the `scripts/translate.py` script:
 
-* For FP32:
-    `python scripts/translate.py --output_dir=/path/to/trained/model --use_amp=false --beam_width <comma separated beam widths> --infer_batch_size <comma separated batch sizes>`
+* For FP32/TF32:
+    `python scripts/translate.py --output_dir=/path/to/trained/model --beam_width <comma separated beam widths> --infer_batch_size <comma separated batch sizes>`
 
 * For mixed precision
-    `python scripts/translate.py --output_dir=/path/to/trained/model --beam_width <comma separated beam widths> --infer_batch_size <comma separated batch sizes>`
+    `python scripts/translate.py --output_dir=/path/to/trained/model --amp --beam_width <comma separated beam widths> --infer_batch_size <comma separated batch sizes>`
 
 The benchmark requires a checkpoint from a fully trained model.
 
@@ -512,10 +535,20 @@ accuracy in training and inference.
 
 #### Training accuracy results
 
-##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+
+Our results were obtained by running the `examples/DGXA100_{TF32,AMP}_8GPU.sh`
+training script in the tensorflow-20.06-tf1-py3 NGC container
+on NVIDIA DGX A100 (8x A100 40GB) GPUs.
+
+| **GPUs** | **Batch size / GPU** |**Accuracy - mixed precision (BLEU)** | **Accuracy - TF32 (BLEU)** | **Time to train - mixed precision** | **Time to train - TF32** | **Time to train speedup (TF32 to mixed precision)** |
+| --- | --- | ----- | ----- | -------- | -------- | ---- |
+|  8  | 128 | 25.1 | 24.31 | 96 min  | 139 min  | 1.45 |
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
 Our results were obtained by running the `nmt.py` script in the
-tensorflow-19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G)  GPUs.
+tensorflow-19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16GB)  GPUs.
 
 | **GPUs** | **Batch size / GPU** |**Accuracy - mixed precision (BLEU)** | **Accuracy - FP32 (BLEU)** | **Time to train - mixed precision** | **Time to train - FP32** | **Time to train speedup (FP32 to mixed precision)** |
 | --- | --- | ----- | ----- | -------- | -------- | ---- |
@@ -534,30 +567,42 @@ configurations are displayed.
 The GNMT v2 model was trained for 6 epochs, starting from 6 different initial
 random seeds. After each training epoch, the model was evaluated on the test
 dataset and the BLEU score was recorded. The training was performed in the
-tensorflow-19.07-py3 NGC container on NVIDIA DGX-1 with 8 Tesla V100 16G GPUs.
+tensorflow-20.06-tf1-py3 NGC container.
 
-In the following table, the BLEU scores after each training epoch for different
+In the following tables, the BLEU scores after each training epoch for different
 initial random seeds are displayed.
 
-| **Epoch** | **Average** | **Standard deviation** | **Minimum** | **Maximum** | **Median** |
-| --- | ------ | ----- | ------ | ------ | ------ |
-|  1  | 20.365 | 0.096 | 20.200 | 20.480 | 20.385 |
-|  2  | 22.002 | 0.080 | 21.900 | 22.110 | 22.000 |
-|  3  | 22.793 | 0.078 | 22.690 | 22.890 | 22.790 |
-|  4  | 23.220 | 0.160 | 22.890 | 23.360 | 23.260 |
-|  5  | 24.007 | 0.153 | 23.870 | 24.220 | 23.925 |
-|  6  | 24.362 | 0.167 | 24.210 | 24.710 | 24.310 |
+###### NVIDIA DGX A100 with 8 Ampere A100 40GB GPUs with TF32.
+
+| Epoch | Average | Standard deviation | Minimum | Median | Maximum |
+| ----- | ------- | ------------------ | ------- | ------ | ------- |
+| 1     | 20.272  | 0.165              | 19.760  | 20.295 | 20.480  |
+| 2     | 21.911  | 0.145              | 21.650  | 21.910 | 22.230  |
+| 3     | 22.731  | 0.140              | 22.490  | 22.725 | 23.020  |
+| 4     | 23.142  | 0.164              | 22.930  | 23.090 | 23.440  |
+| 5     | 23.967  | 0.137              | 23.760  | 23.940 | 24.260  |
+| 6     | 24.358  | 0.143              | 24.120  | 24.360 | 24.610  |
 
+###### NVIDIA DGX-1 with 8 Tesla V100 16GB GPUs with FP32.
+
+| Epoch | Average | Standard deviation | Minimum | Median | Maximum |
+| ----- | ------- | ------------------ | ------- | ------ | ------- |
+| 1     | 20.259  | 0.225              | 19.820  | 20.300 | 20.590  |
+| 2     | 21.954  | 0.194              | 21.540  | 21.955 | 22.370  |
+| 3     | 22.729  | 0.150              | 22.480  | 22.695 | 23.110  |
+| 4     | 23.218  | 0.210              | 22.820  | 23.225 | 23.470  |
+| 5     | 23.921  | 0.114              | 23.680  | 23.910 | 24.080  |
+| 6     | 24.381  | 0.131              | 24.160  | 24.375 | 24.590  |
 
 #### Inference accuracy results
 
-##### Inference accuracy: NVIDIA DGX-1 (8x V100 16G)
+##### Inference accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
-Our results were obtained by running the `scripts/translate.py` script in the tensorflow-19.07-py3 NGC container on NVIDIA DGX-1 8x V100 16G GPUs.
+Our results were obtained by running the `scripts/translate.py` script in the tensorflow-19.07-py3 NGC container on NVIDIA DGX-1 8x V100 16GB GPUs.
 
-* For mixed precision: `python scripts/translate.py --output_dir=/path/to/trained/model --beam_width 1,2,5 --infer_batch_size 128`
+* For mixed precision: `python scripts/translate.py --output_dir=/path/to/trained/model --beam_width 1,2,5 --infer_batch_size 128 --amp`
 
-* For FP32: `python scripts/translate.py --output_dir=/path/to/trained/model --beam_width 1,2,5 --infer_batch_size 128 --use_amp=false`
+* For FP32: `python scripts/translate.py --output_dir=/path/to/trained/model --beam_width 1,2,5 --infer_batch_size 128`
 
 | **Batch size** | **Beam size** | **Mixed precision BLEU** | **FP32 BLEU** |
 |:---:|:---:|:---:|:---:|
@@ -567,7 +612,25 @@ Our results were obtained by running the `scripts/translate.py` script in the te
 
 #### Training performance results
 
-##### Training performance: NVIDIA DGX-1 (8x V100 16G)
+##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+
+Our results were obtained by running the `examples/DGXA100_{TF32,AMP}_{1,8}GPU.sh`
+training script in the tensorflow-20.06-tf1-py3 NGC container
+on NVIDIA DGX A100 (8x A100 40GB) GPUs.
+Performance numbers (in items/images per second)
+were averaged over an entire training epoch.
+
+| **GPUs** | **Batch size / GPU** | **Throughput - mixed precision (tokens/s)** | **Throughput - TF32 (tokens/s)** | **Throughput speedup (TF32 - mixed precision)** | **Weak scaling - mixed precision** | **Weak scaling - TF32** |
+| --- | --- | ------- | ------- | ---- | ---- | ---- |
+|  1  | 128 |  29 911 |  31 110 | 0.96 | 1.00 | 1.00 |
+|  8  | 128 | 181 384 | 175 292 | 1.03 | 6.06 | 5.63 |
+
+
+
+To achieve these same results, follow the steps in the
+[Quick Start Guide](#quick-start-guide).
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
 
 Our results were obtained by running the `nmt.py` script in the tensorflow-19.07-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
 Performance numbers (in tokens per second) were averaged over an entire
@@ -588,18 +651,141 @@ The benchmark requires a checkpoint from a fully trained model.
 To launch the inference benchmark in mixed precision on 1 GPU, run:
 
 ```
-python scripts/translate.py --output_dir=/path/to/trained/model --beam_width 1,2,5 --infer_batch_size 1,2,4,8,32,128,512
+python scripts/translate.py --output_dir=/path/to/trained/model --beam_width 1,2,5 --infer_batch_size 1,2,4,8,32,128,512 --amp
 ```
 
-To launch the inference benchmark in FP32 on 1 GPU, run:
+To launch the inference benchmark in FP32/TF32 on 1 GPU, run:
 
 ```
-python scripts/translate.py --output_dir=/path/to/trained/model --beam_width 1,2,5 --infer_batch_size 1,2,4,8,32,128,512 --use_amp=false
+python scripts/translate.py --output_dir=/path/to/trained/model --beam_width 1,2,5 --infer_batch_size 1,2,4,8,32,128,512
 ```
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
 outlined above.
 
+##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
+
+Our results were obtained by running the
+`python scripts/translate.py --infer_batch_size 1,2,4,8,32,128,512 --beam_width 1,2,5 {--amp}`
+inferencing benchmarking script in the tensorflow-20.06-tf1-py3 NGC container
+on NVIDIA DGX A100 (1x A100 40GB) GPU.
+
+FP16
+
+| **Batch size**     | **Beam width**     | **Bleu**           | **Sentences/sec**  | **Tokens/sec**     | **Latency Avg**    | **Latency 50%**     | **Latency 90%**     | **Latency 95%**     | **Latency 99%**     | **Latency 100%**    |
+|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+| 1              | 1              | 23.80          | 13.67          | 737.89         | 73.15          | 67.69          | 121.98         | 137.20         | 162.74         | 201.06         |
+| 1              | 2              | 24.58          | 13.40          | 721.18         | 74.65          | 69.12          | 123.99         | 138.82         | 169.58         | 198.49         |
+| 1              | 5              | 25.10          | 12.12          | 647.78         | 82.53          | 76.53          | 136.35         | 152.59         | 196.09         | 216.55         |
+| 2              | 1              | 23.80          | 21.55          | 1163.16        | 92.82          | 88.15          | 139.88         | 152.49         | 185.18         | 208.35         |
+| 2              | 2              | 24.58          | 21.07          | 1134.42        | 94.91          | 89.62          | 142.08         | 158.12         | 188.00         | 205.08         |
+| 2              | 5              | 25.10          | 19.59          | 1047.21        | 102.10         | 96.20          | 152.36         | 172.46         | 211.96         | 219.87         |
+| 4              | 1              | 23.80          | 36.98          | 1996.27        | 108.16         | 105.07         | 150.42         | 161.56         | 200.99         | 205.87         |
+| 4              | 2              | 24.57          | 34.92          | 1880.48        | 114.53         | 111.42         | 160.29         | 177.14         | 205.32         | 211.80         |
+| 4              | 5              | 25.10          | 31.56          | 1687.34        | 126.74         | 122.06         | 179.68         | 201.38         | 225.08         | 229.14         |
+| 8              | 1              | 23.80          | 64.52          | 3482.81        | 123.99         | 122.89         | 159.89         | 174.66         | 201.12         | 205.59         |
+| 8              | 2              | 24.57          | 59.04          | 3178.17        | 135.50         | 135.23         | 180.50         | 191.66         | 214.95         | 216.84         |
+| 8              | 5              | 25.09          | 55.51          | 2967.82        | 144.11         | 141.98         | 198.39         | 218.88         | 223.55         | 225.61         |
+| 32             | 1              | 23.80          | 193.54         | 10447.04       | 165.34         | 163.56         | 211.67         | 215.37         | 221.07         | 221.14         |
+| 32             | 2              | 24.57          | 182.00         | 9798.09        | 175.82         | 176.04         | 220.33         | 224.25         | 226.45         | 227.05         |
+| 32             | 5              | 25.10          | 141.63         | 7572.02        | 225.94         | 225.59         | 278.38         | 279.56         | 281.61         | 282.13         |
+| 128            | 1              | 23.80          | 556.57         | 30042.59       | 229.98         | 226.81         | 259.05         | 260.26         | 260.74         | 260.85         |
+| 128            | 2              | 24.57          | 400.02         | 21535.38       | 319.98         | 328.23         | 351.31         | 352.82         | 353.01         | 353.06         |
+| 128            | 5              | 25.10          | 235.14         | 12570.95       | 544.35         | 576.62         | 581.95         | 582.64         | 583.61         | 583.85         |
+| 512            | 1              | 23.80          | 903.83         | 48786.58       | 566.48         | 570.44         | 579.74         | 580.66         | 581.39         | 581.57         |
+| 512            | 2              | 24.58          | 588.63         | 31689.07       | 869.81         | 894.90         | 902.65         | 902.85         | 903.00         | 903.04         |
+| 512            | 5              | 25.10          | 285.86         | 15283.40       | 1791.06        | 1835.19        | 1844.29        | 1845.59        | 1846.63        | 1846.89        |
+
+TF32
+
+| **Batch size**     | **Beam width**     | **Bleu**           | **Sentences/sec**  | **Tokens/sec**     | **Latency Avg**    | **Latency 50%**     | **Latency 90%**     | **Latency 95%**     | **Latency 99%**     | **Latency 100%**    |
+|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+| 1              | 1              | 23.82          | 13.25          | 715.47         | 75.45          | 69.81          | 125.63         | 141.89         | 169.70         | 209.78         |
+| 1              | 2              | 24.59          | 13.21          | 711.16         | 75.72          | 70.06          | 124.75         | 140.20         | 173.23         | 201.39         |
+| 1              | 5              | 25.08          | 12.38          | 661.99         | 80.76          | 74.90          | 131.93         | 148.91         | 187.05         | 208.39         |
+| 2              | 1              | 23.82          | 21.61          | 1166.56        | 92.55          | 87.25          | 139.54         | 151.77         | 180.24         | 209.05         |
+| 2              | 2              | 24.59          | 21.24          | 1143.63        | 94.17          | 88.78          | 139.70         | 156.61         | 189.09         | 205.06         |
+| 2              | 5              | 25.10          | 19.49          | 1042.17        | 102.62         | 96.14          | 153.38         | 172.89         | 213.99         | 219.54         |
+| 4              | 1              | 23.81          | 35.84          | 1934.49        | 111.62         | 108.73         | 154.52         | 165.42         | 207.88         | 211.29         |
+| 4              | 2              | 24.58          | 34.71          | 1869.20        | 115.24         | 111.24         | 161.24         | 177.73         | 208.12         | 212.74         |
+| 4              | 5              | 25.09          | 32.24          | 1723.86        | 124.07         | 119.35         | 177.54         | 196.69         | 221.10         | 223.52         |
+| 8              | 1              | 23.80          | 64.08          | 3459.74        | 124.84         | 123.61         | 161.92         | 177.06         | 205.47         | 206.47         |
+| 8              | 2              | 24.61          | 59.31          | 3193.52        | 134.89         | 133.44         | 182.92         | 192.71         | 216.04         | 218.78         |
+| 8              | 5              | 25.10          | 56.60          | 3026.29        | 141.35         | 138.61         | 194.52         | 213.65         | 220.24         | 221.45         |
+| 32             | 1              | 23.80          | 195.31         | 10544.22       | 163.85         | 162.80         | 212.71         | 215.41         | 216.92         | 217.34         |
+| 32             | 2              | 24.61          | 185.66         | 9996.59        | 172.36         | 171.07         | 216.46         | 221.64         | 223.68         | 225.25         |
+| 32             | 5              | 25.11          | 147.24         | 7872.61        | 217.34         | 214.97         | 269.75         | 270.71         | 271.44         | 272.87         |
+| 128            | 1              | 23.81          | 576.54         | 31123.19       | 222.02         | 219.25         | 249.44         | 249.75         | 249.88         | 249.91         |
+| 128            | 2              | 24.57          | 419.87         | 22609.82       | 304.86         | 314.47         | 332.18         | 334.13         | 336.22         | 336.74         |
+| 128            | 5              | 25.10          | 245.76         | 13138.84       | 520.83         | 552.68         | 558.89         | 559.09         | 559.13         | 559.13         |
+| 512            | 1              | 23.80          | 966.24         | 52156.34       | 529.89         | 534.82         | 558.30         | 559.33         | 560.16         | 560.36         |
+| 512            | 2              | 24.58          | 642.41         | 34590.81       | 797.00         | 812.40         | 824.23         | 825.92         | 827.27         | 827.61         |
+| 512            | 5              | 25.10          | 289.33         | 15468.09       | 1769.61        | 1817.19        | 1849.83        | 1855.17        | 1859.45        | 1860.51        |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
+
+Our results were obtained by running the
+`python scripts/translate.py --infer_batch_size 1,2,4,8,32,128,512 --beam_width 1,2,5 {--amp}`
+inferencing benchmarking script in the tensorflow-20.06-tf1-py3 NGC container
+on NVIDIA DGX-1 with (1x V100 16GB) GPU.
+
+FP16
+
+| **Batch size** | **Sequence length** | **Throughput Avg** | **Latency Avg** | **Latency 90%** |**Latency 95%** |**Latency 99%** |
+|------------|-----------------|-----|-----|-----|-----|-----|
+| 1              | 1              | 23.78          | 9.06           | 489.00         | 110.41         | 102.80         | 183.54         | 206.33         | 242.44         | 306.21         |
+| 1              | 2              | 24.58          | 8.68           | 467.35         | 115.22         | 107.17         | 188.75         | 212.36         | 258.15         | 306.15         |
+| 1              | 5              | 25.09          | 8.39           | 448.32         | 119.25         | 109.79         | 195.68         | 220.56         | 276.41         | 325.65         |
+| 2              | 1              | 23.82          | 14.59          | 787.70         | 137.04         | 129.38         | 206.35         | 224.94         | 267.30         | 318.60         |
+| 2              | 2              | 24.57          | 14.44          | 777.60         | 138.51         | 131.07         | 206.67         | 228.95         | 275.56         | 311.23         |
+| 2              | 5              | 25.11          | 13.78          | 736.99         | 145.11         | 136.76         | 216.01         | 243.24         | 299.28         | 315.88         |
+| 4              | 1              | 23.82          | 23.79          | 1284.24        | 168.14         | 164.13         | 234.70         | 248.42         | 308.38         | 325.46         |
+| 4              | 2              | 24.59          | 22.67          | 1220.66        | 176.45         | 171.40         | 243.76         | 271.92         | 314.79         | 330.19         |
+| 4              | 5              | 25.08          | 22.33          | 1194.00        | 179.12         | 174.04         | 253.36         | 281.88         | 318.76         | 340.01         |
+| 8              | 1              | 23.81          | 43.33          | 2338.68        | 184.63         | 183.25         | 237.66         | 266.73         | 305.89         | 315.03         |
+| 8              | 2              | 24.60          | 39.12          | 2106.44        | 204.49         | 200.96         | 276.05         | 294.53         | 327.61         | 335.50         |
+| 8              | 5              | 25.10          | 37.16          | 1987.05        | 215.26         | 210.92         | 295.65         | 323.83         | 337.09         | 343.03         |
+| 32             | 1              | 23.82          | 129.52         | 6992.15        | 247.06         | 245.81         | 317.71         | 325.54         | 330.09         | 335.04         |
+| 32             | 2              | 24.55          | 123.28         | 6637.86        | 259.57         | 261.07         | 319.13         | 333.45         | 338.75         | 342.57         |
+| 32             | 5              | 25.05          | 88.74          | 4744.33        | 360.61         | 359.27         | 446.65         | 448.40         | 455.93         | 461.86         |
+| 128            | 1              | 23.80          | 332.81         | 17964.83       | 384.60         | 382.14         | 434.46         | 436.71         | 439.64         | 440.37         |
+| 128            | 2              | 24.59          | 262.87         | 14153.59       | 486.93         | 506.45         | 528.87         | 530.90         | 533.09         | 533.64         |
+| 128            | 5              | 25.08          | 143.91         | 7695.36        | 889.42         | 932.93         | 965.67         | 966.26         | 966.53         | 966.59         |
+| 512            | 1              | 23.80          | 613.57         | 33126.42       | 834.46         | 848.06         | 868.21         | 869.04         | 869.70         | 869.86         |
+| 512            | 2              | 24.59          | 387.72         | 20879.62       | 1320.54        | 1343.05        | 1354.40        | 1356.50        | 1358.19        | 1358.61        |
+| 512            | 5              | 25.10          | 199.48         | 10664.34       | 2566.67        | 2628.50        | 2642.59        | 2644.73        | 2646.44        | 2646.86        |
+
+
+FP32
+
+| **Batch size** | **Sequence length** | **Throughput Avg** | **Latency Avg** | **Latency 90%** |**Latency 95%** |**Latency 99%** |
+|------------|-----------------|-----|-----|-----|-----|-----|
+| 1              | 1              | 23.80          | 8.37           | 451.86         | 119.46         | 111.26         | 199.36         | 224.49         | 269.03         | 330.72         |
+| 1              | 2              | 24.59          | 8.83           | 475.11         | 113.31         | 104.54         | 187.79         | 210.64         | 260.42         | 317.45         |
+| 1              | 5              | 25.09          | 7.74           | 413.92         | 129.15         | 119.44         | 212.84         | 239.52         | 305.47         | 349.09         |
+| 2              | 1              | 23.80          | 13.96          | 753.79         | 143.22         | 135.73         | 213.96         | 235.89         | 284.62         | 330.71         |
+| 2              | 2              | 24.59          | 12.96          | 697.63         | 154.33         | 145.01         | 230.88         | 255.31         | 306.71         | 340.36         |
+| 2              | 5              | 25.09          | 12.67          | 677.23         | 157.88         | 148.24         | 236.50         | 266.91         | 322.94         | 349.55         |
+| 4              | 1              | 23.80          | 22.42          | 1209.97        | 178.44         | 172.70         | 247.51         | 266.07         | 326.95         | 343.86         |
+| 4              | 2              | 24.59          | 20.55          | 1106.07        | 194.68         | 188.83         | 271.75         | 295.08         | 345.76         | 364.00         |
+| 4              | 5              | 25.09          | 21.19          | 1132.58        | 188.81         | 182.77         | 268.18         | 298.53         | 331.96         | 357.36         |
+| 8              | 1              | 23.80          | 39.32          | 2122.26        | 203.48         | 201.89         | 263.28         | 286.71         | 332.70         | 348.93         |
+| 8              | 2              | 24.59          | 37.51          | 2019.43        | 213.26         | 211.55         | 283.67         | 302.28         | 338.47         | 356.51         |
+| 8              | 5              | 25.09          | 31.69          | 1694.02        | 252.46         | 245.33         | 348.95         | 378.16         | 392.72         | 401.73         |
+| 32             | 1              | 23.80          | 118.51         | 6396.93        | 270.02         | 269.22         | 337.17         | 352.12         | 361.36         | 361.40         |
+| 32             | 2              | 24.59          | 100.23         | 5395.33        | 319.28         | 318.89         | 399.80         | 403.12         | 414.51         | 423.41         |
+| 32             | 5              | 25.09          | 68.59          | 3666.77        | 466.55         | 466.84         | 581.77         | 586.42         | 589.04         | 593.41         |
+| 128            | 1              | 23.80          | 256.49         | 13845.09       | 499.04         | 492.36         | 562.12         | 567.20         | 571.18         | 572.18         |
+| 128            | 2              | 24.59          | 176.83         | 9519.12        | 723.86         | 754.89         | 792.12         | 793.86         | 796.44         | 797.09         |
+| 128            | 5              | 25.09          | 96.21          | 5143.17        | 1330.48        | 1420.94        | 1427.91        | 1431.02        | 1435.23        | 1436.28        |
+| 512            | 1              | 23.80          | 366.07         | 19759.97       | 1398.63        | 1421.81        | 1457.81        | 1461.04        | 1463.63        | 1464.27        |
+| 512            | 2              | 24.59          | 225.48         | 12137.77       | 2270.75        | 2323.62        | 2338.62        | 2340.94        | 2342.80        | 2343.27        |
+| 512            | 5              | 25.09          | 106.02         | 5667.78        | 4829.31        | 4946.65        | 4956.15        | 4957.85        | 4959.21        | 4959.55        |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
 ##### Inference performance: NVIDIA T4
 
 Our results were obtained by running the `scripts/translate.py` script in the tensorflow-19.07-py3 NGC container on NVIDIA T4.
@@ -638,6 +824,8 @@ Reported mixed precision speedups are relative to FP32 numbers for corresponding
   * Initial release
 2. June, 2019
   * Performance improvements
+3. June, 2020
+  * Updated performance tables to include A100 results
 
 ### Known issues
 There are no known issues in this release.

+ 1 - 0
TensorFlow/Translation/GNMT/examples/DGX1_AMP_1GPU.sh

@@ -0,0 +1 @@
+python nmt.py --output_dir=results --batch_size=128 --learning_rate=5e-4 --amp

+ 1 - 0
TensorFlow/Translation/GNMT/examples/DGX1_AMP_8GPU.sh

@@ -0,0 +1 @@
+python nmt.py --output_dir=results --batch_size=1024 --learning_rate=2e-3 --num_gpus=8 --amp

+ 1 - 0
TensorFlow/Translation/GNMT/examples/DGX1_FP32_1GPU.sh

@@ -0,0 +1 @@
+python nmt.py --output_dir=results --batch_size=128 --learning_rate=5e-4

+ 1 - 0
TensorFlow/Translation/GNMT/examples/DGX1_FP32_8GPU.sh

@@ -0,0 +1 @@
+python nmt.py --output_dir=results --batch_size=1024 --learning_rate=2e-3 --num_gpus=8

+ 1 - 0
TensorFlow/Translation/GNMT/examples/DGXA100_AMP_1GPU.sh

@@ -0,0 +1 @@
+python nmt.py --output_dir=results --batch_size=128 --learning_rate=5e-4 --amp

+ 1 - 0
TensorFlow/Translation/GNMT/examples/DGXA100_AMP_8GPU.sh

@@ -0,0 +1 @@
+python nmt.py --output_dir=results --batch_size=1024 --learning_rate=2e-3 --num_gpus=8 --amp

+ 1 - 0
TensorFlow/Translation/GNMT/examples/DGXA100_TF32_1GPU.sh

@@ -0,0 +1 @@
+python nmt.py --output_dir=results --batch_size=128 --learning_rate=5e-4

+ 1 - 0
TensorFlow/Translation/GNMT/examples/DGXA100_TF32_8GPU.sh

@@ -0,0 +1 @@
+python nmt.py --output_dir=results --batch_size=1024 --learning_rate=2e-3 --num_gpus=8

BIN
TensorFlow/Translation/GNMT/img/bleu_score.png


+ 36 - 6
TensorFlow/Translation/GNMT/nmt.py

@@ -41,6 +41,8 @@ import numpy as np
 import time
 import tensorflow as tf
 
+import dllogger
+
 import estimator
 from utils import evaluation_utils
 from utils import iterator_utils
@@ -351,8 +353,8 @@ def add_arguments(parser):
   parser.add_argument("--num_workers", type=int, default=1,
                       help="Number of workers (inference only).")
 
-  parser.add_argument("--use_amp", type="bool", default=True,
-                      help="use_amp for training and inference")
+  parser.add_argument("--amp", action='store_true',
+                      help="use amp for training and inference")
   parser.add_argument("--use_fastmath", type="bool", default=False,
                       help="use_fastmath for training and inference")
   parser.add_argument("--use_fp16", type="bool", default=False,
@@ -653,7 +655,7 @@ def create_hparams(flags):
       random_seed=flags.random_seed,
       language_model=flags.language_model,
 
-      use_amp=flags.use_amp,
+      amp=flags.amp,
       use_fastmath=flags.use_fastmath,
       use_fp16=flags.use_fp16,
       fp16_loss_scale=flags.fp16_loss_scale,
@@ -906,10 +908,10 @@ def main(unused_argv):
   if FLAGS.use_fp16 and FLAGS.use_dist_strategy:
     raise ValueError("use_fp16 and use_dist_strategy aren't compatible")
 
-  if FLAGS.use_fp16 + FLAGS.use_amp + FLAGS.use_fastmath > 1:
-    raise ValueError("Only one of use_fp16, use_amp, use_fastmath can be set")
+  if FLAGS.use_fp16 + FLAGS.amp + FLAGS.use_fastmath > 1:
+    raise ValueError("Only one of use_fp16, amp, use_fastmath can be set")
 
-  if FLAGS.use_amp:
+  if FLAGS.amp:
     utils.print_out('Enabling TF-AMP')
 
     os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
@@ -969,6 +971,12 @@ def main(unused_argv):
       utils.print_out("# Creating output directory %s ..." % output_dir)
       tf.gfile.MakeDirs(output_dir)
 
+    dllogger.init(backends=[
+        dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT),
+        dllogger.JSONStreamBackend(dllogger.Verbosity.VERBOSE, os.path.join(FLAGS.output_dir, FLAGS.mode + '-report.json')),
+    ])
+    dllogger.log('PARAMETER', vars(FLAGS))
+
     # Load hparams.
     default_hparams = create_hparams(FLAGS)
     default_hparams.num_buckets = 1
@@ -1015,6 +1023,12 @@ def main(unused_argv):
       utils.print_out("# Creating output directory %s ..." % output_dir)
       tf.gfile.MakeDirs(output_dir)
 
+    dllogger.init(backends=[
+        dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT),
+        dllogger.JSONStreamBackend(dllogger.Verbosity.VERBOSE, os.path.join(FLAGS.output_dir, FLAGS.mode + '-report.json')),
+    ])
+    dllogger.log('PARAMETER', vars(FLAGS))
+
     # Load hparams.
     default_hparams = create_hparams(FLAGS)
 
@@ -1051,6 +1065,10 @@ def main(unused_argv):
       train_delta = train_end - train_start
       utils.print_out("training time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" %
                       (epochs + 1, train_delta / 60., train_speed, train_speed * (train_src_tokens + train_tgt_tokens) / train_sentences), f=sys.stderr)
+      logging_data = {
+        'train_speed_sent': train_speed,
+        'train_speed_toks': train_speed * (train_src_tokens + train_tgt_tokens) / train_sentences,
+      }
 
       # This is probably sub-optimal, doing eval per-epoch
       eval_start = time.time()
@@ -1059,8 +1077,17 @@ def main(unused_argv):
       eval_delta = eval_end - eval_start
       utils.print_out("eval time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" %
                       (epochs + 1, eval_delta / 60., eval_speed, eval_speed * (eval_src_tokens + eval_output_tokens) / eval_sentences), f=sys.stderr)
+      logging_data.update({
+        'bleu': bleu_score,
+        'eval_speed_sent': eval_speed,
+        'eval_speed_toks': eval_speed * (eval_src_tokens + eval_output_tokens) / eval_sentences,
+      })
       for lat in sorted(eval_latencies):
         utils.print_out("eval latency_%s for epoch %d: %.2f ms" % (lat, epochs + 1, eval_latencies[lat] * 1000))
+        logging_data['eval_latency_{}'.format(lat)] = eval_latencies[lat] * 1000
+
+      dllogger.log((epochs,), logging_data)
+      dllogger.flush()
 
 
       if FLAGS.debug or (FLAGS.target_bleu is not None and bleu_score > FLAGS.target_bleu):
@@ -1075,6 +1102,9 @@ def main(unused_argv):
         utils.print_out("Stop job since max_train_epochs is reached.",
                         f=sys.stderr)
 
+    dllogger.log((), logging_data)
+    dllogger.flush()
+
   experiment_end = time.time()
   utils.print_out('Experiment took {} min'.format((experiment_end - experiment_start) / 60))
 

+ 43 - 0
TensorFlow/Translation/GNMT/qa/L1_joc_GNMT_inferbench_fp16.sh

@@ -0,0 +1,43 @@
+set -o nounset
+set -o errexit
+set -o pipefail
+
+cd ..
+cp -r /data/joc/gnmt_tf/19.08 output_dir
+
+# hack to work with pytorch dataset
+sed -ie 's/    src_vocab_file = hparams.vocab_prefix + "." + hparams.src/    src_vocab_file = hparams.vocab_prefix/g' nmt.py
+sed -ie 's/    tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt/    tgt_vocab_file = hparams.vocab_prefix/g' nmt.py
+
+( python nmt.py --amp --data_dir=/data/pytorch/wmt16_de_en --output_dir=output_dir --mode=infer --infer_batch_size=512 2>&1 ) | tee log.log
+python scripts/parse_log.py log.log | tee log.json
+
+python << END
+import json
+import numpy as np
+from pathlib import Path
+
+baseline = 10254
+bleu_baseline = 25.1
+
+log = json.loads(Path('log.json').read_text())
+speed = np.mean(log['eval_tokens_per_sec'])
+bleu = log['bleu'][0]
+
+print('Eval speed    :', speed)
+print('Baseline      :', baseline)
+
+print('Bleu          :', bleu)
+print('Bleu baseline :', bleu_baseline)
+
+if speed < baseline * 0.9:
+    print("FAILED: speed ({}) doesn't match the baseline ({})".format(speed, baseline))
+    exit(1)
+
+if bleu < bleu_baseline - 0.2:
+    print("FAILED: bleu ({}) doesn't match the baseline ({})".format(bleu, bleu_baseline))
+    exit(1)
+
+print('SUCCESS')
+END
+

+ 43 - 0
TensorFlow/Translation/GNMT/qa/L1_joc_GNMT_inferbench_fp32.sh

@@ -0,0 +1,43 @@
+set -o nounset
+set -o errexit
+set -o pipefail
+
+cd .. 
+cp -r /data/joc/gnmt_tf/19.08 output_dir
+
+# hack to work with pytorch dataset
+sed -ie 's/    src_vocab_file = hparams.vocab_prefix + "." + hparams.src/    src_vocab_file = hparams.vocab_prefix/g' nmt.py
+sed -ie 's/    tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt/    tgt_vocab_file = hparams.vocab_prefix/g' nmt.py
+
+( python nmt.py --data_dir=/data/pytorch/wmt16_de_en --output_dir=output_dir --mode=infer --infer_batch_size=512 2>&1 ) | tee log.log
+python scripts/parse_log.py log.log | tee log.json
+
+python << END
+import json
+import numpy as np
+from pathlib import Path
+
+baseline = 5374
+bleu_baseline = 25.1
+
+log = json.loads(Path('log.json').read_text())
+speed = np.mean(log['eval_tokens_per_sec'])
+bleu = log['bleu'][0]
+
+print('Eval speed    :', speed)
+print('Baseline      :', baseline)
+
+print('Bleu          :', bleu)
+print('Bleu baseline :', bleu_baseline)
+
+if speed < baseline * 0.9:
+    print("FAILED: speed ({}) doesn't match the baseline ({})".format(speed, baseline))
+    exit(1)
+
+if bleu < bleu_baseline - 0.2:
+    print("FAILED: bleu ({}) doesn't match the baseline ({})".format(bleu, bleu_baseline))
+    exit(1)
+
+print('SUCCESS')
+END
+

+ 1 - 0
TensorFlow/Translation/GNMT/requirements.txt

@@ -1 +1,2 @@
 sacrebleu==1.2.10
+git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger