5 years ago · f8b3a63f81
--- a/PyTorch/LanguageModeling/BERT/Dockerfile
+++ b/PyTorch/LanguageModeling/BERT/Dockerfile
@@ -12,7 +12,7 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
			
 
				-FROM nvcr.io/nvidia/tritonserver:20.06-py3-clientsdk as trt
			
 
				+FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk as trt
			
 
				 FROM ${FROM_IMAGE_NAME}
			
 
				 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
			
 
				 
			
@@ -27,7 +27,8 @@ COPY --from=trt /workspace/install/ /workspace/install/
 
				 ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
			
 
				 
			
 
				 # Install trt python api
			
 
				-RUN pip install /workspace/install/python/tensorrtserver-1.*-py3-none-linux_x86_64.whl
			
 
				+RUN apt-get install libb64-0d
			
 
				+RUN pip install /workspace/install/python/tensorrtserver*.whl
			
 
				 
			
 
				 WORKDIR /workspace/bert
			
 
				 RUN pip install --upgrade --no-cache-dir pip \
			
--- a/PyTorch/LanguageModeling/BERT/LICENSE
+++ b/PyTorch/LanguageModeling/BERT/LICENSE
@@ -176,6 +176,7 @@
 
				 
			
 
				    END OF TERMS AND CONDITIONS
			
 
				 
			
 
				+   Copyright 2019 NVIDIA CORPORATION. All rights reserved.
			
 
				    APPENDIX: How to apply the Apache License to your work.
			
 
				 
			
 
				       To apply the Apache License to your work, attach the following
			
--- a/PyTorch/LanguageModeling/BERT/README.md
+++ b/PyTorch/LanguageModeling/BERT/README.md
@@ -31,7 +31,6 @@ This repository provides a script and recipe to train the BERT model for PyTorch
 
				         * [Pre-training](#pre-training)
			
 
				         * [Fine-tuning](#fine-tuning)   
			
 
				     * [Inference process](#inference-process)
			
 
				-        * [Pre-training inference](#pre-training-inference)
			
 
				         * [Fine-tuning inference](#fine-tuning-inference)
			
 
				     * [Deploying BERT using NVIDIA Triton Inference Server](#deploying-bert-using-nvidia-triton-inference-server)
			
 
				 - [Performance](#performance)
			
@@ -40,10 +39,12 @@ This repository provides a script and recipe to train the BERT model for PyTorch
 
				         * [Inference performance benchmark](#inference-performance-benchmark)
			
 
				     * [Results](#results)
			
 
				         * [Training accuracy results](#training-accuracy-results)
			
 
				-            * [Pre-training loss results: NVIDIA DGX A100 (8x A100 40GB)](#pre-training-loss-results-nvidia-dgx-a100-8x-a100-40gb)  
			
 
				+            * [Pre-training loss results: NVIDIA DGX A100 (8x A100 40GB)](#pre-training-loss-results-nvidia-dgx-a100-8x-a100-40gb)
			
 
				+            * [Pre-training loss results: NVIDIA DGX-2H V100 (16x V100 32GB)](#pre-training-loss-results-nvidia-dgx-2h-v100-16x-v100-32gb)  
			
 
				             * [Pre-training loss results](#pre-training-loss-results)
			
 
				+            * [Pre-training loss curves](#pre-training-loss-curves)
			
 
				             * [Fine-tuning accuracy results: NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-accuracy-results-nvidia-dgx-a100-8x-a100-40gb)
			
 
				-            * [Fine-tuning accuracy results](#fine-tuning-accuracy-results)
			
 
				+            * [Fine-tuning accuracy results: NVIDIA DGX-1 (8x V100 16G)](#fine-tuning-accuracy-results-nvidia-dgx-1-8x-v100-16g)
			
 
				             * [Training stability test](#training-stability-test)
			
 
				                 * [Pre-training stability test](#pre-training-stability-test)
			
 
				                 * [Fine-tuning stability test](#fine-tuning-stability-test) 
			
@@ -79,7 +80,7 @@ This repository provides a script and recipe to train the BERT model for PyTorch
 
				  
			
 
				 ## Model overview
			
 
				  
			
 
				-BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's implementation of BERT is an optimized version of the [Hugging Face implementation](https://github.com/huggingface/pytorch-pretrained-BERT), leveraging mixed precision arithmetic and Tensor Cores on Volta V100 GPUs for faster training times while maintaining target accuracy.
			
 
				+BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's implementation of BERT is an optimized version of the [Hugging Face implementation](https://github.com/huggingface/pytorch-pretrained-BERT), leveraging mixed precision arithmetic and Tensor Cores on Volta V100 and Ampere A100 GPUs for faster training times while maintaining target accuracy.
			
 
				  
			
 
				 This repository contains scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for both pre-training and fine-tuning for tasks such as question answering. The major differences between the original implementation of the paper and this version of BERT are as follows:
			
 
				  
			
@@ -146,7 +147,7 @@ NVLAMB adds the necessary tweaks to [LAMB version 1](https://arxiv.org/abs/1904.
 
				  
			
 
				 ### Mixed precision training
			
 
				  
			
 
				-Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
			
 
				+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
			
 
				  
			
 
				 1.  Porting the model to use the FP16 data type where appropriate.
			
 
				 2.  Adding loss scaling to preserve small gradient values.
			
@@ -217,7 +218,10 @@ This repository contains Dockerfile which extends the PyTorch NGC container and
 
				  
			
 
				 -   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
			
 
				 -   [PyTorch 20.06-py3 NGC container or later](https://ngc.nvidia.com/registry/nvidia-pytorch)
			
 
				--   [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
			
 
				+-   Supported GPUs:
			
 
				+- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
			
 
				+- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
			
 
				+- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
			
 
				  
			
 
				 For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
			
 
				 -   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
			
@@ -232,7 +236,7 @@ More information on how to set up and launch can be found in the [Multi-node Doc
 
				  
			
 
				 ## Quick Start Guide
			
 
				  
			
 
				-To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. The default parameters for pretraining have been set to run on 8x V100 32G cards. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
			
 
				+To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. Training configurations to run on 8 x A100 40G, 8 x V100 16G, 16 x V100 32G cards and examples of usage are provided at the end of this section. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
			
 
				  
			
 
				  
			
 
				 1. Clone the repository.
			
@@ -242,7 +246,7 @@ To train your model using mixed precision with Tensor Cores or using FP32, perfo
 
				  
			
 
				 2. Download the NVIDIA pretrained checkpoint.
			
 
				  
			
 
				-If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/catalog/models) and browse the available models. This downloaded checkpoint is used to fine-tune on SQuAD. Ensure you place the downloaded checkpoint in the `checkpoints/` folder.
			
 
				+If you want to use a pre-trained checkpoint, visit [NGC](https://ngc.nvidia.com/catalog/models/nvidia:bert_large_pyt_amp_ckpt_pretraining_lamb). This downloaded checkpoint is used to fine-tune on SQuAD. Ensure you unzip the downloaded file and place the checkpoint in the `checkpoints/` folder. For a checkpoint already fine-tuned for QA on SQuAD v1.1 visit [NGC](https://ngc.nvidia.com/catalog/models/nvidia:bert_large_pyt_amp_ckpt_squad_qa1_1).
			
 
				  
			
 
				 3. Build BERT on top of the  NGC container.
			
 
				 `bash scripts/docker/build.sh`
			
@@ -265,6 +269,7 @@ This repository provides scripts to download, verify, and extract the following
 
				 To download, verify, extract the datasets, and create the shards in `.hdf5` format, run:  
			
 
				 `/workspace/bert/data/create_datasets_from_start.sh`
			
 
				  
			
 
				+Note: For fine tuning only, Wikipedia and Bookscorpus dataset download can be skipped by commenting it out. The pretraining dataset is 170GB+ and takes 15+ hours to download. The BookCorpus server could sometimes get overloaded and also contain broken links resulting in HTTP 403 and 503 errors. You can either skip the missing files or retry downloading at a later time. Expired dataset links are ignored during data download.
			
 
				  
			
 
				 6. Start pretraining.
			
 
				  
			
@@ -283,11 +288,25 @@ The above pretrained BERT representations can be fine tuned with just one additi
 
				  
			
 
				 9. Start validation/evaluation.
			
 
				  
			
 
				-Validation can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`, setting `mode` to `eval`.
			
 
				+Validation can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`, setting `mode` to `eval` in `scripts/run_squad.sh` as follows:
			
 
				+
			
 
				+```
			
 
				+mode=${11:-"eval"}
			
 
				+```
			
 
				  
			
 
				 10. Start inference/predictions.
			
 
				  
			
 
				-Inference can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`, setting `mode` to `prediction`. Inference predictions are saved to `<OUTPUT_DIRECTORY>/predictions.json`.
			
 
				+Inference can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`, setting `mode` to `prediction` in `scripts/run_squad.sh` as follows:
			
 
				+
			
 
				+```
			
 
				+mode=${11:-"prediction"}
			
 
				+```
			
 
				+
			
 
				+Inference predictions are saved to `<OUT_DIR>/predictions.json`, set in `scripts/run_squad.sh` as follows:
			
 
				+
			
 
				+```
			
 
				+OUT_DIR=${10:-"/workspace/bert/results/SQuAD"}
			
 
				+```
			
 
				 
			
 
				 This repository contains a number of predefined configurations to run the SQuAD and pretraining on NVIDIA DGX-1, NVIDIA DGX-2H or NVIDIA DGX A100 nodes in `scripts/configs/squad_config.sh` and `scripts/configs/pretrain_config.sh`. For example, to use the default DGX A100 8 gpu config, run:
			
 
				 
			
@@ -378,7 +397,7 @@ The complete list of the available parameters for the `run_pretraining.py` scrip
 
				   
			
 
				   --allreduce_post_accumulation_fp16 -  If set to true, performs allreduce after gradient accumulation steps in FP16.
			
 
				  
			
 
				-  --fp16                      - If set, will perform computations using
			
 
				+  --amp or --fp16                      - If set, will perform computations using
			
 
				                                 automatic mixed precision.
			
 
				  
			
 
				   --loss_scale LOSS_SCALE        - Sets the loss scaling value to use when
			
@@ -404,13 +423,13 @@ The complete list of the available parameters for the `run_pretraining.py` scrip
 
				 
			
 
				 #### Fine tuning parameters
			
 
				  
			
 
				-Default arguments are listed below in the order the scripts expects:
			
 
				+Default arguments are listed below in the order `scripts/run_squad.sh` expects:
			
 
				  
			
 
				 -   Initial checkpoint - The default is `/workspace/checkpoints/bert_uncased.pt`.
			
 
				 -   Number of training Epochs - The default is `2`.
			
 
				 -   Batch size - The default is `3`.
			
 
				 -   Learning rate - The default is `3e-5`.
			
 
				--   Precision (either `fp16` or `fp32`) - The default is `fp16`.
			
 
				+-   Precision (either `fp16`, `tf32` or `fp32`) - The default is `fp16`.
			
 
				 -   Number of GPUs - The default is `8`.
			
 
				 -   Seed - The default is `1`.
			
 
				 -   SQuAD directory -  The default is `/workspace/bert/data/v1.1`.
			
@@ -603,7 +622,7 @@ Where:
 
				 - `<allreduce_post_accumulation>` - If set to `true`, performs `allreduce` only after the defined number of gradient accumulation steps.
			
 
				 - `<allreduce_post_accumulation_fp16>` -  If set to `true`, performs `allreduce` after gradient accumulation steps in FP16.
			
 
				  
			
 
				-    Note: The above two options need to be set to false when running on FP32. 
			
 
				+    Note: The above two options need to be set to false when running either TF32 or  FP32. 
			
 
				     
			
 
				 -  `<training_batch_size_phase2>` is per-GPU batch size used for training in phase 2. Larger batch sizes run more efficiently, but require more memory.
			
 
				 -   `<learning_rate_phase2>` is the base learning rate for training phase 2.
			
@@ -652,44 +671,8 @@ Note: The first positional argument (the path to the checkpoint to load) is requ
 
				 Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command-line input to `run_squad.sh`.
			
 
				  
			
 
				 ### Inference process
			
 
				- 
			
 
				-#### Pre-training inference
			
 
				- 
			
 
				-Inference on a pretrained model is performed using the `run_pretraining_inference.py` script along with parameters defined in `scripts/run_pretraining_inference.sh`. Inference is supported both for single and multi-GPU. By setting either the `--eval` or `--prediction` flag, you can choose between running evaluation on a given dataset or doing prediction (on both masked language model and next sentence prediction).
			
 
				- 
			
 
				-Prediction mode can be used to measure the inference turnaround time.
			
 
				- 
			
 
				-The `run_pretraining_inference.sh` script takes a model and a dataset and performs inference/evaluation on it. By default, the inferencing script:
			
 
				- 
			
 
				--   Has FP16 precision enabled
			
 
				--   Runs on 8 GPUs
			
 
				--   Evaluates the latest checkpoint present in `/results/checkpoints` with a batch size of 14
			
 
				--   Runs inference on the entire Wikipedia dataset
			
 
				- 
			
 
				-This script outputs a prediction file to `/results/pyt_bert_pretraining_inference_<precision>_<global_batchsize>.<datestamp>.log`. The output log contains information about:
			
 
				- 
			
 
				--   Inference performance
			
 
				--   Loss (masked language model loss and next sentence prediction loss) of the specified dataset if ground truths exist with the `--eval` flag.
			
 
				- 
			
 
				-For example:
			
 
				- 
			
 
				-`bash scripts/run_pretraining_inference.sh <evaluation_batch_size> <precision> <num_gpus> <inference_mode><model_checkpoint><inference_steps><create_logfile>`
			
 
				- 
			
 
				-Where:
			
 
				- 
			
 
				--   `<evaluation_batch_size>` is per-GPU batch size used for inference. Larger batch sizes run more efficiently, but require more memory.
			
 
				--   `<precision>` is the type of math in your model, can be either `fp32` or `fp16`. The options mean:
			
 
				-    -   `fp32`: 32-bit IEEE single precision floats
			
 
				-    -   `fp16`: 16-bit floats for 3.2x faster inference
			
 
				--   `<num_gpus>` is the number of GPUs to use for inference. Must be equal to or smaller than the number of GPUs attached to your node.
			
 
				--   `<inference_mode>` is either `--eval` for evaluation or `--prediction` for inference
			
 
				--   `<model_checkpoint>` is the model checkpoint to run inference on. Default is `-1`, which takes the most recent model checkpoint from the `checkpoints` folder.
			
 
				--   `<inference_steps>` is the total number of inference steps per process. Default is `-1`, which iterates over the entire dataset.
			
 
				--   `<create_logfile>` a flag indicating if output should be written to a logfile or not (acceptable values are `true` or `false`. `true` indicates output should be saved to a logfile.)
			
 
				- 
			
 
				-For example:
			
 
				- 
			
 
				-`bash scripts/run_pretraining_inference.sh 14 fp16 8 eval -1 -1 true`
			
 
				+
			
 
				+Fine-tuning inference can be run in order to obtain predictions on fine-tuning tasks, for example Q&A on SQuAD.
			
 
				  
			
 
				 #### Fine-tuning inference
			
 
				  
			
@@ -754,8 +737,15 @@ Our results were obtained by running the `scripts/run_squad.sh` and `scripts/run
 
				 
			
 
				 | DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - TF32 | Final Loss - mixed precision | Time to train(hours) - TF32 | Time to train(hours) - mixed precision | Time to train speedup (TF32 to mixed precision)
			
 
				 |---|---|---|---|---|---|---|---|---
			
 
				-|32 x DGX A100 with 40G |8|256 and 128|4 and 8|---|1.3415|---|2.3|---  
			
 
				-|32 x DGX A100 with 40G |8|256 and 128|4 and 16|1.3415|---|3.7|---|---  
			
 
				+|32 x DGX A100 |8|256 and 128|4 and 8|---|1.3415|---|2.3|---  
			
 
				+|32 x DGX A100 |8|256 and 128|4 and 16|1.3415|---|3.7|---|--- 
			
 
				+
			
 
				+##### Pre-training loss results: NVIDIA DGX-2H V100 (16x V100 32GB)
			
 
				+
			
 
				+| DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
			
 
				+|---|---|---|---|---|---|---|---|---
			
 
				+|32 x DGX-2H |16|128 and 64|2 and 8|---|1.3223|---|2.07|---  
			
 
				+|32 x DGX-2H |16|128 and 64|4 and 16|1.3305|---|7.9|---|---  
			
 
				 
			
 
				 ##### Pre-training loss results
			
 
				 
			
@@ -763,20 +753,24 @@ Following results were obtained by running on pytorch:19.07-py3 NGC container.
 
				 
			
 
				 | DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
			
 
				 |---|---|---|---|---|---|---|---|---
			
 
				-| 1 x NVIDIA DGX-1 With 16G|8|8192 and 4096 |512 and 1024|-|1.36|-|153.16|-
			
 
				-| 1 x NVIDIA DGX-2H With 32G|16|4096 and 2048 |64 and 256|-|1.35|-|58.4|-
			
 
				-| 4 x NVIDIA DGX-1 With 16G|8|2048 and 1024 |128 and 256|-|1.34|-|39.27|-
			
 
				-| 4 x NVIDIA DGX-2H With 32G|16|1024 and 512 |16 and 64|-|1.33|-|15.35|-
			
 
				-| 16 x NVIDIA DGX-1 With 16G|8|512 and 256 |32 and 64|-|1.329|-|10.36|-
			
 
				-| 16 x NVIDIA DGX-2H With 32G|16|256 and 128 |4 and 16|-|1.33|-|3.94|-
			
 
				-| 64 x NVIDIA DGX-2H With 32G|16|64 and 32 |(1 and 4)FP16 and (2 and 8)FP32|1.33|1.331|4.338|1.124|3.85
			
 
				+| 1 x NVIDIA DGX-1|8|8192 and 4096 |512 and 1024|-|1.36|-|153.16|-
			
 
				+| 1 x NVIDIA DGX-2H|16|4096 and 2048 |64 and 256|-|1.35|-|58.4|-
			
 
				+| 4 x NVIDIA DGX-1|8|2048 and 1024 |128 and 256|-|1.34|-|39.27|-
			
 
				+| 4 x NVIDIA DGX-2H|16|1024 and 512 |16 and 64|-|1.33|-|15.35|-
			
 
				+| 16 x NVIDIA DGX-1|8|512 and 256 |32 and 64|-|1.329|-|10.36|-
			
 
				+| 16 x NVIDIA DGX-2H|16|256 and 128 |4 and 16|-|1.33|-|3.94|-
			
 
				+| 64 x NVIDIA DGX-2H|16|64 and 32 |FP16:(1;4) FP32(2;8)|1.33|1.331|4.338|1.124|3.85
			
 
				+ 
			
 
				+##### Pre-training loss curves
			
 
				+![Pretraining Loss Curves](images/loss_curves.png)
			
 
				 
			
 
				 ##### Fine-tuning accuracy results: NVIDIA DGX A100 (8x A100 40GB)
			
 
				 
			
 
				 | GPUs | Batch size / GPU (TF32 and FP16) | Accuracy - TF32(% F1) | Accuracy - mixed precision(% F1) | Time to train(hours) - TF32 | Time to train(hours) - mixed precision | Time to train speedup (TF32 to mixed precision)
			
 
				+|---|------------|---------|--------|-------|--------|-----
			
 
				 |8|16 and 32|91.344|91.34|0.174|0.065|2.68
			
 
				 
			
 
				-##### Fine-tuning accuracy results
			
 
				+##### Fine-tuning accuracy results: NVIDIA DGX-1 (8x V100 16G)
			
 
				  
			
 
				 | GPUs | Batch size / GPU | Accuracy - FP32(% F1) | Accuracy - mixed precision(% F1) | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
			
 
				 |---|---|---|---|---|---|---
			
@@ -819,7 +813,7 @@ Our results were obtained by running the `scripts run_pretraining.sh` training s
 
				 
			
 
				 ###### Fine-tuning NVIDIA DGX A100 (8x A100 40GB)
			
 
				   
			
 
				-| GPUs | Batch size / GPU (TF32 and FP16) | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
			
 
				+| GPUs | Batch size / GPU (TF32 and FP16) | Throughput - TF32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision
			
 
				 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
			
 
				 |1 | 16 and 32|44 |116 | 2.63| 1.00| 1.00
			
 
				 |4 | 16 and 32|165 |441 | 2.67| 3.75| 3.80
			
@@ -939,7 +933,7 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
				 
			
 
				 ##### Inference performance: NVIDIA DGX A100 (1x A100 40GB) 
			
 
				  
			
 
				-Our results were obtained by running the `scripts/run_pretraining_inference.sh` script on data of sequence length 512 and the `scripts/run_squad.sh` script in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
			
 
				+Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
			
 
				  
			
 
				 ###### Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)
			
 
				  
			
@@ -950,7 +944,7 @@ Our results were obtained by running the `scripts/run_pretraining_inference.sh`
 
				 
			
 
				 ##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
			
 
				  
			
 
				-Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
			
 
				+Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
			
 
				  
			
 
				 ###### Fine-tuning inference on NVIDIA DGX-1 with 16G
			
 
				  
			
@@ -960,7 +954,7 @@ Our results were obtained by running the `scripts/run_pretraining_inference.sh`
 
				  
			
 
				 ##### Inference performance: NVIDIA DGX-1 (1x V100 32G)
			
 
				  
			
 
				-Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
			
 
				+Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
			
 
				   
			
 
				 ###### Fine-tuning inference on NVIDIA DGX-1 with 32G
			
 
				  
			
@@ -970,7 +964,7 @@ Our results were obtained by running the `scripts/run_pretraining_inference.sh`
 
				  
			
 
				 ##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
			
 
				  
			
 
				-Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
			
 
				+Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
			
 
				  
			
 
				 ###### Fine-tuning inference on NVIDIA DGX-2 with 32G
			
 
				 
			
@@ -987,7 +981,7 @@ The inference performance metrics used were items/second.
 
				 ### Changelog
			
 
				  
			
 
				 July 2020
			
 
				-- Ampere support
			
 
				+-  Updated accuracy and performance tables to include A100 results
			
 
				  
			
 
				 March 2020
			
 
				 - TRITON Inference Server support.
			
@@ -1014,4 +1008,3 @@ July 2019
 
				 ### Known issues
			
 
				  
			
 
				 There are no known issues with this model.
			
 
				-
			
--- a/PyTorch/LanguageModeling/BERT/images/loss_curves.png
+++ b/PyTorch/LanguageModeling/BERT/images/loss_curves.png
--- a/PyTorch/LanguageModeling/BERT/run_glue.py
+++ b/PyTorch/LanguageModeling/BERT/run_glue.py
@@ -458,8 +458,13 @@ def main():
 
				                         default=1,
			
 
				                         help="Number of updates steps to accumulate before performing a backward/update pass.")
			
 
				     parser.add_argument('--fp16',
			
 
				+                        default=False,
			
 
				                         action='store_true',
			
 
				-                        help="Whether to use 16-bit float precision instead of 32-bit")
			
 
				+                        help="Mixed precision training")
			
 
				+    parser.add_argument('--amp',
			
 
				+                        default=False,
			
 
				+                        action='store_true',
			
 
				+                        help="Mixed precision training")
			
 
				     parser.add_argument('--loss_scale',
			
 
				                         type=float, default=0,
			
 
				                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
			
@@ -478,7 +483,8 @@ def main():
 
				                         help="The BERT model config")
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				-
			
 
				+    args.fp16 = args.fp16 or args.amp
			
 
				+    
			
 
				     if args.server_ip and args.server_port:
			
 
				         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
			
 
				         import ptvsd
			
--- a/PyTorch/LanguageModeling/BERT/run_pretraining.py
+++ b/PyTorch/LanguageModeling/BERT/run_pretraining.py
@@ -211,7 +211,11 @@ def parse_arguments():
 
				     parser.add_argument('--fp16',
			
 
				                         default=False,
			
 
				                         action='store_true',
			
 
				-                        help="Whether to use 16-bit float precision instead of 32-bit")
			
 
				+                        help="Mixed precision training")
			
 
				+    parser.add_argument('--amp',
			
 
				+                        default=False,
			
 
				+                        action='store_true',
			
 
				+                        help="Mixed precision training")
			
 
				     parser.add_argument('--loss_scale',
			
 
				                         type=float, default=0.0,
			
 
				                         help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
			
@@ -276,6 +280,7 @@ def parse_arguments():
 
				                         help='If provided, only run this many steps before exiting')
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				+    args.fp16 = args.fp16 or args.amp
			
 
				 
			
 
				     if args.steps_this_run < 0:
			
 
				         args.steps_this_run = args.max_steps
			
--- a/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
+++ b/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
@@ -1,320 +0,0 @@
 
				-# coding=utf-8
			
 
				-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
			
 
				-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-"""BERT finetuning runner."""
			
 
				-
			
 
				-from __future__ import absolute_import
			
 
				-from __future__ import division
			
 
				-from __future__ import print_function
			
 
				-
			
 
				-
			
 
				-#==================
			
 
				-import csv
			
 
				-import os
			
 
				-import logging
			
 
				-import argparse
			
 
				-import random
			
 
				-import h5py
			
 
				-from tqdm import tqdm, trange
			
 
				-import os
			
 
				-import numpy as np
			
 
				-import torch
			
 
				-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
			
 
				-from torch.utils.data.distributed import DistributedSampler
			
 
				-from utils import is_main_process, format_step
			
 
				-import math
			
 
				-import time
			
 
				-
			
 
				-from tokenization import BertTokenizer
			
 
				-from modeling import BertForPreTraining, BertConfig
			
 
				-
			
 
				-# from fused_adam_local import FusedAdamBert
			
 
				-from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
			
 
				-
			
 
				-from apex.parallel import DistributedDataParallel as DDP
			
 
				-import torch.distributed as dist
			
 
				-import dllogger
			
 
				-
			
 
				-
			
 
				-class pretraining_dataset(Dataset):
			
 
				-
			
 
				-    def __init__(self, input_file, max_pred_length):
			
 
				-        self.input_file = input_file
			
 
				-        self.max_pred_length = max_pred_length
			
 
				-        f = h5py.File(input_file, "r")
			
 
				-        self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
			
 
				-        self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
			
 
				-        self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
			
 
				-        self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
			
 
				-        self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
			
 
				-        self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
			
 
				-        f.close()
			
 
				-
			
 
				-    def __len__(self):
			
 
				-        'Denotes the total number of samples'
			
 
				-        return len(self.input_ids)
			
 
				-
			
 
				-    def __getitem__(self, index):
			
 
				-        
			
 
				-        input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
			
 
				-        input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
			
 
				-        segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
			
 
				-        masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
			
 
				-        masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
			
 
				-        next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
			
 
				-         
			
 
				-        masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
			
 
				-        index = self.max_pred_length
			
 
				-        # store number of  masked tokens in index
			
 
				-        if len((masked_lm_positions == 0).nonzero()) != 0:
			
 
				-          index = (masked_lm_positions == 0).nonzero()[0].item()
			
 
				-        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
			
 
				-
			
 
				-        return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
			
 
				-
			
 
				-def main():    
			
 
				-
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-
			
 
				-    ## Required parameters
			
 
				-    parser.add_argument("--input_dir",
			
 
				-                        default=None,
			
 
				-                        type=str,
			
 
				-                        required=True,
			
 
				-                        help="The input data dir. Should contain .hdf5 files  for the task.")
			
 
				-    parser.add_argument("--config_file",
			
 
				-                        default="bert_config.json",
			
 
				-                        type=str,
			
 
				-                        required=False,
			
 
				-                        help="The BERT model config")
			
 
				-    ckpt_group = parser.add_mutually_exclusive_group(required=True)
			
 
				-    ckpt_group.add_argument("--ckpt_dir",
			
 
				-                        default=None,
			
 
				-                        type=str,
			
 
				-                        help="The ckpt directory, e.g. /results")
			
 
				-    ckpt_group.add_argument("--ckpt_path",
			
 
				-                            default=None,
			
 
				-                            type=str,
			
 
				-                            help="Path to the specific checkpoint")
			
 
				-
			
 
				-    group = parser.add_mutually_exclusive_group(required=True)
			
 
				-    group.add_argument('--eval', dest='do_eval', action='store_true')
			
 
				-    group.add_argument('--prediction', dest='do_eval', action='store_false')
			
 
				-    ## Other parameters
			
 
				-    parser.add_argument("--bert_model", default="bert-large-uncased", type=str, required=False,
			
 
				-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
			
 
				-                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
			
 
				-    parser.add_argument("--max_seq_length",
			
 
				-                        default=512,
			
 
				-                        type=int,
			
 
				-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
			
 
				-                             "Sequences longer than this will be truncated, and sequences shorter \n"
			
 
				-                             "than this will be padded.")
			
 
				-    parser.add_argument("--max_predictions_per_seq",
			
 
				-                        default=80,
			
 
				-                        type=int,
			
 
				-                        help="The maximum total of masked tokens in input sequence")
			
 
				-    parser.add_argument("--ckpt_step",
			
 
				-                        default=-1,
			
 
				-                        type=int,
			
 
				-                        required=False,
			
 
				-                        help="The model checkpoint iteration, e.g. 1000")
			
 
				-                       
			
 
				-    parser.add_argument("--eval_batch_size",
			
 
				-                        default=8,
			
 
				-                        type=int,
			
 
				-                        help="Total batch size for training.")
			
 
				-    parser.add_argument("--max_steps",
			
 
				-                        default=-1,
			
 
				-                        type=int,
			
 
				-                        help="Total number of eval  steps to perform, otherwise use full dataset")
			
 
				-    parser.add_argument("--no_cuda",
			
 
				-                        default=False,
			
 
				-                        action='store_true',
			
 
				-                        help="Whether not to use CUDA when available")
			
 
				-    parser.add_argument("--local_rank",
			
 
				-                        type=int,
			
 
				-                        default=-1,
			
 
				-                        help="local_rank for distributed training on gpus")
			
 
				-    parser.add_argument('--seed',
			
 
				-                        type=int,
			
 
				-                        default=42,
			
 
				-                        help="random seed for initialization")
			
 
				-    parser.add_argument('--fp16',
			
 
				-                        default=False,
			
 
				-                        action='store_true',
			
 
				-                        help="Whether to use 16-bit float precision instead of 32-bit")
			
 
				-    parser.add_argument("--log_path",
			
 
				-                        help="Out file for DLLogger",
			
 
				-                        default="/workspace/dllogger_inference.out",
			
 
				-                        type=str)
			
 
				-
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    if 'LOCAL_RANK' in os.environ:
			
 
				-        args.local_rank = int(os.environ['LOCAL_RANK'])
			
 
				-
			
 
				-    if args.local_rank == -1 or args.no_cuda:
			
 
				-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
			
 
				-        
			
 
				-    else:
			
 
				-        torch.cuda.set_device(args.local_rank)
			
 
				-        device = torch.device("cuda", args.local_rank)
			
 
				-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
			
 
				-        torch.distributed.init_process_group(backend='nccl', init_method='env://')
			
 
				-
			
 
				-    if is_main_process():
			
 
				-        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
			
 
				-                                                           filename=args.log_path),
			
 
				-                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
			
 
				-    else:
			
 
				-        dllogger.init(backends=[])
			
 
				-
			
 
				-    n_gpu = torch.cuda.device_count()
			
 
				-    if n_gpu > 1:
			
 
				-        assert(args.local_rank != -1) # only use torch.distributed for multi-gpu
			
 
				-
			
 
				-    dllogger.log(step="device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
			
 
				-        device, n_gpu, bool(args.local_rank != -1), args.fp16), data={})
			
 
				-
			
 
				-
			
 
				-    random.seed(args.seed)
			
 
				-    np.random.seed(args.seed)
			
 
				-    torch.manual_seed(args.seed)
			
 
				-    if n_gpu > 0:
			
 
				-        torch.cuda.manual_seed_all(args.seed)
			
 
				-
			
 
				-
			
 
				-    
			
 
				-
			
 
				-    # Prepare model
			
 
				-    config = BertConfig.from_json_file(args.config_file)
			
 
				-    # Padding for divisibility by 8
			
 
				-    if config.vocab_size % 8 != 0:
			
 
				-        config.vocab_size += 8 - (config.vocab_size % 8)
			
 
				-    model = BertForPreTraining(config)
			
 
				-
			
 
				-    if args.ckpt_dir:
			
 
				-        if args.ckpt_step == -1:
			
 
				-            #retrieve latest model
			
 
				-            model_names = [f for f in os.listdir(args.ckpt_dir) if f.endswith(".pt")]
			
 
				-            args.ckpt_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names])
			
 
				-            dllogger.log(step="load model saved at iteration", data={"number": args.ckpt_step})
			
 
				-        model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".pt")
			
 
				-    else:
			
 
				-        model_file = args.ckpt_path
			
 
				-    state_dict = torch.load(model_file, map_location="cpu")["model"]
			
 
				-    model.load_state_dict(state_dict, strict=False)
			
 
				-
			
 
				-    if args.fp16:
			
 
				-        model.half() # all parameters and buffers are converted to half precision
			
 
				-    model.to(device)
			
 
				-
			
 
				-    multi_gpu_training = args.local_rank != -1 and torch.distributed.is_initialized()
			
 
				-    if multi_gpu_training:
			
 
				-        model = DDP(model)
			
 
				-   
			
 
				-    files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'test' in f]
			
 
				-    files.sort()
			
 
				-
			
 
				-    dllogger.log(step="***** Running Inference *****", data={})
			
 
				-    dllogger.log(step="  Inference batch", data={"size":args.eval_batch_size})
			
 
				-
			
 
				-    model.eval()
			
 
				-
			
 
				-    nb_instances = 0
			
 
				-    max_steps = args.max_steps if args.max_steps > 0  else np.inf
			
 
				-    global_step = 0
			
 
				-    total_samples = 0
			
 
				-
			
 
				-    begin_infer = time.time()
			
 
				-    with torch.no_grad():
			
 
				-        if args.do_eval:
			
 
				-            final_loss = 0.0 # 
			
 
				-            for data_file in files:
			
 
				-                dllogger.log(step="Opening ", data={"file": data_file})
			
 
				-                dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
			
 
				-                if not multi_gpu_training:
			
 
				-                    train_sampler = RandomSampler(dataset)
			
 
				-                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
			
 
				-                else:
			
 
				-                    train_sampler = DistributedSampler(dataset)
			
 
				-                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
			
 
				-                for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
			
 
				-                    if global_step > max_steps:
			
 
				-                        break
			
 
				-                    batch = [t.to(device) for t in batch]
			
 
				-                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
			
 
				-                    loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels)
			
 
				-                    final_loss += loss.item()
			
 
				-
			
 
				-                    global_step += 1
			
 
				-
			
 
				-                total_samples += len(datasetloader)
			
 
				-                torch.cuda.empty_cache()
			
 
				-                if global_step > max_steps:
			
 
				-                    break
			
 
				-            final_loss /= global_step
			
 
				-            if multi_gpu_training:
			
 
				-                final_loss = torch.tensor(final_loss, device=device)
			
 
				-                dist.all_reduce(final_loss)
			
 
				-                final_loss /= torch.distributed.get_world_size()
			
 
				-            if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):       
			
 
				-                dllogger.log(step="Inference Loss", data={"final_loss": final_loss.item()})
			
 
				-
			
 
				-
			
 
				-        else: # inference
			
 
				-            # if multi_gpu_training:
			
 
				-            #     torch.distributed.barrier()
			
 
				-            # start_t0 = time.time()
			
 
				-            for data_file in files:
			
 
				-                dllogger.log(step="Opening ", data={"file": data_file})
			
 
				-                dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
			
 
				-                if not multi_gpu_training:
			
 
				-                    train_sampler = RandomSampler(dataset)
			
 
				-                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
			
 
				-                else:
			
 
				-                    train_sampler = DistributedSampler(dataset)
			
 
				-                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
			
 
				-
			
 
				-                for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
			
 
				-                    if global_step > max_steps:
			
 
				-                        break
			
 
				-
			
 
				-                    batch = [t.to(device) for t in batch]
			
 
				-                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
			
 
				-                    
			
 
				-                    lm_logits, nsp_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=None, next_sentence_label=None)
			
 
				-
			
 
				-                    nb_instances += input_ids.size(0)
			
 
				-                    global_step += 1
			
 
				-
			
 
				-                total_samples += len(datasetloader)
			
 
				-                torch.cuda.empty_cache()
			
 
				-                if global_step > max_steps:
			
 
				-                    break
			
 
				-            # if multi_gpu_training:
			
 
				-            #     torch.distributed.barrier()
			
 
				-            if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):       
			
 
				-                dllogger.log(step="Done Inferring on samples", data={})
			
 
				-
			
 
				-
			
 
				-    end_infer = time.time()
			
 
				-    dllogger.log(step="Inference perf", data={"inference_sequences_per_second": total_samples * args.eval_batch_size / (end_infer - begin_infer)})
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
 
				-    dllogger.flush()
			
--- a/PyTorch/LanguageModeling/BERT/run_squad.py
+++ b/PyTorch/LanguageModeling/BERT/run_squad.py
@@ -472,11 +472,6 @@ def get_answers(examples, features, results, args):
 
				                 preds,
			
 
				                 key=lambda x: (x.start_logit + x.end_logit),
			
 
				                 reverse=True)[:args.n_best_size]
			
 
				-        
			
 
				-        # In very rare edge cases we could only have single null prediction.
			
 
				-	      # So we just create a nonce prediction in this case to avoid failure.
			
 
				-        if not nbest:                                                    
			
 
				-	          nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))
			
 
				 
			
 
				         # In very rare edge cases we could only have single null prediction.
			
 
				         # So we just create a nonce prediction in this case to avoid failure.
			
@@ -796,8 +791,13 @@ def main():
 
				                         default=os.getenv('LOCAL_RANK', -1),
			
 
				                         help="local_rank for distributed training on gpus")
			
 
				     parser.add_argument('--fp16',
			
 
				+                        default=False,
			
 
				+                        action='store_true',
			
 
				+                        help="Mixed precision training")
			
 
				+    parser.add_argument('--amp',
			
 
				+                        default=False,
			
 
				                         action='store_true',
			
 
				-                        help="Whether to use 16-bit float precision instead of 32-bit")
			
 
				+                        help="Mixed precision training")
			
 
				     parser.add_argument('--loss_scale',
			
 
				                         type=float, default=0,
			
 
				                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
			
@@ -851,6 +851,7 @@ def main():
 
				                         help="Location to cache train feaures. Will default to the dataset directory")
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				+    args.fp16 = args.fp16 or args.amp    
			
 
				 
			
 
				     if args.local_rank == -1 or args.no_cuda:
			
 
				         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
			
--- a/PyTorch/LanguageModeling/BERT/run_swag.py
+++ b/PyTorch/LanguageModeling/BERT/run_swag.py
@@ -322,8 +322,13 @@ def main():
 
				                         default=1,
			
 
				                         help="Number of updates steps to accumulate before performing a backward/update pass.")
			
 
				     parser.add_argument('--fp16',
			
 
				+                        default=False,
			
 
				                         action='store_true',
			
 
				-                        help="Whether to use 16-bit float precision instead of 32-bit")
			
 
				+                        help="Mixed precision training")
			
 
				+    parser.add_argument('--amp',
			
 
				+                        default=False,
			
 
				+                        action='store_true',
			
 
				+                        help="Mixed precision training")
			
 
				     parser.add_argument('--loss_scale',
			
 
				                         type=float, default=0,
			
 
				                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
			
@@ -331,7 +336,8 @@ def main():
 
				                              "Positive power of 2: static loss scaling value.\n")
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				-
			
 
				+    args.fp16 = args.fp16 or args.amp
			
 
				+    
			
 
				     if args.local_rank == -1 or args.no_cuda:
			
 
				         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
			
 
				         n_gpu = torch.cuda.device_count()
			
--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh
@@ -1,109 +0,0 @@
 
				-#!/bin/bash
			
 
				-
			
 
				-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-echo "Container nvidia build = " $NVIDIA_BUILD_ID
			
 
				-
			
 
				-DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus # change this for other datasets
			
 
				-DATA_DIR=${22:-$BERT_PREP_WORKING_DIR/${DATASET}/}
			
 
				-
			
 
				-BERT_CONFIG=bert_config.json
			
 
				-RESULTS_DIR=/results
			
 
				-CHECKPOINTS_DIR=/results/checkpoints
			
 
				-
			
 
				-if [ ! -d "$DATA_DIR" ] ; then
			
 
				-   echo "Warning! $DATA_DIR directory missing. Inference cannot start"
			
 
				-fi
			
 
				-if [ ! -d "$RESULTS_DIR" ] ; then
			
 
				-   echo "Error! $RESULTS_DIR directory missing."
			
 
				-   exit -1
			
 
				-fi
			
 
				-if [ ! -d "$CHECKPOINTS_DIR" ] ; then
			
 
				-   echo "Warning! $CHECKPOINTS_DIR directory missing."
			
 
				-   echo "Checkpoints will be loaded from $RESULTS_DIR instead."
			
 
				-   CHECKPOINTS_DIR=$RESULTS_DIR
			
 
				-fi
			
 
				-if [ ! -f "$BERT_CONFIG" ] ; then
			
 
				-   echo "Error! BERT large configuration file not found at $BERT_CONFIG"
			
 
				-   exit -1
			
 
				-fi
			
 
				-
			
 
				-eval_batch_size=${1:-14}
			
 
				-precision=${2:-"fp16"}
			
 
				-num_gpus=${3:-8}
			
 
				-inference_mode=${4:-"eval"}
			
 
				-model_checkpoint=${5:-"-1"}
			
 
				-inference_steps=${6:-"-1"}
			
 
				-create_logfile=${7:-"true"}
			
 
				-seed=${8:-42}
			
 
				-
			
 
				-PREC=""
			
 
				-if [ "$precision" = "fp16" ] ; then
			
 
				-   PREC="--fp16"
			
 
				-elif [ "$precision" = "fp32" ] ; then
			
 
				-   PREC=""
			
 
				-else
			
 
				-   echo "Unknown <precision> argument"
			
 
				-   exit -2
			
 
				-fi
			
 
				-
			
 
				-
			
 
				-MODE=""
			
 
				-if [ "$inference_mode" = "eval" ] ; then
			
 
				-   MODE="--eval"
			
 
				-elif [ "$inference_mode" = "prediction" ] ; then
			
 
				-   MODE="--prediction"
			
 
				-else
			
 
				-   echo "Unknown <inference_mode> argument"
			
 
				-   exit -2
			
 
				-fi
			
 
				-
			
 
				-echo $DATA_DIR
			
 
				-CMD=" /workspace/bert/run_pretraining_inference.py"
			
 
				-CMD+=" --input_dir=$DATA_DIR"
			
 
				-CMD+=" --ckpt_dir=$CHECKPOINTS_DIR"
			
 
				-CMD+=" --config_file=$BERT_CONFIG"
			
 
				-CMD+=" --bert_model=bert-large-uncased"
			
 
				-CMD+=" --eval_batch_size=$eval_batch_size"
			
 
				-CMD+=" --max_seq_length=512"
			
 
				-CMD+=" --max_predictions_per_seq=80"
			
 
				-CMD+=" --max_steps=$inference_steps"
			
 
				-CMD+=" --ckpt_step=$model_checkpoint"
			
 
				-CMD+=" --seed=$seed"
			
 
				-CMD+=" $PREC"
			
 
				-CMD+=" $MODE"
			
 
				-
			
 
				-if [ "$num_gpus" -gt 1 ] ; then
			
 
				-   CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
			
 
				-else
			
 
				-   CMD="python3  $CMD"
			
 
				-fi
			
 
				-
			
 
				-if [ "$create_logfile" = "true" ] ; then
			
 
				-  export GBS=$((eval_batch_size * num_gpus))
			
 
				-  printf -v TAG "pyt_bert_pretraining_inference_%s_gbs%d" "$precision" $GBS
			
 
				-  DATESTAMP=`date +'%y%m%d%H%M%S'`
			
 
				-  LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
			
 
				-  printf "Logs written to %s\n" "$LOGFILE"
			
 
				-fi
			
 
				-
			
 
				-set -x
			
 
				-if [ -z "$LOGFILE" ] ; then
			
 
				-   $CMD
			
 
				-else
			
 
				-   (
			
 
				-     $CMD
			
 
				-   ) |& tee $LOGFILE
			
 
				-fi
			
 
				-set +x
			
--- a/PyTorch/LanguageModeling/BERT/triton/README.md
+++ b/PyTorch/LanguageModeling/BERT/triton/README.md
@@ -38,7 +38,7 @@ Moreover, you may set `precision` to either `fp32` or `fp16`.
 
				 
			
 
				 To launch the Triton server, execute the following command. 
			
 
				 
			
 
				-`docker run --rm --gpus device=0 --ipc=host --network=host -p 8000:8000 -p 8001:8001 -p 8002:8002 -v $PWD/results/triton_models:/models nvcr.io/nvidia/tritonserver:20.03-py3 trtserver --model-store=/models --log-verbose=1`
			
 
				+`docker run --rm --gpus device=0 --ipc=host --network=host -p 8000:8000 -p 8001:8001 -p 8002:8002 -v $PWD/results/triton_models:/models nvcr.io/nvidia/tritonserver:20.06-v1-py3 trtserver --model-store=/models --log-verbose=1`
			
 
				 
			
 
				 Here `device=0,1,2,3` selects GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs. 
			
 
				 
			
--- a/PyTorch/LanguageModeling/BERT/triton/launch_triton_server.sh
+++ b/PyTorch/LanguageModeling/BERT/triton/launch_triton_server.sh
@@ -28,4 +28,4 @@ docker run -d --rm \
 
				    -p 8002:8002 \
			
 
				    --name trt_server_cont \
			
 
				    -v $PWD/results/triton_models:/models \
			
 
				-   nvcr.io/nvidia/tritonserver:20.03-py3 trtserver --model-store=/models --log-verbose=1
			
 
				+   nvcr.io/nvidia/tritonserver:20.06-v1-py3 trtserver --model-store=/models --log-verbose=1