Przemek Strzelczyk 5 лет назад
Родитель
Сommit
96138d5087
51 измененных файлов с 1797 добавлено и 875 удалено
  1. 2 2
      PyTorch/LanguageModeling/BERT/Dockerfile
  2. 200 129
      PyTorch/LanguageModeling/BERT/README.md
  3. 211 0
      PyTorch/LanguageModeling/BERT/bind.sh
  4. 7 0
      PyTorch/LanguageModeling/BERT/modeling.py
  5. 4 1
      PyTorch/LanguageModeling/BERT/requirements.txt
  6. 5 3
      PyTorch/LanguageModeling/BERT/run.sub
  7. 4 3
      PyTorch/LanguageModeling/BERT/run_glue.py
  8. 13 9
      PyTorch/LanguageModeling/BERT/run_pretraining.py
  9. 13 10
      PyTorch/LanguageModeling/BERT/run_squad.py
  10. 252 0
      PyTorch/LanguageModeling/BERT/scripts/configs/pretrain_config.sh
  11. 120 0
      PyTorch/LanguageModeling/BERT/scripts/configs/squad_config.sh
  12. 14 11
      PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
  13. 1 1
      PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
  14. 3 0
      README.md
  15. 1 1
      TensorFlow/LanguageModeling/BERT/.dockerignore
  16. 1 1
      TensorFlow/LanguageModeling/BERT/Dockerfile
  17. 479 450
      TensorFlow/LanguageModeling/BERT/README.md
  18. 1 1
      TensorFlow/LanguageModeling/BERT/biobert/README.md
  19. 2 4
      TensorFlow/LanguageModeling/BERT/biobert/conlleval.py
  20. 12 40
      TensorFlow/LanguageModeling/BERT/biobert/scripts/biobert_finetune_inference_benchmark.sh
  21. 12 42
      TensorFlow/LanguageModeling/BERT/biobert/scripts/biobert_finetune_train_benchmark.sh
  22. 6 3
      TensorFlow/LanguageModeling/BERT/biobert/scripts/ner_bc5cdr-chem.sh
  23. 6 3
      TensorFlow/LanguageModeling/BERT/biobert/scripts/ner_bc5cdr-disease.sh
  24. 6 3
      TensorFlow/LanguageModeling/BERT/biobert/scripts/rel_chemprot.sh
  25. 1 1
      TensorFlow/LanguageModeling/BERT/biobert/scripts/run_biobert.sub
  26. 6 4
      TensorFlow/LanguageModeling/BERT/biobert/scripts/run_biobert_finetuning_inference.sh
  27. 6 3
      TensorFlow/LanguageModeling/BERT/biobert/scripts/run_pretraining_pubmed_base_phase_1.sh
  28. 6 3
      TensorFlow/LanguageModeling/BERT/biobert/scripts/run_pretraining_pubmed_base_phase_2.sh
  29. 1 1
      TensorFlow/LanguageModeling/BERT/run.sub
  30. 42 11
      TensorFlow/LanguageModeling/BERT/run_classifier.py
  31. 23 7
      TensorFlow/LanguageModeling/BERT/run_ner.py
  32. 13 13
      TensorFlow/LanguageModeling/BERT/run_pretraining.py
  33. 23 7
      TensorFlow/LanguageModeling/BERT/run_re.py
  34. 22 9
      TensorFlow/LanguageModeling/BERT/run_squad.py
  35. 0 0
      TensorFlow/LanguageModeling/BERT/scripts/configs/configurations.yml
  36. 85 0
      TensorFlow/LanguageModeling/BERT/scripts/configs/pretrain_config.sh
  37. 85 0
      TensorFlow/LanguageModeling/BERT/scripts/configs/squad_config.sh
  38. 1 1
      TensorFlow/LanguageModeling/BERT/scripts/docker/build.sh
  39. 1 1
      TensorFlow/LanguageModeling/BERT/scripts/docker/launch.sh
  40. 26 36
      TensorFlow/LanguageModeling/BERT/scripts/finetune_inference_benchmark.sh
  41. 4 12
      TensorFlow/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh
  42. 6 3
      TensorFlow/LanguageModeling/BERT/scripts/run_glue.sh
  43. 6 3
      TensorFlow/LanguageModeling/BERT/scripts/run_glue_inference.sh
  44. 9 5
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_adam.sh
  45. 8 4
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh
  46. 8 4
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh
  47. 7 4
      TensorFlow/LanguageModeling/BERT/scripts/run_squad.sh
  48. 6 3
      TensorFlow/LanguageModeling/BERT/scripts/run_squad_inference.sh
  49. 6 1
      TensorFlow/LanguageModeling/BERT/triton/scripts/export_model.sh
  50. 14 13
      TensorFlow/LanguageModeling/BERT/trt/README.md
  51. 7 9
      TensorFlow/LanguageModeling/BERT/utils/utils.py

+ 2 - 2
PyTorch/LanguageModeling/BERT/Dockerfile

@@ -11,8 +11,8 @@
 # See the License for the specific language governing permissions and
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # limitations under the License.
 
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
-FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk as trt
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
+FROM nvcr.io/nvidia/tritonserver:20.06-py3-clientsdk as trt
 FROM ${FROM_IMAGE_NAME}
 FROM ${FROM_IMAGE_NAME}
 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
 
 

+ 200 - 129
PyTorch/LanguageModeling/BERT/README.md

@@ -11,7 +11,8 @@ This repository provides a script and recipe to train the BERT model for PyTorch
         * [Features](#features)
         * [Features](#features)
     * [Mixed precision training](#mixed-precision-training)
     * [Mixed precision training](#mixed-precision-training)
         * [Enabling mixed precision](#enabling-mixed-precision)
         * [Enabling mixed precision](#enabling-mixed-precision)
-        * [Glossary](#glossary)
+        * [Enabling TF32](#enabling-tf32)
+    * [Glossary](#glossary)
 - [Setup](#setup)
 - [Setup](#setup)
     * [Requirements](#requirements)
     * [Requirements](#requirements)
 - [Quick Start Guide](#quick-start-guide)
 - [Quick Start Guide](#quick-start-guide)
@@ -39,12 +40,17 @@ This repository provides a script and recipe to train the BERT model for PyTorch
         * [Inference performance benchmark](#inference-performance-benchmark)
         * [Inference performance benchmark](#inference-performance-benchmark)
     * [Results](#results)
     * [Results](#results)
         * [Training accuracy results](#training-accuracy-results)
         * [Training accuracy results](#training-accuracy-results)
+            * [Pre-training loss results: NVIDIA DGX A100 (8x A100 40GB)](#pre-training-loss-results-nvidia-dgx-a100-8x-a100-40gb)  
             * [Pre-training loss results](#pre-training-loss-results)
             * [Pre-training loss results](#pre-training-loss-results)
-            * [Fine-tuning accuracy results](#fine-tuning-accuracy-results) 
+            * [Fine-tuning accuracy results: NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-accuracy-results-nvidia-dgx-a100-8x-a100-40gb)
+            * [Fine-tuning accuracy results](#fine-tuning-accuracy-results)
             * [Training stability test](#training-stability-test)
             * [Training stability test](#training-stability-test)
                 * [Pre-training stability test](#pre-training-stability-test)
                 * [Pre-training stability test](#pre-training-stability-test)
                 * [Fine-tuning stability test](#fine-tuning-stability-test) 
                 * [Fine-tuning stability test](#fine-tuning-stability-test) 
           * [Training performance results](#training-performance-results)
           * [Training performance results](#training-performance-results)
+              * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+                  * [Pre-training NVIDIA DGX A100 (8x A100 40GB)](#pre-training-nvidia-dgx-a100-8x-a100-40gb)
+                  * [Fine-tuning NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-nvidia-dgx-a100-8x-a100-40gb)      
               * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
               * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
                   * [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
                   * [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
                   * [Pre-training on multiple NVIDIA DGX-1 With 16G](#pre-training-on-multiple-nvidia-dgx-1-with-16g)
                   * [Pre-training on multiple NVIDIA DGX-1 With 16G](#pre-training-on-multiple-nvidia-dgx-1-with-16g)
@@ -57,19 +63,20 @@ This repository provides a script and recipe to train the BERT model for PyTorch
                   * [Pre-training on multiple NVIDIA DGX-2H With 32G](#pre-training-on-multiple-nvidia-dgx-2h-with-32g)
                   * [Pre-training on multiple NVIDIA DGX-2H With 32G](#pre-training-on-multiple-nvidia-dgx-2h-with-32g)
                   * [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)   
                   * [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)   
           * [Inference performance results](#inference-performance-results)
           * [Inference performance results](#inference-performance-results)
+              * [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
+                  * [Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)](#fine-tuning-inference-on-nvidia-dgx-a100-1x-a100-40gb)
               * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
               * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
-                  * [Pre-training inference on NVIDIA DGX-1 with 16G](#pre-training-inference-on-nvidia-dgx-1-with-16g)
                   * [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
                   * [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
               * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
               * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
-                  * [Pre-training inference on NVIDIA DGX-1 with 32G](#pre-training-inference-on-nvidia-dgx-1-with-32g)
                   * [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
                   * [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
               * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
               * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
-                  * [Pre-training inference on NVIDIA DGX-2 with 32G](#pre-training-inference-on-nvidia-dgx-2-with-32g)
                   * [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
                   * [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
 - [Release notes](#release-notes)
 - [Release notes](#release-notes)
     * [Changelog](#changelog)
     * [Changelog](#changelog)
     * [Known issues](#known-issues)
     * [Known issues](#known-issues)
-
+ 
+ 
+ 
 ## Model overview
 ## Model overview
  
  
 BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's implementation of BERT is an optimized version of the [Hugging Face implementation](https://github.com/huggingface/pytorch-pretrained-BERT), leveraging mixed precision arithmetic and Tensor Cores on Volta V100 GPUs for faster training times while maintaining target accuracy.
 BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's implementation of BERT is an optimized version of the [Hugging Face implementation](https://github.com/huggingface/pytorch-pretrained-BERT), leveraging mixed precision arithmetic and Tensor Cores on Volta V100 GPUs for faster training times while maintaining target accuracy.
@@ -92,7 +99,7 @@ Other publicly available implementations of BERT include:
 5. [Google's implementation](https://github.com/google-research/bert)
 5. [Google's implementation](https://github.com/google-research/bert)
     
     
 This model trains with mixed precision Tensor Cores on Volta and provides a push-button solution to pretraining on a corpus of choice. As a result, researchers can get results 4x faster than training without Tensor Cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
 This model trains with mixed precision Tensor Cores on Volta and provides a push-button solution to pretraining on a corpus of choice. As a result, researchers can get results 4x faster than training without Tensor Cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
-
+ 
 ### Model architecture
 ### Model architecture
  
  
 The BERT model uses the same architecture as the encoder of the Transformer. Input sequences are projected into an embedding space before being fed into the encoder structure. Additionally, positional and segment encodings are added to the embeddings to preserve positional information. The encoder structure is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer accomplishes self-attention on multiple input representations.
 The BERT model uses the same architecture as the encoder of the Transformer. Input sequences are projected into an embedding space before being fed into the encoder structure. Additionally, positional and segment encodings are added to the embeddings to preserve positional information. The encoder structure is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer accomplishes self-attention on multiple input representations.
@@ -111,7 +118,9 @@ The BERT paper reports the results for two configurations of BERT, each correspo
 |:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|
 |:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|
 |BERTBASE |12 encoder| 768| 12|4 x  768|512|110M|
 |BERTBASE |12 encoder| 768| 12|4 x  768|512|110M|
 |BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|
 |BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|
-
+ 
+ 
+ 
 ### Feature support matrix
 ### Feature support matrix
  
  
 The following features are supported by this model.  
 The following features are supported by this model.  
@@ -128,11 +137,11 @@ The following features are supported by this model.
 [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training, whereas [AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
 [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training, whereas [AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
  
  
 [DDP](https://nvidia.github.io/apex/parallel.html) stands for DistributedDataParallel and is used for multi-GPU training.
 [DDP](https://nvidia.github.io/apex/parallel.html) stands for DistributedDataParallel and is used for multi-GPU training.
-
-[LAMB](https://arxiv.org/pdf/1904.00962.pdf) stands for Layerwise Adaptive Moments based optimizer, is a large batch optimization technique that helps accelerate training of deep neural networks using large minibatches. It allows using a global batch size of 65536 and 32768 on sequence lengths 128 and 512 respectively, compared to a batch size of 256 for Adam. The optimized implementation accumulates 1024 gradients batches in phase 1 and 4096 steps in phase 2 before updating weights once. This results in 15% training speedup. On multi-node systems, LAMB allows scaling up to 1024 GPUs resulting in training speedups of up to 72x in comparison to [Adam](https://arxiv.org/pdf/1412.6980.pdf). Adam has limitations on the learning rate that can be used since it is applied globally on all parameters whereas LAMB follows a layerwise learning rate strategy.
-
-NVLAMB adds necessary tweaks to [LAMB version 1](https://arxiv.org/abs/1904.00962v1), to ensure correct convergence. A guide to implementating the LAMB optimizer can be found in our [article](https://medium.com/@NvidiaAI/a-guide-to-optimizer-implementation-for-bert-at-scale-8338cc7f45fd) on Medium.com. The algorithm is as follows:
-
+ 
+[LAMB](https://arxiv.org/pdf/1904.00962.pdf) stands for Layerwise Adaptive Moments based optimizer, is a large batch optimization technique that helps accelerate training of deep neural networks using large minibatches. It allows using a global batch size of 65536 and 32768 on sequence lengths 128 and 512 respectively, compared to a batch size of 256 for [Adam](https://arxiv.org/pdf/1412.6980.pdf). The optimized implementation accumulates 1024 gradient batches in phase 1 and 4096 steps in phase 2 before updating weights once. This results in 15% training speedup. On multi-node systems, LAMB allows scaling up to 1024 GPUs resulting in training speedups of up to 72x in comparison to Adam. Adam has limitations on the learning rate that can be used since it is applied globally on all parameters whereas LAMB follows a layerwise learning rate strategy.
+ 
+NVLAMB adds the necessary tweaks to [LAMB version 1](https://arxiv.org/abs/1904.00962v1), to ensure correct convergence. The algorithm is as follows:
+ 
   ![NVLAMB](images/nvlamb.png)
   ![NVLAMB](images/nvlamb.png)
  
  
 ### Mixed precision training
 ### Mixed precision training
@@ -146,7 +155,7 @@ For information about:
 -   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
 -   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
 -   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
 -   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
 -   APEX tools for mixed precision training, see the [NVIDIA APEX: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
 -   APEX tools for mixed precision training, see the [NVIDIA APEX: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
-
+ 
 #### Enabling mixed precision
 #### Enabling mixed precision
  
  
 In this repository, mixed precision training is enabled by NVIDIA’s APEX library. The APEX library has an automatic mixed precision module that allows mixed precision to be enabled with minimal code changes.
 In this repository, mixed precision training is enabled by NVIDIA’s APEX library. The APEX library has an automatic mixed precision module that allows mixed precision to be enabled with minimal code changes.
@@ -166,6 +175,18 @@ if fp16:
  
  
 Where `<opt_level>` is the optimization level. In the pretraining, `O2` is set as the optimization level. Mixed precision training can be turned on by passing the `fp16` argument to the `run_pretraining.py` and `run_squad.py`. All shell scripts have a positional argument available to enable mixed precision training.
 Where `<opt_level>` is the optimization level. In the pretraining, `O2` is set as the optimization level. Mixed precision training can be turned on by passing the `fp16` argument to the `run_pretraining.py` and `run_squad.py`. All shell scripts have a positional argument available to enable mixed precision training.
 
 
+#### Enabling TF32
+
+This section is model specific and needs to show how to enable TF32.  How is TF32 being implemented? Tweaking layers, preprocessing data, etc… 
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
 ### Glossary
 ### Glossary
  
  
 **Fine-tuning**  
 **Fine-tuning**  
@@ -185,17 +206,17 @@ Pretraining on samples of sequence length 128 and 20 masked predictions per sequ
  
  
 **Phase 2**  
 **Phase 2**  
 Pretraining on samples of sequence length 512 and 80 masked predictions per sequence.
 Pretraining on samples of sequence length 512 and 80 masked predictions per sequence.
-
+ 
 ## Setup
 ## Setup
  
  
 The following section lists the requirements that you need to meet in order to start training the BERT model. 
 The following section lists the requirements that you need to meet in order to start training the BERT model. 
-
+ 
 ### Requirements
 ### Requirements
  
  
 This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
  
  
 -   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
 -   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
--   [PyTorch 19.07-py3 NGC container or later](https://ngc.nvidia.com/registry/nvidia-pytorch)
+-   [PyTorch 20.06-py3 NGC container or later](https://ngc.nvidia.com/registry/nvidia-pytorch)
 -   [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
 -   [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
  
  
 For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
 For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
@@ -203,7 +224,6 @@ For more information about how to get started with NGC containers, see the follo
 -   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
 -   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
 -   [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
 -   [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
  
  
-
 For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
 For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
  
  
 For multi-node, the sample provided in this repository requires [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) set up on a [SLURM](https://slurm.schedmd.com) cluster.
 For multi-node, the sample provided in this repository requires [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) set up on a [SLURM](https://slurm.schedmd.com) cluster.
@@ -213,7 +233,7 @@ More information on how to set up and launch can be found in the [Multi-node Doc
 ## Quick Start Guide
 ## Quick Start Guide
  
  
 To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. The default parameters for pretraining have been set to run on 8x V100 32G cards. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
 To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. The default parameters for pretraining have been set to run on 8x V100 32G cards. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
-
+ 
  
  
 1. Clone the repository.
 1. Clone the repository.
 `git clone https://github.com/NVIDIA/DeepLearningExamples.git`
 `git clone https://github.com/NVIDIA/DeepLearningExamples.git`
@@ -269,10 +289,17 @@ Validation can be performed with the `bash scripts/run_squad.sh /workspace/check
  
  
 Inference can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`, setting `mode` to `prediction`. Inference predictions are saved to `<OUTPUT_DIRECTORY>/predictions.json`.
 Inference can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`, setting `mode` to `prediction`. Inference predictions are saved to `<OUTPUT_DIRECTORY>/predictions.json`.
 
 
+This repository contains a number of predefined configurations to run the SQuAD and pretraining on NVIDIA DGX-1, NVIDIA DGX-2H or NVIDIA DGX A100 nodes in `scripts/configs/squad_config.sh` and `scripts/configs/pretrain_config.sh`. For example, to use the default DGX A100 8 gpu config, run:
+
+```
+bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_fp16)
+bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgxa100_8gpu_fp16)
+```
+
 ## Advanced
 ## Advanced
  
  
 The following sections provide greater details of the dataset, running training and inference, and the training results.
 The following sections provide greater details of the dataset, running training and inference, and the training results.
-
+ 
 ### Scripts and sample code
 ### Scripts and sample code
  
  
 Descriptions of the key scripts and folders are provided below.
 Descriptions of the key scripts and folders are provided below.
@@ -288,7 +315,7 @@ Descriptions of the key scripts and folders are provided below.
 -   `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
 -   `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
 -   `run_pretraining.py` - Implements BERT pre-training.
 -   `run_pretraining.py` - Implements BERT pre-training.
 -   `run_pretraining_inference.py` - Implements evaluation of a BERT pre-trained model.
 -   `run_pretraining_inference.py` - Implements evaluation of a BERT pre-trained model.
-
+ 
 ### Parameters
 ### Parameters
  
  
 #### Pre-training parameters
 #### Pre-training parameters
@@ -394,6 +421,7 @@ Default arguments are listed below in the order the scripts expects:
  
  
 The script saves the final checkpoint to the `/results/SQuAD/pytorch_model.bin` file.
 The script saves the final checkpoint to the `/results/SQuAD/pytorch_model.bin` file.
 
 
+
 #### Multi-node
 #### Multi-node
  
  
 Multi-node runs can be launched on a pyxis/enroot Slurm cluster (see [Requirements](#requirements)) with the `run.sub` script with the following command for a 4-node DGX-1 example for both phase 1 and phase 2:
 Multi-node runs can be launched on a pyxis/enroot Slurm cluster (see [Requirements](#requirements)) with the `run.sub` script with the following command for a 4-node DGX-1 example for both phase 1 and phase 2:
@@ -412,7 +440,8 @@ The batch variables `BATCHSIZE`, `LR`, `GRADIENT_STEPS`,`PHASE` refer to the Pyt
 Note that the `run.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `datadir` handle the location of the files for each phase. 
 Note that the `run.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `datadir` handle the location of the files for each phase. 
  
  
 Refer to the files contents to see the full list of variables to adjust for your system.
 Refer to the files contents to see the full list of variables to adjust for your system.
-
+ 
+ 
 #### Fine-tuning parameters
 #### Fine-tuning parameters
  
  
 The `run_squad.py` script contains many of the same arguments as `run_pretraining.py`.
 The `run_squad.py` script contains many of the same arguments as `run_pretraining.py`.
@@ -472,7 +501,7 @@ The main script specific parameters are:
                               - A null answer will be predicted if null_score if
                               - A null answer will be predicted if null_score if
                                 best_non_null is greater than NULL_SCORE_DIFF_THRESHOLD.
                                 best_non_null is greater than NULL_SCORE_DIFF_THRESHOLD.
 ```
 ```
-
+ 
 ### Command-line options
 ### Command-line options
  
  
 To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
 To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
@@ -482,7 +511,7 @@ To see the full list of available options and their descriptions, use the `-h` o
 `python run_squad.py --help`
 `python run_squad.py --help`
  
  
 Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section.
 Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section.
-
+ 
 ### Getting the data
 ### Getting the data
  
  
 For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as BookCorpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. BERT requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
 For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as BookCorpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. BERT requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
@@ -506,7 +535,7 @@ For fine-tuning a pre-trained BERT model for specific tasks, by default this rep
 -   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering
 -   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering
  
  
 Depending on the speed of your internet connection, this process takes about a day to complete. The BookCorpus server could sometimes get overloaded and also contain broken links resulting in HTTP 403 and 503 errors. You can either skip the missing files or retry downloading at a later time.
 Depending on the speed of your internet connection, this process takes about a day to complete. The BookCorpus server could sometimes get overloaded and also contain broken links resulting in HTTP 403 and 503 errors. You can either skip the missing files or retry downloading at a later time.
-
+ 
 #### Dataset guidelines
 #### Dataset guidelines
  
  
 The procedure to prepare a text corpus for pre-training is described in the above section. This section will provide additional insight into how exactly raw text is processed so that it is ready for pre-training.
 The procedure to prepare a text corpus for pre-training is described in the above section. This section will provide additional insight into how exactly raw text is processed so that it is ready for pre-training.
@@ -520,15 +549,15 @@ BERT pre-training optimizes for two unsupervised classification tasks. The first
 The second task is next sentence prediction. One training instance of BERT pre-training is two sentences (a sentence pair). A sentence pair may be constructed by simply taking two adjacent sentences from a single document, or by pairing up two random sentences with equal probability. The goal of this task is to predict whether or not the second sentence followed the first in the original document.
 The second task is next sentence prediction. One training instance of BERT pre-training is two sentences (a sentence pair). A sentence pair may be constructed by simply taking two adjacent sentences from a single document, or by pairing up two random sentences with equal probability. The goal of this task is to predict whether or not the second sentence followed the first in the original document.
  
  
 The `create_pretraining_data.py` script takes in raw text and creates training instances for both pre-training tasks.
 The `create_pretraining_data.py` script takes in raw text and creates training instances for both pre-training tasks.
-
+ 
 #### Multi-dataset
 #### Multi-dataset
  
  
 This repository provides functionality to combine multiple datasets into a single dataset for pre-training on a diverse text corpus at the shard level in `data/create_datasets_from_start.sh`.
 This repository provides functionality to combine multiple datasets into a single dataset for pre-training on a diverse text corpus at the shard level in `data/create_datasets_from_start.sh`.
-
+ 
 ### Training process
 ### Training process
  
  
 The training process consists of two steps: pre-training and fine-tuning.
 The training process consists of two steps: pre-training and fine-tuning.
-
+ 
 #### Pre-training
 #### Pre-training
  
  
 Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `scripts/run_pretraining.sh`.
 Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `scripts/run_pretraining.sh`.
@@ -542,7 +571,7 @@ Phase 1: (Maximum sequence length of 128)
 -   Runs for 7038 steps, where the first 28.43% (2000) are warm-up steps
 -   Runs for 7038 steps, where the first 28.43% (2000) are warm-up steps
 -   Saves a checkpoint every 200 iterations (keeps only the latest 3 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
 -   Saves a checkpoint every 200 iterations (keeps only the latest 3 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
 -   Creates a log file containing all the output
 -   Creates a log file containing all the output
-
+ 
 Phase 2: (Maximum sequence length of 512)
 Phase 2: (Maximum sequence length of 512)
 -   Runs on 8 GPUs with training batch size of 8 per GPU
 -   Runs on 8 GPUs with training batch size of 8 per GPU
 -   Uses a learning rate of 4e-3
 -   Uses a learning rate of 4e-3
@@ -550,7 +579,7 @@ Phase 2: (Maximum sequence length of 512)
 -   Runs for 1563 steps, where the first 12.8% are warm-up steps
 -   Runs for 1563 steps, where the first 12.8% are warm-up steps
 -   Saves a checkpoint every 200 iterations (keeps only the latest 3 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
 -   Saves a checkpoint every 200 iterations (keeps only the latest 3 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
 -   Creates a log file containing all the output
 -   Creates a log file containing all the output
-
+ 
 These parameters will train on Wikipedia and BookCorpus to state-of-the-art accuracy on a DGX-1 with 32GB V100 cards.
 These parameters will train on Wikipedia and BookCorpus to state-of-the-art accuracy on a DGX-1 with 32GB V100 cards.
  
  
 `bash run_pretraining.sh <training_batch_size> <learning-rate> <precision> <num_gpus> <warmup_proportion> <training_steps> <save_checkpoint_steps> <resume_training> <create_logfile> <accumulate_gradients> <gradient_accumulation_steps> <seed> <job_name> <allreduce_post_accumulation> <allreduce_post_accumulation_fp16> <accumulate_into_fp16> <train_bath_size_phase2> <learning_rate_phase2> <warmup_proportion_phase2> <train_steps_phase2> <gradient_accumulation_steps_phase2> `
 `bash run_pretraining.sh <training_batch_size> <learning-rate> <precision> <num_gpus> <warmup_proportion> <training_steps> <save_checkpoint_steps> <resume_training> <create_logfile> <accumulate_gradients> <gradient_accumulation_steps> <seed> <job_name> <allreduce_post_accumulation> <allreduce_post_accumulation_fp16> <accumulate_into_fp16> <train_bath_size_phase2> <learning_rate_phase2> <warmup_proportion_phase2> <train_steps_phase2> <gradient_accumulation_steps_phase2> `
@@ -586,7 +615,7 @@ Where:
 For example:
 For example:
  
  
 `bash scripts/run_pretraining.sh`
 `bash scripts/run_pretraining.sh`
-
+ 
 Trains BERT-large from scratch on a DGX-1 32G using FP16 arithmetic. 90% of the training steps are done with sequence length 128 (phase 1 of training) and 10% of the training steps are done with sequence length 512 (phase 2 of training).
 Trains BERT-large from scratch on a DGX-1 32G using FP16 arithmetic. 90% of the training steps are done with sequence length 128 (phase 1 of training) and 10% of the training steps are done with sequence length 512 (phase 2 of training).
  
  
 To train on a DGX-1 16G, set `gradient_accumulation_steps` to `512` and `gradient_accumulation_steps_phase2` to `1024` in `scripts/run_pretraining.sh`.
 To train on a DGX-1 16G, set `gradient_accumulation_steps` to `512` and `gradient_accumulation_steps_phase2` to `1024` in `scripts/run_pretraining.sh`.
@@ -597,19 +626,19 @@ In order to run pre-training routine on an initial checkpoint, do the following
 -   point the `init_checkpoint` variable to location of the checkpoint
 -   point the `init_checkpoint` variable to location of the checkpoint
 -   set `resume_training` to `true`
 -   set `resume_training` to `true`
 -   Note: The parameter value assigned to `BERT_CONFIG` during training should remain unchanged. Also to resume pretraining on your corpus of choice, the training dataset should be created using the same vocabulary file used in `data/create_datasets_from_start.sh`.
 -   Note: The parameter value assigned to `BERT_CONFIG` during training should remain unchanged. Also to resume pretraining on your corpus of choice, the training dataset should be created using the same vocabulary file used in `data/create_datasets_from_start.sh`.
-
+ 
 #### Fine-tuning
 #### Fine-tuning
  
  
 Fine-tuning is provided for a variety of tasks. The following tasks are included with this repository through the following scripts:
 Fine-tuning is provided for a variety of tasks. The following tasks are included with this repository through the following scripts:
  
  
 -   Question Answering (`scripts/run_squad.sh`)
 -   Question Answering (`scripts/run_squad.sh`)
-
+ 
 By default, each Python script implements fine-tuning a pre-trained BERT model for a specified number of training epochs as well as evaluation of the fine-tuned model. Each shell script invokes the associated Python script with the following default parameters:
 By default, each Python script implements fine-tuning a pre-trained BERT model for a specified number of training epochs as well as evaluation of the fine-tuned model. Each shell script invokes the associated Python script with the following default parameters:
  
  
 -   Uses 8 GPUs
 -   Uses 8 GPUs
 -   Has FP16 precision enabled
 -   Has FP16 precision enabled
 -   Saves a checkpoint at the end of training to the `/results/<dataset_name>` folder
 -   Saves a checkpoint at the end of training to the `/results/<dataset_name>` folder
-
+ 
 Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through NVIDIA’s [APEX](https://github.com/NVIDIA/apex) library. For a full list of parameters and associated explanations, see the [Parameters](#parameters) section.
 Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through NVIDIA’s [APEX](https://github.com/NVIDIA/apex) library. For a full list of parameters and associated explanations, see the [Parameters](#parameters) section.
  
  
 All fine-tuning shell scripts have the same positional arguments, outlined below:
 All fine-tuning shell scripts have the same positional arguments, outlined below:
@@ -621,8 +650,7 @@ By default, the mode positional argument is set to train eval. See the [Quick St
 Note: The first positional argument (the path to the checkpoint to load) is required.
 Note: The first positional argument (the path to the checkpoint to load) is required.
  
  
 Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command-line input to `run_squad.sh`.
 Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command-line input to `run_squad.sh`.
-
-
+ 
 ### Inference process
 ### Inference process
  
  
 #### Pre-training inference
 #### Pre-training inference
@@ -637,12 +665,12 @@ The `run_pretraining_inference.sh` script takes a model and a dataset and perfor
 -   Runs on 8 GPUs
 -   Runs on 8 GPUs
 -   Evaluates the latest checkpoint present in `/results/checkpoints` with a batch size of 14
 -   Evaluates the latest checkpoint present in `/results/checkpoints` with a batch size of 14
 -   Runs inference on the entire Wikipedia dataset
 -   Runs inference on the entire Wikipedia dataset
-
+ 
 This script outputs a prediction file to `/results/pyt_bert_pretraining_inference_<precision>_<global_batchsize>.<datestamp>.log`. The output log contains information about:
 This script outputs a prediction file to `/results/pyt_bert_pretraining_inference_<precision>_<global_batchsize>.<datestamp>.log`. The output log contains information about:
  
  
 -   Inference performance
 -   Inference performance
 -   Loss (masked language model loss and next sentence prediction loss) of the specified dataset if ground truths exist with the `--eval` flag.
 -   Loss (masked language model loss and next sentence prediction loss) of the specified dataset if ground truths exist with the `--eval` flag.
-
+ 
 For example:
 For example:
  
  
 `bash scripts/run_pretraining_inference.sh <evaluation_batch_size> <precision> <num_gpus> <inference_mode><model_checkpoint><inference_steps><create_logfile>`
 `bash scripts/run_pretraining_inference.sh <evaluation_batch_size> <precision> <num_gpus> <inference_mode><model_checkpoint><inference_steps><create_logfile>`
@@ -658,23 +686,28 @@ Where:
 -   `<model_checkpoint>` is the model checkpoint to run inference on. Default is `-1`, which takes the most recent model checkpoint from the `checkpoints` folder.
 -   `<model_checkpoint>` is the model checkpoint to run inference on. Default is `-1`, which takes the most recent model checkpoint from the `checkpoints` folder.
 -   `<inference_steps>` is the total number of inference steps per process. Default is `-1`, which iterates over the entire dataset.
 -   `<inference_steps>` is the total number of inference steps per process. Default is `-1`, which iterates over the entire dataset.
 -   `<create_logfile>` a flag indicating if output should be written to a logfile or not (acceptable values are `true` or `false`. `true` indicates output should be saved to a logfile.)
 -   `<create_logfile>` a flag indicating if output should be written to a logfile or not (acceptable values are `true` or `false`. `true` indicates output should be saved to a logfile.)
-
+ 
 For example:
 For example:
  
  
 `bash scripts/run_pretraining_inference.sh 14 fp16 8 eval -1 -1 true`
 `bash scripts/run_pretraining_inference.sh 14 fp16 8 eval -1 -1 true`
-
+ 
 #### Fine-tuning inference
 #### Fine-tuning inference
  
  
 Evaluation fine-tuning is enabled by the same scripts as training:
 Evaluation fine-tuning is enabled by the same scripts as training:
  
  
 -   Question Answering (`scripts/run_squad.sh`)
 -   Question Answering (`scripts/run_squad.sh`)
-
+ 
 The mode positional argument of the shell script is used to run in evaluation mode. The fine-tuned BERT model will be run on the evaluation dataset, and the evaluation loss and accuracy will be displayed.
 The mode positional argument of the shell script is used to run in evaluation mode. The fine-tuned BERT model will be run on the evaluation dataset, and the evaluation loss and accuracy will be displayed.
  
  
 Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting the `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running predictions and evaluating them on a given dataset or just the former.
 Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting the `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running predictions and evaluating them on a given dataset or just the former.
  
  
 `bash scripts/run_squad.sh <path to fine-tuned model checkpoint>`
 `bash scripts/run_squad.sh <path to fine-tuned model checkpoint>`
+
+To run inference interactively on question-context pairs, use the script `inference.py` as follows:
  
  
+`python inference.py --bert_model "bert-large-uncased" --init_checkpoint=<fine_tuned_checkpoint> --config_file="bert_config.json" --vocab_file=<path to vocab file>  --question="What food does Harry like?" --context="My name is Harry and I grew up in Canada. I love apples."`
+
+
 ### Deploying BERT using NVIDIA Triton Inference Server
 ### Deploying BERT using NVIDIA Triton Inference Server
  
  
 The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. More information on how to perform inference using NVIDIA Triton Inference Server can be found in [triton/README.md](./triton/README.md).
 The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. More information on how to perform inference using NVIDIA Triton Inference Server can be found in [triton/README.md](./triton/README.md).
@@ -695,6 +728,8 @@ To benchmark the training performance on a specific batch size, run:
 An example call used to generate throughput numbers:
 An example call used to generate throughput numbers:
 `bash scripts/run_squad.sh /workspace/bert/bert_large_uncased_wiki+books.pt.model 2.0 4 3e-5 fp16 8 42 /workspace/bert/squad_data /workspace/bert/scripts/vocab/vocab /results/SQuAD train /workspace/bert/bert_config.json -1`
 `bash scripts/run_squad.sh /workspace/bert/bert_large_uncased_wiki+books.pt.model 2.0 4 3e-5 fp16 8 42 /workspace/bert/squad_data /workspace/bert/scripts/vocab/vocab /results/SQuAD train /workspace/bert/bert_config.json -1`
  
  
+ 
+ 
 #### Inference performance benchmark
 #### Inference performance benchmark
  
  
 Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Inference process](#inference-process).
 Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Inference process](#inference-process).
@@ -705,16 +740,27 @@ To benchmark the inference performance on a specific batch size, run:
 An example call used to generate throughput numbers:
 An example call used to generate throughput numbers:
 `bash scripts/run_squad.sh /workspace/bert/bert_large_uncased_wiki+books.pt.model 2.0 4 3e-5 fp16 8 42 /workspace/bert/squad_data /workspace/bert/scripts/vocab/vocab /results/SQuAD eval /workspace/bert/bert_config.json -1`
 `bash scripts/run_squad.sh /workspace/bert/bert_large_uncased_wiki+books.pt.model 2.0 4 3e-5 fp16 8 42 /workspace/bert/squad_data /workspace/bert/scripts/vocab/vocab /results/SQuAD eval /workspace/bert/bert_config.json -1`
  
  
+ 
+ 
 ### Results
 ### Results
  
  
 The following sections provide details on how we achieved our performance and accuracy in training and inference. 
 The following sections provide details on how we achieved our performance and accuracy in training and inference. 
  
  
 #### Training accuracy results
 #### Training accuracy results
  
  
-Our results were obtained by running the `scripts/run_squad.sh` and `scripts/run_pretraining.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs for pretraining and NVIDIA DGX-1 with (8x V100 16G) GPUs for fine-tuning.
+Our results were obtained by running the `scripts/run_squad.sh` and `scripts/run_pretraining.sh` training scripts in the pytorch:20.06-py3 NGC container unless otherwise specified.
  
  
+##### Pre-training loss results: NVIDIA DGX A100 (8x A100 40GB)
+
+| DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - TF32 | Final Loss - mixed precision | Time to train(hours) - TF32 | Time to train(hours) - mixed precision | Time to train speedup (TF32 to mixed precision)
+|---|---|---|---|---|---|---|---|---
+|32 x DGX A100 with 40G |8|256 and 128|4 and 8|---|1.3415|---|2.3|---  
+|32 x DGX A100 with 40G |8|256 and 128|4 and 16|1.3415|---|3.7|---|---  
+
 ##### Pre-training loss results
 ##### Pre-training loss results
- 
+
+Following results were obtained by running on pytorch:19.07-py3 NGC container.
+
 | DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
 | DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
 |---|---|---|---|---|---|---|---|---
 |---|---|---|---|---|---|---|---|---
 | 1 x NVIDIA DGX-1 With 16G|8|8192 and 4096 |512 and 1024|-|1.36|-|153.16|-
 | 1 x NVIDIA DGX-1 With 16G|8|8192 and 4096 |512 and 1024|-|1.36|-|153.16|-
@@ -724,7 +770,12 @@ Our results were obtained by running the `scripts/run_squad.sh` and `scripts/run
 | 16 x NVIDIA DGX-1 With 16G|8|512 and 256 |32 and 64|-|1.329|-|10.36|-
 | 16 x NVIDIA DGX-1 With 16G|8|512 and 256 |32 and 64|-|1.329|-|10.36|-
 | 16 x NVIDIA DGX-2H With 32G|16|256 and 128 |4 and 16|-|1.33|-|3.94|-
 | 16 x NVIDIA DGX-2H With 32G|16|256 and 128 |4 and 16|-|1.33|-|3.94|-
 | 64 x NVIDIA DGX-2H With 32G|16|64 and 32 |(1 and 4)FP16 and (2 and 8)FP32|1.33|1.331|4.338|1.124|3.85
 | 64 x NVIDIA DGX-2H With 32G|16|64 and 32 |(1 and 4)FP16 and (2 and 8)FP32|1.33|1.331|4.338|1.124|3.85
- 
+
+##### Fine-tuning accuracy results: NVIDIA DGX A100 (8x A100 40GB)
+
+| GPUs | Batch size / GPU (TF32 and FP16) | Accuracy - TF32(% F1) | Accuracy - mixed precision(% F1) | Time to train(hours) - TF32 | Time to train(hours) - mixed precision | Time to train speedup (TF32 to mixed precision)
+|8|16 and 32|91.344|91.34|0.174|0.065|2.68
+
 ##### Fine-tuning accuracy results
 ##### Fine-tuning accuracy results
  
  
 | GPUs | Batch size / GPU | Accuracy - FP32(% F1) | Accuracy - mixed precision(% F1) | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
 | GPUs | Batch size / GPU | Accuracy - FP32(% F1) | Accuracy - mixed precision(% F1) | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
@@ -748,25 +799,53 @@ Training stability with 8 GPUs, FP16 computations, batch size of 4:
 |Exact Match %| 84.50 | 84.07 | 84.52 | 84.23 | 84.17 | 84.30 | .200
 |Exact Match %| 84.50 | 84.07 | 84.52 | 84.23 | 84.17 | 84.30 | .200
 | f1 % | 91.29 | 91.01 | 91.14 |  91.10 | 90.85 | 91.08 | 0.162
 | f1 % | 91.29 | 91.01 | 91.14 |  91.10 | 90.85 | 91.08 | 0.162
  
  
-#### Training performance results
  
  
+#### Training performance results
+
+##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+
+Our results were obtained by running the `scripts run_pretraining.sh` training script in the pytorch:20.06-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in items/images per second) were averaged over a few training iterations.
+
+###### Pre-training NVIDIA DGX A100 (8x A100 40GB)
+
+| GPUs | Batch size / GPU (TF32 and FP16) | Accumulation steps (TF32 and FP16) | Sequence length | Throughput - TF32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision
+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
+|1 | 65232 and 65536 | 1208 and 512| 128| 234 |415 |1.77 |1.00 | 1.00
+|4 | 16308 and 16384 | 302 and 128| 128| 910 |1618 | 1.77| 3.89| 3.90
+|8 | 8154 and 8192 | 151 and 64| 128| 1777 |3231 | 1.81| 7.59| 7.79
+|1 | 32768 and 32768| 4096 and 2048| 512| 41 |78 |1.90 |1.00 | 1.00
+|4 | 8192 and 8192| 1024 and 512| 512| 159 |308 | 1.93| 3.88| 3.95
+| 8| 4096 and 4096| 512 and 256| 512| 318 |620 | 1.94| 7.95| 7.76
+
+###### Fine-tuning NVIDIA DGX A100 (8x A100 40GB)
+  
+| GPUs | Batch size / GPU (TF32 and FP16) | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+|------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
+|1 | 16 and 32|44 |116 | 2.63| 1.00| 1.00
+|4 | 16 and 32|165 |441 | 2.67| 3.75| 3.80
+| 8| 16 and 32|324 |861 | 2.65| 7.42| 7.36
+
+
 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
  
  
-Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a predefined number of training iterations.
+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
  
  
 ###### Pre-training NVIDIA DGX-1 With 16G
 ###### Pre-training NVIDIA DGX-1 With 16G
  
  
-| GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+| GPUs | Batch size / GPU (FP32 and FP16) | Accumulation steps (FP32 and FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
-|1 | 8 | 16| 128| 33.36 |125.44 |3.76 |1.00 | 1.00
-|4 | 8 | 16| 128| 121.92 |458.24 | 3.75| 3.65| 3.65
-|8 | 8 | 16| 128| 245.12 |919.04 | 3.74| 7.34| 7.32
-|1 | 2| 4| 512| 7.56 |26.64 |3.52 |1.00 | 1.00
-|4 | 2| 4| 512| 28 |98.24 | 3.50| 3.70| 3.69
-| 8| 2| 4| 512| 56.16 |194.56 | 3.46| 7.43| 7.30
+|1 | 65536 and 65536  | 8192 and 4096| 128| 40 |164 |4.1 |1.00 | 1.00
+|4 | 16384 and 16384  | 2048 and 1024| 128| 155 |615 | 3.96| 3.88| 3.75
+|8 | 8192 and 8192  | 1024 and 512| 128| 313 |1236 | 3.94| 7.83| 7.54
+|1 | 32768 and 32768 | 16384 and 8192| 512| 9 |34 |3.77 |1.00 | 1.00
+|4 | 8192 and 8192 | 4096 and 2048| 512| 35 |131 | 3.74| 3.89| 3.85
+| 8| 4096 and 4096 | 2048 and 1024| 512| 71 |263 | 3.70| 7.89| 7.74
  
  
-###### Pre-training on multiple NVIDIA DGX-1 With 16G
  
  
+###### Pre-training on multiple NVIDIA DGX-1 With 16G
+
+Following numbers were obtained on NGC pytorch:19.07-py3 NGC container.
+
 | Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
 | Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|--------------
 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|--------------
 |1 |8 | N/A | 16| 128| N/A |874.24 |N/A |N/A | 1.00
 |1 |8 | N/A | 16| 128| N/A |874.24 |N/A |N/A | 1.00
@@ -776,64 +855,64 @@ Our results were obtained by running the `scripts/run_pretraining.sh` and `scrip
 |4 |8 | N/A | 4| 512| N/A |700.16 | N/A| N/A| 3.57
 |4 |8 | N/A | 4| 512| N/A |700.16 | N/A| N/A| 3.57
 |16| 8| N/A | 4| 512| N/A |2746.368 | N/A| N/A| 14.02
 |16| 8| N/A | 4| 512| N/A |2746.368 | N/A| N/A| 14.02
  
  
+ 
 ###### Fine-tuning NVIDIA DGX-1 With 16G
 ###### Fine-tuning NVIDIA DGX-1 With 16G
-
-| GPUs | Batch size / GPU | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+ 
+ 
+| GPUs | Batch size / GPU (FP32 and FP16) | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
-|1 | 4|8.96 |35.88 | 3.99| 1.00| 1.00
-|4 | 4|31.04 |120.00 | 3.86| 3.46| 3.34
-| 8| 4|64.64 |227.84 | 3.52| 7.20| 6.35
-|1 | 10|N/A |45.2| N/A| N/A| 1.0
-|4 | 10|N/A |163.6 | N/A| N/A| 3.62
-| 8| 10|N/A |327.2| N/A| N/A| 7.24
+|1 | 4 and 10|9 |50 | 5.55| 1.00| 1.00
+|4 | 4 and 10|32 |183 | 5.71| 3.56| 3.66
+| 8| 4 and 10|61 |359 | 5.88| 6.78| 7.18
+ 
  
  
 ##### Training performance: NVIDIA DGX-1 (8x V100 32G)
 ##### Training performance: NVIDIA DGX-1 (8x V100 32G)
  
  
-Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
  
  
 ###### Pre-training NVIDIA DGX-1 With 32G
 ###### Pre-training NVIDIA DGX-1 With 32G
-
-| GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+ 
+| GPUs | Batch size / GPU (FP32 and FP16) | Accumulation steps (FP32 and FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
-|1 |32 | 64| 128| 40.32 |171.52| 4.25| 1.0| 1.0
-|4 |32 | 64| 128| 154.88 |655.36 | 4.23| 3.84| 3.82
-|8 |32 | 64| 128|309.76 |1305.6| 4.21| 7.68 | 7.62
-|1 | 4| 8| 512|8.36 |30.08 | 3.68| 1.00| 1.00
-|4 | 4| 8| 512|31.52 |116.80 | 3.70| 3.84| 3.82
-| 8| 4| 8| 512|62.72 |231.68 | 3.69| 7.68| 7.61
+|1 | 65536 and 65536  | 8192 and 4096| 128| 40 |158 |3.95 |1.00 | 1.00
+|4 | 16384 and 16384  | 2048 and 1024| 128| 157 |625 | 3.93| 3.96| 3.65
+|8 | 8192 and 8192  | 1024 and 512| 128| 317 |1203 | 3.79| 7.93| 7.61
+|1 | 32768 and 32768 | 16384 and 8192| 512| 9 |33 |3.66 |1.00 | 1.00
+|4 | 8192 and 8192 | 4096 and 2048| 512| 35 |130 | 3.71| 3.89| 3.94
+| 8| 4096 and 4096 | 2048 and 1024| 512| 72 |262 | 3.63| 8.0| 7.94
+ 
  
  
 ###### Fine-tuning NVIDIA DGX-1 With 32G
 ###### Fine-tuning NVIDIA DGX-1 With 32G
-
-| GPUs | Batch size / GPU | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+ 
+| GPUs | Batch size / GPU (FP32 and FP16) | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
-|1 | 8|8.64 |36.04 | 4.171| 1.00| 1.00
-|4 | 8|31.52 |116.80 | 3.71| 3.64| 3.24
-| 8| 8|64.32 |231.04 | 3.59| 7.44| 6.41
-|1 | 10|N/A |46.00| N/A| N/A| 1.0
-|4 | 10|N/A |164.00 | N/A| N/A| 3.57
-| 8| 10|N/A |325.60| N/A| N/A| 7.08
+|1 | 8 and 10|12 |49 | 4.08| 1.00| 1.00
+|4 | 8 and 10|42 |178 | 4.23| 3.5| 3.63
+| 8| 8 and 10|67 |351 | 5.23| 5.58| 7.16 
  
  
 ##### Training performance: NVIDIA DGX-2 (16x V100 32G)
 ##### Training performance: NVIDIA DGX-2 (16x V100 32G)
  
  
-Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
  
  
 ###### Pre-training NVIDIA DGX-2 With 32G
 ###### Pre-training NVIDIA DGX-2 With 32G
-
-| GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
-|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
-|1 |32 | 64 | 128|43.52 | 181.76 | 4.17| 1.00| 1.00
-|4 |32 | 64 | 128| 168.96| 704| 4.16| 3.88| 3.87
-|8 |32 | 64| 128| 335.36| 1402.88| 4.18| 7.70| 7.72
-|16 |32 | 64| 128| 665.6| 2775.04| 4.16| 15.29| 15.26
-|1 | 4 | 8 | 512|9.0| 32.32| 3.59| 1.00| 1.00
-|4 | 4 |8 | 512| 34.4| 124.16| 3.60| 3.82| 3.84
-|8 | 4 | 8| 512| 68.16| 247.04| 3.62| 7.57| 7.64
-|16 | 4 | 8| 512| 135.68| 488.96| 3.60| 15.08| 15.13
  
  
-###### Pre-training on multiple NVIDIA DGX-2H With 32G
+| GPUs | Batch size / GPU (FP32 and FP16) | Accumulation steps (FP32 and FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
+|1 | 65536 and 65536  | 8192 and 4096| 128| 42 |173 |4.11 |1.00 | 1.00
+|4 | 16384 and 16384  | 2048 and 1024| 128| 166 |669 | 4.03| 3.95| 3.87
+|8 | 8192 and 8192  | 1024 and 512| 128| 330 |1324 | 4.01| 7.86| 7.65
+|16 | 4096 and 4096  | 512 and 256| 128| 658 |2557 | 3.88| 15.67| 14.78
+|1 | 32768 and 32768 | 16384 and 8192| 512| 10 |36 |3.6 |1.00 | 1.00
+|4 | 8192 and 8192 | 4096 and 2048| 512| 37 |137 | 3.70| 3.70| 3.81
+| 8| 4096 and 4096 | 2048 and 1024| 512| 75 |273 | 3.64| 7.50| 7.58
+| 16| 2048 and 2048 | 1024 and 512| 512| 150 |551 | 3.67| 15.00| 15.31
 
 
+###### Pre-training on multiple NVIDIA DGX-2H With 32G
+ 
 Note: Multi-node performance numbers below are on DGX-2H whereas the single node performance numbers above are on DGX-2.
 Note: Multi-node performance numbers below are on DGX-2H whereas the single node performance numbers above are on DGX-2.
 
 
+Following numbers are obtained on pytorch:19.07-py3 NGC container. 
+ 
 | Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
 | Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|---------------------
 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|---------------------
 |1 |16 | N/A | 64| 128| N/A |3379.2 |N/A |N/A | 1.00
 |1 |16 | N/A | 64| 128| N/A |3379.2 |N/A |N/A | 1.00
@@ -846,69 +925,58 @@ Note: Multi-node performance numbers below are on DGX-2H whereas the single node
 |64| 16| 4 | 8| 512| 9543.68 |37478.4 | 3.92| N/A| 59.9
 |64| 16| 4 | 8| 512| 9543.68 |37478.4 | 3.92| N/A| 59.9
  
  
 ###### Fine-tuning NVIDIA DGX-2 With 32G
 ###### Fine-tuning NVIDIA DGX-2 With 32G
-
-| GPUs | Batch size / GPU | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+ 
+| GPUs | Batch size / GPU (FP32 and FP16) | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
-|1 |4 |9.92| 38.16| 3.84| 1.00| 1.00
-|4 |4 | 35.52| 122.08| 3.43| 3.58| 3.20
-|8 | 4| 71.36| 241.28| 3.38| 7.19| 6.32
-|16 | 4| 141.40| 462.08| 3.27| 14.25| 12.11
-|1 |10 |N/A | 47.40| N/A| N/A| 1.00
-|4 |10 | N/A| 165.60| N/A| N/A| 3.49
-|8 | 10| N/A| 325.60| N/A| N/A| 6.87
-|16 | 10| N/A| 648.00| N/A| N/A| 13.67
+|1 |8 and 10 |12| 53| 4.41| 1.00| 1.00
+|4 |8 and 10 | 47| 188| 4| 3.92| 3.55
+|8 | 8 and 10| 92| 369| 4.01| 7.67| 6.96
+|16 | 8 and 10| 178| 700| 3.93| 14.83| 13.21
  
  
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
  
  
 #### Inference performance results
 #### Inference performance results
+
+##### Inference performance: NVIDIA DGX A100 (1x A100 40GB) 
  
  
-##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` script on data of sequence length 512 and the `scripts/run_squad.sh` script in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
  
  
-Our results were obtained by running the `scripts/run_pretraining_inference.sh` script on data of sequence length 512 and the `scripts/run_squad.sh` script in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
+###### Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)
  
  
-###### Pre-training inference on NVIDIA DGX-1 with 16G
-
-| GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
+| GPUs |  Batch Size \(TF32/FP16\) | Sequence Length | Throughput \- TF32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
-| 1    | 2/4                       | 512             |     28\.32        | 94\.36                                         |
- 
-###### Fine-tuning inference on NVIDIA DGX-1 with 16G
+| 1    | 8/8  | 384             |      188       | 283    |
 
 
-| GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
-|------|---------------------------|-----------------|-------------------|------------------------------------------------|
-| 1    | 4/4                       | 384             |      37\.64       | 119\.76                                        |
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
  
  
-##### Inference performance: NVIDIA DGX-1 (1x V100 32G)
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
  
  
-Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
+###### Fine-tuning inference on NVIDIA DGX-1 with 16G
  
  
-###### Pre-training inference on NVIDIA DGX-1 with 32G
-
 | GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
 | GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
-| 1    | 4/8                       | 512             | 27\.58            | 90\.16                                         |
+| 1    | 8/8                       | 384             |      42       | 153                                        |
  
  
+##### Inference performance: NVIDIA DGX-1 (1x V100 32G)
+ 
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
+  
 ###### Fine-tuning inference on NVIDIA DGX-1 with 32G
 ###### Fine-tuning inference on NVIDIA DGX-1 with 32G
-
+ 
 | GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
 | GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
-| 1    | 4/4                       | 384             |37\.64             | 119\.76                                        |
+| 1    | 8/8                       | 384             |48             | 143                                        |
  
  
 ##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
 ##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
  
  
-Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
- 
-###### Pre-training inference on NVIDIA DGX-2 with 32G
-
-| GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
-|------|---------------------------|-----------------|--------------------|------------------------------------------------|
-| 1    | 4/8                       | 512             | 30\.24             | 97\.72                                         |
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
  
  
 ###### Fine-tuning inference on NVIDIA DGX-2 with 32G
 ###### Fine-tuning inference on NVIDIA DGX-2 with 32G
 
 
 | GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
 | GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
-|------|---------------------------|-----------------|--------------------|------------------------------------------------|
-| 1    | 4/4                       | 384             | 35\.76             | 112\.60                                        |
+|------|---------------------------|-----------------|-------------------|------------------------------------------------|
+| 1    | 8/8                       | 384             |43             | 148                                        |
  
  
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
  
  
@@ -918,6 +986,9 @@ The inference performance metrics used were items/second.
  
  
 ### Changelog
 ### Changelog
  
  
+July 2020
+- Ampere support
+ 
 March 2020
 March 2020
 - TRITON Inference Server support.
 - TRITON Inference Server support.
  
  

+ 211 - 0
PyTorch/LanguageModeling/BERT/bind.sh

@@ -0,0 +1,211 @@
+#! /bin/bash
+set -euo pipefail
+
+print_usage() {
+    cat << EOF
+${0} [options] [--] COMMAND [ARG...]
+
+Control binding policy for each task. Assumes one rank will be launched for each GPU.
+
+Options:
+    --cpu=MODE
+        * exclusive -- bind each rank to an exclusive set of cores near its GPU
+        * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
+        * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
+	* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
+        * off -- don't bind
+    --mem=MODE
+        * node -- bind each rank to the nearest NUMA node [default]
+	* *.sh -- bind each rank using the bash associative array bind_mem from a file
+        * off -- don't bind
+    --ib=MODE
+        * single -- bind each rank to a single IB device near its GPU
+        * off -- donot bind [default]
+    --cluster=CLUSTER
+        Select which cluster is being used. May be required if system params cannot be detected.
+EOF
+}
+
+################################################################################
+# Argument parsing
+################################################################################
+
+cpu_mode='node'
+mem_mode='node'
+ib_mode='off'
+cluster=''
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -h|--help) print_usage ; exit 0 ;;
+        --cpu=*) cpu_mode="${1/*=/}"; shift ;;
+        --cpu)   cpu_mode="$2"; shift 2 ;;
+        --mem=*) mem_mode="${1/*=/}"; shift ;;
+        --mem)   mem_mode="$2"; shift 2 ;;
+        --ib=*) ib_mode="${1/*=/}"; shift ;;
+        --ib)   ib_mode="$2"; shift 2 ;;
+        --cluster=*) cluster="${1/*=/}"; shift ;;
+        --cluster)   cluster="$2"; shift 2 ;;
+        --) shift; break ;;
+        *) break ;;
+    esac
+done
+if [ $# -lt 1 ]; then
+    echo 'ERROR: no command given' 2>&1
+    print_usage
+    exit 1
+fi
+
+################################################################################
+# Get system params
+################################################################################
+
+# LOCAL_RANK is set with an enroot hook for Pytorch containers
+# SLURM_LOCALID is set by Slurm
+# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
+readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+if [ -z "${local_rank}" ]; then
+    echo 'ERROR: cannot read LOCAL_RANK from env' >&2
+    exit 1
+fi
+
+num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
+if [ "${local_rank}" -ge "${num_gpus}" ]; then
+    echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
+    exit 1
+fi
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+lscpu_out=$(lscpu)
+num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
+num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
+cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
+
+readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
+if [ ${num_gpus} -gt 1 ]; then
+    readonly gpus_per_node=$(( num_gpus / num_nodes ))
+else
+    readonly gpus_per_node=1
+fi
+readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
+readonly local_node=$(( local_rank / gpus_per_node ))
+
+
+declare -a ibdevs=()
+case "${cluster}" in
+    circe)
+        # Need to specialize for circe because IB detection is hard
+        ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
+        ;;
+   selene)
+        # Need to specialize for selene because IB detection is hard
+        ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
+        ;;
+    '')
+        if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
+            mapfile -t ibdevs <<< "${ibstat_out}"
+        fi
+        ;;
+    *)
+        echo "ERROR: Unknown cluster '${cluster}'" >&2
+        exit 1
+        ;;
+esac
+readonly num_ibdevs="${#ibdevs[@]}"
+
+################################################################################
+# Setup for exec
+################################################################################
+
+declare -a numactl_args=()
+
+case "${cpu_mode}" in
+    exclusive)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+            $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
+            $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
+        )" )
+        ;;
+    exclusive,nosmt)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+        )" )
+        ;;
+    node)
+        numactl_args+=( "--cpunodebind=${local_node}" )
+        ;;
+    *.sh)
+	source "${cpu_mode}"
+	if [ -n "${bind_cpu_cores:-}" ]; then
+	    numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
+	elif [ -n "${bind_cpu_nodes:-}" ]; then
+	    numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
+	else
+	    echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
+	    exit 1
+	fi
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${mem_mode}" in
+    node)
+        numactl_args+=( "--membind=${local_node}" )
+        ;;
+    *.sh)
+	source "${mem_mode}"
+	if [ -z "${bind_mem:-}" ]; then
+	    echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
+	    exit 1
+	fi
+	numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${ib_mode}" in
+    single)
+        if [ "${num_ibdevs}" -eq 0 ]; then
+            echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
+        else
+            readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
+            export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
+        fi
+        ;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+################################################################################
+# Exec
+################################################################################
+
+if [ "${#numactl_args[@]}" -gt 0 ] ; then
+    set -x
+    exec numactl "${numactl_args[@]}" -- "${@}"
+else
+    exec "${@}"
+fi

+ 7 - 0
PyTorch/LanguageModeling/BERT/modeling.py

@@ -119,10 +119,16 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
 def gelu(x):
 def gelu(x):
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
 
+#used only for triton inference
 def bias_gelu(bias, y):
 def bias_gelu(bias, y):
     x = bias + y
     x = bias + y
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
 
+# used specifically for training since torch.nn.functional.gelu breaks ONNX export
+def bias_gelu_training(bias, y):
+    x = bias + y
+    return torch.nn.functional.gelu(x) # Breaks ONNX export
+
 def bias_tanh(bias, y):
 def bias_tanh(bias, y):
     x = bias + y
     x = bias + y
     return torch.tanh(x)
     return torch.tanh(x)
@@ -130,6 +136,7 @@ def bias_tanh(bias, y):
 def swish(x):
 def swish(x):
     return x * torch.sigmoid(x)
     return x * torch.sigmoid(x)
 
 
+#torch.nn.functional.gelu(x) # Breaks ONNX export
 ACT2FN = {"gelu": gelu, "bias_gelu": bias_gelu, "bias_tanh": bias_tanh, "relu": torch.nn.functional.relu, "swish": swish}
 ACT2FN = {"gelu": gelu, "bias_gelu": bias_gelu, "bias_tanh": bias_tanh, "relu": torch.nn.functional.relu, "swish": swish}
 
 
 class LinearActivation(Module):
 class LinearActivation(Module):

+ 4 - 1
PyTorch/LanguageModeling/BERT/requirements.txt

@@ -10,4 +10,7 @@ ipdb
 h5py
 h5py
 html2text
 html2text
 nltk
 nltk
-progressbar
+progressbar
+#Others
+onnxruntime
+git+https://github.com/NVIDIA/dllogger

+ 5 - 3
PyTorch/LanguageModeling/BERT/run.sub

@@ -19,8 +19,8 @@
 set -eux
 set -eux
 
 
 # The following variables variables need to be set
 # The following variables variables need to be set
-# Base container to be used  
-readonly docker_image="nvcr.io/nvidia/pytorch:19.10-py3"
+# Base container to be used - container built in step 1 on quick start guide 
+readonly docker_image="nvcr.io/nvidia/pytorch:20.06-py3"
 # Location of dataset for phase 1
 # Location of dataset for phase 1
 readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
 readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
 # Location of dataset for phase 2
 # Location of dataset for phase 2
@@ -30,6 +30,8 @@ readonly checkpointdir="$PWD/checkpoints"
 
 
 readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
 readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
 
 
+BIND_CMD="./bind.sh --cpu=exclusive --ib=single --"
+
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
 
 
 PHASE1="\
 PHASE1="\
@@ -59,7 +61,7 @@ PHASES=( "$PHASE1" "$PHASE2" )
 PHASE=${PHASE:-1}
 PHASE=${PHASE:-1}
 
 
 BERT_CMD="\
 BERT_CMD="\
-    python -u /workspace/bert/run_pretraining.py \
+    ${BIND_CMD} python -u /workspace/bert/run_pretraining.py \
     --seed=42 \
     --seed=42 \
     ${PHASES[$((PHASE-1))]} \
     ${PHASES[$((PHASE-1))]} \
     --do_train \
     --do_train \

+ 4 - 3
PyTorch/LanguageModeling/BERT/run_glue.py

@@ -33,7 +33,7 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from tqdm import tqdm, trange
 
 
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+import modeling
 from tokenization import BertTokenizer
 from tokenization import BertTokenizer
 from optimization import BertAdam, warmup_linear
 from optimization import BertAdam, warmup_linear
 from schedulers import LinearWarmUpScheduler
 from schedulers import LinearWarmUpScheduler
@@ -552,12 +552,13 @@ def main():
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
 
     # Prepare model
     # Prepare model
-    config = BertConfig.from_json_file(args.config_file)
+    config = modeling.BertConfig.from_json_file(args.config_file)
     # Padding for divisibility by 8
     # Padding for divisibility by 8
     if config.vocab_size % 8 != 0:
     if config.vocab_size % 8 != 0:
         config.vocab_size += 8 - (config.vocab_size % 8)
         config.vocab_size += 8 - (config.vocab_size % 8)
 
 
-    model = BertForSequenceClassification(config, num_labels=num_labels)
+    modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
+    model = modeling.BertForSequenceClassification(config, num_labels=num_labels)
     print("USING CHECKPOINT from", args.init_checkpoint)
     print("USING CHECKPOINT from", args.init_checkpoint)
     model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
     model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
     print("USED CHECKPOINT from", args.init_checkpoint)
     print("USED CHECKPOINT from", args.init_checkpoint)

+ 13 - 9
PyTorch/LanguageModeling/BERT/run_pretraining.py

@@ -198,7 +198,7 @@ def parse_arguments():
                              "E.g., 0.1 = 10%% of training.")
                              "E.g., 0.1 = 10%% of training.")
     parser.add_argument("--local_rank",
     parser.add_argument("--local_rank",
                         type=int,
                         type=int,
-                        default=-1,
+                        default=os.getenv('LOCAL_RANK', -1),
                         help="local_rank for distributed training on gpus")
                         help="local_rank for distributed training on gpus")
     parser.add_argument('--seed',
     parser.add_argument('--seed',
                         type=int,
                         type=int,
@@ -272,7 +272,13 @@ def parse_arguments():
                         default=False,
                         default=False,
                         action='store_true',
                         action='store_true',
                         help='Disable tqdm progress bar')
                         help='Disable tqdm progress bar')
+    parser.add_argument('--steps_this_run', type=int, default=-1,
+                        help='If provided, only run this many steps before exiting')
+
     args = parser.parse_args()
     args = parser.parse_args()
+
+    if args.steps_this_run < 0:
+        args.steps_this_run = args.max_steps
     
     
     return args
     return args
 
 
@@ -291,7 +297,7 @@ def setup_training(args):
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl', init_method='env://')
         torch.distributed.init_process_group(backend='nccl', init_method='env://')
         args.n_gpu = 1
         args.n_gpu = 1
-
+        
     if args.gradient_accumulation_steps == 1:
     if args.gradient_accumulation_steps == 1:
         args.allreduce_post_accumulation = False
         args.allreduce_post_accumulation = False
         args.allreduce_post_accumulation_fp16 = False
         args.allreduce_post_accumulation_fp16 = False
@@ -336,7 +342,7 @@ def prepare_model_and_optimizer(args, device):
     if config.vocab_size % 8 != 0:
     if config.vocab_size % 8 != 0:
         config.vocab_size += 8 - (config.vocab_size % 8)
         config.vocab_size += 8 - (config.vocab_size % 8)
 
 
-    modeling.ACT2FN["bias_gelu"] = torch.jit.script(modeling.ACT2FN["bias_gelu"])
+    modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
     model = modeling.BertForPreTraining(config)
     model = modeling.BertForPreTraining(config)
 
 
     checkpoint = None
     checkpoint = None
@@ -481,9 +487,6 @@ def main():
     global timeout_sent
     global timeout_sent
 
 
     args = parse_arguments()
     args = parse_arguments()
-
-    if args.use_env and 'LOCAL_RANK' in os.environ:
-        args.local_rank = int(os.environ['LOCAL_RANK'])
         
         
     random.seed(args.seed + args.local_rank)
     random.seed(args.seed + args.local_rank)
     np.random.seed(args.seed + args.local_rank)
     np.random.seed(args.seed + args.local_rank)
@@ -604,7 +607,7 @@ def main():
                         lr_scheduler.step()  # learning rate warmup
                         lr_scheduler.step()  # learning rate warmup
                         global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
                         global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
 
 
-                    if global_step >= args.max_steps:
+                    if global_step >= args.steps_this_run or timeout_sent:
                         train_time_raw = time.time() - raw_train_start
                         train_time_raw = time.time() - raw_train_start
                         last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
                         last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
                         last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
                         last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
@@ -623,7 +626,8 @@ def main():
                                                                             "learning_rate": optimizer.param_groups[0]['lr']})
                                                                             "learning_rate": optimizer.param_groups[0]['lr']})
                         average_loss = 0
                         average_loss = 0
 
 
-                    if global_step >= args.max_steps or training_steps % (
+
+                    if global_step >= args.steps_this_run or training_steps % (
                             args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent:
                             args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent:
                         if is_main_process() and not args.skip_checkpoint:
                         if is_main_process() and not args.skip_checkpoint:
                             # Save a trained model
                             # Save a trained model
@@ -649,7 +653,7 @@ def main():
 
 
                         # Exiting the training due to hitting max steps, or being sent a 
                         # Exiting the training due to hitting max steps, or being sent a 
                         # timeout from the cluster scheduler
                         # timeout from the cluster scheduler
-                        if global_step >= args.max_steps or timeout_sent:
+                        if global_step >= args.steps_this_run or timeout_sent:
                             del train_dataloader
                             del train_dataloader
                             # thread.join()
                             # thread.join()
                             return args, final_loss, train_time_raw, global_step
                             return args, final_loss, train_time_raw, global_step

+ 13 - 10
PyTorch/LanguageModeling/BERT/run_squad.py

@@ -37,7 +37,7 @@ from tqdm import tqdm, trange
 from apex import amp
 from apex import amp
 from schedulers import LinearWarmUpScheduler
 from schedulers import LinearWarmUpScheduler
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+import modeling
 from optimization import BertAdam, warmup_linear
 from optimization import BertAdam, warmup_linear
 from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
 from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
 from utils import is_main_process, format_step
 from utils import is_main_process, format_step
@@ -478,6 +478,11 @@ def get_answers(examples, features, results, args):
         if not nbest:                                                    
         if not nbest:                                                    
 	          nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))
 	          nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))
 
 
+        # In very rare edge cases we could only have single null prediction.
+        # So we just create a nonce prediction in this case to avoid failure.
+        if not nbest:                                                    
+            nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))
+
         total_scores = []
         total_scores = []
         best_non_null_entry = None
         best_non_null_entry = None
         for entry in nbest:
         for entry in nbest:
@@ -788,7 +793,7 @@ def main():
                         help="Whether to lower case the input text. True for uncased models, False for cased models.")
                         help="Whether to lower case the input text. True for uncased models, False for cased models.")
     parser.add_argument("--local_rank",
     parser.add_argument("--local_rank",
                         type=int,
                         type=int,
-                        default=-1,
+                        default=os.getenv('LOCAL_RANK', -1),
                         help="local_rank for distributed training on gpus")
                         help="local_rank for distributed training on gpus")
     parser.add_argument('--fp16',
     parser.add_argument('--fp16',
                         action='store_true',
                         action='store_true',
@@ -847,9 +852,6 @@ def main():
 
 
     args = parser.parse_args()
     args = parser.parse_args()
 
 
-    if args.use_env and 'LOCAL_RANK' in os.environ:
-        args.local_rank = int(os.environ['LOCAL_RANK'])
-
     if args.local_rank == -1 or args.no_cuda:
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
         n_gpu = torch.cuda.device_count()
         n_gpu = torch.cuda.device_count()
@@ -917,13 +919,14 @@ def main():
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
 
     # Prepare model
     # Prepare model
-    config = BertConfig.from_json_file(args.config_file)
+    config = modeling.BertConfig.from_json_file(args.config_file)
     # Padding for divisibility by 8
     # Padding for divisibility by 8
     if config.vocab_size % 8 != 0:
     if config.vocab_size % 8 != 0:
         config.vocab_size += 8 - (config.vocab_size % 8)
         config.vocab_size += 8 - (config.vocab_size % 8)
 
 
-    model = BertForQuestionAnswering(config)
-    # model = BertForQuestionAnswering.from_pretrained(args.bert_model,
+    modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
+    model = modeling.BertForQuestionAnswering(config)
+    # model = modeling.BertForQuestionAnswering.from_pretrained(args.bert_model,
                 # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
                 # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
     dllogger.log(step="PARAMETER", data={"loading_checkpoint": True})
     dllogger.log(step="PARAMETER", data={"loading_checkpoint": True})
     model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
     model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
@@ -1089,9 +1092,9 @@ def main():
     if args.do_train and is_main_process() and not args.skip_checkpoint:
     if args.do_train and is_main_process() and not args.skip_checkpoint:
         # Save a trained model and the associated configuration
         # Save a trained model and the associated configuration
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_model_file = os.path.join(args.output_dir, modeling.WEIGHTS_NAME)
         torch.save({"model":model_to_save.state_dict()}, output_model_file)
         torch.save({"model":model_to_save.state_dict()}, output_model_file)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        output_config_file = os.path.join(args.output_dir, modeling.CONFIG_NAME)
         with open(output_config_file, 'w') as f:
         with open(output_config_file, 'w') as f:
             f.write(model_to_save.config.to_json_string())
             f.write(model_to_save.config.to_json_string())
 
 

+ 252 - 0
PyTorch/LanguageModeling/BERT/scripts/configs/pretrain_config.sh

@@ -0,0 +1,252 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dgxa100_8gpu_fp16 ()
+{
+    train_batch_size="8192"
+    learning_rate="6e-3"
+    precision="fp16"
+    num_gpus=8
+    warmup_proportion="0.2843"
+    train_steps=7038
+    save_checkpoint_steps=200
+    resume_training="false"
+    create_logfile="true"
+    accumulate_gradients="true"
+    gradient_accumulation_steps=128
+    seed=42
+    job_name="bert_lamb_pretraining"
+    allreduce_post_accumulation="true"
+    allreduce_post_accumulation_fp16="true"
+    train_batch_size_phase2=4096
+    learning_rate_phase2="4e-3"
+    warmup_proportion_phase2="0.128"
+    train_steps_phase2=1563
+    gradient_accumulation_steps_phase2=256
+    DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
+    BERT_CONFIG=bert_config.json
+    CODEDIR="/workspace/bert"
+    init_checkpoint="None"
+    DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
+    echo $train_batch_size $learning_rate $precision $num_gpus \
+         $warmup_proportion $train_steps $save_checkpoint_steps \
+         $resume_training $create_logfile $accumulate_gradients  \
+         $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
+         $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
+         $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
+         $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
+
+}
+
+dgxa100_8gpu_tf32 ()
+{
+    train_batch_size="8192"
+    learning_rate="6e-3"
+    precision="tf32"
+    num_gpus=8
+    warmup_proportion="0.2843"
+    train_steps=7038
+    save_checkpoint_steps=200
+    resume_training="false"
+    create_logfile="true"
+    accumulate_gradients="true"
+    gradient_accumulation_steps=128
+    seed=42
+    job_name="bert_lamb_pretraining"
+    allreduce_post_accumulation="true"
+    allreduce_post_accumulation_fp16="false"
+    train_batch_size_phase2=4096
+    learning_rate_phase2="4e-3"
+    warmup_proportion_phase2="0.128"
+    train_steps_phase2=1563
+    gradient_accumulation_steps_phase2=512
+    DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
+    BERT_CONFIG=bert_config.json
+    CODEDIR="/workspace/bert"
+    init_checkpoint="None"
+    DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
+    echo $train_batch_size $learning_rate $precision $num_gpus \
+         $warmup_proportion $train_steps $save_checkpoint_steps \
+         $resume_training $create_logfile $accumulate_gradients  \
+         $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
+         $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
+         $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
+         $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
+
+}
+
+# Full  pretraining configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
+
+dgx2_16gpu_fp16 ()
+{
+    train_batch_size="4096"
+    learning_rate="6e-3"
+    precision="fp16"
+    num_gpus=16
+    warmup_proportion="0.2843"
+    train_steps=7038
+    save_checkpoint_steps=200
+    resume_training="false"
+    create_logfile="true"
+    accumulate_gradients="true"
+    gradient_accumulation_steps=64
+    seed=42
+    job_name="bert_lamb_pretraining"
+    allreduce_post_accumulation="true"
+    allreduce_post_accumulation_fp16="true"
+    train_batch_size_phase2=2048
+    learning_rate_phase2="4e-3"
+    warmup_proportion_phase2="0.128"
+    train_steps_phase2=1563
+    gradient_accumulation_steps_phase2=128
+    DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
+    BERT_CONFIG=bert_config.json
+    CODEDIR="/workspace/bert"
+    init_checkpoint="None"
+    DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
+    echo $train_batch_size $learning_rate $precision $num_gpus \
+         $warmup_proportion $train_steps $save_checkpoint_steps \
+         $resume_training $create_logfile $accumulate_gradients  \
+         $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
+         $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
+         $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
+         $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
+
+}
+
+dgx2_16gpu_fp32 ()
+{
+    train_batch_size="4096"
+    learning_rate="6e-3"
+    precision="fp32"
+    num_gpus=16
+    warmup_proportion="0.2843"
+    train_steps=7038
+    save_checkpoint_steps=200
+    resume_training="false"
+    create_logfile="true"
+    accumulate_gradients="true"
+    gradient_accumulation_steps=128
+    seed=42
+    job_name="bert_lamb_pretraining"
+    allreduce_post_accumulation="true"
+    allreduce_post_accumulation_fp16="false"
+    train_batch_size_phase2=2048
+    learning_rate_phase2="4e-3"
+    warmup_proportion_phase2="0.128"
+    train_steps_phase2=1563
+    gradient_accumulation_steps_phase2=256
+    DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
+    BERT_CONFIG=bert_config.json
+    CODEDIR="/workspace/bert"
+    init_checkpoint="None"
+    DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
+    echo $train_batch_size $learning_rate $precision $num_gpus \
+         $warmup_proportion $train_steps $save_checkpoint_steps \
+         $resume_training $create_logfile $accumulate_gradients  \
+         $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
+         $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
+         $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
+         $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
+
+}
+
+# Full pretraining configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
+
+dgx1_8gpu_fp16 ()
+{
+    train_batch_size="8192"
+    learning_rate="6e-3"
+    precision="fp16"
+    num_gpus=8
+    warmup_proportion="0.2843"
+    train_steps=7038
+    save_checkpoint_steps=200
+    resume_training="false"
+    create_logfile="true"
+    accumulate_gradients="true"
+    gradient_accumulation_steps=512
+    seed=42
+    job_name="bert_lamb_pretraining"
+    allreduce_post_accumulation="true"
+    allreduce_post_accumulation_fp16="true"
+    train_batch_size_phase2=4096
+    learning_rate_phase2="4e-3"
+    warmup_proportion_phase2="0.128"
+    train_steps_phase2=1563
+    gradient_accumulation_steps_phase2=512
+    DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
+    BERT_CONFIG=bert_config.json
+    CODEDIR="/workspace/bert"
+    init_checkpoint="None"
+    DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
+    echo $train_batch_size $learning_rate $precision $num_gpus \
+         $warmup_proportion $train_steps $save_checkpoint_steps \
+         $resume_training $create_logfile $accumulate_gradients  \
+         $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
+         $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
+         $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
+         $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
+
+}
+
+dgx1_8gpu_fp32 ()
+{
+    train_batch_size="8192"
+    learning_rate="6e-3"
+    precision="fp32"
+    num_gpus=8
+    warmup_proportion="0.2843"
+    train_steps=7038
+    save_checkpoint_steps=200
+    resume_training="false"
+    create_logfile="true"
+    accumulate_gradients="true"
+    gradient_accumulation_steps=1024
+    seed=42
+    job_name="bert_lamb_pretraining"
+    allreduce_post_accumulation="true"
+    allreduce_post_accumulation_fp16="false"
+    train_batch_size_phase2=4096
+    learning_rate_phase2="4e-3"
+    warmup_proportion_phase2="0.128"
+    train_steps_phase2=1563
+    gradient_accumulation_steps_phase2=1024
+    DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
+    BERT_CONFIG=bert_config.json
+    CODEDIR="/workspace/bert"
+    init_checkpoint="None"
+    DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+    DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
+    echo $train_batch_size $learning_rate $precision $num_gpus \
+         $warmup_proportion $train_steps $save_checkpoint_steps \
+         $resume_training $create_logfile $accumulate_gradients  \
+         $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
+         $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
+         $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
+         $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR
+
+}

+ 120 - 0
PyTorch/LanguageModeling/BERT/scripts/configs/squad_config.sh

@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dgxa100_8gpu_fp16 ()
+{
+    init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
+    epochs="2.0"
+    batch_size="32"
+    learning_rate="3e-5"
+    precision="fp16"
+    num_gpu="8"
+    seed="1"
+    squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
+    vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
+    OUT_DIR="/workspace/bert/results/SQuAD"
+    echo $init_checkpoint $epochs $batch_size $learning_rate \
+     $precision $num_gpu $seed $squad_dir $vocab_file \
+     $OUT_DIR
+}
+
+dgxa100_8gpu_tf32 ()
+{
+    init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
+    epochs="2.0"
+    batch_size="16"
+    learning_rate="3e-5"
+    precision="tf32"
+    num_gpu="8"
+    seed="1"
+    squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
+    vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
+    OUT_DIR="/workspace/bert/results/SQuAD"
+    echo $init_checkpoint $epochs $batch_size $learning_rate \
+     $precision $num_gpu $seed $squad_dir $vocab_file \
+     $OUT_DIR
+}
+
+# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
+
+dgx2_16gpu_fp16 ()
+{
+    init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
+    epochs="2.0"
+    batch_size="16"
+    learning_rate="3e-5"
+    precision="fp16"
+    num_gpu="16"
+    seed="1"
+    squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
+    vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
+    OUT_DIR="/workspace/bert/results/SQuAD"
+    echo $init_checkpoint $epochs $batch_size $learning_rate \
+     $precision $num_gpu $seed $squad_dir $vocab_file \
+     $OUT_DIR
+}
+
+dgx2_16gpu_fp32 ()
+{
+    init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
+    epochs="2.0"
+    batch_size="8"
+    learning_rate="3e-5"
+    precision="fp16"
+    num_gpu="16"
+    seed="1"
+    squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
+    vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
+    OUT_DIR="/workspace/bert/results/SQuAD"
+    echo $init_checkpoint $epochs $batch_size $learning_rate \
+     $precision $num_gpu $seed $squad_dir $vocab_file \
+     $OUT_DIR
+}
+
+# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
+
+dgx1_8gpu_fp16 ()
+{
+    init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
+    epochs="2.0"
+    batch_size="10"
+    learning_rate="3e-5"
+    precision="fp16"
+    num_gpu="8"
+    seed="1"
+    squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
+    vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
+    OUT_DIR="/workspace/bert/results/SQuAD"
+    echo $init_checkpoint $epochs $batch_size $learning_rate \
+     $precision $num_gpu $seed $squad_dir $vocab_file \
+     $OUT_DIR
+}
+
+dgx1_8gpu_fp32 ()
+{
+    init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt"
+    epochs="2.0"
+    batch_size="4"
+    learning_rate="3e-5"
+    precision="fp32"
+    num_gpu="8"
+    seed="1"
+    squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1"
+    vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"
+    OUT_DIR="/workspace/bert/results/SQuAD"
+    echo $init_checkpoint $epochs $batch_size $learning_rate \
+     $precision $num_gpu $seed $squad_dir $vocab_file \
+     $OUT_DIR
+}

+ 14 - 11
PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh

@@ -29,16 +29,18 @@ seed=${12:-42}
 job_name=${13:-"bert_lamb_pretraining"}
 job_name=${13:-"bert_lamb_pretraining"}
 allreduce_post_accumulation=${14:-"true"}
 allreduce_post_accumulation=${14:-"true"}
 allreduce_post_accumulation_fp16=${15:-"true"}
 allreduce_post_accumulation_fp16=${15:-"true"}
-train_batch_size_phase2=${17:-4096}
-learning_rate_phase2=${18:-"4e-3"}
-warmup_proportion_phase2=${19:-"0.128"}
-train_steps_phase2=${20:-1563}
-gradient_accumulation_steps_phase2=${21:-512}
+train_batch_size_phase2=${16:-4096}
+learning_rate_phase2=${17:-"4e-3"}
+warmup_proportion_phase2=${18:-"0.128"}
+train_steps_phase2=${19:-1563}
+gradient_accumulation_steps_phase2=${20:-512}
 DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
 DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
-DATA_DIR_PHASE1=${22:-$BERT_PREP_WORKING_DIR/${DATASET}/}
+DATA_DIR_PHASE1=${21:-$BERT_PREP_WORKING_DIR/${DATASET}/}
 BERT_CONFIG=bert_config.json
 BERT_CONFIG=bert_config.json
-CODEDIR=${24:-"/workspace/bert"}
-init_checkpoint=${25:-"None"}
+DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+DATA_DIR_PHASE2=${22:-$BERT_PREP_WORKING_DIR/${DATASET2}/}
+CODEDIR=${23:-"/workspace/bert"}
+init_checkpoint=${24:-"None"}
 RESULTS_DIR=$CODEDIR/results
 RESULTS_DIR=$CODEDIR/results
 CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
 CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
 
 
@@ -67,6 +69,8 @@ if [ "$precision" = "fp16" ] ; then
    PREC="--fp16"
    PREC="--fp16"
 elif [ "$precision" = "fp32" ] ; then
 elif [ "$precision" = "fp32" ] ; then
    PREC=""
    PREC=""
+elif [ "$precision" = "tf32" ] ; then
+   PREC=""
 else
 else
    echo "Unknown <precision> argument"
    echo "Unknown <precision> argument"
    exit -2
    exit -2
@@ -147,14 +151,13 @@ echo "finished pretraining"
 
 
 #Start Phase2
 #Start Phase2
 
 
-DATASET=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
-DATA_DIR_PHASE2=${23:-$BERT_PREP_WORKING_DIR/${DATASET}/}
-
 PREC=""
 PREC=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
    PREC="--fp16"
    PREC="--fp16"
 elif [ "$precision" = "fp32" ] ; then
 elif [ "$precision" = "fp32" ] ; then
    PREC=""
    PREC=""
+elif [ "$precision" = "tf32" ] ; then
+   PREC=""
 else
 else
    echo "Unknown <precision> argument"
    echo "Unknown <precision> argument"
    exit -2
    exit -2

+ 1 - 1
PyTorch/LanguageModeling/BERT/scripts/run_squad.sh

@@ -27,7 +27,7 @@ vocab_file=${9:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncas
 OUT_DIR=${10:-"/workspace/bert/results/SQuAD"}
 OUT_DIR=${10:-"/workspace/bert/results/SQuAD"}
 mode=${11:-"train eval"}
 mode=${11:-"train eval"}
 CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
 CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
-max_steps=${13:-"-1"}
+max_steps=${13:-"-1"} 
 
 
 echo "out dir is $OUT_DIR"
 echo "out dir is $OUT_DIR"
 mkdir -p $OUT_DIR
 mkdir -p $OUT_DIR

+ 3 - 0
README.md

@@ -79,5 +79,8 @@ We're posting these examples on GitHub to better support the community, facilita
 ## Known issues
 ## Known issues
 In each of the network READMEs, we indicate any known issues and encourage the community to provide feedback.
 In each of the network READMEs, we indicate any known issues and encourage the community to provide feedback.
 
 
+<<<<<<< HEAD
+=======
 
 
 
 
+>>>>>>> gh/master

+ 1 - 1
TensorFlow/LanguageModeling/BERT/.dockerignore

@@ -10,7 +10,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # limitations under the License.
-
+data_dl/
 .idea/
 .idea/
 .git/
 .git/
 __pycache__/
 __pycache__/

+ 1 - 1
TensorFlow/LanguageModeling/BERT/Dockerfile

@@ -1,4 +1,4 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.03-tf1-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
 
 
 FROM ${FROM_IMAGE_NAME}
 FROM ${FROM_IMAGE_NAME}
 
 

Разница между файлами не показана из-за своего большого размера
+ 479 - 450
TensorFlow/LanguageModeling/BERT/README.md


+ 1 - 1
TensorFlow/LanguageModeling/BERT/biobert/README.md

@@ -355,7 +355,7 @@ mpi_command="mpirun -np 16 -H localhost:16 \
     -x NCCL_DEBUG=INFO \
     -x NCCL_DEBUG=INFO \
     -x LD_LIBRARY_PATH \
     -x LD_LIBRARY_PATH \
     -x PATH -mca pml ob1 -mca btl ^openib" \
     -x PATH -mca pml ob1 -mca btl ^openib" \
-     python run_ner.py --horovod --use_fp16 --use_xla \
+     python run_ner.py --horovod --amp --use_xla \
       --vocab_file=$BERT_DIR/vocab.txt \
       --vocab_file=$BERT_DIR/vocab.txt \
      --bert_config_file=$BERT_DIR/bert_config.json \
      --bert_config_file=$BERT_DIR/bert_config.json \
      --output_dir=/results --data_dir=$DATA_DIR"
      --output_dir=/results --data_dir=$DATA_DIR"

+ 2 - 4
TensorFlow/LanguageModeling/BERT/biobert/conlleval.py

@@ -22,7 +22,7 @@ ANY_SPACE = '<SPACE>'
 class FormatError(Exception):
 class FormatError(Exception):
     pass
     pass
 
 
-Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
+Metrics = namedtuple('Metrics', 'tp fp fn precision recall f1')
 
 
 
 
 class EvalCounts(object):
 class EvalCounts(object):
@@ -197,9 +197,7 @@ def report(counts, out=None):
         out.write('FB1: %6.2f  %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
         out.write('FB1: %6.2f  %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
 
 
 
 
-def report_notprint(counts, out=None):
-    if out is None:
-        out = sys.stdout
+def report_notprint(counts):
 
 
     overall, by_type = metrics(counts)
     overall, by_type = metrics(counts)
 
 

+ 12 - 40
TensorFlow/LanguageModeling/BERT/biobert/scripts/biobert_finetune_inference_benchmark.sh

@@ -49,21 +49,11 @@ if [ "$task" = "ner_bc5cdr-chem" ] ; then
 
 
     for seq_length in 128 512; do
     for seq_length in 128 512; do
         for batch_size in 8 32 64; do
         for batch_size in 8 32 64; do
-            for precision in fp16 fp32; do
-                res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
+            for use_fp16 in "--amp" "--noamp"; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${use_fp16}_bs_${batch_size}
                 mkdir -p ${res_dir}
                 mkdir -p ${res_dir}
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
 
 
-                if [ "$precision" = "fp16" ] ; then
-                    echo "fp16 activated!"
-                    use_fp16="--use_fp16"
-                    use_xla_tag="--use_xla"
-                else
-                    echo "fp32 activated!"
-                    use_fp16=""
-                    use_xla_tag=""
-                fi
-
                 python /workspace/bert/run_ner.py \
                 python /workspace/bert/run_ner.py \
                 --do_prepare=true \
                 --do_prepare=true \
                 --do_eval=true \
                 --do_eval=true \
@@ -77,10 +67,10 @@ if [ "$task" = "ner_bc5cdr-chem" ] ; then
                 --eval_batch_size=$batch_size \
                 --eval_batch_size=$batch_size \
                 --predict_batch_size=$batch_size \
                 --predict_batch_size=$batch_size \
                 --max_seq_length=$seq_length \
                 --max_seq_length=$seq_length \
-                $use_fp16 $use_xla_tag $case_flag  |& tee $tmp_file
+                $use_fp16 --use_xla $case_flag  |& tee $tmp_file
 
 
                   perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
                   perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
-                echo "$precision $seq_len  $batch_size $perf" >> $LOGFILE
+                echo "$use_fp16 $seq_len  $batch_size $perf" >> $LOGFILE
 
 
             done
             done
         done
         done
@@ -97,20 +87,11 @@ elif [ "$task" = "ner_bc5cdr-disease" ] ; then
 
 
     for seq_length in 128 512; do
     for seq_length in 128 512; do
         for batch_size in 8 32 64; do
         for batch_size in 8 32 64; do
-            for precision in fp16 fp32; do
-                res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
+            for use_fp16 in "--amp" "--noamp"; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${use_fp16}_bs_${batch_size}
                 mkdir -p ${res_dir}
                 mkdir -p ${res_dir}
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
 
 
-                if [ "$precision" = "fp16" ] ; then
-                    echo "fp16 activated!"
-                    use_fp16="--use_fp16"
-                    use_xla_tag="--use_xla"
-                else
-                    echo "fp32 activated!"
-                    use_fp16=""
-                    use_xla_tag=""
-                fi
                 python3 /workspace/bert/run_ner.py \
                 python3 /workspace/bert/run_ner.py \
                 --do_prepare=true \
                 --do_prepare=true \
                 --do_eval=true \
                 --do_eval=true \
@@ -124,10 +105,10 @@ elif [ "$task" = "ner_bc5cdr-disease" ] ; then
                 --eval_batch_size=$batch_size \
                 --eval_batch_size=$batch_size \
                 --predict_batch_size=$batch_size \
                 --predict_batch_size=$batch_size \
                 --max_seq_length=$seq_length \
                 --max_seq_length=$seq_length \
-                "$use_fp16" $use_xla_tag $case_flag  |& tee $tmp_file
+                "$use_fp16" --use_xla $case_flag  |& tee $tmp_file
 
 
                   perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
                   perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
-                echo "$precision $seq_len  $batch_size $perf" >> $LOGFILE
+                echo "$use_fp16 $seq_len  $batch_size $perf" >> $LOGFILE
 
 
             done
             done
         done
         done
@@ -144,20 +125,11 @@ elif [ "$task" = "rel_chemprot" ] ; then
 
 
     for seq_length in 128 512; do
     for seq_length in 128 512; do
         for batch_size in 8 32 64; do
         for batch_size in 8 32 64; do
-            for precision in fp16 fp32; do
-                res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
+            for use_fp16 in "--amp" "--noamp"; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${use_fp16}_bs_${batch_size}
                 mkdir -p ${res_dir}
                 mkdir -p ${res_dir}
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
 
 
-                if [ "$precision" = "fp16" ] ; then
-                    echo "fp16 activated!"
-                    use_fp16="--use_fp16"
-                    use_xla_tag="--use_xla"
-                else
-                    echo "fp32 activated!"
-                    use_fp16=""
-                    use_xla_tag=""
-                fi
                 python3 /workspace/bert/run_re.py \
                 python3 /workspace/bert/run_re.py \
                 --do_prepare=true \
                 --do_prepare=true \
                 --do_eval=true \
                 --do_eval=true \
@@ -171,10 +143,10 @@ elif [ "$task" = "rel_chemprot" ] ; then
                 --eval_batch_size=$batch_size \
                 --eval_batch_size=$batch_size \
                 --predict_batch_size=$batch_size \
                 --predict_batch_size=$batch_size \
                 --max_seq_length=$seq_length \
                 --max_seq_length=$seq_length \
-                "$use_fp16" $use_xla_tag $case_flag  |& tee $tmp_file
+                "$use_fp16" --use_xla $case_flag  |& tee $tmp_file
 
 
                   perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
                   perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
-                echo "$precision $seq_len  $batch_size $perf" >> $LOGFILE
+                echo "$use_fp16 $seq_len  $batch_size $perf" >> $LOGFILE
 
 
             done
             done
         done
         done

+ 12 - 42
TensorFlow/LanguageModeling/BERT/biobert/scripts/biobert_finetune_train_benchmark.sh

@@ -64,21 +64,11 @@ if [ "$task" = "ner_bc5cdr-chem" ] ; then
 
 
     for seq_length in 128 512; do
     for seq_length in 128 512; do
         for train_batch_size in 8 32 64; do
         for train_batch_size in 8 32 64; do
-            for precision in fp16 fp32; do
-                res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${precision}_bs_${batch_size}
+            for use_fp16 in "--amp" "--noamp"; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${use_fp16}_bs_${batch_size}
                 mkdir -p ${res_dir}
                 mkdir -p ${res_dir}
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
 
 
-                if [ "$precision" = "fp16" ] ; then
-                    echo "fp16 activated!"
-                    use_fp16="--use_fp16"
-                    use_xla_tag="--use_xla"
-                else
-                    echo "fp32 activated!"
-                    use_fp16=""
-                    use_xla_tag=""
-                fi
-
                 $mpi_command python /workspace/bert/run_ner.py \
                 $mpi_command python /workspace/bert/run_ner.py \
                   --do_prepare=true \
                   --do_prepare=true \
                   --do_train=true \
                   --do_train=true \
@@ -93,10 +83,10 @@ if [ "$task" = "ner_bc5cdr-chem" ] ; then
                   --output_dir=$res_dir \
                   --output_dir=$res_dir \
                   --train_batch_size=$train_batch_size \
                   --train_batch_size=$train_batch_size \
                   --max_seq_length=$seq_length \
                   --max_seq_length=$seq_length \
-                  $use_hvd $use_fp16 $use_xla_tag $case_flag |& tee $tmp_file
+                  $use_hvd $use_fp16 --use_xla $case_flag |& tee $tmp_file
 
 
                 perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
                 perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
-                echo "$precision  $seq_length  $train_batch_size $perf" >> $LOGFILE
+                echo "${use_fp16}  $seq_length  $train_batch_size $perf" >> $LOGFILE
 
 
             done
             done
         done
         done
@@ -111,21 +101,11 @@ elif [ "$task" = "ner_bc5cdr-disease" ] ; then
 
 
     for seq_length in 128 512; do
     for seq_length in 128 512; do
         for train_batch_size in 8 32 64; do
         for train_batch_size in 8 32 64; do
-            for precision in fp16 fp32; do
-                res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${precision}_bs_${batch_size}
+            for use_fp16 in "--amp" "--noamp"; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${use_fp16}_bs_${batch_size}
                 mkdir -p ${res_dir}
                 mkdir -p ${res_dir}
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
 
 
-                if [ "$precision" = "fp16" ] ; then
-                    echo "fp16 activated!"
-                    use_fp16="--use_fp16"
-                    use_xla_tag="--use_xla"
-                else
-                    echo "fp32 activated!"
-                    use_fp16=""
-                    use_xla_tag=""
-                fi
-
                 $mpi_command python3 /workspace/bert/run_ner.py \
                 $mpi_command python3 /workspace/bert/run_ner.py \
                 --do_prepare=true \
                 --do_prepare=true \
                 --do_train=true \
                 --do_train=true \
@@ -140,10 +120,10 @@ elif [ "$task" = "ner_bc5cdr-disease" ] ; then
                 --output_dir=$res_dir \
                 --output_dir=$res_dir \
                 --train_batch_size=$train_batch_size \
                 --train_batch_size=$train_batch_size \
                 --max_seq_length=$seq_length \
                 --max_seq_length=$seq_length \
-                "$use_hvd" "$use_fp16" $use_xla_tag $case_flag  |& tee $tmp_file
+                "$use_hvd" "$use_fp16" --use_xla $case_flag  |& tee $tmp_file
 
 
                   perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
                   perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
-                echo "$precision  $seq_length  $train_batch_size $perf" >> $LOGFILE
+                echo "${use_fp16}  $seq_length  $train_batch_size $perf" >> $LOGFILE
 
 
             done
             done
         done
         done
@@ -158,21 +138,11 @@ elif [ "$task" = "rel_chemprot" ] ; then
 
 
     for seq_length in 128 512; do
     for seq_length in 128 512; do
         for train_batch_size in 8 32 64; do
         for train_batch_size in 8 32 64; do
-            for precision in fp16 fp32; do
-                res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${precision}_bs_${batch_size}
+            for use_fp16 in "--amp" "--noamp"; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${use_fp16}_bs_${batch_size}
                 mkdir -p ${res_dir}
                 mkdir -p ${res_dir}
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
 
 
-                if [ "$precision" = "fp16" ] ; then
-                    echo "fp16 activated!"
-                    use_fp16="--use_fp16"
-                    use_xla_tag="--use_xla"
-                else
-                    echo "fp32 activated!"
-                    use_fp16=""
-                    use_xla_tag=""
-                fi
-
                 $mpi_command python3 /workspace/bert/run_re.py \
                 $mpi_command python3 /workspace/bert/run_re.py \
                 --do_prepare=true \
                 --do_prepare=true \
                 --do_train=true \
                 --do_train=true \
@@ -187,10 +157,10 @@ elif [ "$task" = "rel_chemprot" ] ; then
                 --output_dir=$res_dir \
                 --output_dir=$res_dir \
                 --train_batch_size=$train_batch_size \
                 --train_batch_size=$train_batch_size \
                 --max_seq_length=$seq_length \
                 --max_seq_length=$seq_length \
-                "$use_hvd" "$use_fp16" $use_xla_tag $case_flag |& tee $tmp_file
+                "$use_hvd" "$use_fp16" --use_xla $case_flag |& tee $tmp_file
 
 
                 perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
                 perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
-                echo "$precision  $seq_length  $train_batch_size $perf" >> $LOGFILE
+                echo "${use_fp16}  $seq_length  $train_batch_size $perf" >> $LOGFILE
 
 
             done
             done
         done
         done

+ 6 - 3
TensorFlow/LanguageModeling/BERT/biobert/scripts/ner_bc5cdr-chem.sh

@@ -42,15 +42,18 @@ mkdir -p ${OUTPUT_DIR}
 
 
 use_fp16=""
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 
 

+ 6 - 3
TensorFlow/LanguageModeling/BERT/biobert/scripts/ner_bc5cdr-disease.sh

@@ -42,15 +42,18 @@ mkdir -p ${OUTPUT_DIR}
 
 
 use_fp16=""
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 if [ $num_gpu -gt 1 ] ; then
 if [ $num_gpu -gt 1 ] ; then

+ 6 - 3
TensorFlow/LanguageModeling/BERT/biobert/scripts/rel_chemprot.sh

@@ -41,15 +41,18 @@ mkdir -p ${OUTPUT_DIR}
 
 
 use_fp16=""
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 if [ $num_gpu -gt 1 ] ; then
 if [ $num_gpu -gt 1 ] ; then

+ 1 - 1
TensorFlow/LanguageModeling/BERT/biobert/scripts/run_biobert.sub

@@ -80,7 +80,7 @@ BERT_CMD="\
      --do_train=True \
      --do_train=True \
      --do_eval=True \
      --do_eval=True \
      --save_checkpoints_steps=5000 \
      --save_checkpoints_steps=5000 \
-     --horovod --use_fp16 --use_xla \
+     --horovod --amp --use_xla \
      --allreduce_post_accumulation=True \
      --allreduce_post_accumulation=True \
      --eval_batch_size=8"
      --eval_batch_size=8"
 
 

+ 6 - 4
TensorFlow/LanguageModeling/BERT/biobert/scripts/run_biobert_finetuning_inference.sh

@@ -36,17 +36,19 @@ else
     export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
     export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
 fi
 fi
 
 
-use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 DATESTAMP=`date +'%y%m%d%H%M%S'`
 DATESTAMP=`date +'%y%m%d%H%M%S'`

+ 6 - 3
TensorFlow/LanguageModeling/BERT/biobert/scripts/run_pretraining_pubmed_base_phase_1.sh

@@ -16,15 +16,18 @@ eval_batch_size=${11:-80}
 
 
 use_fp16=""
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 if [ "$cased" = "true" ] ; then
 if [ "$cased" = "true" ] ; then

+ 6 - 3
TensorFlow/LanguageModeling/BERT/biobert/scripts/run_pretraining_pubmed_base_phase_2.sh

@@ -18,15 +18,18 @@ eval_batch_size=${12:-26}
 
 
 use_fp16=""
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 if [ "$cased" = "true" ] ; then
 if [ "$cased" = "true" ] ; then

+ 1 - 1
TensorFlow/LanguageModeling/BERT/run.sub

@@ -66,7 +66,7 @@ BERT_CMD="\
      --do_train=True \
      --do_train=True \
      --do_eval=True \
      --do_eval=True \
      --save_checkpoints_steps=100 \
      --save_checkpoints_steps=100 \
-     --horovod --use_fp16 --use_xla \
+     --horovod --amp --use_xla \
      --allreduce_post_accumulation=True \
      --allreduce_post_accumulation=True \
      --eval_batch_size=8"
      --eval_batch_size=8"
 
 

+ 42 - 11
TensorFlow/LanguageModeling/BERT/run_classifier.py

@@ -34,6 +34,7 @@ import utils.dllogger_class
 from dllogger import Verbosity
 from dllogger import Verbosity
 from utils.create_glue_data import *
 from utils.create_glue_data import *
 import numpy as np
 import numpy as np
+import tf_metrics
 
 
 flags = tf.flags
 flags = tf.flags
 
 
@@ -64,6 +65,10 @@ flags.DEFINE_string(
     "dllog_path", "/results/bert_dllog.json",
     "dllog_path", "/results/bert_dllog.json",
     "filename where dllogger writes to")
     "filename where dllogger writes to")
 
 
+flags.DEFINE_string(
+    "optimizer_type", "lamb",
+    "Optimizer type : adam or lamb")
+
 flags.DEFINE_string(
 flags.DEFINE_string(
     "init_checkpoint", None,
     "init_checkpoint", None,
     "Initial checkpoint (usually from a pre-trained BERT model).")
     "Initial checkpoint (usually from a pre-trained BERT model).")
@@ -107,15 +112,16 @@ flags.DEFINE_float(
 
 
 flags.DEFINE_integer("save_checkpoints_steps", 1000,
 flags.DEFINE_integer("save_checkpoints_steps", 1000,
                      "How often to save the model checkpoint.")
                      "How often to save the model checkpoint.")
+flags.DEFINE_integer("display_loss_steps", 10,
+                     "How often to print loss from estimator")
 
 
 flags.DEFINE_integer("iterations_per_loop", 1000,
 flags.DEFINE_integer("iterations_per_loop", 1000,
                      "How many steps to make in each estimator call.")
                      "How many steps to make in each estimator call.")
 flags.DEFINE_integer("num_accumulation_steps", 1,
 flags.DEFINE_integer("num_accumulation_steps", 1,
                      "Number of accumulation steps before gradient update" 
                      "Number of accumulation steps before gradient update" 
                       "Global batch size = num_accumulation_steps * train_batch_size")
                       "Global batch size = num_accumulation_steps * train_batch_size")
-flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
-
-flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+flags.DEFINE_bool("amp", True, "Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.")
+flags.DEFINE_bool("use_xla", True, "Whether to enable XLA JIT compilation.")
 flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
 flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
 
 
 flags.DEFINE_bool(
 flags.DEFINE_bool(
@@ -181,7 +187,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
       input_mask=input_mask,
       input_mask=input_mask,
       token_type_ids=segment_ids,
       token_type_ids=segment_ids,
       use_one_hot_embeddings=use_one_hot_embeddings,
       use_one_hot_embeddings=use_one_hot_embeddings,
-      compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
+      compute_type=tf.float32)
 
 
   # In the demo, we are doing a simple classification task on the entire
   # In the demo, we are doing a simple classification task on the entire
   # segment.
   # segment.
@@ -254,7 +260,7 @@ def get_frozen_tftrt_model(bert_config, shape, num_labels, use_one_hot_embedding
         input_graph_def=frozen_graph,
         input_graph_def=frozen_graph,
         nodes_blacklist=output_node_names,
         nodes_blacklist=output_node_names,
         max_workspace_size_bytes=(4096 << 20) - 1000,
         max_workspace_size_bytes=(4096 << 20) - 1000,
-        precision_mode = "FP16" if FLAGS.use_fp16 else "FP32",
+        precision_mode = "FP16" if FLAGS.amp else "FP32",
         minimum_segment_size=4,
         minimum_segment_size=4,
         is_dynamic_op=True,
         is_dynamic_op=True,
         maximum_cached_engines=1000
         maximum_cached_engines=1000
@@ -292,6 +298,16 @@ def model_fn_builder(task_name, bert_config, num_labels, init_checkpoint, learni
             MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5
             MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5
             MCC_op = tf.group(FN_op, TN_op, TP_op, FP_op, tf.identity(MCC, name="MCC"))
             MCC_op = tf.group(FN_op, TN_op, TP_op, FP_op, tf.identity(MCC, name="MCC"))
             return {"MCC": (MCC, MCC_op)}
             return {"MCC": (MCC, MCC_op)}
+        elif task_name == "mrpc":
+            accuracy = tf.metrics.accuracy(
+                labels=label_ids, predictions=predictions)
+            loss = tf.metrics.mean(values=per_example_loss)
+            f1 = tf_metrics.f1(labels=label_ids, predictions=predictions, num_classes=2, pos_indices=[1])
+            return {
+                "eval_accuracy": accuracy,
+                "eval_f1": f1,
+                "eval_loss": loss,
+            }
         else:
         else:
             accuracy = tf.metrics.accuracy(
             accuracy = tf.metrics.accuracy(
                 labels=label_ids, predictions=predictions)
                 labels=label_ids, predictions=predictions)
@@ -354,19 +370,28 @@ def model_fn_builder(task_name, bert_config, num_labels, init_checkpoint, learni
 
 
       train_op = optimization.create_optimizer(
       train_op = optimization.create_optimizer(
           total_loss, learning_rate, num_train_steps, num_warmup_steps,
           total_loss, learning_rate, num_train_steps, num_warmup_steps,
-          hvd, False, FLAGS.use_fp16, FLAGS.num_accumulation_steps)
-
+          hvd, False, FLAGS.amp, FLAGS.num_accumulation_steps, FLAGS.optimizer_type)
       output_spec = tf.estimator.EstimatorSpec(
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode,
           mode=mode,
           loss=total_loss,
           loss=total_loss,
           train_op=train_op)
           train_op=train_op)
     elif mode == tf.estimator.ModeKeys.EVAL:
     elif mode == tf.estimator.ModeKeys.EVAL:
+      dummy_op = tf.no_op()
+      # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
+      if FLAGS.amp:
+        dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+            optimization.LAMBOptimizer(learning_rate=0.0))
       eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
       eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
       output_spec = tf.estimator.EstimatorSpec(
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode,
           mode=mode,
           loss=total_loss,
           loss=total_loss,
           eval_metric_ops=eval_metric_ops)
           eval_metric_ops=eval_metric_ops)
     else:
     else:
+      dummy_op = tf.no_op()
+      # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
+      if FLAGS.amp:
+        dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+            optimization.LAMBOptimizer(learning_rate=0.0))
       output_spec = tf.estimator.EstimatorSpec(
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode, predictions=probabilities)
           mode=mode, predictions=probabilities)
     return output_spec
     return output_spec
@@ -429,7 +454,11 @@ def input_fn_builder(features, batch_size, seq_length, is_training, drop_remaind
 
 
 
 
 def main(_):
 def main(_):
-  os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM
+  # causes memory fragmentation for bert leading to OOM
+  if os.environ.get("TF_XLA_FLAGS", None) is not None:
+    os.environ["TF_XLA_FLAGS"] += "--tf_xla_enable_lazy_compilation=false"
+  else:
+    os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"
 
 
   tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
   dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
@@ -494,6 +523,8 @@ def main(_):
       model_dir=FLAGS.output_dir if master_process else None,
       model_dir=FLAGS.output_dir if master_process else None,
       session_config=config,
       session_config=config,
       save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
       save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
+      save_summary_steps=FLAGS.save_checkpoints_steps if master_process else None,
+      log_step_count_steps=FLAGS.display_loss_steps,
       keep_checkpoint_max=1)
       keep_checkpoint_max=1)
 
 
   if master_process:
   if master_process:
@@ -505,7 +536,7 @@ def main(_):
   train_examples = None
   train_examples = None
   num_train_steps = None
   num_train_steps = None
   num_warmup_steps = None
   num_warmup_steps = None
-  training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))
+  training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank, FLAGS.save_checkpoints_steps, num_steps_ignore_xla=10))
 
 
   if FLAGS.do_train:
   if FLAGS.do_train:
     train_examples = processor.get_train_examples(FLAGS.data_dir)
     train_examples = processor.get_train_examples(FLAGS.data_dir)
@@ -623,7 +654,7 @@ def main(_):
     tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
     tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
     tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
     tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
     tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
     tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
-    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
     tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
@@ -698,7 +729,7 @@ def main(_):
     tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET")
     tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET")
     tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
     tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
     tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
     tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
-    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
     tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)

+ 23 - 7
TensorFlow/LanguageModeling/BERT/run_ner.py

@@ -124,8 +124,8 @@ flags.DEFINE_integer(
 tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
 tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
 
 
 flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
 flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
-flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
-flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+flags.DEFINE_bool("amp", True, "Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.")
+flags.DEFINE_bool("use_xla", True, "Whether to enable XLA JIT compilation.")
 
 
 class InputExample(object):
 class InputExample(object):
     """A single training/test example for simple sequence classification."""
     """A single training/test example for simple sequence classification."""
@@ -501,7 +501,7 @@ def create_model(bert_config, is_training, input_ids, input_mask,
 
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint=None, learning_rate=None,
 def model_fn_builder(bert_config, num_labels, init_checkpoint=None, learning_rate=None,
                      num_train_steps=None, num_warmup_steps=None,
                      num_train_steps=None, num_warmup_steps=None,
-                     use_one_hot_embeddings=False, hvd=None, use_fp16=False):
+                     use_one_hot_embeddings=False, hvd=None, amp=False):
     def model_fn(features, labels, mode, params):
     def model_fn(features, labels, mode, params):
         tf.compat.v1.logging.info("*** Features ***")
         tf.compat.v1.logging.info("*** Features ***")
         for name in sorted(features.keys()):
         for name in sorted(features.keys()):
@@ -536,12 +536,17 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint=None, learning_rat
         output_spec = None
         output_spec = None
         if mode == tf.estimator.ModeKeys.TRAIN:
         if mode == tf.estimator.ModeKeys.TRAIN:
             train_op = optimization.create_optimizer(
             train_op = optimization.create_optimizer(
-                total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16)
+                total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, amp)
             output_spec = tf.estimator.EstimatorSpec(
             output_spec = tf.estimator.EstimatorSpec(
               mode=mode,
               mode=mode,
               loss=total_loss,
               loss=total_loss,
               train_op=train_op)
               train_op=train_op)
         elif mode == tf.estimator.ModeKeys.EVAL:
         elif mode == tf.estimator.ModeKeys.EVAL:
+            dummy_op = tf.no_op()
+            # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
+            if amp:
+                dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+                    optimization.LAMBOptimizer(learning_rate=0.0))
 
 
             def metric_fn(per_example_loss, label_ids, logits):
             def metric_fn(per_example_loss, label_ids, logits):
                 # def metric_fn(label_ids, logits):
                 # def metric_fn(label_ids, logits):
@@ -562,6 +567,13 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint=None, learning_rat
               loss=total_loss,
               loss=total_loss,
               eval_metric_ops=eval_metric_ops)
               eval_metric_ops=eval_metric_ops)
         else:
         else:
+
+            dummy_op = tf.no_op()
+            # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
+            if amp:
+                dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+                    optimization.LAMBOptimizer(learning_rate=0.0))
+
             output_spec = tf.estimator.EstimatorSpec(
             output_spec = tf.estimator.EstimatorSpec(
               mode=mode, predictions=predicts)#probabilities)
               mode=mode, predictions=predicts)#probabilities)
         return output_spec
         return output_spec
@@ -613,7 +625,11 @@ def result_to_pair(predict_line, pred_ids, id2label, writer, err_writer):
 
 
 
 
 def main(_):
 def main(_):
-    os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM
+    # causes memory fragmentation for bert leading to OOM
+    if os.environ.get("TF_XLA_FLAGS", None) is not None:
+        os.environ["TF_XLA_FLAGS"] += "--tf_xla_enable_lazy_compilation=false"
+    else:
+        os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"
 
 
     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
     dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
     dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
@@ -716,7 +732,7 @@ def main(_):
         num_warmup_steps=num_warmup_steps,
         num_warmup_steps=num_warmup_steps,
         use_one_hot_embeddings=False,
         use_one_hot_embeddings=False,
         hvd=None if not FLAGS.horovod else hvd,
         hvd=None if not FLAGS.horovod else hvd,
-        use_fp16=FLAGS.use_fp16)
+        amp=FLAGS.amp)
 
 
     estimator = tf.estimator.Estimator(
     estimator = tf.estimator.Estimator(
       model_fn=model_fn,
       model_fn=model_fn,
@@ -852,7 +868,7 @@ def main(_):
         tf.compat.v1.logging.info("Summary Inference Statistics")
         tf.compat.v1.logging.info("Summary Inference Statistics")
         tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
         tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
         tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
         tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
-        tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+        tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
         tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)

+ 13 - 13
TensorFlow/LanguageModeling/BERT/run_pretraining.py

@@ -119,9 +119,8 @@ flags.DEFINE_bool("report_loss", True, "Whether to report total loss during trai
 flags.DEFINE_bool("manual_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU. "
 flags.DEFINE_bool("manual_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU. "
                                         "Manual casting is done instead of using AMP")
                                         "Manual casting is done instead of using AMP")
 
 
-flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
-
-flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
+flags.DEFINE_bool("amp", True, "Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.")
+flags.DEFINE_bool("use_xla", True, "Whether to enable XLA JIT compilation.")
 flags.DEFINE_integer("init_loss_scale", 2**32, "Initial value of loss scale if mixed precision training")
 flags.DEFINE_integer("init_loss_scale", 2**32, "Initial value of loss scale if mixed precision training")
 
 
 # report samples/sec, total loss and learning rate during training
 # report samples/sec, total loss and learning rate during training
@@ -150,7 +149,7 @@ class _LogSessionRunHook(tf.estimator.SessionRunHook):
   def before_run(self, run_context):
   def before_run(self, run_context):
     self.t0 = time.time()
     self.t0 = time.time()
     if self.num_accumulation_steps <= 1:
     if self.num_accumulation_steps <= 1:
-        if FLAGS.manual_fp16 or FLAGS.use_fp16:
+        if FLAGS.manual_fp16 or FLAGS.amp:
             return tf.estimator.SessionRunArgs(
             return tf.estimator.SessionRunArgs(
                 fetches=['step_update:0', 'total_loss:0',
                 fetches=['step_update:0', 'total_loss:0',
                          'learning_rate:0', 'nsp_loss:0',
                          'learning_rate:0', 'nsp_loss:0',
@@ -161,7 +160,7 @@ class _LogSessionRunHook(tf.estimator.SessionRunHook):
                          'learning_rate:0', 'nsp_loss:0',
                          'learning_rate:0', 'nsp_loss:0',
                          'mlm_loss:0'])
                          'mlm_loss:0'])
     else:
     else:
-        if FLAGS.manual_fp16 or FLAGS.use_fp16:
+        if FLAGS.manual_fp16 or FLAGS.amp:
             return tf.estimator.SessionRunArgs(
             return tf.estimator.SessionRunArgs(
                 fetches=['step_update:0', 'update_step:0', 'total_loss:0',
                 fetches=['step_update:0', 'update_step:0', 'total_loss:0',
                          'learning_rate:0', 'nsp_loss:0',
                          'learning_rate:0', 'nsp_loss:0',
@@ -175,14 +174,14 @@ class _LogSessionRunHook(tf.estimator.SessionRunHook):
     run_time = time.time() - self.t0
     run_time = time.time() - self.t0
 
 
     if self.num_accumulation_steps <=1:
     if self.num_accumulation_steps <=1:
-        if FLAGS.manual_fp16 or FLAGS.use_fp16:
+        if FLAGS.manual_fp16 or FLAGS.amp:
             self.global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
             self.global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
         else:
         else:
             self.global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
             self.global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
                 results
                 results
         update_step = True
         update_step = True
     else:
     else:
-        if FLAGS.manual_fp16 or FLAGS.use_fp16:
+        if FLAGS.manual_fp16 or FLAGS.amp:
           self.global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
           self.global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
         else:
         else:
           self.global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values.\
           self.global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values.\
@@ -212,7 +211,7 @@ class _LogSessionRunHook(tf.estimator.SessionRunHook):
             sent_per_sec = self.global_batch_size / dt
             sent_per_sec = self.global_batch_size / dt
             avg_loss_step = self.loss / self.all_count
             avg_loss_step = self.loss / self.all_count
             if self.hvd_rank >= 0 and FLAGS.report_loss:
             if self.hvd_rank >= 0 and FLAGS.report_loss:
-              if FLAGS.manual_fp16 or FLAGS.use_fp16:
+              if FLAGS.manual_fp16 or FLAGS.amp:
                 self.dllogging.logger.log(step=(print_step),
                 self.dllogging.logger.log(step=(print_step),
                                      data={"Rank": int(self.hvd_rank), "throughput_train": float(sent_per_sec),
                                      data={"Rank": int(self.hvd_rank), "throughput_train": float(sent_per_sec),
                                            "mlm_loss":float(mlm_loss), "nsp_loss":float(nsp_loss),
                                            "mlm_loss":float(mlm_loss), "nsp_loss":float(nsp_loss),
@@ -227,7 +226,7 @@ class _LogSessionRunHook(tf.estimator.SessionRunHook):
                                            "learning_rate": str(lr)},
                                            "learning_rate": str(lr)},
                                      verbosity=Verbosity.DEFAULT)
                                      verbosity=Verbosity.DEFAULT)
             else:
             else:
-              if FLAGS.manual_fp16 or FLAGS.use_fp16:
+              if FLAGS.manual_fp16 or FLAGS.amp:
                 self.dllogging.logger.log(step=int(print_step),
                 self.dllogging.logger.log(step=int(print_step),
                                      data={"throughput_train": float(sent_per_sec),
                                      data={"throughput_train": float(sent_per_sec),
                                            "mlm_loss":float(mlm_loss), "nsp_loss":float(nsp_loss),
                                            "mlm_loss":float(mlm_loss), "nsp_loss":float(nsp_loss),
@@ -316,7 +315,7 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate,
     if mode == tf.estimator.ModeKeys.TRAIN:
     if mode == tf.estimator.ModeKeys.TRAIN:
       train_op = optimization.create_optimizer(
       train_op = optimization.create_optimizer(
           total_loss, learning_rate, num_train_steps, num_warmup_steps,
           total_loss, learning_rate, num_train_steps, num_warmup_steps,
-          hvd, FLAGS.manual_fp16, FLAGS.use_fp16, FLAGS.num_accumulation_steps, FLAGS.optimizer_type, FLAGS.allreduce_post_accumulation, FLAGS.init_loss_scale)
+          hvd, FLAGS.manual_fp16, FLAGS.amp, FLAGS.num_accumulation_steps, FLAGS.optimizer_type, FLAGS.allreduce_post_accumulation, FLAGS.init_loss_scale)
 
 
       output_spec = tf.estimator.EstimatorSpec(
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode,
           mode=mode,
@@ -567,7 +566,7 @@ def main(_):
 
 
   if FLAGS.horovod and len(input_files) < hvd.size():
   if FLAGS.horovod and len(input_files) < hvd.size():
       raise ValueError("Input Files must be sharded")
       raise ValueError("Input Files must be sharded")
-  if FLAGS.use_fp16 and FLAGS.manual_fp16:
+  if FLAGS.amp and FLAGS.manual_fp16:
       raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
       raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
 
 
   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
@@ -584,7 +583,8 @@ def main(_):
   if FLAGS.use_xla: 
   if FLAGS.use_xla: 
       config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
       config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
       config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
       config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
-      tf.enable_resource_variables()
+      if FLAGS.amp:
+        tf.enable_resource_variables()
 
 
   run_config = tf.estimator.RunConfig(
   run_config = tf.estimator.RunConfig(
       model_dir=FLAGS.output_dir,
       model_dir=FLAGS.output_dir,
@@ -687,7 +687,7 @@ def main(_):
     tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
     tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
     tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
     tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
     tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
     tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
-    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
     tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
     tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
     dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
     dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
     tf.compat.v1.logging.info("-----------------------------")
     tf.compat.v1.logging.info("-----------------------------")

+ 23 - 7
TensorFlow/LanguageModeling/BERT/run_re.py

@@ -116,8 +116,8 @@ flags.DEFINE_integer("iterations_per_loop", 1000,
 tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
 tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
 
 
 flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
 flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
-flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
-flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+flags.DEFINE_bool("amp", True, "Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.")
+flags.DEFINE_bool("use_xla", True, "Whether to enable XLA JIT compilation.")
 
 
 class InputExample(object):
 class InputExample(object):
     """A single training/test example for simple sequence classification."""
     """A single training/test example for simple sequence classification."""
@@ -569,7 +569,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate=None,
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate=None,
                      num_train_steps=None, num_warmup_steps=None,
                      num_train_steps=None, num_warmup_steps=None,
-                     use_one_hot_embeddings=False, hvd=None, use_fp16=False):
+                     use_one_hot_embeddings=False, hvd=None, amp=False):
     """Returns `model_fn` closure for TPUEstimator."""
     """Returns `model_fn` closure for TPUEstimator."""
 
 
     def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
@@ -615,7 +615,7 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate=Non
         if mode == tf.estimator.ModeKeys.TRAIN:
         if mode == tf.estimator.ModeKeys.TRAIN:
 
 
             train_op = optimization.create_optimizer(
             train_op = optimization.create_optimizer(
-                total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16)
+                total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, amp)
 
 
             output_spec = tf.estimator.EstimatorSpec(
             output_spec = tf.estimator.EstimatorSpec(
               mode=mode,
               mode=mode,
@@ -623,6 +623,12 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate=Non
               train_op=train_op)
               train_op=train_op)
         elif mode == tf.estimator.ModeKeys.EVAL:
         elif mode == tf.estimator.ModeKeys.EVAL:
 
 
+            dummy_op = tf.no_op()
+            # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
+            if amp:
+                dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+                    optimization.LAMBOptimizer(learning_rate=0.0))
+
             def metric_fn(per_example_loss, label_ids, logits, is_real_example):
             def metric_fn(per_example_loss, label_ids, logits, is_real_example):
                 predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                 predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                 accuracy = tf.metrics.accuracy(
                 accuracy = tf.metrics.accuracy(
@@ -639,6 +645,12 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate=Non
               loss=total_loss,
               loss=total_loss,
               eval_metric_ops=eval_metric_ops)
               eval_metric_ops=eval_metric_ops)
         else:
         else:
+            dummy_op = tf.no_op()
+            # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
+            if amp:
+                dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+                    optimization.LAMBOptimizer(learning_rate=0.0))
+
             output_spec = tf.estimator.EstimatorSpec(
             output_spec = tf.estimator.EstimatorSpec(
                     mode=mode, predictions={"probabilities": probabilities})#predicts)#probabilities)
                     mode=mode, predictions={"probabilities": probabilities})#predicts)#probabilities)
         return output_spec
         return output_spec
@@ -719,7 +731,11 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
 
 
 
 
 def main(_):
 def main(_):
-    os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM
+    # causes memory fragmentation for bert leading to OOM
+    if os.environ.get("TF_XLA_FLAGS", None) is not None:
+        os.environ["TF_XLA_FLAGS"] += "--tf_xla_enable_lazy_compilation=false"
+    else:
+        os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"
 
 
     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
     dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
     dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
@@ -829,7 +845,7 @@ def main(_):
         num_warmup_steps=num_warmup_steps,
         num_warmup_steps=num_warmup_steps,
         use_one_hot_embeddings=False,
         use_one_hot_embeddings=False,
         hvd=None if not FLAGS.horovod else hvd,
         hvd=None if not FLAGS.horovod else hvd,
-        use_fp16=FLAGS.use_fp16)
+        amp=FLAGS.amp)
 
 
     estimator = tf.estimator.Estimator(
     estimator = tf.estimator.Estimator(
       model_fn=model_fn,
       model_fn=model_fn,
@@ -970,7 +986,7 @@ def main(_):
         tf.compat.v1.logging.info("Summary Inference Statistics")
         tf.compat.v1.logging.info("Summary Inference Statistics")
         tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
         tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
         tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
         tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
-        tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+        tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
         tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
         tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)

+ 22 - 9
TensorFlow/LanguageModeling/BERT/run_squad.py

@@ -157,8 +157,8 @@ def extract_run_squad_flags():
       "null_score_diff_threshold", 0.0,
       "null_score_diff_threshold", 0.0,
       "If null_score - best_non_null is greater than the threshold predict null.")
       "If null_score - best_non_null is greater than the threshold predict null.")
 
 
-  flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
-  flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+  flags.DEFINE_bool("amp", True, "Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.")
+  flags.DEFINE_bool("use_xla", True, "Whether to enable XLA JIT compilation.")
   flags.DEFINE_integer("num_eval_iterations", None,
   flags.DEFINE_integer("num_eval_iterations", None,
                        "How many eval iterations to run - performs inference on subset")
                        "How many eval iterations to run - performs inference on subset")
 
 
@@ -259,7 +259,7 @@ def get_frozen_tftrt_model(bert_config, shape, use_one_hot_embeddings, init_chec
         input_graph_def=frozen_graph,
         input_graph_def=frozen_graph,
         nodes_blacklist=output_node_names,
         nodes_blacklist=output_node_names,
         max_workspace_size_bytes=(4096 << 20) - 1000,
         max_workspace_size_bytes=(4096 << 20) - 1000,
-        precision_mode = "FP16" if FLAGS.use_fp16 else "FP32",
+        precision_mode = "FP16" if FLAGS.amp else "FP32",
         minimum_segment_size=4,
         minimum_segment_size=4,
         is_dynamic_op=True,
         is_dynamic_op=True,
         maximum_cached_engines=1000
         maximum_cached_engines=1000
@@ -279,7 +279,7 @@ def get_frozen_tftrt_model(bert_config, shape, use_one_hot_embeddings, init_chec
 
 
 def model_fn_builder(bert_config, init_checkpoint, learning_rate,
 def model_fn_builder(bert_config, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps,
                      num_train_steps, num_warmup_steps,
-                     hvd=None, use_fp16=False, use_one_hot_embeddings=False):
+                     hvd=None, amp=False, use_one_hot_embeddings=False):
   """Returns `model_fn` closure for Estimator."""
   """Returns `model_fn` closure for Estimator."""
 
 
   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
@@ -359,13 +359,20 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate,
       total_loss = (start_loss + end_loss) / 2.0
       total_loss = (start_loss + end_loss) / 2.0
 
 
       train_op = optimization.create_optimizer(
       train_op = optimization.create_optimizer(
-          total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16, FLAGS.num_accumulation_steps)
+          total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, amp, FLAGS.num_accumulation_steps)
 
 
       output_spec = tf.estimator.EstimatorSpec(
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode,
           mode=mode,
           loss=total_loss,
           loss=total_loss,
           train_op=train_op)
           train_op=train_op)
     elif mode == tf.estimator.ModeKeys.PREDICT:
     elif mode == tf.estimator.ModeKeys.PREDICT:
+
+      dummy_op = tf.no_op()
+      # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
+      if amp:
+        dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+            optimization.LAMBOptimizer(learning_rate=0.0))
+
       predictions = {
       predictions = {
           "unique_ids": unique_ids,
           "unique_ids": unique_ids,
           "start_logits": start_logits,
           "start_logits": start_logits,
@@ -928,7 +935,11 @@ dynamic_batching {{
         file.write(final_config_str)
         file.write(final_config_str)
 
 
 def main(_):
 def main(_):
-  os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM
+  # causes memory fragmentation for bert leading to OOM
+  if os.environ.get("TF_XLA_FLAGS", None) is not None:
+    os.environ["TF_XLA_FLAGS"] += "--tf_xla_enable_lazy_compilation=false"
+  else:
+    os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"
 
 
   tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
   dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
@@ -965,7 +976,9 @@ def main(_):
           training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
           training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
   if FLAGS.use_xla:
   if FLAGS.use_xla:
     config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
     config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
-    tf.enable_resource_variables()
+    if FLAGS.amp:
+        tf.enable_resource_variables()
+
   run_config = tf.estimator.RunConfig(
   run_config = tf.estimator.RunConfig(
       model_dir=FLAGS.output_dir if master_process else None,
       model_dir=FLAGS.output_dir if master_process else None,
       session_config=config,
       session_config=config,
@@ -1022,7 +1035,7 @@ def main(_):
       num_train_steps=num_train_steps,
       num_train_steps=num_train_steps,
       num_warmup_steps=num_warmup_steps,
       num_warmup_steps=num_warmup_steps,
       hvd=None if not FLAGS.horovod else hvd,
       hvd=None if not FLAGS.horovod else hvd,
-      use_fp16=FLAGS.use_fp16)
+      amp=FLAGS.amp)
 
 
   estimator = tf.estimator.Estimator(
   estimator = tf.estimator.Estimator(
       model_fn=model_fn,
       model_fn=model_fn,
@@ -1165,7 +1178,7 @@ def main(_):
     tf.compat.v1.logging.info("Summary Inference Statistics")
     tf.compat.v1.logging.info("Summary Inference Statistics")
     tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
     tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
     tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
     tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
-    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
     tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
     tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)

+ 0 - 0
TensorFlow/LanguageModeling/BERT/configurations.yml → TensorFlow/LanguageModeling/BERT/scripts/configs/configurations.yml


+ 85 - 0
TensorFlow/LanguageModeling/BERT/scripts/configs/pretrain_config.sh

@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# Full LAMB pretraining configs for NVIDIA DGX A100 (8x NVIDIA A100 40GB GPU)
+
+dgxa100_8gpu_fp16 ()
+{
+  train_batch_size_phase1=64
+  train_batch_size_phase2=16
+  eval_batch_size=8
+  learning_rate_phase1="7.5e-4"
+  learning_rate_phase2="5e-4"
+  precision="fp16"
+  use_xla="true"
+  num_gpus=8
+  echo $train_batch_size_phase1 $train_batch_size_phase2 $eval_batch_size $learning_rate_phase1 $learning_rate_phase2 $precision $use_xla $num_gpu
+}
+
+dgxa100_8gpu_tf32 ()
+{
+  train_batch_size_phase1=64
+  train_batch_size_phase2=8
+  eval_batch_size=8
+  learning_rate_phase1="7.5e-4"
+  learning_rate_phase2="5e-4"
+  precision="tf32"
+  use_xla="true"
+  num_gpus=8
+  echo $train_batch_size_phase1 $train_batch_size_phase2 $eval_batch_size $learning_rate_phase1 $learning_rate_phase2 $precision $use_xla $num_gpu
+}
+
+# Full LAMB pretraining configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
+
+dgx2_16gpu_fp16 ()
+{
+  train_batch_size_phase1=64
+  train_batch_size_phase2=8
+  eval_batch_size=8
+  learning_rate_phase1="3.75e-4"
+  learning_rate_phase2="2.5e-4"
+  precision="fp16"
+  use_xla="true"
+  num_gpus=16
+  echo $train_batch_size_phase1 $train_batch_size_phase2 $eval_batch_size $learning_rate_phase1 $learning_rate_phase2 $precision $use_xla $num_gpu
+}
+
+dgx2_16gpu_fp32 ()
+{
+  train_batch_size_phase1=32
+  train_batch_size_phase2=8
+  eval_batch_size=8
+  learning_rate_phase1="3.75e-4"
+  learning_rate_phase2="2.5e-4"
+  precision="fp32"
+  use_xla="true"
+  num_gpus=16
+  echo $train_batch_size_phase1 $train_batch_size_phase2 $eval_batch_size $learning_rate_phase1 $learning_rate_phase2 $precision $use_xla $num_gpu
+}
+
+# Full LAMB pretraining configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
+
+dgx1_8gpu_fp16 ()
+{
+  train_batch_size_phase1=16
+  train_batch_size_phase2=4
+  eval_batch_size=8
+  learning_rate_phase1="7.5e-4"
+  learning_rate_phase2="5e-4"
+  precision="fp16"
+  use_xla="true"
+  num_gpus=8
+  echo $train_batch_size_phase1 $train_batch_size_phase2 $eval_batch_size $learning_rate_phase1 $learning_rate_phase2 $precision $use_xla $num_gpu
+}
+
+dgx1_8gpu_fp32 ()
+{
+  train_batch_size_phase1=8
+  train_batch_size_phase2=2
+  eval_batch_size=8
+  learning_rate_phase1="7.5e-4"
+  learning_rate_phase2="5e-4"
+  precision="fp32"
+  use_xla="true"
+  num_gpus=8
+  echo $train_batch_size_phase1 $train_batch_size_phase2 $eval_batch_size $learning_rate_phase1 $learning_rate_phase2 $precision $use_xla $num_gpu
+}

+ 85 - 0
TensorFlow/LanguageModeling/BERT/scripts/configs/squad_config.sh

@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# Full SQuAD training configs for NVIDIA DGX A100 (8x NVIDIA A100 40GB GPU)
+
+dgxa100_8gpu_fp16 ()
+{
+  batch_size=32
+  learning_rate=5e-6
+  precision=fp16
+  use_xla=true
+  num_gpu=8
+  seq_length=384
+  doc_stride=128
+  bert_model="large"
+  echo $batch_size $learning_rate $precision $use_xla $num_gpu $seq_length $doc_stride $bert_model
+}
+
+dgxa100_8gpu_tf32 ()
+{
+  batch_size=16
+  learning_rate=5e-6
+  precision=tf32
+  use_xla=true
+  num_gpu=8
+  seq_length=384
+  doc_stride=128
+  bert_model="large"
+  echo $batch_size $learning_rate $precision $use_xla $num_gpu $seq_length $doc_stride $bert_model
+}
+
+# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
+
+dgx2_16gpu_fp16 ()
+{
+  batch_size=24
+  learning_rate=2.5e-6
+  precision=fp16
+  use_xla=true
+  num_gpu=16
+  seq_length=384
+  doc_stride=128
+  bert_model="large"
+  echo $batch_size $learning_rate $precision $use_xla $num_gpu $seq_length $doc_stride $bert_model
+}
+
+dgx2_16gpu_fp32 ()
+{
+  batch_size=8
+  learning_rate=2.5e-6
+  precision=fp32
+  use_xla=true
+  num_gpu=16
+  seq_length=384
+  doc_stride=128
+  bert_model="large"
+  echo $batch_size $learning_rate $precision $use_xla $num_gpu $seq_length $doc_stride $bert_model
+}
+
+# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
+
+dgx1_8gpu_fp16 ()
+{
+  batch_size=4
+  learning_rate=5e-6
+  precision=fp16
+  use_xla=true
+  num_gpu=8
+  seq_length=384
+  doc_stride=128
+  bert_model="large"
+  echo $batch_size $learning_rate $precision $use_xla $num_gpu $seq_length $doc_stride $bert_model
+}
+
+dgx1_8gpu_fp32 ()
+{
+  batch_size=2
+  learning_rate=5e-6
+  precision=fp32
+  use_xla=true
+  num_gpu=8
+  seq_length=384
+  doc_stride=128
+  bert_model="large"
+  echo $batch_size $learning_rate $precision $use_xla $num_gpu $seq_length $doc_stride $bert_model
+}

+ 1 - 1
TensorFlow/LanguageModeling/BERT/scripts/docker/build.sh

@@ -1,5 +1,5 @@
 #!/bin/bash
 #!/bin/bash
 
 
-docker pull nvcr.io/nvidia/tensorrtserver:19.08-py3
+docker pull nvcr.io/nvidia/tritonserver:20.03-py3
 
 
 docker build . --rm -t bert
 docker build . --rm -t bert

+ 1 - 1
TensorFlow/LanguageModeling/BERT/scripts/docker/launch.sh

@@ -4,7 +4,7 @@ CMD=${@:-/bin/bash}
 NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"all"}
 NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"all"}
 
 
 
 
-nvidia-docker run --rm -it \
+docker run --gpus $NV_VISIBLE_DEVICES --rm -it \
     --net=host \
     --net=host \
     --shm-size=1g \
     --shm-size=1g \
     --ulimit memlock=-1 \
     --ulimit memlock=-1 \

+ 26 - 36
TensorFlow/LanguageModeling/BERT/scripts/finetune_inference_benchmark.sh

@@ -13,17 +13,7 @@
 # See the License for the specific language governing permissions and
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # limitations under the License.
 
 
-bert_model=${1:-"large"}
-task=${2:-"squad"}
-
-if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
-else
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
-fi
-echo  "BERT directory set as " $BERT_DIR
-
-init_checkpoint="$BERT_DIR/bert_model.ckpt"
+task=${1:-"squad"}
 
 
 #Edit to save logs & checkpoints in a different directory
 #Edit to save logs & checkpoints in a different directory
 RESULTS_DIR=/results
 RESULTS_DIR=/results
@@ -41,24 +31,24 @@ if [ "$task" = "squad" ] ; then
     echo "Squad directory set as " $SQUAD_DIR
     echo "Squad directory set as " $SQUAD_DIR
 
 
     echo "Inference performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
     echo "Inference performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
-    echo "Precision Sequence-Length Batch-size Precision Throughput-Average(sent/sec) Latency-Average(ms) Latency-50%(ms) Latency-90%(ms) Latency-95%(ms) Latency-99%(ms) Latency-100%(ms)" >> $LOGFILE
+    for bert_model in "base" "large"; do
+      echo "Model Sequence-Length Batch-size Precision Throughput-Average(sent/sec) Latency-Average(ms) Latency-50%(ms) Latency-90%(ms) Latency-95%(ms) Latency-99%(ms) Latency-100%(ms)" >> $LOGFILE
+
 
 
-    for seq_len in 128 384; do
+      if [ "$bert_model" = "large" ] ; then
+        export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+      else
+          export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+      fi
+      echo  "BERT directory set as " $BERT_DIR
 
 
-    for bs in 1 2 4 8; do
+      init_checkpoint="$BERT_DIR/bert_model.ckpt"
 
 
-    for precision in fp16 fp32; do
+      for seq_len in 128 384; do
 
 
+      for bs in 1 2 4 8; do
 
 
-        if [ "$precision" = "fp16" ] ; then
-            echo "fp16 and XLA activated!"
-            use_fp16="--use_fp16"
-            use_xla_tag="--use_xla"
-        else
-            echo "fp32 activated!"
-            use_fp16=""
-            use_xla_tag=""
-        fi
+      for use_fp16 in "--amp" "--noamp"; do
 
 
         python run_squad.py \
         python run_squad.py \
         --vocab_file=$BERT_DIR/vocab.txt \
         --vocab_file=$BERT_DIR/vocab.txt \
@@ -71,21 +61,21 @@ if [ "$task" = "squad" ] ; then
         --doc_stride=128 \
         --doc_stride=128 \
         --output_dir=${RESULTS_DIR} \
         --output_dir=${RESULTS_DIR} \
         "$use_fp16" \
         "$use_fp16" \
-        $use_xla_tag --num_eval_iterations=1024 |& tee $tmp_file
+        --use_xla --num_eval_iterations=1024 |& tee $tmp_file
 
 
-        perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}'`
-        la=`cat $tmp_file | grep -F 'Latency Average (ms)' | awk -F'= ' '{print $2}'`
-        l50=`cat $tmp_file | grep -F 'Latency Confidence Level 50 (ms)' | awk -F'= ' '{print $2}'`
-        l90=`cat $tmp_file | grep -F 'Latency Confidence Level 90 (ms)' | awk -F'= ' '{print $2}'`
-        l95=`cat $tmp_file | grep -F 'Latency Confidence Level 95 (ms)' | awk -F'= ' '{print $2}'`
-        l99=`cat $tmp_file | grep -F 'Latency Confidence Level 99 (ms)' | awk -F'= ' '{print $2}'`
-        l100=`cat $tmp_file | grep -F 'Latency Confidence Level 100 (ms)' | awk -F'= ' '{print $2}'`
+        perf=`cat $tmp_file | grep -F 'INFO:tensorflow:Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}'`
+        la=`cat $tmp_file | grep -F 'INFO:tensorflow:Latency Average (ms)' | awk -F'= ' '{print $2}'`
+        l50=`cat $tmp_file | grep -F 'INFO:tensorflow:Latency Confidence Level 50 (ms)' | awk -F'= ' '{print $2}'`
+        l90=`cat $tmp_file | grep -F 'INFO:tensorflow:Latency Confidence Level 90 (ms)' | awk -F'= ' '{print $2}'`
+        l95=`cat $tmp_file | grep -F 'INFO:tensorflow:Latency Confidence Level 95 (ms)' | awk -F'= ' '{print $2}'`
+        l99=`cat $tmp_file | grep -F 'INFO:tensorflow:Latency Confidence Level 99 (ms)' | awk -F'= ' '{print $2}'`
+        l100=`cat $tmp_file | grep -F 'INFO:tensorflow:Latency Confidence Level 100 (ms)' | awk -F'= ' '{print $2}'`
 
 
-        echo "$precision $seq_len $bs $precision $perf $la $l50 $l90 $l95 $l99 $l100" >> $LOGFILE
+        echo "$bert_model $seq_len $bs $use_fp16 $perf $la $l50 $l90 $l95 $l99 $l100" >> $LOGFILE
 
 
-     done
-     done
-     done
+       done
+       done
+       done
 
 
 else
 else
 
 

+ 4 - 12
TensorFlow/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh

@@ -41,7 +41,7 @@ echo "Results directory set as " $RESULTS_DIR
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 if [ $num_gpu -gt 1 ] ; then
 if [ $num_gpu -gt 1 ] ; then
@@ -75,19 +75,11 @@ if [ "$task" = "squad" ] ; then
         fi
         fi
 
 
         for batch_size in 1 2 4; do
         for batch_size in 1 2 4; do
-            for precision in fp16 fp32; do
-                res_dir=${RESULTS_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
+            for use_fp16 in "--amp" "--noamp"; do
+                res_dir=${RESULTS_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_len}_prec_${use_fp16}_bs_${batch_size}
                 mkdir -p $res_dir
                 mkdir -p $res_dir
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
                 tmp_file="${res_dir}/${task}_training_benchmark.log"
 
 
-                if [ "$precision" = "fp16" ] ; then
-                    echo "fp16 activated!"
-                    use_fp16="--use_fp16"
-                else
-                    echo "fp32 activated!"
-                    use_fp16=""
-                fi
-
                 $mpi_command python run_squad.py \
                 $mpi_command python run_squad.py \
                 --vocab_file=$BERT_DIR/vocab.txt \
                 --vocab_file=$BERT_DIR/vocab.txt \
                 --bert_config_file=$BERT_DIR/bert_config.json \
                 --bert_config_file=$BERT_DIR/bert_config.json \
@@ -105,7 +97,7 @@ if [ "$task" = "squad" ] ; then
                 $use_xla_tag |& tee $tmp_file
                 $use_xla_tag |& tee $tmp_file
 
 
                 perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
                 perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
-                echo "$precision $seq_len  $batch_size $perf" >> $LOGFILE
+                echo "$use_fp16 $seq_len  $batch_size $perf" >> $LOGFILE
 
 
             done
             done
         done
         done

+ 6 - 3
TensorFlow/LanguageModeling/BERT/scripts/run_glue.sh

@@ -41,15 +41,18 @@ echo "GLUE directory set as " $GLUE_DIR " BERT directory set as " $BERT_DIR
 
 
 use_fp16=""
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 if [ $num_gpu -gt 1 ] ; then
 if [ $num_gpu -gt 1 ] ; then

+ 6 - 3
TensorFlow/LanguageModeling/BERT/scripts/run_glue_inference.sh

@@ -34,15 +34,18 @@ echo "GLUE directory set as " $GLUE_DIR " BERT directory set as " $BERT_DIR
 
 
 use_fp16=""
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 num_gpu=1
 num_gpu=1

+ 9 - 5
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_adam.sh

@@ -15,10 +15,10 @@
 
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
 
-train_batch_size=${1:-14}
+train_batch_size=${1:-16}
 eval_batch_size=${2:-8}
 eval_batch_size=${2:-8}
 learning_rate=${3:-"1e-4"}
 learning_rate=${3:-"1e-4"}
-precision=${4:-"manual_fp16"}
+precision=${4:-"fp16"}
 use_xla=${5:-"true"}
 use_xla=${5:-"true"}
 num_gpus=${6:-8}
 num_gpus=${6:-8}
 warmup_steps=${7:-"10000"}
 warmup_steps=${7:-"10000"}
@@ -39,11 +39,13 @@ fi
 
 
 PREC=""
 PREC=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-   PREC="--use_fp16"
+   PREC="--amp"
 elif [ "$precision" = "fp32" ] ; then
 elif [ "$precision" = "fp32" ] ; then
-   PREC=""
+   PREC="--noamp"
+elif [ "$precision" = "tf32" ] ; then
+   PREC="--noamp"
 elif [ "$precision" = "manual_fp16" ] ; then
 elif [ "$precision" = "manual_fp16" ] ; then
-   PREC="--manual_fp16"
+   PREC="--noamp --manual_fp16"
 else
 else
    echo "Unknown <precision> argument"
    echo "Unknown <precision> argument"
    exit -2
    exit -2
@@ -52,6 +54,8 @@ fi
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     PREC="$PREC --use_xla"
     PREC="$PREC --use_xla"
     echo "XLA activated"
     echo "XLA activated"
+else
+    PREC="$PREC --nouse_xla"
 fi
 fi
 
 
 export GBS=$(expr $train_batch_size \* $num_gpus \* $num_accumulation_steps)
 export GBS=$(expr $train_batch_size \* $num_gpus \* $num_accumulation_steps)

+ 8 - 4
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh

@@ -22,7 +22,7 @@ learning_rate_phase1=${4:-"7.5e-4"}
 learning_rate_phase2=${5:-"5e-4"}
 learning_rate_phase2=${5:-"5e-4"}
 precision=${6:-"fp16"}
 precision=${6:-"fp16"}
 use_xla=${7:-"true"}
 use_xla=${7:-"true"}
-num_gpus=${8:-2}
+num_gpus=${8:-8}
 warmup_steps_phase1=${9:-"2000"}
 warmup_steps_phase1=${9:-"2000"}
 warmup_steps_phase2=${10:-"200"}
 warmup_steps_phase2=${10:-"200"}
 train_steps=${11:-7820}
 train_steps=${11:-7820}
@@ -43,11 +43,13 @@ fi
 
 
 PREC=""
 PREC=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-   PREC="--use_fp16"
+   PREC="--amp"
 elif [ "$precision" = "fp32" ] ; then
 elif [ "$precision" = "fp32" ] ; then
-   PREC=""
+   PREC="--noamp"
+elif [ "$precision" = "tf32" ] ; then
+   PREC="--noamp"
 elif [ "$precision" = "manual_fp16" ] ; then
 elif [ "$precision" = "manual_fp16" ] ; then
-   PREC="--manual_fp16"
+   PREC="--noamp --manual_fp16"
 else
 else
    echo "Unknown <precision> argument"
    echo "Unknown <precision> argument"
    exit -2
    exit -2
@@ -56,6 +58,8 @@ fi
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     PREC="$PREC --use_xla"
     PREC="$PREC --use_xla"
     echo "XLA activated"
     echo "XLA activated"
+else
+    PREC="$PREC --nouse_xla"
 fi
 fi
 
 
 mpi=""
 mpi=""

+ 8 - 4
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh

@@ -22,7 +22,7 @@ learning_rate_phase1=${4:-"7.5e-4"}
 learning_rate_phase2=${5:-"5e-4"}
 learning_rate_phase2=${5:-"5e-4"}
 precision=${6:-"fp16"}
 precision=${6:-"fp16"}
 use_xla=${7:-"true"}
 use_xla=${7:-"true"}
-num_gpus=${8:-2}
+num_gpus=${8:-8}
 warmup_steps_phase1=${9:-"2000"}
 warmup_steps_phase1=${9:-"2000"}
 warmup_steps_phase2=${10:-"200"}
 warmup_steps_phase2=${10:-"200"}
 train_steps=${11:-7820}
 train_steps=${11:-7820}
@@ -45,11 +45,13 @@ echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
 
 PREC=""
 PREC=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-   PREC="--use_fp16"
+   PREC="--amp"
 elif [ "$precision" = "fp32" ] ; then
 elif [ "$precision" = "fp32" ] ; then
-   PREC=""
+   PREC="--noamp"
+elif [ "$precision" = "tf32" ] ; then
+   PREC="--noamp"
 elif [ "$precision" = "manual_fp16" ] ; then
 elif [ "$precision" = "manual_fp16" ] ; then
-   PREC="--manual_fp16"
+   PREC="--noamp --manual_fp16"
 else
 else
    echo "Unknown <precision> argument"
    echo "Unknown <precision> argument"
    exit -2
    exit -2
@@ -58,6 +60,8 @@ fi
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     PREC="$PREC --use_xla"
     PREC="$PREC --use_xla"
     echo "XLA activated"
     echo "XLA activated"
+else
+    PREC="$PREC --nouse_xla"
 fi
 fi
 
 
 mpi=""
 mpi=""

+ 7 - 4
TensorFlow/LanguageModeling/BERT/scripts/run_squad.sh

@@ -46,15 +46,18 @@ echo "Squad directory set as " $SQUAD_DIR " BERT directory set as " $BERT_DIR
 
 
 use_fp16=""
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 if [ $num_gpu -gt 1 ] ; then
 if [ $num_gpu -gt 1 ] ; then
@@ -94,6 +97,7 @@ $mpi_command python run_squad.py \
 --train_file=$SQUAD_DIR/train-v${squad_version}.json \
 --train_file=$SQUAD_DIR/train-v${squad_version}.json \
 --do_predict=True \
 --do_predict=True \
 --predict_file=$SQUAD_DIR/dev-v${squad_version}.json \
 --predict_file=$SQUAD_DIR/dev-v${squad_version}.json \
+--eval_script=$SQUAD_DIR/evaluate-v${squad_version}.py \
 --train_batch_size=$batch_size \
 --train_batch_size=$batch_size \
 --learning_rate=$learning_rate \
 --learning_rate=$learning_rate \
 --num_train_epochs=$epochs \
 --num_train_epochs=$epochs \
@@ -104,4 +108,3 @@ $mpi_command python run_squad.py \
 --horovod "$use_fp16" \
 --horovod "$use_fp16" \
 $use_xla_tag --version_2_with_negative=${version_2_with_negative} |& tee $LOGFILE
 $use_xla_tag --version_2_with_negative=${version_2_with_negative} |& tee $LOGFILE
 
 
-python $SQUAD_DIR/evaluate-v${squad_version}.py $SQUAD_DIR/dev-v${squad_version}.json ${RESULTS_DIR}/predictions.json |& tee -a $LOGFILE

+ 6 - 3
TensorFlow/LanguageModeling/BERT/scripts/run_squad_inference.sh

@@ -43,15 +43,18 @@ echo "Results directory set as " $RESULTS_DIR
 
 
 use_fp16=""
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
-        echo "fp16 activated!"
-        use_fp16="--use_fp16"
+    echo "fp16 activated!"
+    use_fp16="--amp"
+else
+    echo "fp32/tf32 activated!"
+    use_fp16="--noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
     use_xla_tag="--use_xla"
     echo "XLA activated"
     echo "XLA activated"
 else
 else
-    use_xla_tag=""
+    use_xla_tag="--nouse_xla"
 fi
 fi
 
 
 ckpt_str=${init_checkpoint//\//-}
 ckpt_str=${init_checkpoint//\//-}

+ 6 - 1
TensorFlow/LanguageModeling/BERT/triton/scripts/export_model.sh

@@ -32,12 +32,17 @@ additional_args="--triton_model_version=$triton_model_version --triton_model_nam
 
 
 if [ "$precision" = "fp16" ] ; then
 if [ "$precision" = "fp16" ] ; then
    echo "fp16 activated!"
    echo "fp16 activated!"
-   additional_args="$additional_args --use_fp16"
+   additional_args="$additional_args --amp"
+else
+   echo "fp32/tf32 activated!"
+   additional_args="$additional_args --noamp"
 fi
 fi
 
 
 if [ "$use_xla" = "true" ] ; then
 if [ "$use_xla" = "true" ] ; then
     echo "XLA activated"
     echo "XLA activated"
     additional_args="$additional_args --use_xla"
     additional_args="$additional_args --use_xla"
+else
+    additional_args="$additional_args --nouse_xla"
 fi
 fi
 
 
 echo "Additional args: $additional_args"
 echo "Additional args: $additional_args"

+ 14 - 13
TensorFlow/LanguageModeling/BERT/trt/README.md

@@ -12,21 +12,22 @@ This subfolder of the BERT TensorFlow repository, tested and maintained by NVIDI
 - [Setup](#setup)
 - [Setup](#setup)
    * [Requirements](#requirements)
    * [Requirements](#requirements)
 - [Quick Start Guide](#quick-start-guide)
 - [Quick Start Guide](#quick-start-guide)
-  * [(Optional) Trying a different configuration](#optional-trying-a-different-configuration)
+   * [(Optional) Trying a different configuration](#optional-trying-a-different-configuration)
 - [Advanced](#advanced)
 - [Advanced](#advanced)
    * [Scripts and sample code](#scripts-and-sample-code)
    * [Scripts and sample code](#scripts-and-sample-code)
    * [Command-line options](#command-line-options)
    * [Command-line options](#command-line-options)
    * [TensorRT inference process](#tensorrt-inference-process)
    * [TensorRT inference process](#tensorrt-inference-process)
 - [Performance](#performance)
 - [Performance](#performance)
-  * [Benchmarking](#benchmarking)
-       * [TensorRT inference benchmark](#tensorrt-inference-benchmark)
+   * [Benchmarking](#benchmarking)
+      * [TensorRT inference benchmark](#tensorrt-inference-benchmark)
    * [Results](#results)
    * [Results](#results)
-      * [Inference performance: NVIDIA T4 (16GB)](#inference-performance-nvidia-t4-16gb)
-        * [BERT Base](#bert-base)
-        * [BERT Large](#bert-large)
-     * [Inference performance: NVIDIA V100 (32GB)](#inference-performance-nvidia-v100-32gb)
-       * [BERT Base](#bert-base)
-       * [BERT Large](#bert-large)
+      * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
+      * [BERT Base](#bert-base)
+      * [BERT Large](#bert-large)
+   * [Inference performance: NVIDIA V100 (32GB)](#inference-performance-nvidia-v100-(32gc))
+      * [BERT Base](#bert-base)
+      * [BERT Large](#bert-large)
+
 
 
 
 
 ## Model overview
 ## Model overview
@@ -237,7 +238,7 @@ Our results were obtained by running the `scripts/inference_benchmark.sh` traini
 
 
 ##### BERT Base
 ##### BERT Base
 
 
-| Sequence Length | Batch Size | TRT Mixed Precision Latency (ms) ||         | TRT FP32 Latency (ms) |           |         |
+| Sequence Length | Batch Size | TensorRT Mixed Precision Latency (ms) ||         | TensorRT FP32 Latency (ms) |           |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
 | 128 | 1 | 1.97 | 1.97 | 1.93 | 6.47 | 6.51 | 6.12 |
 | 128 | 1 | 1.97 | 1.97 | 1.93 | 6.47 | 6.51 | 6.12 |
@@ -265,7 +266,7 @@ Our results were obtained by running the `scripts/inference_benchmark.sh` traini
 
 
 ##### BERT Large
 ##### BERT Large
 
 
-| Sequence Length | Batch Size | TRT Mixed Precision Latency (ms) ||         | TRT FP32 Latency (ms) |           |         |
+| Sequence Length | Batch Size | TensorRT Mixed Precision Latency (ms) ||         | TensorRT FP32 Latency (ms) |           |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
 | 128 | 1 | 5.63 | 5.66 | 5.39 | 21.53 | 22.16 | 20.74 |
 | 128 | 1 | 5.63 | 5.66 | 5.39 | 21.53 | 22.16 | 20.74 |
@@ -298,7 +299,7 @@ Our results were obtained by running the `scripts/inference_benchmark.sh` traini
 
 
 ##### BERT Base
 ##### BERT Base
 
 
-| Sequence Length | Batch Size | TRT Mixed Precision Latency (ms) ||         | TRT FP32 Latency (ms) |           |         |
+| Sequence Length | Batch Size | TensorRT Mixed Precision Latency (ms) ||         | TensorRT FP32 Latency (ms) |           |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
 | 128 | 1 | 1.39 | 1.45 | 1.37 | 2.93 | 2.95 | 2.91 |
 | 128 | 1 | 1.39 | 1.45 | 1.37 | 2.93 | 2.95 | 2.91 |
@@ -326,7 +327,7 @@ Our results were obtained by running the `scripts/inference_benchmark.sh` traini
 
 
 ##### BERT Large
 ##### BERT Large
 
 
-| Sequence Length | Batch Size | TRT Mixed Precision Latency (ms) ||         | TRT FP32 Latency (ms) |           |         |
+| Sequence Length | Batch Size | TensorRT Mixed Precision Latency (ms) ||         | TensorRT FP32 Latency (ms) |           |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
 | 128 | 1 | 3.4 | 3.46 | 3.38 | 8.83 | 8.85 | 8.76 |
 | 128 | 1 | 3.4 | 3.46 | 3.38 | 8.83 | 8.85 | 8.76 |

+ 7 - 9
TensorFlow/LanguageModeling/BERT/utils/utils.py

@@ -32,13 +32,16 @@ class LogEvalRunHook(tf.estimator.SessionRunHook):
 
 
 # report throughput during training
 # report throughput during training
 class LogTrainRunHook(tf.estimator.SessionRunHook):
 class LogTrainRunHook(tf.estimator.SessionRunHook):
-  def __init__(self, global_batch_size, hvd_rank=-1, save_checkpoints_steps=1000):
+  def __init__(self, global_batch_size, hvd_rank=-1, save_checkpoints_steps=1000, num_steps_ignore_xla=100):
     self.global_batch_size = global_batch_size
     self.global_batch_size = global_batch_size
     self.hvd_rank = hvd_rank
     self.hvd_rank = hvd_rank
     self.save_checkpoints_steps = save_checkpoints_steps
     self.save_checkpoints_steps = save_checkpoints_steps
 
 
     self.total_time = 0.0
     self.total_time = 0.0
     self.count = 0 # Holds number of iterations, including skipped iterations for fp16 loss scaling
     self.count = 0 # Holds number of iterations, including skipped iterations for fp16 loss scaling
+    self.skipped = 0
+    self.num_steps_ignore_xla = num_steps_ignore_xla 
+    #initial steps while xla is still compilingneed to be ignored from throughput computation 
 
 
   def after_create_session(self, session, coord):
   def after_create_session(self, session, coord):
     self.init_global_step = session.run(tf.train.get_global_step())
     self.init_global_step = session.run(tf.train.get_global_step())
@@ -53,14 +56,9 @@ class LogTrainRunHook(tf.estimator.SessionRunHook):
     self.global_step = run_values.results[0]
     self.global_step = run_values.results[0]
     self.count += 1
     self.count += 1
 
 
-    # Removing first step + first two steps after every checkpoint save
-    if (self.global_step - self.init_global_step) % self.save_checkpoints_steps <= 1:
+    # Removing first 100 step + first five steps after every checkpoint save
+    if (self.global_step - self.init_global_step) <= self.num_steps_ignore_xla or (self.global_step - self.init_global_step) % self.save_checkpoints_steps < 5:
       print("Skipping time record for ", self.global_step, " due to checkpoint-saving/warmup overhead")
       print("Skipping time record for ", self.global_step, " due to checkpoint-saving/warmup overhead")
+      self.skipped += 1
     else:
     else:
       self.total_time += elapsed_secs
       self.total_time += elapsed_secs
-
-  def end(self, session):
-    num_global_steps = self.global_step - self.init_global_step
-
-    self.skipped = (num_global_steps // self.save_checkpoints_steps) * 2 + \
-                   min(2, num_global_steps % self.save_checkpoints_steps) - 1

Некоторые файлы не были показаны из-за большого количества измененных файлов