5 ani în urmă · 88864b9291
--- a/PyTorch/LanguageModeling/BERT/Dockerfile
+++ b/PyTorch/LanguageModeling/BERT/Dockerfile
@@ -34,7 +34,7 @@ WORKDIR /workspace/bert
 
				 RUN pip install --upgrade --no-cache-dir pip \
			
 
				  && pip install --no-cache-dir \
			
 
				  tqdm boto3 requests six ipdb h5py html2text nltk progressbar onnxruntime \
			
 
				- git+https://github.com/NVIDIA/dllogger
			
 
				+ git+https://github.com/NVIDIA/dllogger wget
			
 
				 
			
 
				 RUN apt-get install -y iputils-ping
			
 
				 
			
--- a/PyTorch/LanguageModeling/BERT/NOTICE
+++ b/PyTorch/LanguageModeling/BERT/NOTICE
@@ -2,4 +2,3 @@ BERT PyTorch
 
				 
			
 
				 This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT
			
 
				 licensed under the Apache License 2.0.
			
 
				-
			
--- a/PyTorch/LanguageModeling/BERT/README.md
+++ b/PyTorch/LanguageModeling/BERT/README.md
@@ -20,7 +20,7 @@ This repository provides a script and recipe to train the BERT model for PyTorch
 
				     * [Scripts and sample code](#scripts-and-sample-code)
			
 
				     * [Parameters](#parameters)
			
 
				         * [Pre-training parameters](#pre-training-parameters)
			
 
				-        * [Fine tuning parameters](#fine-tune-parameters)    
			
 
				+        * [Fine tuning parameters](#fine-tuning-parameters)    
			
 
				         * [Multi-node](#multi-node)
			
 
				         * [Fine-tuning parameters](#fine-tuning-parameters)     
			
 
				     * [Command-line options](#command-line-options)
			
@@ -44,34 +44,35 @@ This repository provides a script and recipe to train the BERT model for PyTorch
 
				             * [Pre-training loss results](#pre-training-loss-results)
			
 
				             * [Pre-training loss curves](#pre-training-loss-curves)
			
 
				             * [Fine-tuning accuracy results: NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-accuracy-results-nvidia-dgx-a100-8x-a100-40gb)
			
 
				+            * [Fine-tuning accuracy results: NVIDIA DGX-2 (16x V100 32G)](#fine-tuning-accuracy-results-nvidia-dgx-2-16x-v100-32g)
			
 
				             * [Fine-tuning accuracy results: NVIDIA DGX-1 (8x V100 16G)](#fine-tuning-accuracy-results-nvidia-dgx-1-8x-v100-16g)
			
 
				             * [Training stability test](#training-stability-test)
			
 
				                 * [Pre-training stability test](#pre-training-stability-test)
			
 
				                 * [Fine-tuning stability test](#fine-tuning-stability-test) 
			
 
				-          * [Training performance results](#training-performance-results)
			
 
				-              * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
			
 
				-                  * [Pre-training NVIDIA DGX A100 (8x A100 40GB)](#pre-training-nvidia-dgx-a100-8x-a100-40gb)
			
 
				-                  * [Fine-tuning NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-nvidia-dgx-a100-8x-a100-40gb)      
			
 
				-              * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
			
 
				-                  * [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
			
 
				-                  * [Pre-training on multiple NVIDIA DGX-1 With 16G](#pre-training-on-multiple-nvidia-dgx-1-with-16g)
			
 
				-                  * [Fine-tuning NVIDIA DGX-1 With 16G](#fine-tuning-nvidia-dgx-1-with-16g)   
			
 
				-              * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
			
 
				-                  * [Pre-training NVIDIA DGX-1 With 32G](#pre-training-nvidia-dgx-1-with-32g)
			
 
				-                  * [Fine-tuning NVIDIA DGX-1 With 32G](#fine-tuning-nvidia-dgx-1-with-32g)   
			
 
				-              * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
			
 
				-                  * [Pre-training NVIDIA DGX-2 With 32G](#pre-training-nvidia-dgx-2-with-32g)
			
 
				-                  * [Pre-training on multiple NVIDIA DGX-2H With 32G](#pre-training-on-multiple-nvidia-dgx-2h-with-32g)
			
 
				-                  * [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)   
			
 
				-          * [Inference performance results](#inference-performance-results)
			
 
				-              * [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
			
 
				-                  * [Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)](#fine-tuning-inference-on-nvidia-dgx-a100-1x-a100-40gb)
			
 
				-              * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
			
 
				-                  * [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
			
 
				-              * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
			
 
				-                  * [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
			
 
				-              * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
			
 
				-                  * [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
			
 
				+        * [Training performance results](#training-performance-results)
			
 
				+            * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
			
 
				+                * [Pre-training NVIDIA DGX A100 (8x A100 40GB)](#pre-training-nvidia-dgx-a100-8x-a100-40gb)
			
 
				+                * [Fine-tuning NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-nvidia-dgx-a100-8x-a100-40gb)      
			
 
				+            * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
			
 
				+                * [Pre-training NVIDIA DGX-2 With 32G](#pre-training-nvidia-dgx-2-with-32g)
			
 
				+                * [Pre-training on multiple NVIDIA DGX-2H With 32G](#pre-training-on-multiple-nvidia-dgx-2h-with-32g)
			
 
				+                * [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)   
			
 
				+            * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
			
 
				+                * [Pre-training NVIDIA DGX-1 With 32G](#pre-training-nvidia-dgx-1-with-32g)
			
 
				+                * [Fine-tuning NVIDIA DGX-1 With 32G](#fine-tuning-nvidia-dgx-1-with-32g)   
			
 
				+            * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
			
 
				+                * [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
			
 
				+                * [Pre-training on multiple NVIDIA DGX-1 With 16G](#pre-training-on-multiple-nvidia-dgx-1-with-16g)
			
 
				+                * [Fine-tuning NVIDIA DGX-1 With 16G](#fine-tuning-nvidia-dgx-1-with-16g)   
			
 
				+        * [Inference performance results](#inference-performance-results)
			
 
				+            * [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
			
 
				+                * [Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)](#fine-tuning-inference-on-nvidia-dgx-a100-1x-a100-40gb)
			
 
				+            * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
			
 
				+                * [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
			
 
				+            * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
			
 
				+                * [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
			
 
				+            * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
			
 
				+                * [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
			
 
				 - [Release notes](#release-notes)
			
 
				     * [Changelog](#changelog)
			
 
				     * [Known issues](#known-issues)
			
@@ -278,7 +279,6 @@ The pretraining dataset is 170GB+ and takes 15+ hours to download. The BookCorpu
 
				 - Download Wikipedia and BookCorpus
			
 
				 
			
 
				 Users are welcome to download BookCorpus from other sources to match our accuracy, or repeatedly try our script until the required number of files are downloaded by running the following:
			
 
				-
			
 
				 `/workspace/bert/data/create_datasets_from_start.sh wiki_books`
			
 
				 
			
 
				 Note: Not using BookCorpus can potentially change final accuracy on a few downstream tasks.
			
@@ -296,11 +296,15 @@ To run on multiple nodes, see the [Multi-node](#multi-node) section.
 
				  
			
 
				 The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script launches fine-tuning for question answering with the SQuAD dataset.
			
 
				 `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`
			
 
				+  
			
 
				+8. Start fine-tuning with the GLUE tasks.
			
 
				  
			
 
				+The above pretrained BERT representations can be fine tuned with just one additional output layer for GLUE tasks. Running the following scripts launch fine-tuning for paraphrase detection with the MRPC dataset:
			
 
				+`bash scripts/run_glue.sh /workspace/bert/checkpoints/<downloaded_checkpoint>`
			
 
				  
			
 
				 9. Start validation/evaluation.
			
 
				  
			
 
				-Validation can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`, setting `mode` to `eval` in `scripts/run_squad.sh` as follows:
			
 
				+For both SQuAD and GLUE, validation can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>` or `bash scripts/run_glue.sh /workspace/bert/checkpoints/<downloaded_checkpoint>`, setting `mode` to `eval` in `scripts/run_squad.sh` or `scripts/run_glue.sh` as follows:
			
 
				 
			
 
				 ```
			
 
				 mode=${11:-"eval"}
			
@@ -308,22 +312,28 @@ mode=${11:-"eval"}
 
				  
			
 
				 10. Start inference/predictions.
			
 
				  
			
 
				-Inference can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`, setting `mode` to `prediction` in `scripts/run_squad.sh` as follows:
			
 
				+Inference can be performed with the `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`, setting `mode` to `prediction` in `scripts/run_squad.sh` or `scripts/run_glue.sh` as follows:
			
 
				 
			
 
				 ```
			
 
				 mode=${11:-"prediction"}
			
 
				 ```
			
 
				 
			
 
				-Inference predictions are saved to `<OUT_DIR>/predictions.json`, set in `scripts/run_squad.sh` as follows:
			
 
				+Inference predictions are saved to `<OUT_DIR>/predictions.json`, set in `scripts/run_squad.sh` or `scripts/run_glue.sh` as follows:
			
 
				 
			
 
				 ```
			
 
				-OUT_DIR=${10:-"/workspace/bert/results/SQuAD"}
			
 
				+OUT_DIR=${10:-"/workspace/bert/results/SQuAD"} # For SQuAD.
			
 
				+# Or…
			
 
				+out_dir=${5:-"/workspace/bert/results/MRPC"} # For MRPC.
			
 
				+# Or... 
			
 
				+out_dir=${5:-"/workspace/bert/results/SST-2"} # For SST-2.
			
 
				 ```
			
 
				 
			
 
				-This repository contains a number of predefined configurations to run the SQuAD and pretraining on NVIDIA DGX-1, NVIDIA DGX-2H or NVIDIA DGX A100 nodes in `scripts/configs/squad_config.sh` and `scripts/configs/pretrain_config.sh`. For example, to use the default DGX A100 8 gpu config, run:
			
 
				+This repository contains a number of predefined configurations to run the SQuAD, GLUE and pretraining on NVIDIA DGX-1, NVIDIA DGX-2H or NVIDIA DGX A100 nodes in `scripts/configs/squad_config.sh`, `scripts/configs/glue_config.sh` and `scripts/configs/pretrain_config.sh`. For example, to use the default DGX A100 8 gpu config, run:
			
 
				 
			
 
				 ```
			
 
				 bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_fp16)
			
 
				+bash scripts/run_glue.sh $(source scripts/configs/glue_config.sh && mrpc_dgxa100_8gpu_fp16)  # For the MRPC dataset.
			
 
				+bash scripts/run_glue.sh $(source scripts/configs/glue_config.sh && sst-2_dgxa100_8gpu_fp16)  # For the SST-2 dataset.
			
 
				 bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgxa100_8gpu_fp16)
			
 
				 ```
			
 
				 
			
@@ -339,11 +349,13 @@ Descriptions of the key scripts and folders are provided below.
 
				 -   `scripts/` - Contains shell scripts to launch data download, pre-training, and fine-tuning.
			
 
				 -   `data_download.sh` - Launches download and processing of required datasets.
			
 
				 -   `run_squad.sh`  - Interface for launching question answering fine-tuning with `run_squad.py`.
			
 
				+-   `run_glue.sh`  - Interface for launching paraphrase detection and sentiment analysis fine-tuning with `run_glue.py`.
			
 
				 -   `run_pretraining.sh`  - Interface for launching BERT pre-training with `run_pretraining.py`.
			
 
				 -   `create_pretraining_data.py` - Creates `.hdf5` files from shared text files in the final step of dataset creation.
			
 
				 -   `model.py` - Implements the BERT pre-training and fine-tuning model architectures with PyTorch.
			
 
				 -   `optimization.py` - Implements the LAMB optimizer with PyTorch.
			
 
				 -   `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
			
 
				+-   `run_glue.py` - Implements fine tuning training and evaluation for [GLUE](https://gluebenchmark.com/) tasks.
			
 
				 -   `run_pretraining.py` - Implements BERT pre-training.
			
 
				 -   `run_pretraining_inference.py` - Implements evaluation of a BERT pre-trained model.
			
 
				  
			
@@ -434,6 +446,8 @@ The complete list of the available parameters for the `run_pretraining.py` scrip
 
				 ```
			
 
				 
			
 
				 #### Fine tuning parameters
			
 
				+
			
 
				+* SQuAD 
			
 
				  
			
 
				 Default arguments are listed below in the order `scripts/run_squad.sh` expects:
			
 
				  
			
@@ -452,6 +466,26 @@ Default arguments are listed below in the order `scripts/run_squad.sh` expects:
 
				  
			
 
				 The script saves the final checkpoint to the `/results/SQuAD/pytorch_model.bin` file.
			
 
				 
			
 
				+* GLUE
			
 
				+
			
 
				+Default arguments are listed below in the order `scripts/run_glue.sh` expects:
			
 
				+ 
			
 
				+-   Initial checkpoint - The default is `/workspace/bert/checkpoints/bert_uncased.pt`.
			
 
				+-   Data directory -  The default is `/workspace/bert/data/download/glue/MRPC/`.
			
 
				+-   Vocabulary file (token to ID mapping) - The default is `/workspace/bert/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt`.
			
 
				+-   Config file for the BERT model (It should be the same as the pretrained model) - The default is `/workspace/bert/bert_config.json`.
			
 
				+-   Output directory for result - The default is `/workspace/bert/results/MRPC`.
			
 
				+-   The name of the GLUE task (`mrpc` or `sst-2`) - The default is `mrpc`
			
 
				+-   Number of GPUs - The default is `8`.
			
 
				+-   Batch size per GPU - The default is `16`.
			
 
				+-   Number of update steps to accumulate before performing a backward/update pass (this option effectively normalizes the GPU memory footprint down by the same factor) - The default is `1`.
			
 
				+-   Learning rate - The default is `2.4e-5`.
			
 
				+-   The proportion of training samples used to warm up learning rate - The default is `0.1`.
			
 
				+-   Number of training Epochs - The default is `3`.
			
 
				+-   Total number of training steps to perform - The default is `-1.0` which means it is determined by the number of epochs.
			
 
				+-   Precision (either `fp16`, `tf32` or `fp32`) - The default is `fp16`.
			
 
				+-   Seed - The default is `2`.
			
 
				+-   Mode (`train`, `eval`, `prediction`, `train eval`, `train prediction`, `eval prediction`, `train eval prediction`) - The default is `train eval`.
			
 
				 
			
 
				 #### Multi-node
			
 
				  
			
@@ -474,6 +508,8 @@ Refer to the files contents to see the full list of variables to adjust for your
 
				  
			
 
				  
			
 
				 #### Fine-tuning parameters
			
 
				+
			
 
				+* SQuAD
			
 
				  
			
 
				 The `run_squad.py` script contains many of the same arguments as `run_pretraining.py`.
			
 
				  
			
@@ -533,6 +569,76 @@ The main script specific parameters are:
 
				                                 best_non_null is greater than NULL_SCORE_DIFF_THRESHOLD.
			
 
				 ```
			
 
				  
			
 
				+* GLUE
			
 
				+
			
 
				+The `run_glue.py` script contains many of the same arguments as `run_pretraining.py`.
			
 
				+ 
			
 
				+The main script specific parameters are:
			
 
				+ 
			
 
				+```
			
 
				+  --data_dir DATA_DIR   The input data dir. Should contain the .tsv files (or
			
 
				+                        other data files) for the task.
			
 
				+  --bert_model BERT_MODEL
			
 
				+                        Bert pre-trained model selected in the list: bert-
			
 
				+                        base-uncased, bert-large-uncased, bert-base-cased,
			
 
				+                        bert-large-cased, bert-base-multilingual-uncased,
			
 
				+                        bert-base-multilingual-cased, bert-base-chinese.
			
 
				+  --task_name {cola,mnli,mrpc,sst-2}
			
 
				+                        The name of the task to train.
			
 
				+  --output_dir OUTPUT_DIR
			
 
				+                        The output directory where the model predictions and
			
 
				+                        checkpoints will be written.
			
 
				+  --init_checkpoint INIT_CHECKPOINT
			
 
				+                        The checkpoint file from pretraining
			
 
				+  --max_seq_length MAX_SEQ_LENGTH
			
 
				+                        The maximum total input sequence length after
			
 
				+                        WordPiece tokenization. Sequences longer than this
			
 
				+                        will be truncated, and sequences shorter than this
			
 
				+                        will be padded.
			
 
				+  --do_train            Whether to run training.
			
 
				+  --do_eval             Whether to get model-task performance on the dev set
			
 
				+                        by running eval.
			
 
				+  --do_predict          Whether to output prediction results on the dev set by
			
 
				+                        running eval.
			
 
				+  --do_lower_case       Set this flag if you are using an uncased model.
			
 
				+  --train_batch_size TRAIN_BATCH_SIZE
			
 
				+                        Batch size per GPU for training.
			
 
				+  --eval_batch_size EVAL_BATCH_SIZE
			
 
				+                        Batch size per GPU for eval.
			
 
				+  --learning_rate LEARNING_RATE
			
 
				+                        The initial learning rate for Adam.
			
 
				+  --num_train_epochs NUM_TRAIN_EPOCHS
			
 
				+                        Total number of training epochs to perform.
			
 
				+  --max_steps MAX_STEPS
			
 
				+                        Total number of training steps to perform.
			
 
				+  --warmup_proportion WARMUP_PROPORTION
			
 
				+                        Proportion of training to perform linear learning rate
			
 
				+                        warmup for. E.g., 0.1 = 10% of training.
			
 
				+  --no_cuda             Whether not to use CUDA when available
			
 
				+  --local_rank LOCAL_RANK
			
 
				+                        local_rank for distributed training on gpus
			
 
				+  --seed SEED           random seed for initialization
			
 
				+  --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
			
 
				+                        Number of updates steps to accumulate before
			
 
				+                        performing a backward/update pass.
			
 
				+  --fp16                Mixed precision training
			
 
				+  --amp                 Mixed precision training
			
 
				+  --loss_scale LOSS_SCALE
			
 
				+                        Loss scaling to improve fp16 numeric stability. Only
			
 
				+                        used when fp16 set to True. 0 (default value): dynamic
			
 
				+                        loss scaling. Positive power of 2: static loss scaling
			
 
				+                        value.
			
 
				+  --server_ip SERVER_IP
			
 
				+                        Can be used for distant debugging.
			
 
				+  --server_port SERVER_PORT
			
 
				+                        Can be used for distant debugging.
			
 
				+  --vocab_file VOCAB_FILE
			
 
				+                        Vocabulary mapping/file BERT was pretrainined on
			
 
				+  --config_file CONFIG_FILE
			
 
				+                        The BERT model config
			
 
				+  --skip_checkpoint     Whether to save checkpoints
			
 
				+```
			
 
				+ 
			
 
				 ### Command-line options
			
 
				  
			
 
				 To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
			
@@ -540,6 +646,8 @@ To see the full list of available options and their descriptions, use the `-h` o
 
				 `python run_pretraining.py --help`
			
 
				  
			
 
				 `python run_squad.py --help`
			
 
				+
			
 
				+`python run_glue.py --help`
			
 
				  
			
 
				 Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section.
			
 
				  
			
@@ -564,6 +672,8 @@ The tools used for preparing the BookCorpus and Wikipedia datasets can be applie
 
				 For fine-tuning a pre-trained BERT model for specific tasks, by default this repository prepares the following dataset:
			
 
				  
			
 
				 -   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering
			
 
				+-   [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398): for paraphrase detection.
			
 
				+-   [SST-2](https://nlp.stanford.edu/sentiment/index.html): for sentiment analysis.
			
 
				  
			
 
				 Depending on the speed of your internet connection, this process takes about a day to complete. The BookCorpus server could sometimes get overloaded and also contain broken links resulting in HTTP 403 and 503 errors. You can either skip the missing files or retry downloading at a later time.
			
 
				  
			
@@ -663,18 +773,24 @@ In order to run pre-training routine on an initial checkpoint, do the following
 
				 Fine-tuning is provided for a variety of tasks. The following tasks are included with this repository through the following scripts:
			
 
				  
			
 
				 -   Question Answering (`scripts/run_squad.sh`)
			
 
				+-   Paraphrase Detection and Sentiment Analysis (`script/run_glue.sh`)
			
 
				  
			
 
				 By default, each Python script implements fine-tuning a pre-trained BERT model for a specified number of training epochs as well as evaluation of the fine-tuned model. Each shell script invokes the associated Python script with the following default parameters:
			
 
				  
			
 
				 -   Uses 8 GPUs
			
 
				 -   Has FP16 precision enabled
			
 
				--   Saves a checkpoint at the end of training to the `/results/<dataset_name>` folder
			
 
				+-   Saves a checkpoint at the end of training to the `results/<dataset_name>` folder
			
 
				  
			
 
				 Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through NVIDIA’s [APEX](https://github.com/NVIDIA/apex) library. For a full list of parameters and associated explanations, see the [Parameters](#parameters) section.
			
 
				  
			
 
				-All fine-tuning shell scripts have the same positional arguments, outlined below:
			
 
				+The fine-tuning shell scripts have positional arguments outlined below:
			
 
				  
			
 
				-```bash scripts/run_squad.sh <checkpoint_to_load> <epochs> <batch_size per GPU> <learning rate> <precision (either `fp16` or `fp32`)> <number of GPUs to use> <seed> <SQuAD_DATA_DIR> <VOCAB_FILE> <OUTPUT_DIR> <mode (either `train`, `eval` or `train eval`)> <CONFIG_FILE>```
			
 
				+```
			
 
				+# For SQuAD.
			
 
				+bash scripts/run_squad.sh <checkpoint_to_load> <epochs> <batch_size per GPU> <learning rate> <precision (either `fp16` or `fp32`)> <number of GPUs to use> <seed> <SQuAD_DATA_DIR> <VOCAB_FILE> <OUTPUT_DIR> <mode (either `train`, `eval` or `train eval`)> <CONFIG_FILE>
			
 
				+# For GLUE
			
 
				+bash scripts/run_glue.sh <checkpoint_to_load> <data_directory> <vocab_file> <config_file> <out_dir> <task_name> <number of GPUs to use> <batch size per GPU> <gradient_accumulation steps> <learning_rate> <warmup_proportion> <epochs> <precision (either `fp16` or `fp32` or `tf32`)> <seed> <mode (either `train`, `eval`, `prediction`, `train eval`, `train prediction`, `eval prediction` or `train eval prediction`)>
			
 
				+```
			
 
				  
			
 
				 By default, the mode positional argument is set to train eval. See the [Quick Start Guide](#quick-start-guide) for explanations of each positional argument.
			
 
				  
			
@@ -691,14 +807,16 @@ Fine-tuning inference can be run in order to obtain predictions on fine-tuning t
 
				 Evaluation fine-tuning is enabled by the same scripts as training:
			
 
				  
			
 
				 -   Question Answering (`scripts/run_squad.sh`)
			
 
				+-   Paraphrase Detection and Sentiment Analysis (`scripts/run_glue.sh`)
			
 
				  
			
 
				 The mode positional argument of the shell script is used to run in evaluation mode. The fine-tuned BERT model will be run on the evaluation dataset, and the evaluation loss and accuracy will be displayed.
			
 
				  
			
 
				-Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting the `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running predictions and evaluating them on a given dataset or just the former.
			
 
				+Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting the `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running predictions and evaluating them on a given dataset or just obtain the model predictions.
			
 
				  
			
 
				 `bash scripts/run_squad.sh <path to fine-tuned model checkpoint>`
			
 
				+`bash scripts/run_glue.sh <path to fine-tuned model checkpoint>`
			
 
				 
			
 
				-To run inference interactively on question-context pairs, use the script `inference.py` as follows:
			
 
				+For SQuAD, to run inference interactively on question-context pairs, use the script `inference.py` as follows:
			
 
				  
			
 
				 `python inference.py --bert_model "bert-large-uncased" --init_checkpoint=<fine_tuned_checkpoint> --config_file="bert_config.json" --vocab_file=<path to vocab file>  --question="What food does Harry like?" --context="My name is Harry and I grew up in Canada. I love apples."`
			
 
				 
			
@@ -715,9 +833,9 @@ The following section shows how to run benchmarks measuring the model performanc
 
				  
			
 
				 #### Training performance benchmark
			
 
				  
			
 
				-Training performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Training process](#training-process).
			
 
				+Training performance benchmarks for pretraining can be obtained by running `scripts/run_pretraining.sh`, and for fine-tuning can be obtained by running `scripts/run_squad.sh` or `scripts/run_glue.sh` for SQuAD or GLUE respectively. The required parameters can be passed through the command-line as described in [Training process](#training-process).
			
 
				  
			
 
				-To benchmark the training performance on a specific batch size, run:
			
 
				+As an example, to benchmark the training performance on a specific batch size for SQuAD, run:
			
 
				 `bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to SQuAD dataset> <path to vocab set> <results directory> train <BERT config path] <max steps>`
			
 
				  
			
 
				 An example call used to generate throughput numbers:
			
@@ -727,9 +845,9 @@ An example call used to generate throughput numbers:
 
				  
			
 
				 #### Inference performance benchmark
			
 
				  
			
 
				-Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Inference process](#inference-process).
			
 
				+Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh`, `scripts/run_squad.sh` and `scripts/run_glue.sh` respectively. The required parameters can be passed through the command-line as described in [Inference process](#inference-process).
			
 
				  
			
 
				-To benchmark the inference performance on a specific batch size, run:
			
 
				+As an example, to benchmark the inference performance on a specific batch size for SQuAD, run:
			
 
				 `bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to SQuAD dataset> <path to vocab set> <results directory> eval <BERT config path> <max steps>`
			
 
				  
			
 
				 An example call used to generate throughput numbers:
			
@@ -778,11 +896,41 @@ Following results were obtained by running on pytorch:19.07-py3 NGC container.
 
				 
			
 
				 ##### Fine-tuning accuracy results: NVIDIA DGX A100 (8x A100 40GB)
			
 
				 
			
 
				+* SQuAD
			
 
				+
			
 
				 | GPUs | Batch size / GPU (TF32 and FP16) | Accuracy - TF32(% F1) | Accuracy - mixed precision(% F1) | Time to train(hours) - TF32 | Time to train(hours) - mixed precision | Time to train speedup (TF32 to mixed precision)
			
 
				 |---|------------|---------|--------|-------|--------|-----
			
 
				 |8|16 and 32|91.344|91.34|0.174|0.065|2.68
			
 
				 
			
 
				+* MRPC
			
 
				+
			
 
				+| GPUs | Batch size / GPU (TF32 and FP16) | Accuracy - TF32(%) | Accuracy - mixed precision(%) | Time to train(seconds) - TF32 | Time to train(seconds) - mixed precision | Time to train speedup (TF32 to mixed precision)
			
 
				+|---|------------|---------|--------|-------|--------|-----
			
 
				+|8|16| 88.97 | 88.73 | 21.5 | 8.9 | 2.4
			
 
				+
			
 
				+* SST-2
			
 
				+
			
 
				+| GPUs | Batch size / GPU (TF32 and FP16) | Accuracy - TF32(%) | Accuracy - mixed precision(%) | Time to train(seconds) - TF32 | Time to train(seconds) - mixed precision | Time to train speedup (TF32 to mixed precision)
			
 
				+|---|------------|---------|--------|-------|--------|-----
			
 
				+|8|64 and 128| 93.00 | 93.58 | 159.0 | 60.0 | 2.7
			
 
				+
			
 
				+##### Fine-tuning accuracy results: NVIDIA DGX-2 (16x V100 32G)
			
 
				+
			
 
				+* MRPC
			
 
				+
			
 
				+| GPUs | Batch size / GPU (FP32 and FP16) | Accuracy - FP32(%) | Accuracy - mixed precision(%) | Time to train(seconds) - FP32 | Time to train(seconds) - mixed precision | Time to train speedup (FP32 to mixed precision)
			
 
				+|---|------------|---------|--------|-------|--------|-----
			
 
				+|16|8|89.22|88.97|34.9|13.8|2.5
			
 
				+
			
 
				+* SST-2
			
 
				+
			
 
				+| GPUs | Batch size / GPU (FP32 and FP16) | Accuracy - FP32(%) | Accuracy - mixed precision(%) | Time to train(seconds) - FP32 | Time to train(seconds) - mixed precision | Time to train speedup (FP32 to mixed precision)
			
 
				+|---|------------|---------|--------|-------|--------|-----
			
 
				+|16|64|93.46|93.92|253.0|63.4|4.0
			
 
				+
			
 
				 ##### Fine-tuning accuracy results: NVIDIA DGX-1 (8x V100 16G)
			
 
				+
			
 
				+* SQuAD
			
 
				  
			
 
				 | GPUs | Batch size / GPU | Accuracy - FP32(% F1) | Accuracy - mixed precision(% F1) | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
			
 
				 |---|---|---|---|---|---|---
			
@@ -797,6 +945,8 @@ Following results were obtained by running on pytorch:19.07-py3 NGC container.
 
				 |Final Loss| 1.344 | 1.328 | 1.324 | 1.326 | 1.333 | 1.331 | 0.009
			
 
				  
			
 
				 ###### Fine-tuning stability test
			
 
				+
			
 
				+* SQuAD
			
 
				  
			
 
				 Training stability with 8 GPUs, FP16 computations, batch size of 4:
			
 
				  
			
@@ -805,6 +955,23 @@ Training stability with 8 GPUs, FP16 computations, batch size of 4:
 
				 |Exact Match %| 84.50 | 84.07 | 84.52 | 84.23 | 84.17 | 84.30 | .200
			
 
				 | f1 % | 91.29 | 91.01 | 91.14 |  91.10 | 90.85 | 91.08 | 0.162
			
 
				  
			
 
				+* MRPC
			
 
				+
			
 
				+Training stability with 8 A100 GPUs, FP16 computations, batch size of 16 per GPU:
			
 
				+ 
			
 
				+| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
			
 
				+|---|---|---|---|---|---|---|---
			
 
				+|Exact Match %| 85.78 | 84.31 | 85.05 | 88.73 | 79.17 | 84.61 | 3.472
			
 
				+
			
 
				+> Note: Since MRPC is a very small dataset where overfitting can often occur, the resulting validation accuracy can often have high variance. By repeating the above experiments for 100 seeds, the max accuracy is 88.73, and the average accuracy is 82.56 with a standard deviation of 6.01.
			
 
				+
			
 
				+* SST-2
			
 
				+
			
 
				+Training stability with 8 A100 GPUs, FP16 computations, batch size of 128 per GPU:
			
 
				+ 
			
 
				+| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
			
 
				+|---|---|---|---|---|---|---|---
			
 
				+|Exact Match %| 93.00 | 93.58 | 93.00  | 92.78  | 92.55  | 92.98  | 0.384
			
 
				  
			
 
				 #### Training performance results
			
 
				 
			
@@ -824,54 +991,60 @@ Our results were obtained by running the `scripts run_pretraining.sh` training s
 
				 | 8| 4096 and 4096| 512 and 256| 512| 318 |620 | 1.94| 7.95| 7.76
			
 
				 
			
 
				 ###### Fine-tuning NVIDIA DGX A100 (8x A100 40GB)
			
 
				+
			
 
				+* SQuAD
			
 
				   
			
 
				 | GPUs | Batch size / GPU (TF32 and FP16) | Throughput - TF32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision
			
 
				 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
			
 
				 |1 | 16 and 32|44 |116 | 2.63| 1.00| 1.00
			
 
				 |4 | 16 and 32|165 |441 | 2.67| 3.75| 3.80
			
 
				 | 8| 16 and 32|324 |861 | 2.65| 7.42| 7.36
			
 
				-
			
 
				-
			
 
				-##### Training performance: NVIDIA DGX-1 (8x V100 16G)
			
 
				  
			
 
				-Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
			
 
				+##### Training performance: NVIDIA DGX-2 (16x V100 32G)
			
 
				  
			
 
				-###### Pre-training NVIDIA DGX-1 With 16G
			
 
				+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
			
 
				+ 
			
 
				+###### Pre-training NVIDIA DGX-2 With 32G
			
 
				  
			
 
				 | GPUs | Batch size / GPU (FP32 and FP16) | Accumulation steps (FP32 and FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
			
 
				 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
			
 
				-|1 | 65536 and 65536  | 8192 and 4096| 128| 40 |164 |4.1 |1.00 | 1.00
			
 
				-|4 | 16384 and 16384  | 2048 and 1024| 128| 155 |615 | 3.96| 3.88| 3.75
			
 
				-|8 | 8192 and 8192  | 1024 and 512| 128| 313 |1236 | 3.94| 7.83| 7.54
			
 
				-|1 | 32768 and 32768 | 16384 and 8192| 512| 9 |34 |3.77 |1.00 | 1.00
			
 
				-|4 | 8192 and 8192 | 4096 and 2048| 512| 35 |131 | 3.74| 3.89| 3.85
			
 
				-| 8| 4096 and 4096 | 2048 and 1024| 512| 71 |263 | 3.70| 7.89| 7.74
			
 
				- 
			
 
				- 
			
 
				-###### Pre-training on multiple NVIDIA DGX-1 With 16G
			
 
				-
			
 
				-Following numbers were obtained on NGC pytorch:19.07-py3 NGC container.
			
 
				+|1 | 65536 and 65536  | 8192 and 4096| 128| 42 |173 |4.11 |1.00 | 1.00
			
 
				+|4 | 16384 and 16384  | 2048 and 1024| 128| 166 |669 | 4.03| 3.95| 3.87
			
 
				+|8 | 8192 and 8192  | 1024 and 512| 128| 330 |1324 | 4.01| 7.86| 7.65
			
 
				+|16 | 4096 and 4096  | 512 and 256| 128| 658 |2557 | 3.88| 15.67| 14.78
			
 
				+|1 | 32768 and 32768 | 16384 and 8192| 512| 10 |36 |3.6 |1.00 | 1.00
			
 
				+|4 | 8192 and 8192 | 4096 and 2048| 512| 37 |137 | 3.70| 3.70| 3.81
			
 
				+| 8| 4096 and 4096 | 2048 and 1024| 512| 75 |273 | 3.64| 7.50| 7.58
			
 
				+| 16| 2048 and 2048 | 1024 and 512| 512| 150 |551 | 3.67| 15.00| 15.31
			
 
				 
			
 
				-| Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
			
 
				-|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|--------------
			
 
				-|1 |8 | N/A | 16| 128| N/A |874.24 |N/A |N/A | 1.00
			
 
				-|4 |8 | N/A | 16| 128| N/A |3089.76 | N/A| N/A| 3.53
			
 
				-|16 |8 | N/A | 16| 128| N/A |12144.64 | N/A| N/A| 13.89
			
 
				-|1 |8 | N/A | 4| 512| N/A |195.93 |N/A |N/A | 1.00
			
 
				-|4 |8 | N/A | 4| 512| N/A |700.16 | N/A| N/A| 3.57
			
 
				-|16| 8| N/A | 4| 512| N/A |2746.368 | N/A| N/A| 14.02
			
 
				+###### Pre-training on multiple NVIDIA DGX-2H With 32G
			
 
				  
			
 
				+Note: Multi-node performance numbers below are on DGX-2H whereas the single node performance numbers above are on DGX-2.
			
 
				+
			
 
				+Following numbers are obtained on pytorch:19.07-py3 NGC container. 
			
 
				  
			
 
				-###### Fine-tuning NVIDIA DGX-1 With 16G
			
 
				+| Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
			
 
				+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|---------------------
			
 
				+|1 |16 | N/A | 64| 128| N/A |3379.2 |N/A |N/A | 1.00
			
 
				+|4 |16 | N/A | 64| 128| N/A |12709.88 | N/A| N/A| 3.76
			
 
				+|16 |16 | N/A | 64| 128| N/A |51937.28 | N/A| N/A| 15.37
			
 
				+|64 |16 | 32 | 64| 128| 46628.86 |188088.32 | 4.03 | N/A| 55.66
			
 
				+|1 |16 | N/A | 8| 512| N/A |625.66 |N/A |N/A | 1.00
			
 
				+|4 |16 | N/A | 8| 512| N/A |2386.38 | N/A| N/A| 3.81
			
 
				+|16| 16| N/A | 8| 512| N/A |9932.8 | N/A| N/A| 15.87
			
 
				+|64| 16| 4 | 8| 512| 9543.68 |37478.4 | 3.92| N/A| 59.9
			
 
				  
			
 
				+###### Fine-tuning NVIDIA DGX-2 With 32G
			
 
				+
			
 
				+* SQuAD
			
 
				  
			
 
				 | GPUs | Batch size / GPU (FP32 and FP16) | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
			
 
				 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
			
 
				-|1 | 4 and 10|9 |50 | 5.55| 1.00| 1.00
			
 
				-|4 | 4 and 10|32 |183 | 5.71| 3.56| 3.66
			
 
				-| 8| 4 and 10|61 |359 | 5.88| 6.78| 7.18
			
 
				- 
			
 
				- 
			
 
				+|1 |8 and 10 |12| 53| 4.41| 1.00| 1.00
			
 
				+|4 |8 and 10 | 47| 188| 4| 3.92| 3.55
			
 
				+|8 | 8 and 10| 92| 369| 4.01| 7.67| 6.96
			
 
				+|16 | 8 and 10| 178| 700| 3.93| 14.83| 13.21
			
 
				+
			
 
				 ##### Training performance: NVIDIA DGX-1 (8x V100 32G)
			
 
				  
			
 
				 Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
			
@@ -889,55 +1062,54 @@ Our results were obtained by running the `scripts/run_pretraining.sh` and `scrip
 
				  
			
 
				  
			
 
				 ###### Fine-tuning NVIDIA DGX-1 With 32G
			
 
				+
			
 
				+* SQuAD 
			
 
				  
			
 
				 | GPUs | Batch size / GPU (FP32 and FP16) | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
			
 
				 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
			
 
				 |1 | 8 and 10|12 |49 | 4.08| 1.00| 1.00
			
 
				 |4 | 8 and 10|42 |178 | 4.23| 3.5| 3.63
			
 
				 | 8| 8 and 10|67 |351 | 5.23| 5.58| 7.16 
			
 
				+  
			
 
				+##### Training performance: NVIDIA DGX-1 (8x V100 16G)
			
 
				  
			
 
				-##### Training performance: NVIDIA DGX-2 (16x V100 32G)
			
 
				- 
			
 
				-Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
			
 
				+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
			
 
				  
			
 
				-###### Pre-training NVIDIA DGX-2 With 32G
			
 
				+###### Pre-training NVIDIA DGX-1 With 16G
			
 
				  
			
 
				 | GPUs | Batch size / GPU (FP32 and FP16) | Accumulation steps (FP32 and FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
			
 
				 |------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
			
 
				-|1 | 65536 and 65536  | 8192 and 4096| 128| 42 |173 |4.11 |1.00 | 1.00
			
 
				-|4 | 16384 and 16384  | 2048 and 1024| 128| 166 |669 | 4.03| 3.95| 3.87
			
 
				-|8 | 8192 and 8192  | 1024 and 512| 128| 330 |1324 | 4.01| 7.86| 7.65
			
 
				-|16 | 4096 and 4096  | 512 and 256| 128| 658 |2557 | 3.88| 15.67| 14.78
			
 
				-|1 | 32768 and 32768 | 16384 and 8192| 512| 10 |36 |3.6 |1.00 | 1.00
			
 
				-|4 | 8192 and 8192 | 4096 and 2048| 512| 37 |137 | 3.70| 3.70| 3.81
			
 
				-| 8| 4096 and 4096 | 2048 and 1024| 512| 75 |273 | 3.64| 7.50| 7.58
			
 
				-| 16| 2048 and 2048 | 1024 and 512| 512| 150 |551 | 3.67| 15.00| 15.31
			
 
				-
			
 
				-###### Pre-training on multiple NVIDIA DGX-2H With 32G
			
 
				+|1 | 65536 and 65536  | 8192 and 4096| 128| 40 |164 |4.1 |1.00 | 1.00
			
 
				+|4 | 16384 and 16384  | 2048 and 1024| 128| 155 |615 | 3.96| 3.88| 3.75
			
 
				+|8 | 8192 and 8192  | 1024 and 512| 128| 313 |1236 | 3.94| 7.83| 7.54
			
 
				+|1 | 32768 and 32768 | 16384 and 8192| 512| 9 |34 |3.77 |1.00 | 1.00
			
 
				+|4 | 8192 and 8192 | 4096 and 2048| 512| 35 |131 | 3.74| 3.89| 3.85
			
 
				+| 8| 4096 and 4096 | 2048 and 1024| 512| 71 |263 | 3.70| 7.89| 7.74
			
 
				  
			
 
				-Note: Multi-node performance numbers below are on DGX-2H whereas the single node performance numbers above are on DGX-2.
			
 
				-
			
 
				-Following numbers are obtained on pytorch:19.07-py3 NGC container. 
			
 
				  
			
 
				+###### Pre-training on multiple NVIDIA DGX-1 With 16G
			
 
				+
			
 
				+Following numbers were obtained on NGC pytorch:19.07-py3 NGC container.
			
 
				+
			
 
				 | Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
			
 
				-|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|---------------------
			
 
				-|1 |16 | N/A | 64| 128| N/A |3379.2 |N/A |N/A | 1.00
			
 
				-|4 |16 | N/A | 64| 128| N/A |12709.88 | N/A| N/A| 3.76
			
 
				-|16 |16 | N/A | 64| 128| N/A |51937.28 | N/A| N/A| 15.37
			
 
				-|64 |16 | 32 | 64| 128| 46628.86 |188088.32 | 4.03 | N/A| 55.66
			
 
				-|1 |16 | N/A | 8| 512| N/A |625.66 |N/A |N/A | 1.00
			
 
				-|4 |16 | N/A | 8| 512| N/A |2386.38 | N/A| N/A| 3.81
			
 
				-|16| 16| N/A | 8| 512| N/A |9932.8 | N/A| N/A| 15.87
			
 
				-|64| 16| 4 | 8| 512| 9543.68 |37478.4 | 3.92| N/A| 59.9
			
 
				+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|--------------
			
 
				+|1 |8 | N/A | 16| 128| N/A |874.24 |N/A |N/A | 1.00
			
 
				+|4 |8 | N/A | 16| 128| N/A |3089.76 | N/A| N/A| 3.53
			
 
				+|16 |8 | N/A | 16| 128| N/A |12144.64 | N/A| N/A| 13.89
			
 
				+|1 |8 | N/A | 4| 512| N/A |195.93 |N/A |N/A | 1.00
			
 
				+|4 |8 | N/A | 4| 512| N/A |700.16 | N/A| N/A| 3.57
			
 
				+|16| 8| N/A | 4| 512| N/A |2746.368 | N/A| N/A| 14.02
			
 
				  
			
 
				-###### Fine-tuning NVIDIA DGX-2 With 32G
			
 
				+ 
			
 
				+###### Fine-tuning NVIDIA DGX-1 With 16G
			
 
				+ 
			
 
				+* SQuAD
			
 
				  
			
 
				 | GPUs | Batch size / GPU (FP32 and FP16) | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
			
 
				 |------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
			
 
				-|1 |8 and 10 |12| 53| 4.41| 1.00| 1.00
			
 
				-|4 |8 and 10 | 47| 188| 4| 3.92| 3.55
			
 
				-|8 | 8 and 10| 92| 369| 4.01| 7.67| 6.96
			
 
				-|16 | 8 and 10| 178| 700| 3.93| 14.83| 13.21
			
 
				+|1 | 4 and 10|9 |50 | 5.55| 1.00| 1.00
			
 
				+|4 | 4 and 10|32 |183 | 5.71| 3.56| 3.66
			
 
				+| 8| 4 and 10|61 |359 | 5.88| 6.78| 7.18
			
 
				  
			
 
				 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
 
				  
			
@@ -948,41 +1120,84 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
				 Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
			
 
				  
			
 
				 ###### Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)
			
 
				+
			
 
				+* SQuAD
			
 
				  
			
 
				 | GPUs |  Batch Size \(TF32/FP16\) | Sequence Length | Throughput \- TF32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
			
 
				 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
			
 
				 | 1    | 8/8  | 384             |      188       | 283    |
			
 
				 
			
 
				+* MRPC
			
 
				 
			
 
				-##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
			
 
				- 
			
 
				-Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
			
 
				+| GPUs |  Batch Size \(TF32/FP16\) | Sequence Length | Throughput \- TF32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
			
 
				+|------|---------------------------|-----------------|-------------------|------------------------------------------------|
			
 
				+| 1 | 1  | 128 | 47.77 | 56.18 |
			
 
				+| 1 | 2  | 128 | 109.89 | 114.17 |
			
 
				+| 1 | 4  | 128 | 158.30 | 238.81 |
			
 
				+| 1 | 8  | 128 | 176.72 | 463.49 |
			
 
				+
			
 
				+* SST-2
			
 
				+
			
 
				+| GPUs |  Batch Size \(TF32/FP16\) | Sequence Length | Throughput \- TF32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
			
 
				+|------|---------------------------|-----------------|-------------------|------------------------------------------------|
			
 
				+| 1 | 1  | 128 | 51.16 | 57.67  |
			
 
				+| 1 | 2  | 128 | 104.59 | 115.21 |
			
 
				+| 1 | 4  | 128 | 207.64 | 232.52 |
			
 
				+| 1 | 8  | 128 | 446.57 | 469.30 |
			
 
				+
			
 
				+##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
			
 
				  
			
 
				-###### Fine-tuning inference on NVIDIA DGX-1 with 16G
			
 
				+Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
			
 
				  
			
 
				+###### Fine-tuning inference on NVIDIA DGX-2 with 32G
			
 
				+
			
 
				+* SQuAD 
			
 
				+
			
 
				 | GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
			
 
				 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
			
 
				-| 1    | 8/8                       | 384             |      42       | 153                                        |
			
 
				+| 1    | 8/8                       | 384             |43             | 148                                        |
			
 
				+
			
 
				+* MRPC
			
 
				+
			
 
				+| GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
			
 
				+|------|---------------------------|-----------------|-------------------|------------------------------------------------|
			
 
				+| 1    | 1                       | 128             | 59.07            | 60.53                                        |
			
 
				+| 1    | 2                       | 128             | 99.58             | 121.27                                       |
			
 
				+| 1    | 4                       | 128             | 136.92            | 228.77                                        |
			
 
				+| 1    | 8                       | 128             | 148.20            | 502.32                                       |
			
 
				+
			
 
				+* SST-2
			
 
				+
			
 
				+| GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
			
 
				+|------|---------------------------|-----------------|-------------------|------------------------------------------------|
			
 
				+| 1    | 1                       | 128             | 60.04            | 59.83                                        |
			
 
				+| 1    | 2                       | 128             | 111.25            | 117.59                                        |
			
 
				+| 1    | 4                       | 128             | 136.77            | 239.03                                        |
			
 
				+| 1    | 8                       | 128             | 146.58            | 504.10                                        |
			
 
				  
			
 
				 ##### Inference performance: NVIDIA DGX-1 (1x V100 32G)
			
 
				  
			
 
				 Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
			
 
				   
			
 
				 ###### Fine-tuning inference on NVIDIA DGX-1 with 32G
			
 
				+
			
 
				+* SQuAD 
			
 
				  
			
 
				 | GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
			
 
				 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
			
 
				 | 1    | 8/8                       | 384             |48             | 143                                        |
			
 
				  
			
 
				-##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
			
 
				+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
			
 
				  
			
 
				-Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
			
 
				+Our results were obtained by running `scripts/run_squad.sh` in the pytorch:20.06-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
			
 
				  
			
 
				-###### Fine-tuning inference on NVIDIA DGX-2 with 32G
			
 
				+###### Fine-tuning inference on NVIDIA DGX-1 with 16G
			
 
				 
			
 
				+* SQuAD 
			
 
				+ 
			
 
				 | GPUs |  Batch Size \(FP32/FP16\) | Sequence Length | Throughput \- FP32\(sequences/sec\) | Throughput \- Mixed Precision\(sequences/sec\) |
			
 
				 |------|---------------------------|-----------------|-------------------|------------------------------------------------|
			
 
				-| 1    | 8/8                       | 384             |43             | 148                                        |
			
 
				+| 1    | 8/8                       | 384             |      42       | 153                                        |
			
 
				  
			
 
				 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
 
				  
			
@@ -994,6 +1209,7 @@ The inference performance metrics used were items/second.
 
				  
			
 
				 July 2020
			
 
				 -  Updated accuracy and performance tables to include A100 results
			
 
				+-  Fine-tuning with the MRPC and SST-2 datasets.
			
 
				  
			
 
				 March 2020
			
 
				 - TRITON Inference Server support.
			
--- a/PyTorch/LanguageModeling/BERT/bind.sh
+++ b/PyTorch/LanguageModeling/BERT/bind.sh
@@ -1,3 +1,16 @@
 
				+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				 #! /bin/bash
			
 
				 set -euo pipefail
			
 
				 
			
--- a/PyTorch/LanguageModeling/BERT/data/Downloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/Downloader.py
@@ -15,16 +15,16 @@ from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
 
				 from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
			
 
				 from WikiDownloader import WikiDownloader
			
 
				 from BooksDownloader import BooksDownloader
			
 
				-from MRPCDownloader import MRPCDownloader
			
 
				+from GLUEDownloader import GLUEDownloader
			
 
				 from SquadDownloader import SquadDownloader
			
 
				 
			
 
				 
			
 
				 class Downloader:
			
 
				+
			
 
				     def __init__(self, dataset_name, save_path):
			
 
				         self.dataset_name = dataset_name
			
 
				         self.save_path = save_path
			
 
				 
			
 
				-
			
 
				     def download(self):
			
 
				         if self.dataset_name == 'bookscorpus':
			
 
				             self.download_bookscorpus()
			
@@ -41,50 +41,45 @@ class Downloader:
 
				         elif self.dataset_name == 'nvidia_pretrained_weights':
			
 
				             self.download_nvidia_pretrained_weights()
			
 
				 
			
 
				-        elif self.dataset_name == 'mrpc':
			
 
				-            self.download_mrpc()
			
 
				+        elif self.dataset_name in {'mrpc', 'sst-2'}:
			
 
				+            self.download_glue(self.dataset_name)
			
 
				 
			
 
				         elif self.dataset_name == 'squad':
			
 
				             self.download_squad()
			
 
				 
			
 
				         elif self.dataset_name == 'all':
			
 
				-            self.download_bookscorpus(self.save_path)
			
 
				-            self.download_wikicorpus('en', self.save_path)
			
 
				-            self.download_wikicorpus('zh', self.save_path)
			
 
				-            self.download_google_pretrained_weights(self.save_path)
			
 
				-            self.download_nvidia_pretrained_weights(self.save_path)
			
 
				-            self.download_mrpc(self.save_path)
			
 
				-            self.download_squad(self.save_path)
			
 
				+            self.download_bookscorpus()
			
 
				+            self.download_wikicorpus('en')
			
 
				+            self.download_wikicorpus('zh')
			
 
				+            self.download_google_pretrained_weights()
			
 
				+            self.download_nvidia_pretrained_weights()
			
 
				+            self.download_glue('mrpc')
			
 
				+            self.download_glue('sst-2')
			
 
				+            self.download_squad()
			
 
				 
			
 
				         else:
			
 
				             print(self.dataset_name)
			
 
				             assert False, 'Unknown dataset_name provided to downloader'
			
 
				 
			
 
				-
			
 
				     def download_bookscorpus(self):
			
 
				         downloader = BooksDownloader(self.save_path)
			
 
				         downloader.download()
			
 
				 
			
 
				-
			
 
				     def download_wikicorpus(self, language):
			
 
				         downloader = WikiDownloader(language, self.save_path)
			
 
				         downloader.download()
			
 
				 
			
 
				-
			
 
				     def download_google_pretrained_weights(self):
			
 
				         downloader = GooglePretrainedWeightDownloader(self.save_path)
			
 
				         downloader.download()
			
 
				 
			
 
				-
			
 
				     def download_nvidia_pretrained_weights(self):
			
 
				         downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
			
 
				         downloader.download()
			
 
				 
			
 
				-
			
 
				-    def download_mrpc(self):
			
 
				-        downloader = MRPCDownloader(self.save_path)
			
 
				-        downloader.download()
			
 
				-
			
 
				+    def download_glue(self, task_name):
			
 
				+        downloader = GLUEDownloader(self.save_path)
			
 
				+        downloader.download(task_name)
			
 
				 
			
 
				     def download_squad(self):
			
 
				         downloader = SquadDownloader(self.save_path)
			
--- a/PyTorch/LanguageModeling/BERT/data/GLUEDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/GLUEDownloader.py
@@ -0,0 +1,46 @@
 
				+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import sys
			
 
				+import wget
			
 
				+
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+def mkdir(path):
			
 
				+    Path(path).mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+
			
 
				+class GLUEDownloader:
			
 
				+
			
 
				+    def __init__(self, save_path):
			
 
				+        self.save_path = save_path + '/glue'
			
 
				+
			
 
				+    def download(self, task_name):
			
 
				+        mkdir(self.save_path)
			
 
				+        if task_name in {'mrpc', 'mnli'}:
			
 
				+            task_name = task_name.upper()
			
 
				+        elif task_name == 'cola':
			
 
				+            task_name = 'CoLA'
			
 
				+        else:  # SST-2
			
 
				+            assert task_name == 'sst-2'
			
 
				+            task_name = 'SST'
			
 
				+        wget.download(
			
 
				+            'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py',
			
 
				+            out=self.save_path,
			
 
				+        )
			
 
				+        sys.path.append(self.save_path)
			
 
				+        import download_glue_data
			
 
				+        download_glue_data.main(
			
 
				+            ['--data_dir', self.save_path, '--tasks', task_name])
			
 
				+        sys.path.pop()
			
--- a/PyTorch/LanguageModeling/BERT/data/MRPCDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/MRPCDownloader.py
@@ -1,44 +0,0 @@
 
				-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-import bz2
			
 
				-import os
			
 
				-import urllib.request
			
 
				-import sys
			
 
				-
			
 
				-class MRPCDownloader:
			
 
				-    def __init__(self, save_path):
			
 
				-        self.save_path = save_path + '/mrpc'
			
 
				-
			
 
				-        if not os.path.exists(self.save_path):
			
 
				-            os.makedirs(self.save_path)
			
 
				-
			
 
				-        # Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
			
 
				-        self.download_urls = {
			
 
				-            'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc' : 'mrpc_dev_ids.tsv'
			
 
				-        }
			
 
				-
			
 
				-    def download(self):
			
 
				-        for item in self.download_urls:
			
 
				-            url = item
			
 
				-            file = self.download_urls[item]
			
 
				-
			
 
				-            print('Downloading:', url)
			
 
				-            if os.path.isfile(self.save_path + '/' + file):
			
 
				-                print('** Download file already exists, skipping download')
			
 
				-            else:
			
 
				-                response = urllib.request.urlopen(url)
			
 
				-                with open(self.save_path + '/' + file, "wb") as handle:
			
 
				-                    handle.write(response.read())
			
 
				-
			
 
				-
			
--- a/PyTorch/LanguageModeling/BERT/data/bertPrep.py
+++ b/PyTorch/LanguageModeling/BERT/data/bertPrep.py
@@ -101,7 +101,7 @@ def main(args):
 
				             output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
			
 
				             wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
			
 
				             wiki_formatter.merge()
			
 
				-            
			
 
				+
			
 
				             assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
			
 
				 
			
 
				     elif args.action == 'sharding':
			
@@ -248,6 +248,7 @@ if __name__ == "__main__":
 
				             'google_pretrained_weights',
			
 
				             'nvidia_pretrained_weights',
			
 
				             'mrpc',
			
 
				+            'sst-2',
			
 
				             'squad',
			
 
				             'all'
			
 
				         }
			
--- a/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
+++ b/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
@@ -23,6 +23,8 @@ fi
 
				 python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
			
 
				 python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
			
 
				 python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
			
 
				+python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc
			
 
				+python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2
			
 
				 
			
 
				 # Properly format the text files
			
 
				 if [ "$to_download" = "wiki_books" ] ; then
			
--- a/PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh
+++ b/PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh
@@ -1,20 +0,0 @@
 
				-#!/usr/bin/env bash
			
 
				-
			
 
				-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-echo "Downloading MRPC data"
			
 
				-
			
 
				-wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
			
 
				-
			
 
				-python download_glue_data.py --data_dir . --tasks MRPC
			
--- a/PyTorch/LanguageModeling/BERT/modeling.py
+++ b/PyTorch/LanguageModeling/BERT/modeling.py
@@ -149,7 +149,7 @@ class LinearActivation(Module):
 
				         self.in_features = in_features
			
 
				         self.out_features = out_features
			
 
				         self.act_fn = nn.Identity()                                                         #
			
 
				-        self.biased_act_fn = None                                                           # 
			
 
				+        self.biased_act_fn = None                                                           #
			
 
				         self.bias = None                                                                    #
			
 
				         if isinstance(act, str) or (sys.version_info[0] == 2 and isinstance(act, unicode)): # For TorchScript
			
 
				             if bias and not 'bias' in act:                                                  # compatibility
			
@@ -1073,17 +1073,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
				         self.classifier = nn.Linear(config.hidden_size, num_labels)
			
 
				         self.apply(self.init_bert_weights)
			
 
				 
			
 
				-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
			
 
				+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
			
 
				         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
			
 
				         pooled_output = self.dropout(pooled_output)
			
 
				-        logits = self.classifier(pooled_output)
			
 
				-
			
 
				-        if labels is not None:
			
 
				-            loss_fct = CrossEntropyLoss()
			
 
				-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
			
 
				-            return loss
			
 
				-        else:
			
 
				-            return logits
			
 
				+        return self.classifier(pooled_output)
			
 
				 
			
 
				 
			
 
				 class BertForMultipleChoice(BertPreTrainedModel):
			
--- a/PyTorch/LanguageModeling/BERT/processors/__init__.py
+++ b/PyTorch/LanguageModeling/BERT/processors/__init__.py
--- a/PyTorch/LanguageModeling/BERT/processors/glue.py
+++ b/PyTorch/LanguageModeling/BERT/processors/glue.py
@@ -0,0 +1,325 @@
 
				+# coding=utf-8
			
 
				+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
			
 
				+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import csv
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+class InputExample(object):
			
 
				+    """A single training/test example for simple sequence classification."""
			
 
				+
			
 
				+    def __init__(self, guid, text_a, text_b=None, label=None):
			
 
				+        """Constructs a InputExample.
			
 
				+
			
 
				+        Args:
			
 
				+            guid: Unique id for the example.
			
 
				+            text_a: string. The untokenized text of the first sequence. For
			
 
				+                    single sequence tasks, only this sequence must be specified.
			
 
				+            text_b: (Optional) string. The untokenized text of the second
			
 
				+                    sequence. Only must be specified for sequence pair tasks.
			
 
				+            label: (Optional) string. The label of the example. This should be
			
 
				+                   specified for train and dev examples, but not for test
			
 
				+                   examples.
			
 
				+        """
			
 
				+        self.guid = guid
			
 
				+        self.text_a = text_a
			
 
				+        self.text_b = text_b
			
 
				+        self.label = label
			
 
				+
			
 
				+
			
 
				+class InputFeatures(object):
			
 
				+    """A single set of features of data."""
			
 
				+
			
 
				+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
			
 
				+        self.input_ids = input_ids
			
 
				+        self.input_mask = input_mask
			
 
				+        self.segment_ids = segment_ids
			
 
				+        self.label_id = label_id
			
 
				+
			
 
				+
			
 
				+class DataProcessor(object):
			
 
				+    """Base class for data converters for sequence classification data sets."""
			
 
				+
			
 
				+    def get_train_examples(self, data_dir):
			
 
				+        """Gets a collection of `InputExample`s for the train set."""
			
 
				+        raise NotImplementedError()
			
 
				+
			
 
				+    def get_dev_examples(self, data_dir):
			
 
				+        """Gets a collection of `InputExample`s for the dev set."""
			
 
				+        raise NotImplementedError()
			
 
				+
			
 
				+    def get_labels(self):
			
 
				+        """Gets the list of labels for this data set."""
			
 
				+        raise NotImplementedError()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _read_tsv(cls, input_file, quotechar=None):
			
 
				+        """Reads a tab separated value file."""
			
 
				+        with open(input_file, "r") as f:
			
 
				+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
			
 
				+            lines = []
			
 
				+            for line in reader:
			
 
				+                if sys.version_info[0] == 2:
			
 
				+                    line = list(unicode(cell, 'utf-8') for cell in line)
			
 
				+                lines.append(line)
			
 
				+            return lines
			
 
				+
			
 
				+
			
 
				+class MrpcProcessor(DataProcessor):
			
 
				+    """Processor for the MRPC data set (GLUE version)."""
			
 
				+
			
 
				+    def get_train_examples(self, data_dir):
			
 
				+        """See base class."""
			
 
				+        return self._create_examples(
			
 
				+            self._read_tsv(os.path.join(data_dir, "train.tsv")),
			
 
				+            "train",
			
 
				+        )
			
 
				+
			
 
				+    def get_dev_examples(self, data_dir):
			
 
				+        """See base class."""
			
 
				+        return self._create_examples(
			
 
				+            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
			
 
				+            "dev",
			
 
				+        )
			
 
				+
			
 
				+    def get_labels(self):
			
 
				+        """See base class."""
			
 
				+        return ["0", "1"]
			
 
				+
			
 
				+    def _create_examples(self, lines, set_type):
			
 
				+        """Creates examples for the training and dev sets."""
			
 
				+        examples = []
			
 
				+        for (i, line) in enumerate(lines):
			
 
				+            if i == 0:
			
 
				+                continue
			
 
				+            guid = "%s-%s" % (set_type, i)
			
 
				+            text_a = line[3]
			
 
				+            text_b = line[4]
			
 
				+            label = line[0]
			
 
				+            examples.append(
			
 
				+                InputExample(guid=guid,
			
 
				+                             text_a=text_a,
			
 
				+                             text_b=text_b,
			
 
				+                             label=label))
			
 
				+        return examples
			
 
				+
			
 
				+
			
 
				+class MnliProcessor(DataProcessor):
			
 
				+    """Processor for the MultiNLI data set (GLUE version)."""
			
 
				+
			
 
				+    def get_train_examples(self, data_dir):
			
 
				+        """See base class."""
			
 
				+        return self._create_examples(
			
 
				+            self._read_tsv(os.path.join(data_dir, "train.tsv")),
			
 
				+            "train",
			
 
				+        )
			
 
				+
			
 
				+    def get_dev_examples(self, data_dir):
			
 
				+        """See base class."""
			
 
				+        return self._create_examples(
			
 
				+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
			
 
				+            "dev_matched",
			
 
				+        )
			
 
				+
			
 
				+    def get_labels(self):
			
 
				+        """See base class."""
			
 
				+        return ["contradiction", "entailment", "neutral"]
			
 
				+
			
 
				+    def _create_examples(self, lines, set_type):
			
 
				+        """Creates examples for the training and dev sets."""
			
 
				+        examples = []
			
 
				+        for (i, line) in enumerate(lines):
			
 
				+            if i == 0:
			
 
				+                continue
			
 
				+            guid = "%s-%s" % (set_type, line[0])
			
 
				+            text_a = line[8]
			
 
				+            text_b = line[9]
			
 
				+            label = line[-1]
			
 
				+            examples.append(
			
 
				+                InputExample(guid=guid,
			
 
				+                             text_a=text_a,
			
 
				+                             text_b=text_b,
			
 
				+                             label=label))
			
 
				+        return examples
			
 
				+
			
 
				+
			
 
				+class ColaProcessor(DataProcessor):
			
 
				+    """Processor for the CoLA data set (GLUE version)."""
			
 
				+
			
 
				+    def get_train_examples(self, data_dir):
			
 
				+        """See base class."""
			
 
				+        return self._create_examples(
			
 
				+            self._read_tsv(os.path.join(data_dir, "train.tsv")),
			
 
				+            "train",
			
 
				+        )
			
 
				+
			
 
				+    def get_dev_examples(self, data_dir):
			
 
				+        """See base class."""
			
 
				+        return self._create_examples(
			
 
				+            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
			
 
				+            "dev",
			
 
				+        )
			
 
				+
			
 
				+    def get_labels(self):
			
 
				+        """See base class."""
			
 
				+        return ["0", "1"]
			
 
				+
			
 
				+    def _create_examples(self, lines, set_type):
			
 
				+        """Creates examples for the training and dev sets."""
			
 
				+        examples = []
			
 
				+        for (i, line) in enumerate(lines):
			
 
				+            guid = "%s-%s" % (set_type, i)
			
 
				+            text_a = line[3]
			
 
				+            label = line[1]
			
 
				+            examples.append(
			
 
				+                InputExample(guid=guid, text_a=text_a, text_b=None,
			
 
				+                             label=label))
			
 
				+        return examples
			
 
				+
			
 
				+
			
 
				+class Sst2Processor(DataProcessor):
			
 
				+    """Processor for the CoLA data set (GLUE version)."""
			
 
				+
			
 
				+    def get_train_examples(self, data_dir):
			
 
				+        """See base class."""
			
 
				+        return self._create_examples(
			
 
				+            self._read_tsv(os.path.join(data_dir, "train.tsv")),
			
 
				+            "train",
			
 
				+        )
			
 
				+
			
 
				+    def get_dev_examples(self, data_dir):
			
 
				+        """See base class."""
			
 
				+        return self._create_examples(
			
 
				+            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
			
 
				+            "dev",
			
 
				+        )
			
 
				+
			
 
				+    def get_labels(self):
			
 
				+        """See base class."""
			
 
				+        return ["0", "1"]
			
 
				+
			
 
				+    def _create_examples(self, lines, set_type):
			
 
				+        """Creates examples for the training and dev sets."""
			
 
				+        examples = []
			
 
				+        for (i, line) in enumerate(lines):
			
 
				+            if i == 0:
			
 
				+                continue
			
 
				+            guid = "%s-%s" % (set_type, i)
			
 
				+            text_a = line[0]
			
 
				+            label = line[1]
			
 
				+            examples.append(
			
 
				+                InputExample(guid=guid, text_a=text_a, text_b=None,
			
 
				+                             label=label))
			
 
				+        return examples
			
 
				+
			
 
				+
			
 
				+def convert_examples_to_features(examples, label_list, max_seq_length,
			
 
				+                                 tokenizer):
			
 
				+    """Loads a data file into a list of `InputBatch`s."""
			
 
				+
			
 
				+    label_map = {label: i for i, label in enumerate(label_list)}
			
 
				+
			
 
				+    features = []
			
 
				+    for (ex_index, example) in enumerate(examples):
			
 
				+        tokens_a = tokenizer.tokenize(example.text_a)
			
 
				+
			
 
				+        tokens_b = None
			
 
				+        if example.text_b:
			
 
				+            tokens_b = tokenizer.tokenize(example.text_b)
			
 
				+            # Modifies `tokens_a` and `tokens_b` in place so that the total
			
 
				+            # length is less than the specified length.
			
 
				+            # Account for [CLS], [SEP], [SEP] with "- 3"
			
 
				+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
			
 
				+        else:
			
 
				+            # Account for [CLS] and [SEP] with "- 2"
			
 
				+            if len(tokens_a) > max_seq_length - 2:
			
 
				+                tokens_a = tokens_a[:(max_seq_length - 2)]
			
 
				+
			
 
				+        # The convention in BERT is:
			
 
				+        # (a) For sequence pairs:
			
 
				+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
			
 
				+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
			
 
				+        # (b) For single sequences:
			
 
				+        #  tokens:   [CLS] the dog is hairy . [SEP]
			
 
				+        #  type_ids: 0   0   0   0  0     0 0
			
 
				+        #
			
 
				+        # Where "type_ids" are used to indicate whether this is the first
			
 
				+        # sequence or the second sequence. The embedding vectors for `type=0` and
			
 
				+        # `type=1` were learned during pre-training and are added to the wordpiece
			
 
				+        # embedding vector (and position vector). This is not *strictly* necessary
			
 
				+        # since the [SEP] token unambigiously separates the sequences, but it makes
			
 
				+        # it easier for the model to learn the concept of sequences.
			
 
				+        #
			
 
				+        # For classification tasks, the first vector (corresponding to [CLS]) is
			
 
				+        # used as as the "sentence vector". Note that this only makes sense because
			
 
				+        # the entire model is fine-tuned.
			
 
				+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
			
 
				+        segment_ids = [0] * len(tokens)
			
 
				+
			
 
				+        if tokens_b:
			
 
				+            tokens += tokens_b + ["[SEP]"]
			
 
				+            segment_ids += [1] * (len(tokens_b) + 1)
			
 
				+
			
 
				+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
			
 
				+
			
 
				+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
			
 
				+        # tokens are attended to.
			
 
				+        input_mask = [1] * len(input_ids)
			
 
				+
			
 
				+        # Zero-pad up to the sequence length.
			
 
				+        padding = [0] * (max_seq_length - len(input_ids))
			
 
				+        input_ids += padding
			
 
				+        input_mask += padding
			
 
				+        segment_ids += padding
			
 
				+
			
 
				+        assert len(input_ids) == max_seq_length
			
 
				+        assert len(input_mask) == max_seq_length
			
 
				+        assert len(segment_ids) == max_seq_length
			
 
				+
			
 
				+        label_id = label_map[example.label]
			
 
				+
			
 
				+        features.append(
			
 
				+            InputFeatures(input_ids=input_ids,
			
 
				+                          input_mask=input_mask,
			
 
				+                          segment_ids=segment_ids,
			
 
				+                          label_id=label_id))
			
 
				+    return features, label_map
			
 
				+
			
 
				+
			
 
				+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
			
 
				+    """Truncates a sequence pair in place to the maximum length."""
			
 
				+
			
 
				+    # This is a simple heuristic which will always truncate the longer sequence
			
 
				+    # one token at a time. This makes more sense than truncating an equal percent
			
 
				+    # of tokens from each, since if one sequence is very short then each token
			
 
				+    # that's truncated likely contains more information than a longer sequence.
			
 
				+    while True:
			
 
				+        total_length = len(tokens_a) + len(tokens_b)
			
 
				+        if total_length <= max_length:
			
 
				+            break
			
 
				+        if len(tokens_a) > len(tokens_b):
			
 
				+            tokens_a.pop()
			
 
				+        else:
			
 
				+            tokens_b.pop()
			
 
				+
			
 
				+
			
 
				+PROCESSORS = {
			
 
				+    "cola": ColaProcessor,
			
 
				+    "mnli": MnliProcessor,
			
 
				+    "mrpc": MrpcProcessor,
			
 
				+    "sst-2": Sst2Processor,
			
 
				+}
			
--- a/PyTorch/LanguageModeling/BERT/run_glue.py
+++ b/PyTorch/LanguageModeling/BERT/run_glue.py
--- a/PyTorch/LanguageModeling/BERT/scripts/configs/glue_config.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/configs/glue_config.sh
@@ -0,0 +1,487 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+batch_size_and_gradient_accumulation_steps() {
			
 
				+  batch_size=$((global_batch_size / num_gpu))
			
 
				+  gradient_accumulation_steps=1
			
 
				+
			
 
				+  while [ $((batch_size / gradient_accumulation_steps)) -gt $batch_size_capacity ]
			
 
				+  do
			
 
				+    gradient_accumulation_steps=$((gradient_accumulation_steps * 2))
			
 
				+  done
			
 
				+}
			
 
				+
			
 
				+commons () {
			
 
				+  init_checkpoint=/workspace/bert/checkpoints/bert_uncased.pt
			
 
				+  vocab_file=${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
			
 
				+  config_file=/workspace/bert/bert_config.json
			
 
				+  max_steps=-1.0
			
 
				+}
			
 
				+
			
 
				+mrpc_commons () {
			
 
				+  data_dir=${BERT_PREP_WORKING_DIR}/download/glue/MRPC/
			
 
				+  out_dir=/workspace/bert/results/MRPC
			
 
				+  task_name=mrpc
			
 
				+  global_batch_size=128
			
 
				+  learning_rate=2.4e-5
			
 
				+  warmup_proportion=0.1
			
 
				+  epochs=3
			
 
				+}
			
 
				+
			
 
				+sst-2_commons () {
			
 
				+  data_dir=${BERT_PREP_WORKING_DIR}/download/glue/SST-2/
			
 
				+  out_dir=/workspace/bert/results/SST-2
			
 
				+  task_name=sst-2
			
 
				+  warmup_proportion=0.1
			
 
				+  epochs=3
			
 
				+}
			
 
				+
			
 
				+dgxa100_fp16_commons () {
			
 
				+  batch_size_capacity=128
			
 
				+  precision=fp16
			
 
				+}
			
 
				+
			
 
				+dgxa100_tf32_commons () {
			
 
				+  batch_size_capacity=64
			
 
				+  precision=tf32
			
 
				+}
			
 
				+
			
 
				+dgx2_fp16_commons () {
			
 
				+  batch_size_capacity=128
			
 
				+  precision=fp16
			
 
				+}
			
 
				+
			
 
				+dgx2_fp32_commons () {
			
 
				+  batch_size_capacity=64
			
 
				+  precision=fp32
			
 
				+}
			
 
				+
			
 
				+print_arguments_in_order () {
			
 
				+  echo \
			
 
				+    $init_checkpoint \
			
 
				+    $data_dir \
			
 
				+    $vocab_file \
			
 
				+    $config_file \
			
 
				+    $out_dir \
			
 
				+    $task_name \
			
 
				+    $num_gpu \
			
 
				+    $batch_size \
			
 
				+    $gradient_accumulation_steps \
			
 
				+    $learning_rate \
			
 
				+    $warmup_proportion \
			
 
				+    $epochs \
			
 
				+    $max_steps \
			
 
				+    $precision
			
 
				+}
			
 
				+
			
 
				+##########################################
			
 
				+#                 DGXA100                #
			
 
				+##########################################
			
 
				+
			
 
				+##########################
			
 
				+#          MRPC          #
			
 
				+##########################
			
 
				+
			
 
				+# AMP
			
 
				+
			
 
				+mrpc_dgxa100_1gpu_fp16 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgxa100_fp16_commons
			
 
				+  num_gpu=1
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgxa100_2gpu_fp16 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgxa100_fp16_commons
			
 
				+  num_gpu=2
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgxa100_4gpu_fp16 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgxa100_fp16_commons
			
 
				+  num_gpu=4
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgxa100_8gpu_fp16 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgxa100_fp16_commons
			
 
				+  num_gpu=8
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+# TF32
			
 
				+
			
 
				+mrpc_dgxa100_1gpu_tf32 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgxa100_tf32_commons
			
 
				+  num_gpu=1
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgxa100_2gpu_tf32 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgxa100_tf32_commons
			
 
				+  num_gpu=2
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+
			
 
				+}
			
 
				+
			
 
				+mrpc_dgxa100_4gpu_tf32 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgxa100_tf32_commons
			
 
				+  num_gpu=4
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgxa100_8gpu_tf32 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgxa100_tf32_commons
			
 
				+  num_gpu=8
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+##########################
			
 
				+#          SST-2         #
			
 
				+##########################
			
 
				+
			
 
				+# AMP
			
 
				+
			
 
				+sst-2_dgxa100_fp16_commons () {
			
 
				+  global_batch_size=1024
			
 
				+  learning_rate=3e-5
			
 
				+}
			
 
				+
			
 
				+sst-2_dgxa100_1gpu_fp16 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgxa100_fp16_commons
			
 
				+  sst-2_dgxa100_fp16_commons
			
 
				+  num_gpu=1
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgxa100_2gpu_fp16 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgxa100_fp16_commons
			
 
				+  sst-2_dgxa100_fp16_commons
			
 
				+  num_gpu=2
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgxa100_4gpu_fp16 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgxa100_fp16_commons
			
 
				+  sst-2_dgxa100_fp16_commons
			
 
				+  num_gpu=4
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgxa100_8gpu_fp16 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgxa100_fp16_commons
			
 
				+  sst-2_dgxa100_fp16_commons
			
 
				+  num_gpu=8
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+# TF32
			
 
				+
			
 
				+sst-2_dgxa100_tf32_commons () {
			
 
				+  global_batch_size=512
			
 
				+  learning_rate=2e-5
			
 
				+}
			
 
				+
			
 
				+sst-2_dgxa100_1gpu_tf32 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgxa100_tf32_commons
			
 
				+  sst-2_dgxa100_tf32_commons
			
 
				+  num_gpu=1
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgxa100_2gpu_tf32 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgxa100_tf32_commons
			
 
				+  sst-2_dgxa100_tf32_commons
			
 
				+  num_gpu=2
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgxa100_4gpu_tf32 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgxa100_tf32_commons
			
 
				+  sst-2_dgxa100_tf32_commons
			
 
				+  num_gpu=4
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgxa100_8gpu_tf32 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgxa100_tf32_commons
			
 
				+  sst-2_dgxa100_tf32_commons
			
 
				+  num_gpu=8
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+##########################################
			
 
				+#                 DGX2                   #
			
 
				+##########################################
			
 
				+
			
 
				+##########################
			
 
				+#          MRPC          #
			
 
				+##########################
			
 
				+
			
 
				+# AMP
			
 
				+
			
 
				+mrpc_dgx2_1gpu_fp16 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  num_gpu=1
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgx2_2gpu_fp16 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  num_gpu=2
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgx2_4gpu_fp16 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  num_gpu=4
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgx2_8gpu_fp16 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  num_gpu=8
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgx2_16gpu_fp16 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  num_gpu=16
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+# FP32.
			
 
				+
			
 
				+mrpc_dgx2_1gpu_fp32 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  num_gpu=1
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgx2_2gpu_fp32 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  num_gpu=2
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgx2_4gpu_fp32 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  num_gpu=4
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgx2_8gpu_fp32 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  num_gpu=8
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+mrpc_dgx2_16gpu_fp32 () {
			
 
				+  commons
			
 
				+  mrpc_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  num_gpu=16
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+##########################
			
 
				+#          SST-2         #
			
 
				+##########################
			
 
				+
			
 
				+sst-2_dgx2_commons () {
			
 
				+  global_batch_size=1024
			
 
				+  learning_rate=3e-5
			
 
				+}
			
 
				+
			
 
				+# AMP
			
 
				+
			
 
				+sst-2_dgx2_1gpu_fp16 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=1
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgx2_2gpu_fp16 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=2
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgx2_4gpu_fp16 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=4
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgx2_8gpu_fp16 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=8
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgx2_16gpu_fp16 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp16_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=16
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+# TF32
			
 
				+
			
 
				+sst-2_dgx2_1gpu_fp32 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=1
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgx2_2gpu_fp32 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=2
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgx2_4gpu_fp32 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=4
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgx2_8gpu_fp32 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=8
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
 
				+
			
 
				+sst-2_dgx2_16gpu_fp32 () {
			
 
				+  commons
			
 
				+  sst-2_commons
			
 
				+  dgx2_fp32_commons
			
 
				+  sst-2_dgx2_commons
			
 
				+  num_gpu=16
			
 
				+  batch_size_and_gradient_accumulation_steps
			
 
				+  print_arguments_in_order
			
 
				+}
			
--- a/PyTorch/LanguageModeling/BERT/scripts/run_glue.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_glue.sh
@@ -13,25 +13,28 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				-MRPC_DIR=/workspace/bert/data/glue/MRPC
			
 
				-OUT_DIR=/results/MRPC
			
 
				-
			
 
				-mkdir -p $OUT_DIR
			
 
				+set -e
			
 
				 
			
 
				 echo "Container nvidia build = " $NVIDIA_BUILD_ID
			
 
				 
			
 
				-init_checkpoint=${1:-"/workspace/bert/checkpoints/ckpt_8601.pt"}
			
 
				-mode=${2:-"train eval"}
			
 
				-max_steps=${3:-"-1.0"} # if < 0, has no effect
			
 
				-batch_size=${4:-"8"}
			
 
				-learning_rate=${5:-"2e-5"}
			
 
				-precision=${6:-"fp16"}
			
 
				-num_gpu=${7:-8}
			
 
				-epochs=${8:-"3"}
			
 
				-warmup_proportion=${9:-"0.01"}
			
 
				-seed=${10:-2}
			
 
				-vocab_file=${11:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
			
 
				-CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
			
 
				+init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"}
			
 
				+data_dir=${2:-"$BERT_PREP_WORKING_DIR/download/glue/MRPC/"}
			
 
				+vocab_file=${3:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
			
 
				+config_file=${4:-"/workspace/bert/bert_config.json"}
			
 
				+out_dir=${5:-"/workspace/bert/results/MRPC"}
			
 
				+task_name=${6:-"mrpc"}
			
 
				+num_gpu=${7:-"8"}
			
 
				+batch_size=${8:-"16"}
			
 
				+gradient_accumulation_steps=${9:-"1"}
			
 
				+learning_rate=${10:-"2.4e-5"}
			
 
				+warmup_proportion=${11:-"0.1"}
			
 
				+epochs=${12:-"3"}
			
 
				+max_steps=${13:-"-1.0"}
			
 
				+precision=${14:-"fp16"}
			
 
				+seed=${15:-"2"}
			
 
				+mode=${16:-"train eval"}
			
 
				+
			
 
				+mkdir -p $out_dir
			
 
				 
			
 
				 if [ "$mode" = "eval" ] ; then
			
 
				   num_gpu=1
			
@@ -51,26 +54,25 @@ else
 
				   mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
			
 
				 fi
			
 
				 
			
 
				-
			
 
				 CMD="python $mpi_command run_glue.py "
			
 
				-CMD+="--task_name MRPC "
			
 
				-if [ "$mode" = "train" ] ; then
			
 
				+CMD+="--task_name ${task_name} "
			
 
				+if [[ $mode == *"train"* ]] ; then
			
 
				   CMD+="--do_train "
			
 
				   CMD+="--train_batch_size=$batch_size "
			
 
				 fi
			
 
				-if [ "$mode" == "eval" ] ; then
			
 
				-  CMD+="--do_eval "
			
 
				-  CMD+="--eval_batch_size=$batch_size "
			
 
				-fi
			
 
				-if [ "$mode" == "train eval" ] ; then
			
 
				-  CMD+="--do_train "
			
 
				-  CMD+="--train_batch_size=$batch_size "
			
 
				-  CMD+="--do_eval "
			
 
				+if [[ $mode == *"eval"* ]] || [[ $mode == *"prediction"* ]]; then
			
 
				+  if [[ $mode == *"eval"* ]] ; then
			
 
				+    CMD+="--do_eval "
			
 
				+  fi
			
 
				+  if [[ $mode == *"prediction"* ]] ; then
			
 
				+    CMD+="--do_predict "
			
 
				+  fi
			
 
				   CMD+="--eval_batch_size=$batch_size "
			
 
				 fi
			
 
				 
			
 
				+CMD+="--gradient_accumulation_steps=$gradient_accumulation_steps "
			
 
				 CMD+="--do_lower_case "
			
 
				-CMD+="--data_dir $MRPC_DIR "
			
 
				+CMD+="--data_dir $data_dir "
			
 
				 CMD+="--bert_model bert-large-uncased "
			
 
				 CMD+="--seed $seed "
			
 
				 CMD+="--init_checkpoint $init_checkpoint "
			
@@ -80,11 +82,10 @@ CMD+="--learning_rate $learning_rate "
 
				 CMD+="--num_train_epochs $epochs "
			
 
				 CMD+="--max_steps $max_steps "
			
 
				 CMD+="--vocab_file=$vocab_file "
			
 
				-CMD+="--config_file=$CONFIG_FILE "
			
 
				-CMD+="--output_dir $OUT_DIR "
			
 
				+CMD+="--config_file=$config_file "
			
 
				+CMD+="--output_dir $out_dir "
			
 
				 CMD+="$use_fp16"
			
 
				 
			
 
				-LOGFILE=$OUT_DIR/logfile
			
 
				+LOGFILE=$out_dir/logfile
			
 
				 
			
 
				-echo $CMD
			
 
				 $CMD |& tee $LOGFILE
			
--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
@@ -25,7 +25,7 @@ resume_training=${8:-"false"}
 
				 create_logfile=${9:-"true"}
			
 
				 accumulate_gradients=${10:-"true"}
			
 
				 gradient_accumulation_steps=${11:-128}
			
 
				-seed=${12:-42}
			
 
				+seed=${12:-12439}
			
 
				 job_name=${13:-"bert_lamb_pretraining"}
			
 
				 allreduce_post_accumulation=${14:-"true"}
			
 
				 allreduce_post_accumulation_fp16=${15:-"true"}
			
--- a/PyTorch/LanguageModeling/BERT/utils.py
+++ b/PyTorch/LanguageModeling/BERT/utils.py
@@ -14,6 +14,9 @@
 
				 import torch
			
 
				 import torch.distributed as dist
			
 
				 
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				 def get_rank():
			
 
				     if not dist.is_available():
			
 
				         return 0
			
@@ -21,6 +24,7 @@ def get_rank():
 
				         return 0
			
 
				     return dist.get_rank()
			
 
				 
			
 
				+
			
 
				 def get_world_size():
			
 
				     if not dist.is_available():
			
 
				         return 1
			
@@ -28,9 +32,16 @@ def get_world_size():
 
				         return 1
			
 
				     return dist.get_world_size()
			
 
				 
			
 
				+
			
 
				 def is_main_process():
			
 
				     return get_rank() == 0
			
 
				 
			
 
				+
			
 
				+def barrier():
			
 
				+    if dist.is_available() and dist.is_initialized():
			
 
				+        dist.barrier()
			
 
				+
			
 
				+
			
 
				 def format_step(step):
			
 
				     if isinstance(step, str):
			
 
				         return step
			
@@ -42,3 +53,13 @@ def format_step(step):
 
				     if len(step) > 2:
			
 
				         s += "Validation Iteration: {} ".format(step[2])
			
 
				     return s
			
 
				+
			
 
				+
			
 
				+def mkdir(path):
			
 
				+    Path(path).mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+
			
 
				+def mkdir_by_main_process(path):
			
 
				+    if is_main_process():
			
 
				+        mkdir(path)
			
 
				+    barrier()