Bladeren bron

[BERT/PyT] Support for multi-node

Przemek Strzelczyk 6 jaren geleden
bovenliggende
commit
6fe463fe27
43 gewijzigde bestanden met toevoegingen van 929 en 264 verwijderingen
  1. 19 7
      PyTorch/LanguageModeling/BERT/.dockerignore
  2. 3 6
      PyTorch/LanguageModeling/BERT/.gitignore
  3. 14 16
      PyTorch/LanguageModeling/BERT/Dockerfile
  4. 2 1
      PyTorch/LanguageModeling/BERT/LICENSE
  5. 278 186
      PyTorch/LanguageModeling/BERT/README.md
  6. 13 0
      PyTorch/LanguageModeling/BERT/bind_pyt.py
  7. 182 0
      PyTorch/LanguageModeling/BERT/configurations.yml
  8. 3 2
      PyTorch/LanguageModeling/BERT/create_pretraining_data.py
  9. 12 0
      PyTorch/LanguageModeling/BERT/data/BooksDownloader.py
  10. 12 0
      PyTorch/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
  11. 12 0
      PyTorch/LanguageModeling/BERT/data/Downloader.py
  12. 11 0
      PyTorch/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
  13. 11 0
      PyTorch/LanguageModeling/BERT/data/MRPCDownloader.py
  14. 11 0
      PyTorch/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
  15. 11 0
      PyTorch/LanguageModeling/BERT/data/SquadDownloader.py
  16. 11 0
      PyTorch/LanguageModeling/BERT/data/TextSharding.py
  17. 12 3
      PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
  18. 11 0
      PyTorch/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
  19. 12 0
      PyTorch/LanguageModeling/BERT/data/__init__.py
  20. 13 2
      PyTorch/LanguageModeling/BERT/data/bertPrep.py
  21. 14 1
      PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
  22. 13 0
      PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh
  23. 13 0
      PyTorch/LanguageModeling/BERT/data/squad/squad_download.sh
  24. 1 0
      PyTorch/LanguageModeling/BERT/extract_features.py
  25. 14 0
      PyTorch/LanguageModeling/BERT/file_utils.py
  26. 2 2
      PyTorch/LanguageModeling/BERT/modeling.py
  27. 2 0
      PyTorch/LanguageModeling/BERT/optimization.py
  28. 74 0
      PyTorch/LanguageModeling/BERT/run.sub
  29. 2 2
      PyTorch/LanguageModeling/BERT/run_glue.py
  30. 5 26
      PyTorch/LanguageModeling/BERT/run_pretraining.py
  31. 1 0
      PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
  32. 7 4
      PyTorch/LanguageModeling/BERT/run_squad.py
  33. 2 2
      PyTorch/LanguageModeling/BERT/run_swag.py
  34. 14 0
      PyTorch/LanguageModeling/BERT/schedulers.py
  35. 14 0
      PyTorch/LanguageModeling/BERT/scripts/data_download.sh
  36. 15 1
      PyTorch/LanguageModeling/BERT/scripts/run_glue.sh
  37. 13 0
      PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
  38. 13 0
      PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh
  39. 13 1
      PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
  40. 15 1
      PyTorch/LanguageModeling/BERT/scripts/run_swag.sh
  41. 14 0
      PyTorch/LanguageModeling/BERT/scripts/start_pretraining.sh
  42. 2 1
      PyTorch/LanguageModeling/BERT/tokenization.py
  43. 13 0
      PyTorch/LanguageModeling/BERT/utils.py

+ 19 - 7
PyTorch/LanguageModeling/BERT/.dockerignore

@@ -1,8 +1,20 @@
-data/download/
-data/extracted/
-data/formatted_one_article_per_line/
-data/sharded/
-data/hdf5/
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+data/download
+data/extracted
+data/formatted_one_article_per_line
+data/sharded
+data/hdf5
 vocab/
-results/
-checkpoints/*
+results/

+ 3 - 6
PyTorch/LanguageModeling/BERT/.gitignore

@@ -8,14 +8,11 @@ __pycache__/
 # C extensions
 *.so
 
-#Data       
+#Data checkpoints and results       
 data/*/*/   
 data/*/*.zip
-data/*
-
-#checkpoints and results
-checkpoints/*
-results/*
+checkpoints/
+results/
 
 # Distribution / packaging
 .Python

+ 14 - 16
PyTorch/LanguageModeling/BERT/Dockerfile

@@ -1,24 +1,22 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.07-py3
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.08-py3
 FROM ${FROM_IMAGE_NAME}
 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
 
 ENV BERT_PREP_WORKING_DIR /workspace/bert/data
 
-WORKDIR /opt
-RUN rm -rf /opt/pytorch/apex ; \
-  git clone https://github.com/NVIDIA/apex.git pytorch/apex ; \
-  cd pytorch/apex ; \
-  pip uninstall --yes apex; \
-  git checkout 880ab925bce9f817a93988b021e12db5f67f7787;  \
-  git pull; \
-  pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
-
-#WORKDIR /opt
-#RUN cd pytorch/apex \
-# && git fetch origin pull/334/head:multi_tensor_lamb_optimizer \
-# && git checkout multi_tensor_lamb_optimizer \
-# && python setup.py develop --cuda_ext --cpp_ext
-
 WORKDIR /workspace
 RUN git clone https://github.com/attardi/wikiextractor.git
 RUN git clone https://github.com/soskek/bookcorpus.git

+ 2 - 1
PyTorch/LanguageModeling/BERT/LICENSE

@@ -1,4 +1,3 @@
-
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
@@ -176,6 +175,8 @@
 
    END OF TERMS AND CONDITIONS
 
+   Copyright 2019 NVIDIA CORPORATION. All rights reserved.
+
    APPENDIX: How to apply the Apache License to your work.
 
       To apply the Apache License to your work, attach the following

+ 278 - 186
PyTorch/LanguageModeling/BERT/README.md

@@ -1,8 +1,8 @@
 # BERT For PyTorch
 
-This repository provides a script and recipe to train the BERT model to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
+This repository provides a script and recipe to train the BERT model for PyTorch to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
 
-**Table Of Contents**
+## Table Of Contents
 
 - [Model overview](#model-overview)
     * [Model architecture](#model-architecture)
@@ -11,6 +11,7 @@ This repository provides a script and recipe to train the BERT model to achieve
         * [Features](#features)
     * [Mixed precision training](#mixed-precision-training)
         * [Enabling mixed precision](#enabling-mixed-precision)
+        * [Glossary](#glossary)
 - [Setup](#setup)
     * [Requirements](#requirements)
 - [Quick Start Guide](#quick-start-guide)
@@ -18,14 +19,12 @@ This repository provides a script and recipe to train the BERT model to achieve
     * [Scripts and sample code](#scripts-and-sample-code)
     * [Parameters](#parameters)
         * [Pre-training parameters](#pre-training-parameters)
+        * [Multi-node](#multi-node)
         * [Fine-tuning parameters](#fine-tuning-parameters)     
     * [Command-line options](#command-line-options)
     * [Getting the data](#getting-the-data)
         * [Dataset guidelines](#dataset-guidelines)
         * [Multi-dataset](#multi-dataset)
-            * [Relocating hdf5 files](#relocating-hdf5-files)
-            * [Inter sequence-pair mixing](#inter-sequence-pair-mixing)
-            * [Retaining document-level granularity](#retaining-document-level-granularity)
     * [Training process](#training-process)
         * [Pre-training](#pre-training)
         * [Fine-tuning](#fine-tuning)   
@@ -43,31 +42,34 @@ This repository provides a script and recipe to train the BERT model to achieve
             * [Training stability test](#training-stability-test)
                 * [Pre-training stability test](#pre-training-stability-test)
                 * [Fine-tuning stability test](#fine-tuning-stability-test) 
-            * [Training performance results](#training-performance-results)
-                * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
-                    * [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
-                    * [Fine-tuning NVIDIA DGX-1 With 16G](#fine-tuning-nvidia-dgx-1-with-16g)   
-                * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
-                    * [Pre-training NVIDIA DGX-1 With 32G](#pre-training-nvidia-dgx-1-with-32g)
-                    * [Fine-tuning NVIDIA DGX-1 With 32G](#fine-tuning-nvidia-dgx-1-with-32g)   
-                * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
-                    * [Pre-training NVIDIA DGX-2 With 32G](#pre-training-nvidia-dgx-2-with-32g)
-                    * [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)   
-            * [Inference performance results](#inference-performance-results)
-                * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
-                    * [Pre-training inference on NVIDIA DGX-1 with 16G](#pre-training-inference-on-nvidia-dgx-1-with-16g)
-                    * [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
-                * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
-                    * [Pre-training inference on NVIDIA DGX-1 with 32G](#pre-training-inference-on-nvidia-dgx-1-with-32g)
-                    * [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
-                * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
-                    * [Pre-training inference on NVIDIA DGX-2 with 32G](#pre-training-inference-on-nvidia-dgx-2-with-32g)
-                    * [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
+          * [Training performance results](#training-performance-results)
+              * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
+                  * [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
+                  * [Pre-training on multiple NVIDIA DGX-1 With 16G](#pre-training-on-multiple-nvidia-dgx-1-with-16g)
+                  * [Fine-tuning NVIDIA DGX-1 With 16G](#fine-tuning-nvidia-dgx-1-with-16g)   
+              * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
+                  * [Pre-training NVIDIA DGX-1 With 32G](#pre-training-nvidia-dgx-1-with-32g)
+                  * [Fine-tuning NVIDIA DGX-1 With 32G](#fine-tuning-nvidia-dgx-1-with-32g)   
+              * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
+                  * [Pre-training NVIDIA DGX-2 With 32G](#pre-training-nvidia-dgx-2-with-32g)
+                  * [Pre-training on multiple NVIDIA DGX-2H With 32G](#pre-training-on-multiple-nvidia-dgx-2h-with-32g)
+                  * [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)   
+          * [Inference performance results](#inference-performance-results)
+              * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
+                  * [Pre-training inference on NVIDIA DGX-1 with 16G](#pre-training-inference-on-nvidia-dgx-1-with-16g)
+                  * [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
+              * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
+                  * [Pre-training inference on NVIDIA DGX-1 with 32G](#pre-training-inference-on-nvidia-dgx-1-with-32g)
+                  * [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
+              * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
+                  * [Pre-training inference on NVIDIA DGX-2 with 32G](#pre-training-inference-on-nvidia-dgx-2-with-32g)
+                  * [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
 - [Release notes](#release-notes)
     * [Changelog](#changelog)
     * [Known issues](#known-issues)
 
 
+
 ## Model overview
  
 BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's implementation of BERT is an optimized version of the [Hugging Face implementation](https://github.com/huggingface/pytorch-pretrained-BERT), leveraging mixed precision arithmetic and Tensor Cores on V100 GPUs for faster training times while maintaining target accuracy.
@@ -75,22 +77,25 @@ BERT, or Bidirectional Encoder Representations from Transformers, is a new metho
 The repository also contains scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for both pre-training and fine-tuning for tasks such as question answering. The major differences between the original implementation of the paper and this version of BERT are as follows:
 
 -   Scripts to download Wikipedia and BookCorpus datasets
--   Scripts to preprocess downloaded data or a custom corpus into inputs and targets for pre-training in a modular fashion.
+-   Scripts to preprocess downloaded data or a custom corpus into inputs and targets for pre-training in a modular fashion
 -   Fused [LAMB](https://arxiv.org/pdf/1904.00962.pdf) optimizer to support training with larger batches
 -   Fused Adam optimizer for fine tuning tasks
 -   Fused CUDA kernels for better performance LayerNorm
--   Automatic Mixed precision training support
+-   Automatic mixed precision (AMP) training support
+-   Scripts to launch on multiple number of nodes
 
 Other publicly available implementations of BERT include:
-
-1.  [Google's official implementation](https://github.com/google-research/bert)
-2.  [codertimo](https://github.com/codertimo/BERT-pytorch)
+1. [NVIDIA Tensorflow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT)
+2. [Hugging Face](https://github.com/huggingface/pytorch-pretrained-BERT)
+3. [codertimo](https://github.com/codertimo/BERT-pytorch)
+4. [gluon-nlp](https://github.com/dmlc/gluon-nlp/tree/master/scripts/bert)
+5. [Google's implementation](https://github.com/google-research/bert)
     
 This model trains with mixed precision Tensor Cores on Volta and provides a push-button solution to pretraining on a corpus of choice. As a result, researchers can get results 4x faster than training without Tensor Cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
 
 ### Model architecture
 
-The BERT architecture uses the same architecture as the encoder half of the Transformer. Input sequences are projected into an embedding space before being fed into the encoder structure. Additionally, a positional and segment encodings are added to the embeddings to preserve positional information. The encoder structure is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer accomplishes self-attention on multiple input representations.
+The BERT architecture uses the same architecture as the encoder half of the Transformer. Input sequences are projected into an embedding space before being fed into the encoder structure. Additionally, positional and segment encodings are added to the embeddings to preserve positional information. The encoder structure is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer accomplishes self-attention on multiple input representations.
 
 An illustration of the architecture taken from the [Transformer paper](https://arxiv.org/pdf/1706.03762.pdf) is shown below.
 
@@ -100,14 +105,14 @@ An illustration of the architecture taken from the [Transformer paper](https://a
 
 The architecture of the BERT model is almost identical to the Transformer model that was first introduced in the [Attention Is All You Need paper](https://arxiv.org/pdf/1706.03762.pdf). The main innovation of BERT lies in the pre-training step, where the model is trained on two unsupervised prediction tasks using a large text corpus. Training on these unsupervised tasks produces a generic language model, which can then be quickly fine-tuned to achieve state-of-the-art performance on language processing tasks such as question answering.
 
-The BERT paper reports results two configurations of BERT, each corresponding to a unique model size. This implementation provides the same configurations by default, which are described in the table below.  
+The BERT paper reports the results for two configurations of BERT, each corresponding to a unique model size. This implementation provides the same configurations by default, which are described in the table below.  
 
 | **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |
 |:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|
 |BERTBASE |12 encoder| 768| 12|4 x  768|512|110M|
 |BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|
 
-Additionally, this implementation supports training on multiple GPUs. Mixed precision training and inference with dynamic loss scaling is also supported.
+
 
 ### Feature support matrix
 
@@ -118,12 +123,13 @@ The following features are supported by this model.
 |APEX AMP|Yes|
 |APEX DDP|Yes|
 |LAMB|Yes|
+|Multi-node|Yes|
 
 #### Features
  
-[APEX](https://github.com/NVIDIA/apex) is a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training. 
+[APEX](https://github.com/NVIDIA/apex) is a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training, whereas [AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
  
-[DDP](https://nvidia.github.io/apex/parallel.html) stands for DistributedDataParallel and is used for multi-GPU training, where as [AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
+[DDP](https://nvidia.github.io/apex/parallel.html) stands for DistributedDataParallel and is used for multi-GPU training.
 
 [LAMB](https://arxiv.org/pdf/1904.00962.pdf) stands for Layerwise Adaptive Moments based optimizer, is a large batch optimization technique that helps accelerate training of deep neural networks using large minibatches. It allows using a global batch size of 65536 and 32768 on sequence lengths 128 and 512 respectively, compared to a batch size of 256 for Adam. The optimized implementation accumulates 1024 gradients batches in phase 1 and 4096 steps in phase 2 before updating weights once. This results in 15% training speedup. On multi-node systems, LAMB allows scaling up to 1024 GPUs resulting in training speedups of up to 72x in comparison to [Adam](https://arxiv.org/pdf/1412.6980.pdf). Adam has limitations on the learning rate that can be used since it is applied globally on all parameters whereas LAMB follows a layerwise learning rate strategy.
 
@@ -135,10 +141,9 @@ Mixed precision is the combined use of different numerical precisions in a compu
 2.  Adding loss scaling to preserve small gradient values.
 
 For information about:
-
 -   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
 -   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
--   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+-   APEX tools for mixed precision training, see the [NVIDIA APEX: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
  
 #### Enabling mixed precision
 
@@ -149,15 +154,35 @@ Automatic mixed precision can be enabled with the following code changes:
 ```
 from apex import amp
 if fp16:
-	# Wrap optimizer and model
-	model, optimizer = amp.initialize(model, optimizer, opt_level=<opt_level>, loss_scale=”dynamic”)
+    # Wrap optimizer and model
+    model, optimizer = amp.initialize(model, optimizer, opt_level=<opt_level>, loss_scale=”dynamic”)
  
 if fp16:
-	with amp.scale_loss(loss, optimizer) as scaled_loss:
-    	scaled_loss.backward()
+    with amp.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
    ```
 
-Where `<opt_level>` is the optimization level. In the pretraining, “O2” is set as the optimization level. Mixed precision training can be turned on by passing the `fp16` argument to the pre-training and fine-tuning Python scripts. Shell scripts all have a positional argument available to enable mixed precision training.
+Where `<opt_level>` is the optimization level. In the pretraining, `O2` is set as the optimization level. Mixed precision training can be turned on by passing the `fp16` argument to the `run_pretraining.py` and `run_squad.py`. All shell scripts have a positional argument available to enable mixed precision training.
+
+### Glossary
+
+**Fine-tuning**  
+Training an already pretrained model further using a task specific dataset for subject-specific refinements, by adding task-specific layers on top if required.
+
+**Language Model**  
+Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence.
+
+**Pre-training**  
+Training a model on vast amounts of data on the same (or different) task to build general understandings.
+
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+
+**Phase1**  
+Pretraining on samples of sequence length 128 and 20 masked predictions per sequence.
+
+**Phase2**  
+Pretraining on samples of sequence length 512 and 80 masked predictions per sequence.
  
 ## Setup
 
@@ -178,9 +203,14 @@ For more information about how to get started with NGC containers, see the follo
 
 For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
 
+For multi-node, the sample provided in this repository requires [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) set up on a [SLURM](https://slurm.schedmd.com) cluster.
+
+More information on how to set up and launch can be found in the [Multi-node Documentation](https://docs.nvidia.com/ngc/multi-node-bert-user-guide).
+
+
 ## Quick Start Guide
 
-To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. The default parameters for pretraining have been set to run on 8 x V100 32G cards. For the specifics concerning training and inference, see [Advanced](#advanced).
+To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. The default parameters for pretraining have been set to run on 8 x V100 32G cards. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
  
 
 1. Clone the repository.
@@ -190,11 +220,11 @@ To train your model using mixed precision with Tensor Cores or using FP32, perfo
 `cd DeepLearningExamples/PyTorch/LanguageModeling/BERT`
 
 
-2. Download NVIDIA pretrained checkpoint.
+2. Download the NVIDIA pretrained checkpoint.
 
-If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/catalog/models) and browse the available models. This downloaded checkpoint is used to fine-tune on SQuAD. Make sure to place the downloaded checkpoint in `checkpoints/` folder.
+If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/catalog/models) and browse the available models. This downloaded checkpoint is used to fine-tune on SQuAD. Ensure you place the downloaded checkpoint in the `checkpoints/` folder.
 
-3. Build the BERT 19.07 NGC container.
+3. Build the BERT 19.08 NGC container.
 
 `bash scripts/docker/build.sh`
 
@@ -202,7 +232,7 @@ If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/c
 
 `bash scripts/docker/launch.sh`
 
-Resultant logs and checkpoints of pretraining and finetuning routines get stored in the `results/` folder.
+Resultant logs and checkpoints of pretraining and fine-tuning routines get stored in the `results/` folder.
 
 `data` and `vocab.txt` are downloaded in `data/` directory by default. Refer to the [Getting the data](#getting-the-data) section for more details on how to process a custom corpus as required for BERT pretraining.
 
@@ -214,25 +244,29 @@ This repository provides scripts to download, verify and extract the following d
 -   Wikipedia (pre-training)
 -   BookCorpus (pre-training)
 
-To download, verify, extract the datasets, and create the shards in hdf5 format, run:
+To download, verify, extract the datasets, and create the shards in hdf5 format, run:  
 `/workspace/bert/data/create_datasets_from_start.sh`
 
-6. Start pre-training.
+Depending on the speed of your internet connection, this process takes about a day to complete.
+
+6. Start pretraining.
 
-BERT is designed to pre-train deep bidirectional representations for language representations. The following scripts are to replicate pretraining on Wikipedia+Book Corpus from this [paper](https://arxiv.org/pdf/1810.04805.pdf). These scripts are general and can be used for pre-training language representations on any corpus of choice.
+BERT is designed to pre-train deep bidirectional networks for language representations. The following scripts replicate pretraining on Wikipedia + BookCorpus from this [paper](https://arxiv.org/pdf/1810.04805.pdf). These scripts are general and can be used for pre-training language representations on any corpus of choice.
 
-From within the container, you can use the following script to run pre-training.
+To run on a single node, from within the container, you can use the following script to run pre-training.  
 `bash scripts/run_pretraining.sh`
 
-More details can be found in Details/Training Process
+The default hyperparameters are set to run on 8 x V100 32G cards.
+
+To run on multiple nodes, see the [Multi-node](#multi-node) section.  
  
-7. Start fine-tuning with the SQUAD dataset.
+7. Start fine-tuning with the SQuAD dataset.
 
-The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script launches fine-tuning for question answering with the SQuaD dataset.
+The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script launches fine-tuning for question answering with the SQuAD dataset.
 
 `bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`
 
-Default arguments are listed below in order,
+Default arguments are listed below in the order the scripts expects:
 
 -   Initial checkpoint - The default is `/workspace/checkpoints/bert_uncased.pt`.
 -   Number of training Epochs - The default is `2`.
@@ -244,18 +278,18 @@ Default arguments are listed below in order,
 -   SQuAD directory -  The default is `/workspace/bert/data/v1.1`.
 -   Vocabulary file (token to ID mapping) - The default is `/workspace/bert/vocab/vocab`.
 -   Output directory for result - The default is `/results/SQuAD`.
--   Mode (“train”, “eval”, “train eval”, "predict") - The default is `train`.
--   Config file for the bert model (It should be the same as the pretrained model) - The default is `/workspace/bert/bert_config.json`.
+-   Mode (`train`, `eval`, `train eval`, `predict`) - The default is `train`.
+-   Config file for the BERT model (It should be the same as the pretrained model) - The default is `/workspace/bert/bert_config.json`.
 
-The script will save the final checkpoint to the `/results/SQuAD/pytorch_model.bin` file.
+The script saves the final checkpoint to the `/results/SQuAD/pytorch_model.bin` file.
 
 9. Start validation/evaluation.
 
-Validation can be performed with the same script as above, setting `Mode` to "prediction".
+Validation can be performed with the same script as above, setting `Mode` to `prediction`.
 
 10. Start inference/predictions.
 
-Inference can be performed with the same script as above, setting `Mode` to `eval`. Inference predictions get saved to `<OUTPUT_DIRECTORY>/predictions.json`.
+Inference can be performed with the same script as above, setting `Mode` to `eval`. Inference predictions are saved to `<OUTPUT_DIRECTORY>/predictions.json`.
 
 ## Advanced
 
@@ -273,7 +307,7 @@ Descriptions of the key scripts and folders are provided below.
 -   `create_pretraining_data.py` - Creates `.hdf5` files from shared text files in the final step of dataset creation.
 -   `model.py` - Implements the BERT pre-training and fine-tuning model architectures with PyTorch.
 -   `optimization.py` - Implements the LAMB optimizer with PyTorch.
--   `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuaD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+-   `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
 -   `run_pretraining.py` - Implements BERT pre-training.
 -   `run_pretraining_inference.py` - Implements evaluation of a BERT pre-trained model.
 
@@ -284,145 +318,169 @@ Descriptions of the key scripts and folders are provided below.
 The complete list of the available parameters for the `run_pretraining.py` script are:
 
 ```
-  --input_dir INPUT_DIR   	- The input data directory.
-                            	Should contain .hdf5 files for the task.
+  --input_dir INPUT_DIR       - The input data directory.
+                                Should contain .hdf5 files for the task.
 
-  --config_file CONFIG_FILE   - Path to a json file describing the BERT model
-                            	configuration. This file configures the model
-                            	architecture, such as the number of transformer
-                            	blocks, number of attention heads, etc.
+  --config_file CONFIG_FILE      - Path to a json file describing the BERT model
+                                configuration. This file configures the model
+                                architecture, such as the number of transformer
+                                blocks, number of attention heads, etc.
 
-  --bert_model BERT_MODEL 	- Specifies the type of BERT model to use;
-                            	should be one of the following:
-    	bert-base-uncased
-    	bert-large-uncased
-    	bert-base-cased
-    	bert-base-multilingual
-    	bert-base-chinese
+  --bert_model BERT_MODEL        - Specifies the type of BERT model to use;
+                                should be one of the following:
+        bert-base-uncased
+        bert-large-uncased
+        bert-base-cased
+        bert-base-multilingual
+        bert-base-chinese
 
-  --output_dir OUTPUT_DIR 	- Path to the output directory where the model
-                            	checkpoints will be written.
+  --output_dir OUTPUT_DIR        - Path to the output directory where the model
+                                checkpoints will be written.
 
   --max_seq_length MAX_SEQ_LENGTH
-                          	- The maximum total input sequence length after
-                            	WordPiece tokenization. Sequences longer than
-                            	this will be truncated, and sequences shorter
-                            	than this will be padded.
+                              - The maximum total input sequence length after
+                                WordPiece tokenization. Sequences longer than
+                                this will be truncated, and sequences shorter
+                                than this will be padded.
 
   --max_predictions_per_seq MAX_PREDICTIONS_PER_SEQ
-                          	- The maximum total of masked tokens per input
-                            	sequence for Masked LM.
+                              - The maximum total of masked tokens per input
+                                sequence for Masked LM.
 
   --train_batch_size TRAIN_BATCH_SIZE
-                          	- Batch size per GPU for training.
+                              - Batch size per GPU for training.
 
   --learning_rate LEARNING_RATE
-                          	- The initial learning rate for LAMB optimizer.
+                              - The initial learning rate for LAMB optimizer.
 
-  --max_steps MAX_STEPS   	- Total number of training steps to perform.
+  --max_steps MAX_STEPS       - Total number of training steps to perform.
 
   --warmup_proportion WARMUP_PROPORTION
-                          	- Proportion of training to perform linear learning
-                            	rate warmup for. For example, 0.1 = 10% of training.
+                              - Proportion of training to perform linear learning
+                                rate warmup for. For example, 0.1 = 10% of training.
 
-  --seed SEED             	- Sets the seed to use for random number generation.
+  --seed SEED                 - Sets the seed to use for random number generation.
 
   --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
-                          	- Number of update steps to accumulate before
-                            	performing a backward/update pass.
+                              - Number of update steps to accumulate before
+                                performing a backward/update pass.
 
-  --fp16                  	- If set, will perform computations using
-                            	automatic mixed precision.
+  --fp16                      - If set, will perform computations using
+                                automatic mixed precision.
 
-  --loss_scale LOSS_SCALE 	- Sets the loss scaling value to use when
-                            	mixed precision is used. The default value (0)
-                            	tells the script to use dynamic loss scaling
-                            	instead of fixed loss scaling.
+  --loss_scale LOSS_SCALE        - Sets the loss scaling value to use when
+                                mixed precision is used. The default value (0)
+                                tells the script to use dynamic loss scaling
+                                instead of fixed loss scaling.
 
-  --log_freq LOG_FREQ     	- If set, the script will output the training
-                            	loss every LOG_FREQ steps.
+  --log_freq LOG_FREQ         - If set, the script will output the training
+                                loss every LOG_FREQ steps.
  
-  --resume_from_checkpoint	- If set, training will resume from a checkpoint
-                            	that currently exists in OUTPUT_DIR.
+  --resume_from_checkpoint       - If set, training will resume from a checkpoint
+                                that currently exists in OUTPUT_DIR.
 
   --num_steps_per_checkpoint NUM_STEPS_PER_CHECKPOINT
-                          	- Number of update steps until a model checkpoint
-                            	is saved to disk.`
+                              - Number of update steps until a model checkpoint
+                                is saved to disk.
+  --phase2                 - Specified if training on phase 2 only. If not specified, default pretraining is on phase 1.
 
+  --phase1_end_step        - The number of steps phase 1 was trained for. In order to  
+                           resume phase 2 the correct way, phase1_end_step should correspond to the --max_steps phase 1 was trained for.
 ```
  
+
+#### Multi-node
+
+Multi-node runs can be launched on a pyxis/enroot Slurm cluster (see [Requirements](#requirements)) with the `run.sub` script with the following command for a 4-node DGX1 example for both phase 1 and phase 2:
+```
+BATCHSIZE=2048 LR=6e-3 GRADIENT_STEPS=128 PHASE=1 sbatch -N4 --ntasks-per-node=8 run.sub
+BATCHSIZE=1024 LR=4e-3 GRADIENT_STEPS=256 PHASE=2 sbatch -N4 --ntasks-per-node=8 run.sub
+```
+
+
+Checkpoint after phase 1 will be saved in `checkpointdir` specified in `run.sub`. The checkpoint will be automatically picked up to resume training on phase 2. Note that phase 2 should be run after phase 1.
+
+Variables to re-run the [Training performance results](#training-performance-results) are available in the `configurations.yml` file. 
+
+The batch variables `BATCHSIZE`, `LR`, `GRADIENT_STEPS`,`PHASE` refer to the Python arguments `train_batch_size`, `learning_rate`, `gradient_accumulation_steps`, `phase2` respectively.
+
+Note that the `run.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `datadir` handle the location of the files for each phase. 
+
+Refer to the files contents to see the full list of variables to adjust for your system.
+
+
 #### Fine-tuning parameters
 
-The run_squad.py script contains many of the same arguments as `run_pretraining.py`.
+The `run_squad.py` script contains many of the same arguments as `run_pretraining.py`.
 The main script specific parameters are:
 
 ```
- --bert_model BERT_MODEL  	- Specifies the type of BERT model to use;
-                            	should be one of the following:
-    	bert-base-uncased
-    	bert-large-uncased
-    	bert-base-cased
-    	bert-base-multilingual
-    	bert-base-chinese
+ --bert_model BERT_MODEL      - Specifies the type of BERT model to use;
+                                should be one of the following:
+        bert-base-uncased
+        bert-large-uncased
+        bert-base-cased
+        bert-base-multilingual
+        bert-base-chinese
 
- --train_file TRAIN_FILE  	- Path to the SQuAD json for training.
-                            	For example, train-v1.1.json.
+ --train_file TRAIN_FILE      - Path to the SQuAD json for training.
+                                For example, train-v1.1.json.
 
- --predict_file PREDICT_FILE  - Path to the SQuAD json for predictions.
-                            	For example, dev-v1.1.json or test-v1.1.json.
+ --predict_file PREDICT_FILE     - Path to the SQuAD json for predictions.
+                                For example, dev-v1.1.json or test-v1.1.json.
 
  --max_seq_length MAX_SEQ_LENGTH
-                          	- The maximum total input sequence length
-                            	after WordPiece tokenization.
-                            	Sequences longer than this will be truncated,
-                            	and sequences shorter than this will be padded.
+                              - The maximum total input sequence length
+                                after WordPiece tokenization.
+                                Sequences longer than this will be truncated,
+                                and sequences shorter than this will be padded.
 
- --doc_stride DOC_STRIDE  	- When splitting up a long document into chunks
-                            	this parameters sets how much stride to take
-                            	between chunks of tokens.
+ --doc_stride DOC_STRIDE      - When splitting up a long document into chunks
+                                this parameters sets how much stride to take
+                                between chunks of tokens.
 
  --max_query_length MAX_QUERY_LENGTH
-                          	- The maximum number of tokens for the question.
-                            	Questions longer than <max_query_length>
-                            	will be truncated to the value specified.
+                              - The maximum number of tokens for the question.
+                                Questions longer than <max_query_length>
+                                will be truncated to the value specified.
 
- --n_best_size N_BEST_SIZE	- The total number of n-best predictions to
-                            	generate in the nbest_predictions.json
-                            	output file.
+ --n_best_size N_BEST_SIZE       - The total number of n-best predictions to
+                                generate in the nbest_predictions.json
+                                output file.
 
  --max_answer_length MAX_ANSWER_LENGTH
-                          	- The maximum length of an answer that can be
-                            	generated. This is needed because the start and
-                            	end predictions are not conditioned on one another.
+                              - The maximum length of an answer that can be
+                                generated. This is needed because the start and
+                                end predictions are not conditioned on one another.
 
- --verbose_logging        	- If true, all the warnings related to data
-                            	processing will be printed. A number of warnings
-                            	are expected for a normal SQuAD evaluation.
+ --verbose_logging            - If true, all the warnings related to data
+                                processing will be printed. A number of warnings
+                                are expected for a normal SQuAD evaluation.
 
- --do_lower_case          	- Whether to lower case the input text. Set to
-                            	true for uncased models and false for cased models.
+ --do_lower_case              - Whether to lower case the input text. Set to
+                                true for uncased models and false for cased models.
 
- --version_2_with_negative	- If true, the SQuAD examples contain questions
-                            	that do not have an answer.
+ --version_2_with_negative       - If true, the SQuAD examples contain questions
+                                that do not have an answer.
 
  --null_score_diff_threshold NULL_SCORE_DIFF_THRES HOLD
-                          	- A null answer will be predicted if null_score if
-                            	best_non_null is greater than NULL_SCORE_DIFF_THRESHOLD.
+                              - A null answer will be predicted if null_score if
+                                best_non_null is greater than NULL_SCORE_DIFF_THRESHOLD.
 ```
 
 ### Command-line options
 
-To see the full list of available options and their descriptions, use the -h or --help command line option, for example:
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
 
 `python run_pretraining.py --help`
 
 `python run_squad.py --help`
 
-Detailed descriptions of command line options can be found in the Parameters section above.
+Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section.
 
 ### Getting the data
 
-For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as Book Corpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. BERT requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
+For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as BookCorpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. BERT requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
 
 The preparation of pre-training dataset is described in the `bertPrep.py` script found in the `data/` folder. The component steps in the automated scripts to prepare the datasets are as follows:
 
@@ -436,12 +494,11 @@ The preparation of pre-training dataset is described in the `bertPrep.py` script
 
 5.  hdf5 file creation - each text file shard is processed by the `create_pretraining_data.py` script to produce a corresponding hdf5 file. The script generates input data and labels for masked language modeling and sentence prediction tasks for the input text shard.
 
-The tools used for preparing the Bookcorpus and Wikipedia datasets can be applied to prepare an arbitrary corpus. The `create_datasets_from_start.sh` script in the `data/` directory applies sentence segmentation, sharding, and hdf5 file creation given an arbitrary text file containing a document-separated text corpus.
-
+The tools used for preparing the BookCorpus and Wikipedia datasets can be applied to prepare an arbitrary corpus. The `create_datasets_from_start.sh` script in the `data/` directory applies sentence segmentation, sharding, and hdf5 file creation given an arbitrary text file containing a document-separated text corpus.
 
 For fine-tuning a pre-trained BERT model for specific tasks, by default this repository prepares the following dataset:
 
--   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering 
+-   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering
 
 #### Dataset guidelines
 
@@ -469,7 +526,7 @@ The training process consists of two steps: pre-training and fine-tuning.
 
 Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `scripts/run_pretraining.sh`.
 
-The `run_pretraining.sh` script runs a job on a single node that trains the BERT-large model from scratch using the Wikipedia and BookCorpus datasets as training data using LAMB optimizer. By default, the training script runs two phases of training with a hyperparameter recipe specific to 8 x V100 32G cards:
+The `run_pretraining.sh` script runs a job on a single node that trains the BERT-large model from scratch using Wikipedia and BookCorpus datasets as training data using the LAMB optimizer. By default, the training script runs two phases of training with a hyperparameter recipe specific to 8 x V100 32G cards:
 
 Phase 1: (Maximum sequence length of 128)
 -   Runs on 8 GPUs with training batch size of 64 per GPU
@@ -487,7 +544,7 @@ Phase 2: (Maximum sequence length of 512)
 -   Saves a checkpoint every 200 iterations (keeps only the latest 3 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
 -   Creates a log file containing all the output
 
-These parameters will train on Wikipedia and BooksCorpus to SoTA accuracy on a DGX-1 with 32GB V100 cards.
+These parameters will train on Wikipedia and BookCorpus to SoTA accuracy on a DGX-1 with 32GB V100 cards.
 
 `bash run_pretraining.sh <training_batch_size> <learning-rate> <precision> <num_gpus> <warmup_proportion> <training_steps> <save_checkpoint_steps> <resume_training> <create_logfile> <accumulate_gradients> <gradient_accumulation_steps> <seed> <job_name> <allreduce_post_accumulation> <allreduce_post_accumulation_fp16> <accumulate_into_fp16> <train_bath_size_phase2> <learning_rate_phase2> <warmup_proportion_phase2> <train_steps_phase2> <gradient_accumulation_steps_phase2> `
 
@@ -496,14 +553,14 @@ Where:
 -   `<training_batch_size>` is per-GPU batch size used for training. Larger batch sizes run more efficiently, but require more memory.
 -   `<learning_rate>` is the base learning rate for training
 -   `<precision>` is the type of math in your model, can be either `fp32` or `fp16`. The options mean:
-	-   FP32: 32-bit IEEE single precision floats.
-	-   FP16: Mixed precision 16 and 32 bit floats.
+    -   FP32: 32-bit IEEE single precision floats.
+    -   FP16: Mixed precision 16 and 32 bit floats.
 -   `<num_gpus>` is the number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node.
 -   `<warmup_proportion>` is the percentage of training steps used for warm-up at the start of training.
 -   `<training_steps>` is the total number of training steps.
 -   `<save_checkpoint_steps>` controls how often checkpoints are saved.
--   `<resume_training>` if set to true, training should resume from latest model in /results/checkpoints. Default is false.
--   `<create_logfile>` a flag indicating if output should be written to a log file or not (acceptable values are true or false. true indicates output should be saved to a log file.)
+-   `<resume_training>` if set to `true`, training should resume from latest model in `/results/checkpoints`. Default is `false`.
+-   `<create_logfile>` a flag indicating if output should be written to a log file or not (acceptable values are `true` or 'false`. `true` indicates output should be saved to a log file.)
 -   `<accumulate_gradient>` a flag indicating whether a larger batch should be simulated with gradient accumulation.
 -   `<gradient_accumulation_steps>` an integer indicating the number of steps to accumulate gradients over. Effective batch size = `training_batch_size` / `gradient_accumulation_steps`.
 -   `<seed>` random seed for the run.
@@ -522,7 +579,7 @@ For example:
 
 Trains BERT-large from scratch on a DGX-1 32G using FP16 arithmetic. 90% of the training steps are done with sequence length 128 (phase1 of training) and 10% of the training steps are done with sequence length 512 (phase2 of training).
 
-In order to train on a DGX-1 16G, set `gradient_accumulation_steps` to `512` and `gradient_accumulation_steps_phase2` to `1024` in `scripts/run_pretraining.sh`
+In order to train on a DGX-1 16G, set `gradient_accumulation_steps` to `512` and `gradient_accumulation_steps_phase2` to `1024` in `scripts/run_pretraining.sh`.
 
 In order to train on a DGX-2 32G, set `train_batch_size` to `4096`, `train_batch_size_phase2` to `2048`, `num_gpus` to `16`, `gradient_accumulation_steps` to `64` and `gradient_accumulation_steps_phase2` to `256` in `scripts/run_pretraining.sh`
 
@@ -538,17 +595,17 @@ By default, each Python script implements fine-tuning a pre-trained BERT model f
 -   Has FP16 precision enabled
 -   Saves a checkpoint at the end of training to the `/results/<dataset_name>` folder
 
-Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through NVIDIA’s [Apex](https://github.com/NVIDIA/apex) library. For a full list of parameters and associated explanations, consult the [Parameters](#parameters) section.
+Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through NVIDIA’s [APEX](https://github.com/NVIDIA/apex) library. For a full list of parameters and associated explanations, see the [Parameters](#parameters) section.
 
 All fine-tuning shell scripts have the same positional arguments, outlined below:
 
-`bash scripts/run_squad.sh <checkpoint_to_load> <epochs> <batch_size per GPU> <learning rate> <precision (either `fp16` or `fp32`)> <number of GPUs to use> <seed> <SQUAD_DATA_DIR> <VOCAB_FILE> <OUTPUT_DIR> <mode (either `train`, `eval` or `train eval`)> <CONFIG_FILE>`
+```bash scripts/run_squad.sh <checkpoint_to_load> <epochs> <batch_size per GPU> <learning rate> <precision (either `fp16` or `fp32`)> <number of GPUs to use> <seed> <SQuAD_DATA_DIR> <VOCAB_FILE> <OUTPUT_DIR> <mode (either `train`, `eval` or `train eval`)> <CONFIG_FILE>```
 
 By default, the mode positional argument is set to train eval. See the [Quick Start Guide](#quick-start-guide) for explanations of each positional argument.
 
 Note: The first positional argument (the path to the checkpoint to load) is required.
 
-Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command line input to `run_squad.sh`.
+Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command-line input to `run_squad.sh`.
 
 ### Inference process
 
@@ -578,13 +635,13 @@ Where:
 
 -   `<evaluation_batch_size>` is per-GPU batch size used for inference. Larger batch sizes run more efficiently, but require more memory.
 -   `<precision>` is the type of math in your model, can be either `fp32` or `fp16`. The options mean:
-	-   `fp32`: 32-bit IEEE single precision floats
-	-   `fp16`: 16-bit floats for 3.2x faster inference
+    -   `fp32`: 32-bit IEEE single precision floats
+    -   `fp16`: 16-bit floats for 3.2x faster inference
 -   `<num_gpus>` is the number of GPUs to use for inference. Must be equal to or smaller than the number of GPUs attached to your node.
 -   `<inference_mode>` is either `--eval` for evaluation or `--prediction` for inference
--   `<model_checkpoint>` is the model checkpoint to run inference on. Default is `-1`, which takes the most recent model checkpoint from the checkpoints folder.
+-   `<model_checkpoint>` is the model checkpoint to run inference on. Default is `-1`, which takes the most recent model checkpoint from the `checkpoints` folder.
 -   `<inference_steps>` is the total number of inference steps per process. Default is `-1`, which iterates over the entire dataset.
--   `<create_logfile>` a flag indicating if output should be written to a logfile or not (acceptable values are true or false. true indicates output should be saved to a logfile.)
+-   `<create_logfile>` a flag indicating if output should be written to a logfile or not (acceptable values are `true` or `false`. `true` indicates output should be saved to a logfile.)
 
 For example:
 
@@ -598,11 +655,10 @@ Evaluation fine-tuning is enabled by the same scripts as training:
 
 The mode positional argument of the shell script is used to run in evaluation mode. The fine-tuned BERT model will be run on the evaluation dataset, and the evaluation loss and accuracy will be displayed.
 
-Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running evaluation on a given dataset or doing prediction.
+Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running prediction and evaluating them on a given dataset or just the former.
 
 `bash scripts/run_squad.sh <path to fine-tuned model checkpoint>`
 
-Note: Fine-tuning evaluation is only supported on single GPU.
 
 ## Performance
 
@@ -612,11 +668,11 @@ The following section shows how to run benchmarks measuring the model performanc
 
 #### Training performance benchmark
 
-Training performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command line as described in [Training process](#training-process).
+Training performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Training process](#training-process).
 
 To benchmark the training performance on a specific batch size, run:
 
-`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to squad dataset> <path to vocab set> <results directory> train <BERT config path] <max steps>`
+`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to SQuAD dataset> <path to vocab set> <results directory> train <BERT config path] <max steps>`
 
 An example call used to generate throughput numbers:
 
@@ -626,11 +682,11 @@ An example call used to generate throughput numbers:
 
 #### Inference performance benchmark
 
-Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command line as described in [Inference process](#inference-process).
+Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Inference process](#inference-process).
 
 To benchmark the inference performance on a specific batch size, run:
 
-`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to squad dataset> <path to vocab set> <results directory> eval <BERT config path> <max steps>`
+`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to SQuAD dataset> <path to vocab set> <results directory> eval <BERT config path> <max steps>`
 
 An example call used to generate throughput numbers:
 
@@ -644,18 +700,20 @@ An example call used to generate throughput numbers:
 #### Training accuracy results
 
 
-
-Our results were obtained by running `scripts/run_squad.sh` and `scripts/run_pretraining.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs for pretraining and NVIDIA DGX-1 with (8x V100 16G) GPUs for fine-tuning.
-
-Note: Pretraining results were obtained with a dataset that was created using an earlier version of the data preprocessing scripts than are currently in this repository, and with an an earlier snapshot of wikidumps. The results in the table will be updated soon with results using the latest data prep scripts. Early data show the results are quite similar.
+Our results were obtained by running the `scripts/run_squad.sh` and `scripts/run_pretraining.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs for pretraining and NVIDIA DGX-1 with (8x V100 16G) GPUs for fine-tuning.
 
 
 ##### Pre-training loss results
 
-| DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(days) - FP32 | Time to train(days) - mixed precision | Time to train speedup (FP32 to mixed precision)
+| DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
 |---|---|---|---|---|---|---|---|---
-| NVIDIA DGX-1 With 16G|8|8192 and 4196 |512 and 1024|-|1.53|-|6.84|- 
-| NVIDIA DGX-2 With 32G|16|4096 and 2048 |64 and 256|-|1.52|-|2.71|- 
+| 1 x NVIDIA DGX-1 With 16G|8|8192 and 4196 |512 and 1024|-|1.36|-|153.16|-
+| 1 x NVIDIA DGX-2H With 32G|16|4096 and 2048 |64 and 256|-|1.35|-|58.4|-
+| 4 x NVIDIA DGX-1 With 16G|8|2048 and 1024 |128 and 256|-|1.34|-|39.27|-
+| 4 x NVIDIA DGX-2H With 32G|16|1024 and 512 |16 and 64|-|1.33|-|15.35|-
+| 16 x NVIDIA DGX-1 With 16G|8|512 and 256 |32 and 64|-|1.329|-|10.36|-
+| 16 x NVIDIA DGX-2H With 32G|16|256 and 128 |4 and 16|-|1.33|-|3.94|-
+| 64 x NVIDIA DGX-2H With 32G|16|64 and 32 |(1 and 4)FP16 and (2 and 8)FP32|1.33|1.331|4.338|1.124|3.85
 
 ##### Fine-tuning accuracy results
 
@@ -667,9 +725,9 @@ Note: Pretraining results were obtained with a dataset that was created using an
 
 ###### Pre-training stability test
 
-| Accuracy Metric | Seed 1
-|---|---
-| Final Loss | 1.52
+| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
+|---|---|---|---|---|---|---|---
+|Final Loss| 1.344 | 1.328 | 1.324 | 1.326 | 1.333 | 1.331 | 0.009
 
 ###### Fine-tuning stability test
 
@@ -680,11 +738,12 @@ Training stability with 8 GPUs, FP16 computations, batch size of 4:
 |Exact Match %| 84.50 | 84.07 | 84.52 | 84.23 | 84.17 | 84.30 | .200
 | f1 % | 91.29 | 91.01 | 91.14 |  91.10 | 90.85 | 91.08 | 0.162
 
+
 #### Training performance results
 
 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
 
-Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.shtraining scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a predefined number of training iterations.
+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a predefined number of training iterations.
 
 ###### Pre-training NVIDIA DGX-1 With 16G
 
@@ -698,6 +757,18 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
 | 8| 2| 4| 512| 56.16 |194.56 | 3.46| 7.43| 7.30
 
 
+###### Pre-training on multiple NVIDIA DGX-1 With 16G
+
+| Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|--------------
+|1 |8 | N/A | 16| 128| N/A |874.24 |N/A |N/A | 1.00
+|4 |8 | N/A | 16| 128| N/A |3089.76 | N/A| N/A| 3.53
+|16 |8 | N/A | 16| 128| N/A |12144.64 | N/A| N/A| 13.89
+|1 |8 | N/A | 4| 512| N/A |195.93 |N/A |N/A | 1.00
+|4 |8 | N/A | 4| 512| N/A |700.16 | N/A| N/A| 3.57
+|16| 8| N/A | 4| 512| N/A |2746.368 | N/A| N/A| 14.02
+
+
 ###### Fine-tuning NVIDIA DGX-1 With 16G
 
 
@@ -713,7 +784,7 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
 
 ##### Training performance: NVIDIA DGX-1 (8x V100 32G)
 
-Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
 
 ###### Pre-training NVIDIA DGX-1 With 32G
 
@@ -729,6 +800,7 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
 |4 |N/A | 10| 512|N/A |164.00 | N/A| N/A| 3.57
 | 8|N/A | 10| 512|N/A |325.60| N/A| N/A| 7.08
 
+
 ###### Fine-tuning NVIDIA DGX-1 With 32G
 
 | GPUs | Batch size / GPU | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
@@ -743,7 +815,7 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
 
 ##### Training performance: NVIDIA DGX-2 (16x V100 32G)
 
-Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
+Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
 
 ###### Pre-training NVIDIA DGX-2 With 32G
 
@@ -762,6 +834,22 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
 |8 | N/A | 10| 512| N/A| 325.60| N/A| N/A| 6.87
 |16 | N/A | 10| 512| N/A| 648.00| N/A| N/A| 13.67
 
+###### Pre-training on multiple NVIDIA DGX-2H With 32G
+
+Note: Multi-node performance numbers below are on DGX-2H whereas the single node performance numbers above are on DGX-2.
+
+
+| Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|---------------------
+|1 |16 | N/A | 64| 128| N/A |3379.2 |N/A |N/A | 1.00
+|4 |16 | N/A | 64| 128| N/A |12709.88 | N/A| N/A| 3.76
+|16 |16 | N/A | 64| 128| N/A |51937.28 | N/A| N/A| 15.37
+|64 |16 | 32 | 64| 128| 46628.86 |188088.32 | 4.03 | N/A| 55.66
+|1 |16 | N/A | 8| 512| N/A |625.66 |N/A |N/A | 1.00
+|4 |16 | N/A | 8| 512| N/A |2386.38 | N/A| N/A| 3.81
+|16| 16| N/A | 8| 512| N/A |9932.8 | N/A| N/A| 15.87
+|64| 16| 4 | 8| 512| 9543.68 |37478.4 | 3.92| N/A| 59.9
+
 ###### Fine-tuning NVIDIA DGX-2 With 32G
 
 | GPUs | Batch size / GPU | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
@@ -781,7 +869,7 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 ##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
 
-Our results were obtained by running `scripts/run_pretraining_inference.sh` on data of sequence length 512 and `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` script on data of sequence length 512 and the `scripts/run_squad.sh` script in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
 
 ###### Pre-training inference on NVIDIA DGX-1 with 16G
 
@@ -797,7 +885,7 @@ Our results were obtained by running `scripts/run_pretraining_inference.sh` on d
 
 ##### Inference performance: NVIDIA DGX-1 (1x V100 32G)
 
-Our results were obtained by running `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
 
 ###### Pre-training inference on NVIDIA DGX-1 with 32G
 
@@ -813,13 +901,13 @@ Our results were obtained by running `scripts/run_pretraining_inference.sh` and
 
 ##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
 
-Our results were obtained by running `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
+Our results were obtained by running the `scripts/run_pretraining_inference.sh` and  `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
 
 ###### Pre-training inference on NVIDIA DGX-2 with 32G
 
 |GPUs | Throughput - FP32(sequences/sec)|Throughput - Mixed Precision(sequences/sec)
 |---------- |---------|---------------
-| 1| 30.24 97.72
+| 1| 30.24| 97.72
 
 ###### Fine-tuning inference on NVIDIA DGX-2 with 32G
 
@@ -835,16 +923,20 @@ The inference performance metrics used were items/second.
 
 ### Changelog
 
-August 2019
-
-- Pretraining support with LAMB optimizer
+September 2019
+- Scripts to support multi-node launch
+- Update pretraining loss results based on the latest data preparation scripts
 
+August 2019
+- Pre-training support with LAMB optimizer
 - Updated Data download and Preprocessing
 
 July 2019
-
 - Initial release
 
 ### Known issues
 
 There are no known issues with this model.
+
+
+

+ 13 - 0
PyTorch/LanguageModeling/BERT/bind_pyt.py

@@ -1,3 +1,16 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import subprocess
 import os

+ 182 - 0
PyTorch/LanguageModeling/BERT/configurations.yml

@@ -0,0 +1,182 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#1 DGX1 phase1
+bert--DGX1:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "1"
+    BATCHSIZE: "8192"
+    LR: "6e-3"
+    GRADIENT_STEPS: "512"
+    PHASE: "1"
+
+#4 DGX1 phase1
+bert--DGX1_4x8x16x128:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "4"
+    BATCHSIZE: "2048"
+    LR: "6e-3"
+    GRADIENT_STEPS: "128"
+    PHASE: "1"
+
+#16 DGX1 phase1
+bert--DGX1_16x8x16x32:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "16"
+    BATCHSIZE: "512"
+    LR: "6e-3"
+    GRADIENT_STEPS: "32"
+    PHASE: "1"
+
+#1 DGX2 phase1
+bert--DGX2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "1"
+    BATCHSIZE: "4096"
+    LR: "6e-3"
+    GRADIENT_STEPS: "64"
+    PHASE: "1"
+
+#4 DGX2 phase1
+bert--DGX2_4x16x64x16:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "4"
+    BATCHSIZE: "1024"
+    LR: "6e-3"
+    GRADIENT_STEPS: "16"
+    PHASE: "1"
+
+#16 DGX2 phase1
+bert--DGX2_16x16x64x4:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "16"
+    BATCHSIZE: "256"
+    LR: "6e-3"
+    GRADIENT_STEPS: "4"
+    PHASE: "1"
+
+#64 DGX2 phase1
+bert--DGX2_64x16x64:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "64"
+    BATCHSIZE: "64"
+    LR: "6e-3"
+    GRADIENT_STEPS: "1"
+    PHASE: "1"
+
+#1 DGX1 phase2
+bert--DGX1_1x8x4x1024:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "1"
+    BATCHSIZE: "4096"
+    LR: "4e-3"
+    GRADIENT_STEPS: "1024"
+    PHASE: "2"
+
+#4 DGX1 phase2
+bert--DGX1_4x8x4x256:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "4"
+    BATCHSIZE: "1024"
+    LR: "4e-3"
+    GRADIENT_STEPS: "256"
+    PHASE: "2"
+
+#16 DGX1 phase2
+bert--DGX1_16x8x4x64:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "16"
+    BATCHSIZE: "256"
+    LR: "4e-3"
+    GRADIENT_STEPS: "64"
+    PHASE: "2"
+
+#1 DGX2 phase2
+bert--DGX2_1x16x8x256:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "1"
+    BATCHSIZE: "2048"
+    LR: "4e-3"
+    GRADIENT_STEPS: "256"
+    PHASE: "2"
+
+#4 DGX2 phase2
+bert--DGX2_4x16x8x64:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "4"
+    BATCHSIZE: "512"
+    LR: "4e-3"
+    GRADIENT_STEPS: "64"
+    PHASE: "2"
+
+#16 DGX2 phase2
+bert--DGX2_16x16x8x16:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "16"
+    BATCHSIZE: "128"
+    LR: "4e-3"
+    GRADIENT_STEPS: "16"
+    PHASE: "2"
+
+#64 DGX2 phase2
+bert--DGX2_64x16x8x4:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "64"
+    BATCHSIZE: "32"
+    LR: "4e-3"
+    GRADIENT_STEPS: "4"
+    PHASE: "2"
+

+ 3 - 2
PyTorch/LanguageModeling/BERT/create_pretraining_data.py

@@ -1,6 +1,6 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Create masked LM/next sentence masked_lm TF examples for BERT."""
 from __future__ import absolute_import, division, print_function, unicode_literals
 

+ 12 - 0
PyTorch/LanguageModeling/BERT/data/BooksDownloader.py

@@ -1,4 +1,16 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import subprocess
 
 class BooksDownloader:

+ 12 - 0
PyTorch/LanguageModeling/BERT/data/BookscorpusTextFormatting.py

@@ -1,4 +1,16 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import glob
 import os
 

+ 12 - 0
PyTorch/LanguageModeling/BERT/data/Downloader.py

@@ -1,4 +1,16 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
 from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
 from WikiDownloader import WikiDownloader

+ 11 - 0
PyTorch/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py

@@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import hashlib
 import os

+ 11 - 0
PyTorch/LanguageModeling/BERT/data/MRPCDownloader.py

@@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import bz2
 import os

+ 11 - 0
PyTorch/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py

@@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import os
 

+ 11 - 0
PyTorch/LanguageModeling/BERT/data/SquadDownloader.py

@@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import bz2
 import os

+ 11 - 0
PyTorch/LanguageModeling/BERT/data/TextSharding.py

@@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from collections import defaultdict
 from itertools import islice

+ 12 - 3
PyTorch/LanguageModeling/BERT/data/WikiDownloader.py

@@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import bz2
 import os
@@ -43,6 +54,4 @@ class WikiDownloader:
             subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
 
         else:
-            assert False, 'WikiDownloader not implemented for this language yet.'
-
-
+            assert False, 'WikiDownloader not implemented for this language yet.'

+ 11 - 0
PyTorch/LanguageModeling/BERT/data/WikicorpusTextFormatting.py

@@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import glob
 import os

+ 12 - 0
PyTorch/LanguageModeling/BERT/data/__init__.py

@@ -0,0 +1,12 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 13 - 2
PyTorch/LanguageModeling/BERT/data/bertPrep.py

@@ -1,4 +1,15 @@
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import BookscorpusTextFormatting
 import Downloader
@@ -70,14 +81,13 @@ def main(args):
                 wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
                 print('WikiExtractor Command:', wikiextractor_command)
                 wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+                #wikiextractor_process.communicate()
 
             wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
             output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
             wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
             wiki_formatter.merge()
 
-            assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
-            
         elif args.dataset == 'wikicorpus_zh':
             assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
             if args.skip_wikiextractor == 0:
@@ -85,6 +95,7 @@ def main(args):
                 wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
                 print('WikiExtractor Command:', wikiextractor_command)
                 wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+                #wikiextractor_process.communicate()
 
             wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
             output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'

+ 14 - 1
PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh

@@ -1,5 +1,18 @@
 #!/bin/bash
+
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Download
 python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus
 python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
@@ -26,4 +39,4 @@ python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset bo
 
 # Create HDF5 files Phase 2
 python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 512 \
- --max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
+ --max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1

+ 13 - 0
PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh

@@ -1,5 +1,18 @@
 #!/usr/bin/env bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Downloading MRPC data"
 
 wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py

+ 13 - 0
PyTorch/LanguageModeling/BERT/data/squad/squad_download.sh

@@ -1,5 +1,18 @@
 #!/usr/bin/env bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Downloading dataset for squad..."
 
 # Download SQuAD

+ 1 - 0
PyTorch/LanguageModeling/BERT/extract_features.py

@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Extract pre-computed feature vectors from a PyTorch BERT model."""
 
 from __future__ import absolute_import

+ 14 - 0
PyTorch/LanguageModeling/BERT/file_utils.py

@@ -1,8 +1,22 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Utilities for working with the local dataset cache.
 This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
 Copyright by the AllenNLP authors.
 """
+
 from __future__ import (absolute_import, division, print_function, unicode_literals)
 
 import json

+ 2 - 2
PyTorch/LanguageModeling/BERT/modeling.py

@@ -1,7 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """PyTorch BERT model."""
 
 from __future__ import absolute_import, division, print_function, unicode_literals

+ 2 - 0
PyTorch/LanguageModeling/BERT/optimization.py

@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """PyTorch optimization for BERT model."""
 
 import math
@@ -24,6 +25,7 @@ from torch.nn.utils import clip_grad_norm_
 from apex.optimizers import FusedAdam
 from apex.multi_tensor_apply import multi_tensor_applier
 import amp_C
+
 multi_tensor_l2norm = amp_C.multi_tensor_l2norm
 lamb_compute_update = amp_C.multi_tensor_lamb_stage1_cuda
 lamb_apply_update = amp_C.multi_tensor_lamb_stage2_cuda

+ 74 - 0
PyTorch/LanguageModeling/BERT/run.sub

@@ -0,0 +1,74 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --overcommit
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eux
+
+# The following variables variables need to be set
+# Base container to be used  
+readonly docker_image="nvcr.io/nvidia/pytorch:19.08-py3"
+# Location of dataset for phase 1
+readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
+# Location of dataset for phase 2
+readonly datadir_phase2="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"
+# Path to where trained checkpoints will be saved on the system
+readonly checkpointdir="$PWD/checkpoints"
+
+readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
+
+PHASE1="\
+    --train_batch_size=${BATCHSIZE:-16} \
+    --learning_rate=${LR:-6e-3} \
+    --warmup_proportion=${WARMUP_UPDATES:-0.2843} \
+    --input_dir=/workspace/data \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=7038 \
+    --num_steps_per_checkpoint=2500 \
+    "
+PHASE2="\
+    --train_batch_size=${BATCHSIZE:-4096} \
+    --learning_rate=${LR:-4e-3} \
+    --warmup_proportion=${WARMUP_UPDATES:-0.128} \
+    --input_dir=/workspace/data_phase2 \
+    --phase2 \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=80 \
+    --max_steps=1563 \
+    --num_steps_per_checkpoint=1000 \
+    --resume_from_checkpoint --phase1_end_step=7038 \
+    "
+PHASES=( "$PHASE1" "$PHASE2" ) 
+
+PHASE=${PHASE:-1}
+
+BERT_CMD="\
+    python -u /workspace/bert/run_pretraining.py \
+    --seed=42 \
+    ${PHASES[$((PHASE-1))]} \
+    --do_train \
+    --config_file=/workspace/bert/bert_config.json \
+    --output_dir=/results \
+    --fp16 \
+    --allreduce_post_accumulation --allreduce_post_accumulation_fp16 \
+    --gradient_accumulation_steps=${GRADIENT_STEPS:-2} \
+    --log_freq=1 \
+    --local_rank=\${SLURM_LOCALID}"
+
+srun -l --container-image="${docker_image}" --container-mounts="${mounts}" sh -c "${BERT_CMD}"

+ 2 - 2
PyTorch/LanguageModeling/BERT/run_glue.py

@@ -1,7 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """BERT finetuning runner."""
 
 from __future__ import absolute_import, division, print_function

+ 5 - 26
PyTorch/LanguageModeling/BERT/run_pretraining.py

@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """BERT finetuning runner."""
 
 from __future__ import absolute_import
@@ -65,7 +66,6 @@ def create_pretraining_dataset(input_file, max_pred_length, shared_list, args):
     train_dataloader = DataLoader(train_data, sampler=train_sampler,
                                   batch_size=args.train_batch_size * args.n_gpu, num_workers=4,
                                   pin_memory=True)
-    # shared_list["0"] = (train_dataloader, input_file)
     return train_dataloader, input_file
 
 class pretraining_dataset(Dataset):
@@ -179,7 +179,7 @@ def parse_arguments():
                         type=float, default=0.0,
                         help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
     parser.add_argument('--log_freq',
-                        type=float, default=50.0,
+                        type=float, default=1.0,
                         help='frequency of logging loss.')
     parser.add_argument('--checkpoint_activations',
                         default=False,
@@ -253,7 +253,7 @@ def setup_training(args):
         raise ValueError(" `do_train`  must be True.")
 
     if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (
-            os.listdir(args.output_dir) and os.listdir(args.output_dir) != ['logfile.txt']):
+            os.listdir(args.output_dir) and any([i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
 
     if not args.resume_from_checkpoint:
@@ -478,8 +478,7 @@ def main():
 
             for f_id in range(f_start_id + 1 , len(files)):
                 
-                # torch.cuda.synchronize()
-                # f_start = time.time()    
+   
                 if torch.distributed.get_world_size() > num_files:
                     data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_id)%num_files]
                 else:
@@ -489,23 +488,10 @@ def main():
 
                 previous_file = data_file
 
-                # train_dataloader = shared_file_list["0"][0]
-
-                # thread = multiprocessing.Process(
-                #     name="LOAD DATA:" + str(f_id) + ":" + str(data_file),
-                #     target=create_pretraining_dataset,
-                #     args=(data_file, args.max_predictions_per_seq, shared_file_list, args, n_gpu)
-                # )
-                # thread.start()
                 dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args)
-                # torch.cuda.synchronize()
-                # f_end = time.time()
-                # print('[{}] : shard overhead {}'.format(torch.distributed.get_rank(), f_end - f_start))
 
                 train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process() else train_dataloader
                 for step, batch in enumerate(train_iter):
-                    # torch.cuda.synchronize()
-                    # iter_start = time.time()
 
                     training_steps += 1
                     batch = [t.to(device) for t in batch]
@@ -533,7 +519,7 @@ def main():
                         global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
 
                     if global_step >= args.max_steps:
-                        last_num_steps = global_step % args.log_freq
+                        last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
                         last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
                         average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
                         average_loss = average_loss / (last_num_steps * divisor)
@@ -578,13 +564,6 @@ def main():
                             # thread.join()
                             return args
 
-
-                    # torch.cuda.synchronize()
-                    # iter_end = time.time()
-
-                    # if torch.distributed.get_rank() == 0:
-                    #     print('step {} : {}'.format(global_step, iter_end - iter_start))
-
                 del train_dataloader
                 # thread.join()
                 # Make sure pool has finished and switch train_dataloader

+ 1 - 0
PyTorch/LanguageModeling/BERT/run_pretraining_inference.py

@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """BERT finetuning runner."""
 
 from __future__ import absolute_import

+ 7 - 4
PyTorch/LanguageModeling/BERT/run_squad.py

@@ -1,7 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Run BERT on SQuAD."""
 
 from __future__ import absolute_import, division, print_function
@@ -40,6 +40,7 @@ from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
 from optimization import BertAdam, warmup_linear
 from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
+from utils import is_main_process
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -923,9 +924,11 @@ def main():
     model = BertForQuestionAnswering(config)
     # model = BertForQuestionAnswering.from_pretrained(args.bert_model,
                 # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
-    print("USING CHECKOINT")
+    if is_main_process():
+        print("LOADING CHECKOINT")
     model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
-    print("USED CHECKPOINT \n\n")
+    if is_main_process():
+        print("LOADED CHECKPOINT")
     model.to(device)
     if args.fp16 and args.old:
         model.half()

+ 2 - 2
PyTorch/LanguageModeling/BERT/run_swag.py

@@ -1,7 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """BERT finetuning runner."""
 
 import argparse

+ 14 - 0
PyTorch/LanguageModeling/BERT/schedulers.py

@@ -1,3 +1,17 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import torch
 from torch.optim.optimizer import Optimizer

+ 14 - 0
PyTorch/LanguageModeling/BERT/scripts/data_download.sh

@@ -1,4 +1,18 @@
 #!/usr/bin/env bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 DATA_DIR=${1:-/workspace/bert/data}
 
 # Download vocab files from pretrained model

+ 15 - 1
PyTorch/LanguageModeling/BERT/scripts/run_glue.sh

@@ -1,5 +1,18 @@
 #!/bin/bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 MRPC_DIR=/workspace/bert/data/glue/MRPC
 OUT_DIR=/results/MRPC
 
@@ -55,7 +68,8 @@ CMD+="$use_fp16"
 LOGFILE=$OUT_DIR/logfile
 $CMD |& tee $LOGFILE
 
-sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
+sed -r 's/
+|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
 
 throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
 

+ 13 - 0
PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh

@@ -1,5 +1,18 @@
 #!/bin/bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 train_batch_size=${1:-8192}
 learning_rate=${2:-"6e-3"}

+ 13 - 0
PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh

@@ -1,5 +1,18 @@
 #!/bin/bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
 DATASET=wikipedia_corpus # change this for other datasets

+ 13 - 1
PyTorch/LanguageModeling/BERT/scripts/run_squad.sh

@@ -1,7 +1,19 @@
 #!/usr/bin/env bash
 
-#OUT_DIR=/results/SQuAD
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
+#OUT_DIR=/results/SQuAD
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 

+ 15 - 1
PyTorch/LanguageModeling/BERT/scripts/run_swag.sh

@@ -1,5 +1,18 @@
 #!/bin/bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SWAG_DIR=/workspace/bert/data/swag
 OUT_DIR=/results/SWAG
 
@@ -54,7 +67,8 @@ CMD+="$use_fp16"
 LOGFILE=$OUT_DIR/logfile
 $CMD |& tee $LOGFILE
 
-sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
+sed -r 's/
+|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
 
 throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
 

+ 14 - 0
PyTorch/LanguageModeling/BERT/scripts/start_pretraining.sh

@@ -1,4 +1,18 @@
 #!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # purpose: for multinode training on slurm clusters
 node_type=${1:-"dgx1"}
 num_nodes=${2:-1}

+ 2 - 1
PyTorch/LanguageModeling/BERT/tokenization.py

@@ -1,6 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Tokenization classes."""
 
 from __future__ import absolute_import, division, print_function, unicode_literals

+ 13 - 0
PyTorch/LanguageModeling/BERT/utils.py

@@ -1,3 +1,16 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.distributed as dist