3 år sedan · 8cdaba13eb
--- a/PaddlePaddle/LanguageModeling/BERT/Dockerfile
+++ b/PaddlePaddle/LanguageModeling/BERT/Dockerfile
@@ -1,15 +1,20 @@
 
				-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:22.08-py3
			
 
				-
			
 
				+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:22.12-py3
			
 
				 FROM ${FROM_IMAGE_NAME}
			
 
				-
			
 
				 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
			
 
				 
			
 
				 ENV BERT_PREP_WORKING_DIR /workspace/bert/data
			
 
				-ADD requirements.txt /workspace/
			
 
				+
			
 
				 WORKDIR /workspace/
			
 
				-RUN pip install --no-cache-dir -r requirements.txt
			
 
				-RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
			
 
				-RUN git clone https://github.com/soskek/bookcorpus.git
			
 
				 
			
 
				-ADD . /workspace/bert
			
 
				 WORKDIR /workspace/bert
			
 
				+RUN pip install --no-cache-dir \
			
 
				+ tqdm boto3 requests six ipdb h5py nltk progressbar tokenizers>=0.7\
			
 
				+ git+https://github.com/NVIDIA/dllogger wget
			
 
				+
			
 
				+RUN apt-get install -y iputils-ping
			
 
				+
			
 
				+COPY . .
			
 
				+
			
 
				+RUN apt-get install -y libjemalloc-dev
			
 
				+RUN pip install git+https://github.com/NVIDIA/lddl.git
			
 
				+RUN python -m nltk.downloader punkt
			
--- a/PaddlePaddle/LanguageModeling/BERT/README.md
+++ b/PaddlePaddle/LanguageModeling/BERT/README.md
@@ -20,7 +20,8 @@ This repository provides a script and recipe to train the BERT model for PaddleP
 
				     * [Scripts and sample code](#scripts-and-sample-code)
			
 
				     * [Parameters](#parameters)
			
 
				         * [Pre-training parameters](#pre-training-parameters)
			
 
				-        * [Fine tuning parameters](#fine-tuning-parameters)    
			
 
				+        * [Fine tuning parameters](#fine-tuning-parameters)
			
 
				+        * [Multi-node](#multi-node)
			
 
				     * [Command-line options](#command-line-options)
			
 
				     * [Getting the data](#getting-the-data)
			
 
				         * [Dataset guidelines](#dataset-guidelines)
			
@@ -43,6 +44,7 @@ This repository provides a script and recipe to train the BERT model for PaddleP
 
				         * [Training performance results](#training-performance-results)
			
 
				             * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
			
 
				                 * [Pre-training NVIDIA DGX A100 (8x A100 80GB)](#pre-training-nvidia-dgx-a100-8x-a100-80gb)
			
 
				+                * [Pre-training NVIDIA DGX A100 (8x A100 80GB) Multi-node Scaling](#pre-training-nvidia-dgx-a100-8x-a100-80gb-multi-node-scaling)
			
 
				                 * [Fine-tuning NVIDIA DGX A100 (8x A100 80GB)](#fine-tuning-nvidia-dgx-a100-8x-a100-80gb)
			
 
				         * [Inference performance results](#inference-performance-results)
			
 
				             * [Inference performance: NVIDIA DGX A100 (1x A100 80GB)](#inference-performance-nvidia-dgx-a100-1x-a100-80gb)
			
@@ -105,13 +107,17 @@ The following features are supported by this model.
 
				 | [Paddle AMP](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/performance_improving/amp_en.html)           |   Yes    |
			
 
				 | [Paddle Fleet](https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/distributed/fleet/Fleet_en.html#fleet) |   Yes    |
			
 
				 | [LAMB](https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/optimizer/Lamb_en.html)                        |   Yes    |
			
 
				+| [LDDL](https://github.com/NVIDIA/LDDL)  |   Yes    |
			
 
				+| Multi-node  |   Yes   |
			
 
				  
			
 
				 #### Features
			
 
				  
			
 
				 [Fleet](https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/distributed/fleet/Fleet_en.html#fleet) is a unified API for distributed training of PaddlePaddle.
			
 
				  
			
 
				 [LAMB](https://arxiv.org/pdf/1904.00962.pdf) stands for Layerwise Adaptive Moments based optimizer, which is a large batch optimization technique that helps accelerate the training of deep neural networks using large minibatches. It allows using a global batch size of 65536 and 32768 on sequence lengths 128 and 512, respectively, compared to a batch size of 256 for [Adam](https://arxiv.org/pdf/1412.6980.pdf). The optimized implementation accumulates 1024 gradient batches in phase 1 and 4096 steps in phase 2 before updating weights once. This results in a 15% training speedup. On multi-node systems, LAMB allows scaling up to 1024 GPUs resulting in training speedups of up to 72x in comparison to Adam. Adam has limitations on the learning rate that can be used since it is applied globally on all parameters, whereas LAMB follows a layerwise learning rate strategy.
			
 
				- 
			
 
				+
			
 
				+[LDDL](https://github.com/NVIDIA/LDDL) is a library that enables scalable data preprocessing and loading. LDDL is used by this PaddlePaddle BERT example.
			
 
				+
			
 
				 
			
 
				 ### Mixed precision training
			
 
				 
			
@@ -193,7 +199,7 @@ The following section lists the requirements you need to meet to start training
 
				 This repository contains a Dockerfile that extends the CUDA NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
			
 
				  
			
 
				 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
			
 
				-* [PaddlePaddle 22.08-py3 NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/paddlepaddle) or newer
			
 
				+* [PaddlePaddle 22.12-py3 NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/paddlepaddle) or newer
			
 
				 * Supported GPUs:
			
 
				     * [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
			
 
				 
			
@@ -204,7 +210,11 @@ DGX Documentation:
 
				 * [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
			
 
				 
			
 
				 For those unable to use the PaddlePaddle NGC container, to set up the required environment or create your own container, refer to the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
			
 
				+
			
 
				+For multi-node, the sample provided in this repository requires [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) set up on a [SLURM](https://slurm.schedmd.com) cluster.
			
 
				  
			
 
				+More information on how to set up and launch can be found in the [Multi-node Documentation](https://docs.nvidia.com/ngc/multi-node-bert-user-guide).
			
 
				+
			
 
				  
			
 
				 ## Quick Start Guide
			
 
				  
			
@@ -218,7 +228,10 @@ cd DeepLearningExamples/PaddlePaddle/LanguageModeling/BERT
 
				 ```
			
 
				  
			
 
				 2. Download the NVIDIA pre-trained checkpoint.
			
 
				-Pre-trained checkpoints link is coming soon. 
			
 
				+If you want to use a pre-trained checkpoint, visit [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/bert_large_paddle_ckpt_mode-pretrain/files). This pre-trained checkpoint is used to fine-tune on SQuAD. Ensure you unzip the downloaded file and place the checkpoint in the `checkpoints/` folder. For a checkpoint already fine-tuned for QA on SQuAD v1.1 visit [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/bert_large_paddle_ckpt_mode-qa_ds-squad11/files).
			
 
				+
			
 
				+
			
 
				+
			
 
				 
			
 
				 3. Build BERT on top of the NGC container.
			
 
				 ```
			
@@ -235,36 +248,23 @@ By default:
 
				 - Paddle native logs are stored in the `log/` folder.
			
 
				 - DLLogger's outputs are stored in the `results/` folder. 
			
 
				 
			
 
				-5. Download and preprocess the dataset.
			
 
				+5. Download the dataset.
			
 
				 
			
 
				 This repository provides scripts to download, verify, and extract the following datasets:
			
 
				  
			
 
				--   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) (fine-tuning for question answering)
			
 
				--   Wikipedia (pre-training)
			
 
				--   BookCorpus (pre-training)
			
 
				+- [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) (fine-tuning for question answering)
			
 
				+- Wikipedia (pre-training)
			
 
				+
			
 
				  
			
 
				-To download, verify, extract the datasets, and create the shards in `.hdf5` format, run:  
			
 
				+To download, verify, extract the datasets, run:  
			
 
				 ```shell
			
 
				 bash data/create_datasets_from_start.sh
			
 
				 ```
			
 
				 
			
 
				-Note: For fine tuning only, Wikipedia and Bookscorpus dataset download and preprocessing can be skipped by commenting it out.
			
 
				-
			
 
				-- Download Wikipedia only for pretraining
			
 
				-
			
 
				-The pretraining dataset is 170GB+ and takes 15+ hours to download. The BookCorpus server, most of the time, gets overloaded and contains broken links resulting in HTTP 403 and 503 errors. Hence, it is recommended to skip downloading BookCorpus data by running:
			
 
				-```shell
			
 
				-bash data/create_datasets_from_start.sh wiki_only
			
 
				-```
			
 
				-
			
 
				-- Download Wikipedia and BookCorpus
			
 
				-
			
 
				-Users are welcome to download BookCorpus from other sources to match our accuracy or repeatedly try our script until the required number of files are downloaded by running the following:
			
 
				-```shell
			
 
				-bash data/create_datasets_from_start.sh wiki_books
			
 
				-```
			
 
				+Note: For fine-tuning only, downloading the Wikipedia dataset can be skipped by commenting it out.
			
 
				 
			
 
				-Note: Ensure a complete Wikipedia download. If, in any case, the download breaks, remove the output file `wikicorpus_en.xml.bz2` and start again. If a partially downloaded file exists, the script assumes a successful download, which causes the extraction to fail. Not using BookCorpus can potentially change the final accuracy on a few downstream tasks.
			
 
				+Note: Ensure a complete Wikipedia download. But if the download failed in LDDL,
			
 
				+remove the output directory `data/wikipedia/` and start over again. 
			
 
				 
			
 
				 
			
 
				 6. Start pre-training.
			
@@ -276,16 +276,18 @@ bash scripts/run_pretraining.sh
 
				  
			
 
				 The default hyperparameters are set to run on 8x A100 80G cards.
			
 
				 
			
 
				+To run on multiple nodes, refer to the [Multi-node](#multi-node) section.  
			
 
				+
			
 
				 7. Start fine-tuning with the SQuAD dataset.
			
 
				  
			
 
				 The above pre-trained BERT representations can be fine-tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script launches fine-tuning for question answering with the SQuAD dataset.
			
 
				 ```
			
 
				-bash scripts/run_squad.sh <path to pretrained_model>
			
 
				+bash scripts/run_squad.sh /workspace/bert/checkpoints/<pre-trained_checkpoint>
			
 
				 ```
			
 
				 
			
 
				 8. Start validation/evaluation.
			
 
				  
			
 
				-For SQuAD, validation can be performed with the `bash scripts/run_squad.sh <path to pretrained_model>`, setting `mode` to `eval` in `scripts/run_squad.sh` as follows:
			
 
				+For SQuAD, validation can be performed with the `bash scripts/run_squad.sh /workspace/bert/checkpoints/<pre-trained_checkpoint>`, setting `mode` to `eval` in `scripts/run_squad.sh` as follows:
			
 
				 
			
 
				 ```
			
 
				 mode=${12:-"eval"}
			
@@ -293,7 +295,7 @@ mode=${12:-"eval"}
 
				 
			
 
				  
			
 
				 9. Start inference/predictions.
			
 
				-Inference can be performed with the `bash scripts/run_squad.sh <path to pretrained_model>`, setting `mode` to `prediction` in `scripts/run_squad.sh` as follows:
			
 
				+Inference can be performed with the `bash scripts/run_squad.sh /workspace/bert/checkpoints/<pre-trained_checkpoint>`, setting `mode` to `prediction` in `scripts/run_squad.sh` as follows:
			
 
				 
			
 
				 ```
			
 
				 mode=${12:-"prediction"}
			
@@ -366,6 +368,8 @@ The complete list of the available parameters for the `run_pretraining.py` scrip
 
				 Global:
			
 
				   --input-dir INPUT_DIR
			
 
				                         The input data directory. Should be specified by users and contain .hdf5 files for the task. (default: None)
			
 
				+  --vocab-file VOCAB_FILE
			
 
				+                        Vocabulary mapping/file BERT was pretrainined on. (default: None)
			
 
				   --output-dir OUTPUT_DIR
			
 
				                         The output directory where the model checkpoints will be written. Should be specified by users. (default: None)
			
 
				   --bert-model {bert-base-uncased,bert-base-cased,bert-large-uncased,bert-large-cased,custom}
			
@@ -466,6 +470,24 @@ Note:
 
				 - For SQuAD fine-tuning, `<--max-steps>` is not required since it's usually trained for two or three epochs. If `<--max-steps>` is not set or set to -1, it will be trained for `<--epochs>` epochs. If `<--max-steps>` is set to a positive number, the total training steps is calculated by: `total_steps = min(max_steps, epochs * steps_per_epoch)`.
			
 
				 - For pre-training, `<--max-steps>` is required and `<--epochs>` is deprecated. Because We typically train for a specified number of steps rather than epochs.
			
 
				 
			
 
				+#### Multi-node
			
 
				+Multi-node runs can be launched on a pyxis/enroot Slurm cluster (refer to [Requirements](#requirements)) with the `run.sub` script with the following command for a 4-node DGX-A100 example for both phase 1 and phase 2:
			
 
				+ 
			
 
				+```
			
 
				+TRAIN_BATCH_SIZE=256 GRADIENT_ACCUMULATION_STEPS=8 PHASE=1 sbatch -N4 run.sub
			
 
				+TRAIN_BATCH_SIZE=32 GRADIENT_ACCUMULATION_STEPS=32 PHASE=2 sbatch -N4 run.sub
			
 
				+```
			
 
				+ 
			
 
				+Checkpoints  after phase 1 will be saved in `checkpointdir` specified in `run.sub`. The checkpoint will be automatically picked up to resume training on phase 2. Note that phase 2 should be run after phase 1.
			
 
				+ 
			
 
				+ 
			
 
				+The batch variables `BATCHSIZE`, `GRADIENT_STEPS`,`PHASE` refer to the Python arguments `--batch-size`, `--gradient-merge-steps`, `--phase1/--phase2` respectively.
			
 
				+ 
			
 
				+Note that the `run.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `datadir` handle the location of the files for each phase. 
			
 
				+ 
			
 
				+Refer to the file’s contents to find the full list of variables to adjust for your system.
			
 
				+
			
 
				+
			
 
				 ### Command-line options
			
 
				  
			
 
				 To view the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
			
@@ -477,27 +499,25 @@ To view the full list of available options and their descriptions, use the `-h`
 
				 Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section.
			
 
				  
			
 
				 ### Getting the data
			
 
				-For pre-training BERT, we use the concatenation of Wikipedia (2500M words) and BookCorpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. BERT requires that datasets are structured as a document-level corpus rather than a shuffled sentence-level corpus because it is critical to extract long contiguous sentences.
			
 
				- 
			
 
				-The preparation of the pre-training dataset is described in the `bertPrep.py` script found in the `data/` folder. The component steps in the automated scripts to prepare the datasets are as follows:
			
 
				- 
			
 
				-1.  Data download and extract - the dataset is downloaded and extracted.
			
 
				- 
			
 
				-2.  Clean and format - document tags, and so on. are removed from the dataset.
			
 
				- 
			
 
				-3.  Sentence segmentation - the corpus text file is processed into separate sentences.
			
 
				- 
			
 
				-4.  Sharding - the sentence segmented corpus file is split into a number of uniformly distributed smaller text documents.
			
 
				- 
			
 
				-5.  `hdf5` file creation - each text file shard is processed by the `create_pretraining_data.py` script to produce a corresponding `hdf5` file. The script generates input data and labels for masked language modeling and sentence prediction tasks for the input text shard.
			
 
				- 
			
 
				-The tools used for preparing the BookCorpus and Wikipedia datasets can be applied to prepare an arbitrary corpus. The `create_datasets_from_start.sh` script in the `data/` directory applies sentence segmentation, sharding, and `hdf5` file creation given an arbitrary text file containing a document-separated text corpus.
			
 
				- 
			
 
				-For fine-tuning a pre-trained BERT model for specific tasks, by default this repository prepares the following dataset:
			
 
				+
			
 
				+For pre-training BERT, we use the Wikipedia (2500M words) dataset. We extract 
			
 
				+only the text passages and ignore headers, lists, and tables. BERT requires that
			
 
				+datasets are structured as a document level corpus rather than a shuffled 
			
 
				+sentence-level corpus because it is critical to extract long contiguous 
			
 
				+sentences. `data/create_datasets_from_start.sh` uses the LDDL downloader to 
			
 
				+download the Wikipedia dataset, and `scripts/run_pretraining.sh` uses the LDDL 
			
 
				+preprocessor and load balancer to preprocess the Wikipedia dataset into Parquet
			
 
				+shards which are then streamed during the pre-training by the LDDL data loader.
			
 
				+Refer to [LDDL's README](https://github.com/NVIDIA/LDDL/blob/main/README.md) for more 
			
 
				+information on how to use LDDL. Depending on the speed of your internet 
			
 
				+connection, downloading and extracting the Wikipedia dataset takes a few hours,
			
 
				+and running the LDDL preprocessor and load balancer takes half an hour on a 
			
 
				+single DGXA100 node.
			
 
				+
			
 
				+For fine-tuning a pre-trained BERT model for specific tasks, by default, this repository prepares the following dataset:
			
 
				  
			
 
				 -   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering
			
 
				- 
			
 
				-Depending on the speed of your internet connection, this process takes about a day to complete. The BookCorpus server could sometimes get overloaded and also contain broken links resulting in HTTP 403 and 503 errors. You can either skip the missing files or retry downloading at a later time. 
			
 
				+
			
 
				  
			
 
				 #### Dataset guidelines
			
 
				  
			
@@ -511,8 +531,6 @@ BERT pre-training optimizes for two unsupervised classification tasks. The first
 
				  
			
 
				 The second task is next sentence prediction. One training instance of BERT pre-training is two sentences (a sentence pair). A sentence pair may be constructed by simply taking two adjacent sentences from a single document or by pairing up two random sentences with equal probability. The goal of this task is to predict whether or not the second sentence followed the first in the original document.
			
 
				 
			
 
				-The `create_pretraining_data.py` script takes in raw text and creates training instances for both pre-training tasks.
			
 
				-
			
 
				 
			
 
				 ### Training process
			
 
				  
			
@@ -522,7 +540,7 @@ The training process consists of two steps: pre-training and fine-tuning.
 
				 
			
 
				 Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `scripts/run_pretraining.sh`.
			
 
				  
			
 
				-The `run_pretraining.sh` script runs a job on a single node that trains the BERT-large model from scratch using Wikipedia and BookCorpus datasets as training data using the LAMB optimizer. By default, the training script runs two phases of training with a hyperparameter recipe specific to 8x A100 80G cards:
			
 
				+The `run_pretraining.sh` script runs a job on a single node that trains the BERT-large model from scratch using Wikipedia datasets as training data using the LAMB optimizer. By default, the training script runs two phases of training with a hyperparameter recipe specific to 8x A100 80G cards:
			
 
				 
			
 
				 Phase 1: (Maximum sequence length of 128)
			
 
				 -   Runs on 8 GPUs with a training batch size of 256 per GPU.
			
@@ -565,6 +583,13 @@ bash run_pretraining.sh \
 
				     <dataset_dir_phase2> \
			
 
				     <code_dir> \
			
 
				     <init_checkpoint_dir> \
			
 
				+    <wikipedia_source> \
			
 
				+    <num_dask_workers> \
			
 
				+    <num_shards_per_workers> \
			
 
				+    <num_workers> \
			
 
				+    <sample_ratio> \
			
 
				+    <phase2_bin_size> \
			
 
				+    <masking> \
			
 
				     <bert_config_file> \
			
 
				     <enable_benchmark> \
			
 
				     <benchmark_steps> \
			
@@ -593,6 +618,13 @@ Where:
 
				 -   `<dataset_dir_phase12` is the path to dataset of phase 2. It should be a path to the folder containing `.hdf` files.
			
 
				 -   `<code_dir>` is the root path to bert code.
			
 
				 -   `<init_checkpoint_dir>` is the path to the checkpoint to start the pretraining routine on (Usually a BERT pre-trained checkpoint).
			
 
				+-   `wikipedia_source` is the path to the 'source' subdirectory for the Wikipedia corpus.
			
 
				+-   `num_dask_workers` is the number of dask workers to preprocess the bert dataset.
			
 
				+-   `num_shards_per_workers` is the number of the output parquet/txt shards per worker.
			
 
				+-   `num_workers` is the number of workers for dataloading.
			
 
				+-   `sample_ratio` is the ratio of how many articles/documents are sampled from each corpus.
			
 
				+-   `phase2_bin_size` is the stride of the sequence length for each binbin size for phase2.
			
 
				+-   `masking` LDDL supports both static and dynamic masking. Refer to [LDDL's README](https://github.com/NVIDIA/LDDL/blob/main/README.md) for more information.
			
 
				 -   `<bert_config_file>` is the path to the bert config file.
			
 
				 -   `<enable_benchmark>` a flag to enable benchmark. The train process will warmup for `<benchmark_warmup_steps>` and then measure the throughput of the following `<benchmark_steps>`.
			
 
				 
			
@@ -609,7 +641,10 @@ bash scripts/run_pretraining.sh \
 
				     /path/to/dataset/phase1 \
			
 
				     /path/to/dataset/phase2 \
			
 
				     /workspace/bert \
			
 
				-    None None false
			
 
				+    None \
			
 
				+    /path/to/wikipedia/source \
			
 
				+    32 128 4 0.9 64 static \
			
 
				+    None false
			
 
				 ```
			
 
				  
			
 
				 To run the pre-training routine on an initial checkpoint, point the `from-checkpoint` variable to the location of the checkpoint folder in `scripts/run_pretraining.sh`.
			
@@ -622,6 +657,7 @@ python3 -m paddle.distributed.launch \
 
				     --gpus="0,1,2,3,4,5,6,7" \
			
 
				     ./run_pretraining.py \
			
 
				     --input-dir=/path/to/dataset/phase1 \
			
 
				+    --vocab-file=vocab/bert-large-uncased-vocab.txt \
			
 
				     --output-dir=./results \
			
 
				     --bert-model=bert-large-uncased \
			
 
				     --from-checkpoint=./results/bert-large-uncased/phase1 \
			
@@ -773,7 +809,10 @@ bash scripts/run_pretraining.sh \
 
				     /path/to/dataset/phase1 \
			
 
				     /path/to/dataset/phase2 \
			
 
				     /workspace/bert \
			
 
				-    None None true 10 10
			
 
				+    None \
			
 
				+    /path/to/wikipedia/source \
			
 
				+    32 128 4 0.9 64 static \
			
 
				+    None true 10 10
			
 
				 ```
			
 
				 
			
 
				 To benchmark the training performance on a specific batch size for SQuAD, refer to [Fine-tuning](#fine-tuning) and turn on the `<benchmark>` flags. An example call to run training for 200 steps (100 steps for warmup and 100 steps to measure), and generate throughput numbers:
			
@@ -831,8 +870,8 @@ Our results were obtained by running the `scripts/run_squad.sh` and `scripts/run
 
				 
			
 
				 | DGX System         | GPUs / Node | Precision | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) |     Final Loss    | Time to train(hours) | Time to train speedup (TF32 to mixed precision) |
			
 
				 |--------------------|-------------|-----------|----------------------------------------------------|------------------------------------------|-------------------|----------------------|-------------------------------------------------|
			
 
				-|  1 x DGX A100 80GB | 8           | AMP       | 256 and 32                                         | 32 and 128                               |       1.409       |    ~ 50 hours        | 1.72                                            |
			
 
				-|  1 x DGX A100 80GB | 8           | TF32      | 128 and 16                                         | 64 and 256                               |       1.421       |    ~ 86 hours        | 1                                               |
			
 
				+| 32 x DGX A100 80GB | 8           | AMP       | 256 and 128                                        | 1 and 4                                  |       1.409       |    ~ 1.2 hours       | 1.72                                            |
			
 
				+| 32 x DGX A100 80GB | 8           | TF32      | 128 and 16b                                        | 2 and 8                                  |       1.421       |    ~ 2.5 hours       | 1                                               |
			
 
				 
			
 
				 
			
 
				 ##### Pre-training loss curves
			
@@ -869,16 +908,34 @@ Training stability with 8 GPUs, FP16 computations, batch size of 32:
 
				 
			
 
				 ##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
			
 
				 
			
 
				-Our results were obtained by running the script `run_pretraining.sh` in the PaddlePaddle:22.08-py3 NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
			
 
				+Our results were obtained by running the script `run_pretraining.sh` in the PaddlePaddle:22.12-py3 NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs. Performance numbers (in sequences per second) were averaged over a few training iterations.
			
 
				 
			
 
				 ###### Pre-training NVIDIA DGX A100 (8x A100 80GB)
			
 
				 
			
 
				 | GPUs | Batch size / GPU (TF32 and FP16) | Accumulation steps (TF32 and FP16) | Sequence length | Throughput - TF32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
			
 
				 |------|----------------------------------|------------------------------------|-----------------|----------------------------------|---------------------------------------------|---------------------------------------------|---------------------|--------------------------------|
			
 
				-| 1    | 8192 and 8192                    | 64 and 32                          | 128             |  304                             |   529                                       | 1.74                                        | 1.00                | 1.00                           |
			
 
				-| 8    | 8192 and 8192                    | 64 and 32                          | 128             | 2410                             |  4200                                       | 1.74                                        | 7.93                | 7.94                           |
			
 
				-| 1    | 4096 and 4096                    | 256 and 128                        | 512             |   59                             |   103                                       | 1.75                                        | 1.00                | 1.00                           |
			
 
				-| 8    | 4096 and 4096                    | 256 and 128                        | 512             |  469                             |   823                                       | 1.75                                        | 7.95                | 7.99                           |
			
 
				+| 1    | 8192 and 8192                    | 64 and 32                          | 128             |  307                             |   633                                       | 2.06                                        | 1.00                | 1.00                           |
			
 
				+| 8    | 8192 and 8192                    | 64 and 32                          | 128             | 2428                             |  4990                                       | 2.06                                        | 7.91                | 7.88                           |
			
 
				+| 1    | 4096 and 4096                    | 256 and 128                        | 512             |  107                             |   219                                       | 2.05                                        | 1.00                | 1.00                           |
			
 
				+| 8    | 4096 and 4096                    | 256 and 128                        | 512             |  851                             |  1724                                       | 2.26                                        | 7.95                | 7.87                           |
			
 
				+
			
 
				+
			
 
				+###### Pre-training NVIDIA DGX A100 (8x A100 80GB) Multi-node Scaling
			
 
				+
			
 
				+| Nodes | GPUs / node | Batch size / GPU (TF32 and FP16) | Accumulated Batch size / GPU (TF32 and FP16) | Accumulation steps (TF32 and FP16) | Sequence length | Mixed Precision Throughput | Mixed Precision Strong Scaling | TF32 Throughput | TF32 Strong Scaling | Speedup (Mixed Precision to TF32) |
			
 
				+|-------|-------------|----------------------------------|------------------------------------|-----------------|----------------------------|--------------------------------|-----------------|---------------------|-----------------------------------|-----|
			
 
				+| 1     | 8           | 126 and 256 | 8192 and 8192                    | 64 and 32             | 128             |   4990               | 1                              |   2428          |  1                  |  2.06               |
			
 
				+| 2     | 8           | 126 and 256 | 4096 and 4096                    | 32 and 16             | 128             |   9581               | 1.92                           |   4638          |  1.91               |  2.07               |
			
 
				+| 4     | 8           | 126 and 256 | 2048 and 2048                    | 16 and 8              | 128             |   19262              | 3.86                           |   9445          |  3.89               |  2.04               |
			
 
				+| 8     | 8           | 126 and 256 | 1024 and 1024                    | 8 and 4               | 128             |   37526              | 7.52                           |   18335         |  7.55               |  2.05               |
			
 
				+| 16    | 8           | 126 and 256 | 512 and 512                      | 4 and 2               | 128             |   71156              | 14.26                          |   35526         |  14.63              |  2.00               |
			
 
				+| 32    | 8           | 126 and 256 | 256 and 256                      | 2 and 1               | 128             |   142087             | 28.47                          |   69701         |  28.71              |  2.04               |
			
 
				+| 1     | 8           | 16  and 32  | 4096 and 4096                    | 256 and 128           | 512             |   1724               | 1                              |   851           |  1                  |  2.03               |
			
 
				+| 2     | 8           | 16  and 32  | 2048 and 2048                    | 128 and 64            | 512             |   3305               | 1.92                           |   1601          |  1.88               |  2.06               |
			
 
				+| 4     | 8           | 16  and 32  | 1024 and 1024                    | 64 and 32             | 512             |   6492               | 3.77                           |   3240          |  3.81               |  2.00               |
			
 
				+| 8     | 8           | 16  and 32  | 512 and 512                      | 32 and 16             | 512             |   12884              | 7.47                           |   6329          |  7.44               |  2.04               |
			
 
				+| 16    | 8           | 16  and 32  | 256 and 256                      | 16 and 8              | 512             |   25493              | 14.79                          |   12273         |  14.42              |  2.08               |
			
 
				+| 32    | 8           | 16  and 32  | 128 and 128                      | 8 and 4               | 512             |   49307              | 28.60                          |   24047         |  28.26              |  2.05               |
			
 
				 
			
 
				 
			
 
				 ###### Fine-tuning NVIDIA DGX A100 (8x A100 80GB)
			
@@ -912,7 +969,11 @@ The inference performance metrics used were items/second.
 
				 ## Release notes
			
 
				  
			
 
				 ### Changelog
			
 
				- 
			
 
				+
			
 
				+January 2023
			
 
				+- [Pre-training using Language Datasets and Data Loaders (LDDL)](https://github.com/NVIDIA/LDDL)
			
 
				+- Binned pretraining for phase2 with LDDL using a bin size of 64
			
 
				+
			
 
				 August 2022
			
 
				 - Pre-training support with LAMB optimizer.
			
 
				 - Updated Data download and Preprocessing.
			
--- a/PaddlePaddle/LanguageModeling/BERT/data/create_datasets_from_start.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/data/create_datasets_from_start.sh
@@ -13,36 +13,5 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 #Download
			
 
				-to_download=${1:-"wiki_only"}
			
 
				-
			
 
				-#Download
			
 
				-if [ "$to_download" = "wiki_books" ] ; then
			
 
				-    python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus
			
 
				-fi
			
 
				-
			
 
				-python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
			
 
				+download_wikipedia --outdir ${BERT_PREP_WORKING_DIR}/wikipedia/
			
 
				 python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
			
 
				-
			
 
				-# Properly format the text files
			
 
				-if [ "$to_download" = "wiki_books" ] ; then
			
 
				-    python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus
			
 
				-fi
			
 
				-python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en
			
 
				-
			
 
				-if [ "$to_download" = "wiki_books" ] ; then
			
 
				-    DATASET="books_wiki_en_corpus"
			
 
				-else
			
 
				-    DATASET="wikicorpus_en"
			
 
				-    # Shard the text files
			
 
				-fi
			
 
				-
			
 
				-# Shard the text files
			
 
				-python3 /workspace/bert/data/bertPrep.py --action sharding --dataset $DATASET
			
 
				-
			
 
				-# Create HDF5 files Phase 1
			
 
				-python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 128 \
			
 
				---max_predictions_per_seq 20 --vocab_file /workspace/bert/vocab/bert-large-uncased-vocab.txt --do_lower_case 1
			
 
				-
			
 
				-# Create HDF5 files Phase 2
			
 
				-python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 512 \
			
 
				---max_predictions_per_seq 80 --vocab_file /workspace/bert/vocab/bert-large-uncased-vocab.txt --do_lower_case 1
			
--- a/PaddlePaddle/LanguageModeling/BERT/loss.py
+++ b/PaddlePaddle/LanguageModeling/BERT/loss.py
@@ -13,7 +13,6 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 import paddle
			
 
				-import paddle.nn.functional as F
			
 
				 
			
 
				 
			
 
				 class CrossEntropyLossForSQuAD(paddle.nn.Layer):
			
@@ -53,7 +52,7 @@ class BertPretrainingCriterion(paddle.nn.Layer):
 
				         self.vocab_size = vocab_size
			
 
				 
			
 
				     def forward(self, prediction_scores, seq_relationship_score,
			
 
				-                masked_lm_labels, next_sentence_labels, masked_lm_scale):
			
 
				+                masked_lm_labels, next_sentence_labels):
			
 
				         """
			
 
				         Args:
			
 
				             prediction_scores(Tensor):
			
@@ -80,12 +79,11 @@ class BertPretrainingCriterion(paddle.nn.Layer):
 
				             Its data type should be float32 and its shape is [1].
			
 
				         """
			
 
				         with paddle.static.amp.fp16_guard():
			
 
				-            masked_lm_loss = F.cross_entropy(
			
 
				-                prediction_scores,
			
 
				-                masked_lm_labels,
			
 
				-                reduction='none',
			
 
				-                ignore_index=-1)
			
 
				-            masked_lm_loss = masked_lm_loss / masked_lm_scale
			
 
				-            next_sentence_loss = F.cross_entropy(
			
 
				-                seq_relationship_score, next_sentence_labels, reduction='none')
			
 
				-        return paddle.sum(masked_lm_loss) + paddle.mean(next_sentence_loss)
			
 
				+            masked_lm_labels_flat = masked_lm_labels.reshape([-1])
			
 
				+            mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != -1]
			
 
				+            masked_lm_loss = self.loss_fn(prediction_scores, mlm_labels)
			
 
				+            if next_sentence_labels.ndim == 1:
			
 
				+                next_sentence_labels = next_sentence_labels.unsqueeze(axis=-1)
			
 
				+            next_sentence_loss = self.loss_fn(seq_relationship_score,
			
 
				+                                              next_sentence_labels)
			
 
				+        return masked_lm_loss + next_sentence_loss
			
--- a/PaddlePaddle/LanguageModeling/BERT/modeling.py
+++ b/PaddlePaddle/LanguageModeling/BERT/modeling.py
@@ -89,17 +89,15 @@ class BertEmbeddings(nn.Layer):
 
				         self.layer_norm = nn.LayerNorm(bert_config.hidden_size, epsilon=1e-12)
			
 
				         self.dropout = nn.Dropout(bert_config.hidden_dropout_prob)
			
 
				 
			
 
				-    def forward(self, input_ids, token_type_ids=None, position_ids=None):
			
 
				+    def forward(self, input_ids, token_type_ids=None):
			
 
				         """
			
 
				         Args:
			
 
				             See class `BertModel`.
			
 
				         """
			
 
				-        if position_ids is None:
			
 
				-            ones = paddle.ones_like(input_ids, dtype="int64")
			
 
				-            seq_length = paddle.cumsum(ones, axis=-1)
			
 
				-
			
 
				-            position_ids = seq_length - ones
			
 
				-            position_ids.stop_gradient = True
			
 
				+        ones = paddle.ones_like(input_ids, dtype="int64")
			
 
				+        seq_length = paddle.cumsum(ones, axis=-1)
			
 
				+        position_ids = seq_length - ones
			
 
				+        position_ids.stop_gradient = True
			
 
				         if token_type_ids is None:
			
 
				             token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
			
 
				 
			
@@ -174,18 +172,13 @@ class BertModel(nn.Layer):
 
				                 dropout=bert_config.hidden_dropout_prob,
			
 
				                 activation=bert_config.hidden_act,
			
 
				                 attn_dropout=bert_config.attention_probs_dropout_prob,
			
 
				-                act_dropout=0,
			
 
				-                enable_cudnn=False)
			
 
				+                act_dropout=0)
			
 
				             self.encoder = nn.TransformerEncoder(encoder_layer,
			
 
				                                                  bert_config.num_hidden_layers)
			
 
				 
			
 
				         self.pooler = BertPooler(bert_config.hidden_size)
			
 
				 
			
 
				-    def forward(self,
			
 
				-                input_ids,
			
 
				-                token_type_ids=None,
			
 
				-                position_ids=None,
			
 
				-                attention_mask=None):
			
 
				+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
			
 
				         """
			
 
				         Args:
			
 
				             input_ids(Tensor):
			
@@ -198,11 +191,6 @@ class BertModel(nn.Layer):
 
				                 to a `sentence A` and type 1 corresponds to a `sentence B` token.
			
 
				                 (see BERT paper for more details). Its data type should be `int64`
			
 
				                 Defaults: None, which means we don't add segment embeddings.
			
 
				-            position_ids(Tensor, optional):
			
 
				-                An optional Tensor of shape [batch_size, num_tokens] with the position
			
 
				-                indices of each input sequence tokens in the position embeddings.
			
 
				-                Selected in the range [0, max_position_embeddings - 1].
			
 
				-                Its data type should be `int64`. Defaults: None.
			
 
				             attention_mask(Tensor, optional):
			
 
				                 An optional Tensor of shape [batch_size, sequence_length] with indices of
			
 
				                 mask used in multi-head attention to avoid performing attention on to some
			
@@ -234,9 +222,7 @@ class BertModel(nn.Layer):
 
				                 attention_mask = attention_mask.unsqueeze(axis=[1, 2])
			
 
				 
			
 
				         embedding_output = self.embeddings(
			
 
				-            input_ids=input_ids,
			
 
				-            position_ids=position_ids,
			
 
				-            token_type_ids=token_type_ids)
			
 
				+            input_ids=input_ids, token_type_ids=token_type_ids)
			
 
				 
			
 
				         if self.fuse:
			
 
				             encoder_output = embedding_output
			
@@ -263,11 +249,7 @@ class BertForQuestionAnswering(nn.Layer):
 
				         self.bert = BertModel(bert_config)
			
 
				         self.classifier = nn.Linear(bert_config.hidden_size, 2)
			
 
				 
			
 
				-    def forward(self,
			
 
				-                input_ids,
			
 
				-                token_type_ids=None,
			
 
				-                position_ids=None,
			
 
				-                attention_mask=None):
			
 
				+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
			
 
				         """
			
 
				         Args:
			
 
				             See class `BertModel`.
			
@@ -282,7 +264,6 @@ class BertForQuestionAnswering(nn.Layer):
 
				         encoder_output, _ = self.bert(
			
 
				             input_ids,
			
 
				             token_type_ids=token_type_ids,
			
 
				-            position_ids=position_ids,
			
 
				             attention_mask=attention_mask)
			
 
				 
			
 
				         logits = self.classifier(encoder_output)
			
@@ -322,13 +303,7 @@ class BertLMPredictionHead(nn.Layer):
 
				         self.decoder_bias = self.create_parameter(
			
 
				             shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)
			
 
				 
			
 
				-    def forward(self, hidden_states, masked_positions=None):
			
 
				-        if masked_positions is not None:
			
 
				-            hidden_states = paddle.reshape(hidden_states,
			
 
				-                                           [-1, hidden_states.shape[-1]])
			
 
				-            hidden_states = paddle.tensor.gather(hidden_states,
			
 
				-                                                 masked_positions)
			
 
				-        # gather masked tokens might be more quick
			
 
				+    def forward(self, hidden_states):
			
 
				         hidden_states = self.transform(hidden_states)
			
 
				         hidden_states = self.activation(hidden_states)
			
 
				         hidden_states = self.layer_norm(hidden_states)
			
@@ -362,7 +337,7 @@ class BertPretrainingHeads(nn.Layer):
 
				                                                 activation, embedding_weights)
			
 
				         self.seq_relationship = nn.Linear(hidden_size, 2)
			
 
				 
			
 
				-    def forward(self, encoder_output, pooled_output, masked_positions=None):
			
 
				+    def forward(self, encoder_output, pooled_output, masked_lm_labels):
			
 
				         """
			
 
				         Args:
			
 
				             sequence_output(Tensor):
			
@@ -384,7 +359,12 @@ class BertPretrainingHeads(nn.Layer):
 
				                 A Tensor of shape [batch_size, 2] with the scores of next sentence prediction.
			
 
				                 Its data type should be float32.
			
 
				         """
			
 
				-        prediction_scores = self.predictions(encoder_output, masked_positions)
			
 
				+
			
 
				+        sequence_flattened = paddle.index_select(
			
 
				+            encoder_output.reshape([-1, encoder_output.shape[-1]]),
			
 
				+            paddle.nonzero(masked_lm_labels.reshape([-1]) != -1).squeeze(),
			
 
				+            axis=0)
			
 
				+        prediction_scores = self.predictions(sequence_flattened)
			
 
				         seq_relationship_score = self.seq_relationship(pooled_output)
			
 
				         return prediction_scores, seq_relationship_score
			
 
				 
			
@@ -406,18 +386,13 @@ class BertForPretraining(nn.Layer):
 
				             bert_config.hidden_act,
			
 
				             embedding_weights=self.bert.embeddings.word_embeddings.weight)
			
 
				 
			
 
				-    def forward(self,
			
 
				-                input_ids,
			
 
				-                token_type_ids=None,
			
 
				-                position_ids=None,
			
 
				-                attention_mask=None,
			
 
				-                masked_positions=None):
			
 
				+    def forward(self, input_ids, token_type_ids, attention_mask,
			
 
				+                masked_lm_labels):
			
 
				         """
			
 
				 
			
 
				         Args:
			
 
				             input_ids(Tensor): See class `BertModel`.
			
 
				             token_type_ids(Tensor, optional): See class `BertModel`.
			
 
				-            position_ids(Tensor, optional): See class `BertModel`.
			
 
				             attention_mask(Tensor, optional): See class `BertModel`.
			
 
				             masked_positions(Tensor, optional): See class `BertPretrainingHeads`.
			
 
				 
			
@@ -434,9 +409,8 @@ class BertForPretraining(nn.Layer):
 
				             outputs = self.bert(
			
 
				                 input_ids,
			
 
				                 token_type_ids=token_type_ids,
			
 
				-                position_ids=position_ids,
			
 
				                 attention_mask=attention_mask)
			
 
				             sequence_output, pooled_output = outputs[:2]
			
 
				             prediction_scores, seq_relationship_score = self.cls(
			
 
				-                sequence_output, pooled_output, masked_positions)
			
 
				+                sequence_output, pooled_output, masked_lm_labels)
			
 
				             return prediction_scores, seq_relationship_score
			
--- a/PaddlePaddle/LanguageModeling/BERT/pretraining_dataset.py
+++ b/PaddlePaddle/LanguageModeling/BERT/pretraining_dataset.py
@@ -1,169 +0,0 @@
 
				-# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-import random
			
 
				-import h5py
			
 
				-import numpy as np
			
 
				-import paddle
			
 
				-from paddle.io import DataLoader, Dataset
			
 
				-from utils.collate import Stack
			
 
				-
			
 
				-
			
 
				-def create_pretraining_dataset(args,
			
 
				-                               input_file,
			
 
				-                               data_holders,
			
 
				-                               worker_init=None,
			
 
				-                               places=None):
			
 
				-    train_data = PretrainingDataset(
			
 
				-        input_file=input_file, max_pred_length=args.max_predictions_per_seq)
			
 
				-    train_batch_sampler = paddle.io.BatchSampler(
			
 
				-        train_data, batch_size=args.batch_size, shuffle=True)
			
 
				-
			
 
				-    def _collate_data(data, stack_fn=Stack()):
			
 
				-        num_fields = len(data[0])
			
 
				-        out = [None] * num_fields
			
 
				-        [
			
 
				-            input_ids, segment_ids, input_mask, masked_lm_positions,
			
 
				-            masked_lm_labels, next_sentence_labels, masked_lm_scale
			
 
				-        ] = [0, 1, 2, 3, 4, 5, 6]
			
 
				-        for i in (input_ids, segment_ids, input_mask, next_sentence_labels):
			
 
				-            out[i] = stack_fn([x[i] for x in data])
			
 
				-        _, seq_length = out[input_ids].shape
			
 
				-        size = sum(len(x[masked_lm_positions]) for x in data)
			
 
				-        if size % 8 != 0:
			
 
				-            size += 8 - (size % 8)
			
 
				-        out[masked_lm_positions] = np.full(size, 0, dtype=np.int32)
			
 
				-        out[masked_lm_labels] = np.full([size, 1], -1, dtype=np.int64)
			
 
				-        mask_token_num = 0
			
 
				-        for i, x in enumerate(data):
			
 
				-            for j, pos in enumerate(x[masked_lm_positions]):
			
 
				-                out[masked_lm_positions][mask_token_num] = i * seq_length + pos
			
 
				-                out[masked_lm_labels][mask_token_num] = x[masked_lm_labels][j]
			
 
				-                mask_token_num += 1
			
 
				-        # The value of masked_lm_scale is equal to mask_token_num,
			
 
				-        # which would be used to compute average masked_lm_loss.
			
 
				-        out.append(np.asarray([mask_token_num], dtype=np.float32))
			
 
				-        if args.amp and args.use_pure_fp16:
			
 
				-            #out[input_mask] = out[input_mask].astype(np.float16)
			
 
				-            out[masked_lm_scale] = out[masked_lm_scale].astype(np.float16)
			
 
				-        return out
			
 
				-
			
 
				-    train_data_loader = DataLoader(
			
 
				-        dataset=train_data,
			
 
				-        places=places,
			
 
				-        feed_list=data_holders,
			
 
				-        batch_sampler=train_batch_sampler,
			
 
				-        collate_fn=_collate_data,
			
 
				-        num_workers=0,
			
 
				-        worker_init_fn=worker_init,
			
 
				-        return_list=False)
			
 
				-
			
 
				-    return train_data_loader
			
 
				-
			
 
				-
			
 
				-def create_pretraining_data_holder():
			
 
				-    input_ids = paddle.static.data(
			
 
				-        name="input_ids", shape=[-1, -1], dtype="int64")
			
 
				-    segment_ids = paddle.static.data(
			
 
				-        name="segment_ids", shape=[-1, -1], dtype="int64")
			
 
				-    input_mask = paddle.static.data(
			
 
				-        name="input_mask", shape=[-1, 1, 1, -1], dtype="int64")
			
 
				-    masked_lm_positions = paddle.static.data(
			
 
				-        name="masked_lm_positions", shape=[-1], dtype="int32")
			
 
				-    masked_lm_labels = paddle.static.data(
			
 
				-        name="masked_lm_labels", shape=[-1, 1], dtype="int64")
			
 
				-    next_sentence_labels = paddle.static.data(
			
 
				-        name="next_sentence_labels", shape=[-1, 1], dtype="int64")
			
 
				-    masked_lm_scale = paddle.static.data(
			
 
				-        name="masked_lm_scale", shape=[-1, 1], dtype="float32")
			
 
				-    return [
			
 
				-        input_ids, segment_ids, input_mask, masked_lm_positions,
			
 
				-        masked_lm_labels, next_sentence_labels, masked_lm_scale
			
 
				-    ]
			
 
				-
			
 
				-
			
 
				-def select_dataset_file_for_each_worker(files, f_start_id, num_trainers,
			
 
				-                                        trainer_id):
			
 
				-    """
			
 
				-    Spliting the train file according to the worker index.
			
 
				-    """
			
 
				-    num_files = len(files)
			
 
				-    if num_trainers > num_files:
			
 
				-        remainder = num_trainers % num_files
			
 
				-        data_file = files[(
			
 
				-            f_start_id * num_trainers + trainer_id + remainder * f_start_id) %
			
 
				-                          num_files]
			
 
				-    else:
			
 
				-        data_file = files[(f_start_id * num_trainers + trainer_id) % num_files]
			
 
				-    return data_file
			
 
				-
			
 
				-
			
 
				-class WorkerInitObj:
			
 
				-    "Construct the object with different seed, and the Dataloader will generate the data "
			
 
				-    "with different seed in each worker."
			
 
				-
			
 
				-    def __init__(self, seed):
			
 
				-        self.seed = seed
			
 
				-
			
 
				-    def __call__(self, pid):
			
 
				-        np.random.seed(seed=self.seed + pid)
			
 
				-        random.seed(self.seed + pid)
			
 
				-
			
 
				-
			
 
				-class PretrainingDataset(Dataset):
			
 
				-    def __init__(self, input_file, max_pred_length):
			
 
				-        self.input_file = input_file
			
 
				-        self.max_pred_length = max_pred_length
			
 
				-        f = h5py.File(input_file, "r")
			
 
				-        keys = [
			
 
				-            'input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions',
			
 
				-            'masked_lm_ids', 'next_sentence_labels'
			
 
				-        ]
			
 
				-        self.inputs = [np.asarray(f[key][:]) for key in keys]
			
 
				-        f.close()
			
 
				-
			
 
				-    def __len__(self):
			
 
				-        'Denotes the total number of samples'
			
 
				-        return len(self.inputs[0])
			
 
				-
			
 
				-    def __getitem__(self, index):
			
 
				-        # convert next_sentence_labels (index=5) to np.ndarray type
			
 
				-        [
			
 
				-            input_ids, input_mask, segment_ids, masked_lm_positions,
			
 
				-            masked_lm_ids, next_sentence_labels
			
 
				-        ] = [
			
 
				-            input[index].astype(np.int64)
			
 
				-            if indice < 5 else np.asarray(input[index].astype(np.int64))
			
 
				-            for indice, input in enumerate(self.inputs)
			
 
				-        ]
			
 
				-        # input_mask = (1 - np.reshape(
			
 
				-        #     input_mask.astype(np.float32), [1, 1, input_mask.shape[0]])) * -1e4
			
 
				-        input_mask = np.reshape(input_mask, [1, 1, input_mask.shape[0]])
			
 
				-
			
 
				-        index = self.max_pred_length
			
 
				-        padded_mask_indices = (masked_lm_positions == 0).nonzero()[0]
			
 
				-        if len(padded_mask_indices) != 0:
			
 
				-            index = padded_mask_indices[0].item()
			
 
				-        else:
			
 
				-            index = self.max_pred_length
			
 
				-        masked_lm_labels = masked_lm_ids[:index]
			
 
				-        masked_lm_positions = masked_lm_positions[:index]
			
 
				-        # softmax_with_cross_entropy enforce last dim size equal 1
			
 
				-        masked_lm_labels = np.expand_dims(masked_lm_labels, axis=-1)
			
 
				-        next_sentence_labels = np.expand_dims(next_sentence_labels, axis=-1)
			
 
				-
			
 
				-        return [
			
 
				-            input_ids, segment_ids, input_mask, masked_lm_positions,
			
 
				-            masked_lm_labels, next_sentence_labels
			
 
				-        ]
			
--- a/PaddlePaddle/LanguageModeling/BERT/program.py
+++ b/PaddlePaddle/LanguageModeling/BERT/program.py
@@ -12,29 +12,44 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				-from concurrent.futures import ThreadPoolExecutor
			
 
				 import os
			
 
				 import time
			
 
				 import logging
			
 
				 import shutil
			
 
				-import numpy as np
			
 
				 import paddle
			
 
				 import paddle.distributed.fleet as fleet
			
 
				 from modeling import BertForPretraining, BertConfig
			
 
				 from loss import BertPretrainingCriterion
			
 
				 from utils.save_load import save_model
			
 
				-from utils.utility import get_num_trainers, get_trainer_id
			
 
				+from utils.utility import get_trainer_id
			
 
				 from lr_scheduler import build_lr_scheduler
			
 
				 from optimizer import build_optimizer
			
 
				-from pretraining_dataset import create_pretraining_dataset, select_dataset_file_for_each_worker, WorkerInitObj
			
 
				 import dllogger
			
 
				 
			
 
				 
			
 
				-def create_strategy(use_distributed_fused_lamb=False):
			
 
				+def create_pretraining_data_holder():
			
 
				+    input_ids = paddle.static.data(
			
 
				+        name="input_ids", shape=[-1, -1], dtype="int64")
			
 
				+    token_type_ids = paddle.static.data(
			
 
				+        name="token_type_ids", shape=[-1, -1], dtype="int64")
			
 
				+    attention_mask = paddle.static.data(
			
 
				+        name="attention_mask", shape=[-1, 1, 1, -1], dtype="int64")
			
 
				+    next_sentence_labels = paddle.static.data(
			
 
				+        name="next_sentence_labels", shape=[-1, 1], dtype="int64")
			
 
				+    masked_lm_labels = paddle.static.data(
			
 
				+        name="masked_lm_labels", shape=[-1, -1], dtype="int64")
			
 
				+    return [
			
 
				+        input_ids, token_type_ids, attention_mask, next_sentence_labels,
			
 
				+        masked_lm_labels
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+def create_strategy(use_amp, use_distributed_fused_lamb=False):
			
 
				     """
			
 
				     Create paddle.static.BuildStrategy and paddle.static.ExecutionStrategy with arguments.
			
 
				 
			
 
				     Args:
			
 
				+        use_amp(bool): Whether to use amp.
			
 
				         use_distributed_fused_lamb(bool, optional): Whether to use distributed fused lamb.
			
 
				     Returns:
			
 
				         build_strategy(paddle.static.BuildStrategy): A instance of BuildStrategy.
			
@@ -44,6 +59,8 @@ def create_strategy(use_distributed_fused_lamb=False):
 
				     exec_strategy = paddle.static.ExecutionStrategy()
			
 
				 
			
 
				     build_strategy.enable_addto = True
			
 
				+    if use_amp:
			
 
				+        build_strategy.fuse_gemm_epilogue = True
			
 
				 
			
 
				     if use_distributed_fused_lamb:
			
 
				         build_strategy.fuse_all_reduce_ops = False
			
@@ -69,7 +86,8 @@ def dist_optimizer(args, optimizer):
 
				         optimizer(fleet.distributed_optimizer): A distributed optimizer.
			
 
				     """
			
 
				     use_distributed_fused_lamb = True if args.optimizer == 'DistributedFusedLamb' else False
			
 
				-    build_strategy, exec_strategy = create_strategy(use_distributed_fused_lamb)
			
 
				+    build_strategy, exec_strategy = create_strategy(args.amp,
			
 
				+                                                    use_distributed_fused_lamb)
			
 
				     dist_strategy = fleet.DistributedStrategy()
			
 
				 
			
 
				     if use_distributed_fused_lamb:
			
@@ -111,31 +129,33 @@ def dist_optimizer(args, optimizer):
 
				     return optimizer
			
 
				 
			
 
				 
			
 
				-def build(args, main_prog, startup_prog, feeds, is_train=True):
			
 
				+def build(args, main_prog, startup_prog, is_train=True):
			
 
				     """
			
 
				     Build a executable paddle.static.Program via following 3 steps:
			
 
				-        1. Create model.
			
 
				-        2. Create loss.
			
 
				-        3. Create optimizer if is_train==True.
			
 
				+        1. Create feeds.
			
 
				+        2. Create model.
			
 
				+        3. Create loss.
			
 
				+        4. Create optimizer if is_train==True.
			
 
				 
			
 
				     Args:
			
 
				         args(Namespace): Arguments obtained from ArgumentParser.
			
 
				         main_prog(paddle.static.Program):The main program.
			
 
				         startup_prog(paddle.static.Program):The startup program.
			
 
				-        feeds(dict): A dict of mapping variables' names to their values
			
 
				         is_train(bool, optional): Whether the main programe created is for training. Default: True.
			
 
				     Returns:
			
 
				         model(paddle.nn.Layer): An instance of BERT Model defined in modeling.py.
			
 
				         lr_scheduler(paddle.optimizer.lr.LRScheduler): A learning rate scheduler.
			
 
				         optimizer(Optimizer): An optimizer with distributed/AMP strategy.
			
 
				         loss(variable): The output variable of loss function.
			
 
				+        feeds(dict): A dict of mapping variables' names to their values
			
 
				     """
			
 
				 
			
 
				     with paddle.static.program_guard(main_prog, startup_prog):
			
 
				         with paddle.utils.unique_name.guard():
			
 
				+            feeds = create_pretraining_data_holder()
			
 
				             [
			
 
				-                input_ids, segment_ids, input_mask, masked_lm_positions,
			
 
				-                masked_lm_labels, next_sentence_labels, masked_lm_scale
			
 
				+                input_ids, token_type_ids, attention_mask,
			
 
				+                next_sentence_labels, masked_lm_labels
			
 
				             ] = feeds
			
 
				             bert_config = BertConfig.from_json_file(args.config_file)
			
 
				             if bert_config.vocab_size % 8 != 0:
			
@@ -144,12 +164,11 @@ def build(args, main_prog, startup_prog, feeds, is_train=True):
 
				             criterion = BertPretrainingCriterion(bert_config.vocab_size)
			
 
				             prediction_scores, seq_relationship_score = model(
			
 
				                 input_ids=input_ids,
			
 
				-                token_type_ids=segment_ids,
			
 
				-                attention_mask=input_mask,
			
 
				-                masked_positions=masked_lm_positions)
			
 
				+                token_type_ids=token_type_ids,
			
 
				+                attention_mask=attention_mask,
			
 
				+                masked_lm_labels=masked_lm_labels)
			
 
				             loss = criterion(prediction_scores, seq_relationship_score,
			
 
				-                             masked_lm_labels, next_sentence_labels,
			
 
				-                             masked_lm_scale)
			
 
				+                             masked_lm_labels, next_sentence_labels)
			
 
				 
			
 
				             lr_scheduler = None
			
 
				             optimizer = None
			
@@ -158,10 +177,16 @@ def build(args, main_prog, startup_prog, feeds, is_train=True):
 
				                 optimizer = build_optimizer(args, lr_scheduler)
			
 
				                 optimizer = dist_optimizer(args, optimizer)
			
 
				                 optimizer.minimize(loss)
			
 
				-        return model, lr_scheduler, optimizer, loss
			
 
				+        return model, lr_scheduler, optimizer, loss, feeds
			
 
				 
			
 
				 
			
 
				-def run(exe, program, args, lr_scheduler, loss, feeds, progress=None):
			
 
				+def run(exe,
			
 
				+        program,
			
 
				+        args,
			
 
				+        lr_scheduler,
			
 
				+        loss,
			
 
				+        train_dataloader,
			
 
				+        progress=None):
			
 
				     """
			
 
				     Execute program.
			
 
				 
			
@@ -172,20 +197,14 @@ def run(exe, program, args, lr_scheduler, loss, feeds, progress=None):
 
				         lr_scheduler(paddle.optimizer.lr.LRScheduler): A learning rate scheduler.
			
 
				                                                                  Default: None.
			
 
				         loss(variable): The output variable of loss function.
			
 
				-        feeds(dict): A dict of mapping variables' names to their values
			
 
				         progress(dict, optional): A dict to record the training progress of checkpoint.
			
 
				     Returns:
			
 
				         global_step(int): Final step id of this run.
			
 
				         loss_return(float): Final loss of this run.
			
 
				         train_time_raw(float): Time to train of this run.
			
 
				     """
			
 
				-    pool = ThreadPoolExecutor(1)
			
 
				-
			
 
				-    num_trainers = get_num_trainers()
			
 
				     trainer_id = get_trainer_id()
			
 
				 
			
 
				-    worker_init = WorkerInitObj(args.seed + trainer_id)
			
 
				-
			
 
				     batch_size_per_gpu = args.batch_size
			
 
				     log_steps = args.log_freq
			
 
				     save_steps = args.num_steps_per_checkpoint
			
@@ -195,12 +214,9 @@ def run(exe, program, args, lr_scheduler, loss, feeds, progress=None):
 
				     last_step = args.last_step_of_checkpoint
			
 
				     train_iter = 0
			
 
				     epoch = 0
			
 
				-    resume_from_ckpt = False
			
 
				     if progress is None:
			
 
				         progress = dict()
			
 
				     else:
			
 
				-        resume_from_ckpt = True
			
 
				-        last_step = progress.get('global_step', 0)
			
 
				         epoch = progress.get('epoch', 0)
			
 
				 
			
 
				     global_step = 0 + last_step
			
@@ -216,94 +232,64 @@ def run(exe, program, args, lr_scheduler, loss, feeds, progress=None):
 
				             max_steps = args.steps_this_run + last_step
			
 
				             logging.warning(
			
 
				                 f"{args.steps_this_run} steps will be performed in this run.")
			
 
				+
			
 
				     total_samples = 0
			
 
				+    raw_train_start = time.time()
			
 
				     step_start = time.time()
			
 
				-    raw_train_start = None
			
 
				+    avg_loss = 0
			
 
				 
			
 
				     while True:
			
 
				-        input_dir = args.input_dir
			
 
				-        if not resume_from_ckpt or progress.get('files', None) is None:
			
 
				-            files = [
			
 
				-                os.path.join(input_dir, f) for f in os.listdir(input_dir)
			
 
				-                if os.path.isfile(os.path.join(input_dir, f)) and "training" in
			
 
				-                f
			
 
				-            ]
			
 
				-            files.sort()
			
 
				-            np.random.shuffle(files)
			
 
				-            f_start_id = 0
			
 
				-        else:
			
 
				-            f_start_id = progress['f_id']
			
 
				-            files = progress['files']
			
 
				-        resume_from_ckpt = False
			
 
				-
			
 
				-        # Select one file for each worker and create the DataLoader for the file
			
 
				-        data_file = select_dataset_file_for_each_worker(
			
 
				-            files, f_start_id, num_trainers, trainer_id)
			
 
				-        train_data_loader = create_pretraining_dataset(
			
 
				-            args, data_file, feeds, worker_init, paddle.static.cuda_places())
			
 
				-
			
 
				-        for f_id in range(f_start_id + 1, len(files)):
			
 
				-            data_file = select_dataset_file_for_each_worker(
			
 
				-                files, f_id, num_trainers, trainer_id)
			
 
				-            dataset_future = pool.submit(create_pretraining_dataset, args,
			
 
				-                                         data_file, feeds, worker_init,
			
 
				-                                         paddle.static.cuda_places())
			
 
				-
			
 
				-            if raw_train_start is None:
			
 
				+        for batch in train_dataloader:
			
 
				+
			
 
				+            train_iter += 1
			
 
				+            loss_return = exe.run(program, feed=batch, fetch_list=[loss])
			
 
				+            total_samples += batch_size_per_gpu
			
 
				+            avg_loss += loss_return[0].item()
			
 
				+
			
 
				+            lr = lr_scheduler.get_lr()
			
 
				+
			
 
				+            if train_iter % (log_steps * gradient_merge_steps) == 0:
			
 
				+                step_cost = time.time() - step_start
			
 
				+                dllogger_it_data = {
			
 
				+                    'loss': avg_loss / gradient_merge_steps,
			
 
				+                    'learning_rate': lr,
			
 
				+                    'step_cost': step_cost,
			
 
				+                    'step_samples': total_samples,
			
 
				+                    'seqs_per_sec': total_samples / step_cost,
			
 
				+                }
			
 
				+                dllogger.log((epoch, global_step + 1), data=dllogger_it_data)
			
 
				+                total_samples = 0
			
 
				+                step_start = time.time()
			
 
				+
			
 
				+            if train_iter % gradient_merge_steps == 0:
			
 
				+                global_step += 1
			
 
				+                lr_scheduler.step()
			
 
				+                avg_loss = 0
			
 
				+
			
 
				+            if args.benchmark and train_iter == (args.benchmark_warmup_steps *
			
 
				+                                                 gradient_merge_steps):
			
 
				                 raw_train_start = time.time()
			
 
				 
			
 
				-            for batch in train_data_loader:
			
 
				-                train_iter += 1
			
 
				-                loss_return = exe.run(program, feed=batch, fetch_list=[loss])
			
 
				-                total_samples += batch_size_per_gpu
			
 
				-
			
 
				-                lr = lr_scheduler.get_lr()
			
 
				-                if train_iter % gradient_merge_steps == 0:
			
 
				-                    global_step += 1
			
 
				-                    lr_scheduler.step()
			
 
				-
			
 
				-                if train_iter % (log_steps * gradient_merge_steps) == 0:
			
 
				-                    step_cost = time.time() - step_start
			
 
				-                    dllogger_it_data = {
			
 
				-                        'loss': loss_return[0].item(),
			
 
				-                        'learning_rate': lr,
			
 
				-                        'step_cost': step_cost,
			
 
				-                        'step_samples': total_samples,
			
 
				-                        'seqs_per_sec': total_samples / step_cost,
			
 
				+            if train_iter % (save_steps * gradient_merge_steps
			
 
				+                             ) == 0 or global_step >= max_steps:
			
 
				+                if trainer_id == 0:
			
 
				+                    model_path = os.path.join(
			
 
				+                        args.output_dir, args.bert_model, "phase1"
			
 
				+                        if args.phase1 else "phase2", f"{global_step}")
			
 
				+                    progress = {
			
 
				+                        'epoch': epoch,
			
 
				+                        'global_step': global_step,
			
 
				+                        'phase': 1 if args.phase1 else 2,
			
 
				                     }
			
 
				-                    dllogger.log((epoch, global_step), data=dllogger_it_data)
			
 
				-                    total_samples = 0
			
 
				-                    step_start = time.time()
			
 
				-
			
 
				-                if args.benchmark and train_iter == (
			
 
				-                        args.benchmark_warmup_steps * gradient_merge_steps):
			
 
				-                    raw_train_start = time.time()
			
 
				-
			
 
				-                if train_iter % (save_steps * gradient_merge_steps
			
 
				-                                 ) == 0 or global_step >= max_steps:
			
 
				-                    if trainer_id == 0:
			
 
				-                        model_path = os.path.join(
			
 
				-                            args.output_dir, args.bert_model, "phase1"
			
 
				-                            if args.phase1 else "phase2", f"{global_step}")
			
 
				-                        progress = {
			
 
				-                            'files': files,
			
 
				-                            'epoch': epoch,
			
 
				-                            'global_step': global_step,
			
 
				-                            'f_id': f_id,
			
 
				-                            'phase': 1 if args.phase1 else 2,
			
 
				-                        }
			
 
				-                        save_model(program, model_path, args.model_prefix,
			
 
				-                                   progress)
			
 
				-                        most_recent_ckpts_paths.append(model_path)
			
 
				-                        if len(most_recent_ckpts_paths) > 3:
			
 
				-                            ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
			
 
				-                            shutil.rmtree(ckpt_to_be_removed)
			
 
				-                if (global_step >= max_steps) or (
			
 
				-                        args.benchmark and global_step >=
			
 
				-                        args.benchmark_steps + args.benchmark_warmup_steps):
			
 
				-                    train_time_raw = time.time() - raw_train_start
			
 
				-                    del train_data_loader
			
 
				-                    return global_step, loss_return[0].item(), train_time_raw
			
 
				-            del train_data_loader
			
 
				-            train_data_loader = dataset_future.result(timeout=None)
			
 
				+                    save_model(program, model_path, args.model_prefix,
			
 
				+                               progress)
			
 
				+                    most_recent_ckpts_paths.append(model_path)
			
 
				+                    if len(most_recent_ckpts_paths) > 3:
			
 
				+                        ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
			
 
				+                        shutil.rmtree(ckpt_to_be_removed)
			
 
				+            if (global_step >= max_steps) or (
			
 
				+                    args.benchmark and global_step >=
			
 
				+                    args.benchmark_steps + args.benchmark_warmup_steps):
			
 
				+                train_time_raw = time.time() - raw_train_start
			
 
				+                return global_step, loss_return[0].item(), train_time_raw
			
 
				         epoch += 1
			
--- a/PaddlePaddle/LanguageModeling/BERT/requirements.txt
+++ b/PaddlePaddle/LanguageModeling/BERT/requirements.txt
@@ -1,4 +0,0 @@
 
				-nltk
			
 
				-h5py
			
 
				-tqdm
			
 
				-git+https://github.com/NVIDIA/dllogger#egg=dllogger
			
--- a/PaddlePaddle/LanguageModeling/BERT/run.sub
+++ b/PaddlePaddle/LanguageModeling/BERT/run.sub
@@ -0,0 +1,268 @@
 
				+#!/bin/bash
			
 
				+#SBATCH --exclusive
			
 
				+#SBATCH --mem=0
			
 
				+#SBATCH --overcommit
			
 
				+#SBATCH --parsable
			
 
				+
			
 
				+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+set -eux
			
 
				+
			
 
				+#
			
 
				+# Job Configurations
			
 
				+#
			
 
				+# Tag to the built image.
			
 
				+IMAGE_VERSION=${IMAGE_VERSION:-"22.12-py3"}
			
 
				+# Number of processes per node used for the LDDL preprocessor.
			
 
				+DASK_TASKS_PER_NODE=${DASK_TASKS_PER_NODE:-128}
			
 
				+# 1 or 2 .
			
 
				+PHASE=${PHASE:-1}
			
 
				+# An integer that specifies the pretraining seed. 
			
 
				+SEED=${SEED:-42}
			
 
				+# The percentage of the articles from the Wikipedia dataset to sample and used
			
 
				+# for pretraining. 0 < ${SAMPLE_RATIO} < 1.0
			
 
				+SAMPLE_RATIO=${SAMPLE_RATIO:-0.9}
			
 
				+# Number of GPUs per node. 0 < ${GPUS} <= 8.
			
 
				+GPUS=${GPUS:-"8"}
			
 
				+# The bin size for binned LDDL data loading. 'none' or an integer that divides 
			
 
				+# 128 (for Phase1) or 512 (for Phase2).
			
 
				+BIN_SIZE=${BIN_SIZE:-"none"}
			
 
				+# Number of parquet shards per each LDDL data loader worker process. 'none' or 
			
 
				+# an integer.
			
 
				+NUM_SHARDS_PER_WORKER=${NUM_SHARDS_PER_WORKER:-"none"}
			
 
				+# Number of LDDL data loader worker processes per rank.
			
 
				+NUM_WORKERS=${NUM_WORKERS:-4}
			
 
				+# Should we rerun the LDDL preprocessor every time? 'true' or 'false' .
			
 
				+RERUN_DASK=${RERUN_DASK:-"true"}
			
 
				+# 'static' or 'dynamic' .
			
 
				+MASKING=${MASKING:-"static"}
			
 
				+# Should we use jemalloc for the LDDL preprocessor? 'true' or 'false' .
			
 
				+USE_JEMALLOC=${USE_JEMALLOC:-"true"}
			
 
				+# 'fp16' or 'tf32' .
			
 
				+PRECISION=${PRECISION:-"fp16"}
			
 
				+# The path to the initial checkpoint (from Phase1) used to start Phase2. 'none'
			
 
				+# or an absolute path.
			
 
				+INIT_CHECKPOINT=${INIT_CHECKPOINT:-"none"}
			
 
				+# The per-rank batch size before being divided by the gradient accumulation
			
 
				+# steps.
			
 
				+TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-"256"}
			
 
				+# The gradient accumulation steps.
			
 
				+GRADIENT_ACCUMULATION_STEPS=${GRADIENT_ACCUMULATION_STEPS:-"32"}
			
 
				+
			
 
				+#
			
 
				+# Static Configurations
			
 
				+#
			
 
				+# Container URL.
			
 
				+# Replace this with the URL of the docker image that you build 
			
 
				+# with scripts/docker/build.sh .
			
 
				+readonly docker_image="bert:${IMAGE_VERSION}" 
			
 
				+# Where the datasets are stored on the system.
			
 
				+readonly host_datadir="/home/${USER}/datasets"
			
 
				+readonly container_datadir="/datasets"
			
 
				+# Replace these with the path to the 'source' subdirectory of the LDDL Wikipedia
			
 
				+# dataset.
			
 
				+readonly host_wikipedia_source="${host_datadir}/wikipedia/source"
			
 
				+readonly container_wikipedia_source="${container_datadir}/wikipedia/source"
			
 
				+readonly wikipedia_mount="${host_wikipedia_source}:${container_wikipedia_source}"
			
 
				+# Replace these with where you want to store the Parquet shards in case 
			
 
				+# ${RERUN_DASK} is 'false'.
			
 
				+readonly host_pretrain="${host_datadir}/pretrain"
			
 
				+readonly container_pretrain="${container_datadir}/pretrain"
			
 
				+readonly pretrain_mount="${host_pretrain}:${container_pretrain}"
			
 
				+# Replace these with where you want to store the pretrained checkpoints on 
			
 
				+# the system.
			
 
				+readonly host_output="$PWD/results/${SLURM_JOB_ID}"
			
 
				+mkdir -p "${host_output}"
			
 
				+readonly container_output="/results"
			
 
				+readonly output_mount="${host_output}:${container_output}"
			
 
				+# If INIT_CHECKPOINT is 'none', infer INIT_CHECKPOINT based on job dependency.
			
 
				+if [ "${INIT_CHECKPOINT}" == "none" ] && [ "${PHASE}" == "2" ] ; then
			
 
				+  INIT_CHECKPOINT="$PWD/results/${SLURM_JOB_DEPENDENCY}/bert-large-uncased/phase1/7038"
			
 
				+fi
			
 
				+# Define mounts.
			
 
				+mounts="${PWD}:/workspace/bert,${wikipedia_mount},${pretrain_mount},${output_mount}"
			
 
				+# Add the mount path of the initial checkpoint for Phase2.
			
 
				+if [ "${PHASE}" == "1" ]; then
			
 
				+  echo "No init. mounted for Phase1!"
			
 
				+  readonly container_init_checkpoint=""
			
 
				+elif [ "${PHASE}" == "2" ]; then
			
 
				+  if [ ! -f "${INIT_CHECKPOINT}" ]; then
			
 
				+    echo "No init. checkpoint found for Phase2!"
			
 
				+    exit 1
			
 
				+  else
			
 
				+    mounts="${mounts},$(dirname "${INIT_CHECKPOINT}"):/checkpoints"
			
 
				+    readonly container_init_checkpoint="/checkpoints"
			
 
				+  fi
			
 
				+else
			
 
				+  echo "\${PHASE} = ${PHASE} unknown!"
			
 
				+  exit 1
			
 
				+fi
			
 
				+# Determine where the parquet shards should be stored.
			
 
				+if [ "${RERUN_DASK}" == "true" ]; then
			
 
				+  # Always rerun the dask pipeline. Therefore, use the output directory to store
			
 
				+  # the parquets.
			
 
				+  readonly host_pretrain_parquet="${host_output}/parquet"
			
 
				+  readonly container_pretrain_parquet="${container_output}/parquet"
			
 
				+elif [ "${RERUN_DASK}" == "false" ]; then
			
 
				+  echo "Use existing parquets if they exists."
			
 
				+  if [ "${BIN_SIZE}" == "none" ]; then
			
 
				+      readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/unbinned/parquet"
			
 
				+      readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/unbinned/parquet"
			
 
				+  else
			
 
				+      readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
			
 
				+      readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
			
 
				+  fi
			
 
				+else
			
 
				+  echo "\${RERUN_DASK} = ${RERUN_DASK} unknown!"
			
 
				+  exit 1
			
 
				+fi
			
 
				+
			
 
				+readonly PHASE1="\
			
 
				+    --learning-rate=6e-3 \
			
 
				+    --warmup-proportion=0.2843 \
			
 
				+    --phase1 \
			
 
				+    --max-seq-length=128 \
			
 
				+    --max-predictions-per-seq=20 \
			
 
				+    --max-steps=7038 \
			
 
				+    --num-steps-per-checkpoint=2500 \
			
 
				+    "
			
 
				+
			
 
				+readonly PHASE2="\
			
 
				+    --learning-rate=4e-3 \
			
 
				+    --warmup-proportion=0.128 \
			
 
				+    --phase2 \
			
 
				+    --max-seq-length=512 \
			
 
				+    --max-predictions-per-seq=80 \
			
 
				+    --max-steps=1563 \
			
 
				+    --num-steps-per-checkpoint=1000 \
			
 
				+    --from-pretrained-params=${container_init_checkpoint} \
			
 
				+    "
			
 
				+
			
 
				+# Arguments for fp16.
			
 
				+if [ "${PRECISION}" == "fp16" ]; then
			
 
				+  readonly fp16_flags="--amp --use-dynamic-loss-scaling --scale-loss=1048576"
			
 
				+elif [ "${PRECISION}" == "tf32" ]; then
			
 
				+  readonly fp16_flags=""
			
 
				+else
			
 
				+  echo "\${PRECISION} = ${PRECISION} unknown!"
			
 
				+  exit 1
			
 
				+fi
			
 
				+
			
 
				+# Get the ip address of all nodes.
			
 
				+IP_CMD="hostname -i"
			
 
				+IP_STR=$(srun -pmix --ntasks-per-node=1 bash -c "${IP_CMD}")
			
 
				+IP_STR=$(echo $IP_STR | sed 's/ /,/g')
			
 
				+echo "\${IP_STR} = ${IP_STR}"
			
 
				+
			
 
				+# Get the actual pretraining command.
			
 
				+readonly PHASES=( "$PHASE1" "$PHASE2" ) 
			
 
				+readonly BERT_CMD="\
			
 
				+    python -m paddle.distributed.launch \
			
 
				+    --gpus=0,1,2,3,4,5,6,7 \
			
 
				+    --ips="${IP_STR}" \
			
 
				+    /workspace/bert/run_pretraining.py \
			
 
				+    ${PHASES[$((PHASE - 1))]} \
			
 
				+    --batch-size=${TRAIN_BATCH_SIZE} \
			
 
				+    --input-dir=${container_pretrain_parquet} \
			
 
				+    --output-dir=${container_output} \
			
 
				+    --vocab-file=/workspace/bert/vocab/bert-large-uncased-vocab.txt \
			
 
				+    --bert-model=bert-large-uncased \
			
 
				+    --config-file=/workspace/bert/bert_configs/bert-large-uncased.json \
			
 
				+    --gradient-merge-steps=${GRADIENT_ACCUMULATION_STEPS} \
			
 
				+    --log-freq=1 \
			
 
				+    --seed=12345 \
			
 
				+    --optimizer=Lamb \
			
 
				+    ${fp16_flags} "
			
 
				+
			
 
				+echo "nodes: ${SLURM_JOB_NUM_NODES}, TRAIN_BATCH_SIZE: ${TRAIN_BATCH_SIZE}, GRADIENT_ACCUMULATION_STEPS: ${GRADIENT_ACCUMULATION_STEPS}"
			
 
				+
			
 
				+#
			
 
				+# Running the LDDL preprocessor and load balancer.
			
 
				+# 
			
 
				+# Determine the number of parquet shards in total.
			
 
				+if [ "${NUM_SHARDS_PER_WORKER}" == "none" ]; then
			
 
				+  readonly num_blocks=4096
			
 
				+else
			
 
				+  readonly num_blocks=$((NUM_SHARDS_PER_WORKER * $(( NUM_WORKERS > 0 ? NUM_WORKERS : 1 )) * SLURM_JOB_NUM_NODES * GPUS))
			
 
				+fi
			
 
				+echo "num_blocks: ${num_blocks}"
			
 
				+# Run the LDDL preprocessor and load balancer only when there is no file in 
			
 
				+# where the parquets are supposed to be stored.
			
 
				+if [ ! -d "${host_pretrain_parquet}" ] || [ -z "$(ls -A "${host_pretrain_parquet}")" ]; then
			
 
				+  # The sequence length is 128 for Phase1, but 512 for Phase2.
			
 
				+  if [ "${PHASE}" == "1" ]; then
			
 
				+    readonly target_seq_len_flag=""
			
 
				+  elif [ "${PHASE}" == "2" ]; then
			
 
				+    readonly target_seq_len_flag="--target-seq-length 512"
			
 
				+  else
			
 
				+    echo "\${PHASE} = ${PHASE} unknown!"
			
 
				+    exit 1
			
 
				+  fi
			
 
				+  # Should we use sequence binning?
			
 
				+  if [ "${BIN_SIZE}" == "none" ]; then
			
 
				+    readonly bin_size_flag=""
			
 
				+  else
			
 
				+    readonly bin_size_flag="--bin-size ${BIN_SIZE}"
			
 
				+  fi
			
 
				+  # Static masking or dynamic masking?
			
 
				+  if [ "${MASKING}" == "dynamic" ]; then
			
 
				+    readonly masking_flag=""
			
 
				+  elif [ "${MASKING}" == "static" ]; then
			
 
				+    readonly masking_flag="--masking"
			
 
				+  else
			
 
				+    echo "\${MASKING} = ${MASKING} unknown!"
			
 
				+    exit 1
			
 
				+  fi
			
 
				+  # Should we use jemalloc for the LDDL preprocessor?
			
 
				+  if [ "${USE_JEMALLOC}" == "true" ]; then
			
 
				+    readonly use_jemalloc_flag="--export=ALL,LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so"
			
 
				+  elif [ "${USE_JEMALLOC}" == "false" ]; then
			
 
				+    readonly use_jemalloc_flag=""
			
 
				+  else
			
 
				+    echo "\${USE_JEMALLOC} = ${USE_JEMALLOC} unknown!"
			
 
				+    exit 1
			
 
				+  fi
			
 
				+  # Run the LDDL preprocessor.
			
 
				+  srun -l \
			
 
				+    --mpi=pmix \
			
 
				+    --container-image="${docker_image}" \
			
 
				+    --container-mounts="${mounts}"  \
			
 
				+    --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
			
 
				+    ${use_jemalloc_flag} \
			
 
				+    preprocess_bert_pretrain \
			
 
				+      --schedule mpi \
			
 
				+      ${target_seq_len_flag} \
			
 
				+      --wikipedia ${container_wikipedia_source} \
			
 
				+      --sink "${container_pretrain_parquet}" \
			
 
				+      --vocab-file /workspace/bert/vocab/bert-large-uncased-vocab.txt \
			
 
				+      --num-blocks "${num_blocks}" \
			
 
				+      --sample-ratio "${SAMPLE_RATIO}" \
			
 
				+      ${bin_size_flag} \
			
 
				+      ${masking_flag} \
			
 
				+      --seed "${SEED}"
			
 
				+  # Run the LDDL load balancer.
			
 
				+  srun -l \
			
 
				+    --mpi=pmix \
			
 
				+    --container-image="${docker_image}" \
			
 
				+    --container-mounts="${mounts}"  \
			
 
				+    --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
			
 
				+    balance_dask_output \
			
 
				+      --indir "${container_pretrain_parquet}" \
			
 
				+      --num-shards "${num_blocks}"
			
 
				+fi
			
 
				+
			
 
				+# 
			
 
				+# Run pretraining.
			
 
				+#
			
 
				+srun -l -pmix --container-image="${docker_image}" --container-mounts="${mounts}" --ntasks-per-node=1 bash -c "${BERT_CMD}"
			
--- a/PaddlePaddle/LanguageModeling/BERT/run_pretraining.py
+++ b/PaddlePaddle/LanguageModeling/BERT/run_pretraining.py
@@ -12,11 +12,12 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				+import os
			
 
				 import time
			
 
				+import logging
			
 
				 import paddle
			
 
				 import paddle.distributed.fleet as fleet
			
 
				 
			
 
				-from pretraining_dataset import create_pretraining_data_holder
			
 
				 from utils.config import parse_args, print_args
			
 
				 from utils.save_load import init_program
			
 
				 from utils.logger import setup_loggers
			
@@ -24,6 +25,7 @@ from utils.affinity import set_cpu_affinity
 
				 from utils.utility import set_seed, get_trainer_id, get_num_trainers
			
 
				 import program
			
 
				 import dllogger
			
 
				+from lddl.paddle import get_bert_pretrain_data_loader
			
 
				 
			
 
				 
			
 
				 def main():
			
@@ -42,12 +44,11 @@ def main():
 
				     if args.show_config:
			
 
				         print_args(args)
			
 
				 
			
 
				+    device = paddle.set_device('gpu')
			
 
				     fleet.init(is_collective=True)
			
 
				     if args.enable_cpu_affinity:
			
 
				         set_cpu_affinity()
			
 
				 
			
 
				-    device = paddle.set_device('gpu')
			
 
				-
			
 
				     # Create the random seed for the worker
			
 
				     set_seed(args.seed + get_trainer_id())
			
 
				 
			
@@ -60,21 +61,34 @@ def main():
 
				     main_program = paddle.static.default_main_program()
			
 
				     startup_program = paddle.static.default_startup_program()
			
 
				 
			
 
				-    feeds = create_pretraining_data_holder()
			
 
				-
			
 
				-    model, lr_scheduler, optimizer, loss = program.build(
			
 
				-        args, main_program, startup_program, feeds)
			
 
				+    model, lr_scheduler, optimizer, loss, feeds = program.build(
			
 
				+        args, main_program, startup_program)
			
 
				 
			
 
				     exe = paddle.static.Executor(device)
			
 
				     exe.run(startup_program)
			
 
				 
			
 
				     progress = init_program(args, program=main_program, exe=exe, model=model)
			
 
				+    train_dataloader = get_bert_pretrain_data_loader(
			
 
				+        args.input_dir,
			
 
				+        vocab_file=args.vocab_file,
			
 
				+        data_loader_kwargs={
			
 
				+            'batch_size': args.batch_size,
			
 
				+            'num_workers': args.num_workers,
			
 
				+            'persistent_workers': True,
			
 
				+            'feed_list': feeds
			
 
				+        },
			
 
				+        base_seed=args.seed,
			
 
				+        log_dir=None if args.output_dir is None else
			
 
				+        os.path.join(args.output_dir, 'lddl_log'),
			
 
				+        log_level=logging.WARNING,
			
 
				+        start_epoch=0 if progress is None else progress.get("epoch", 0), )
			
 
				 
			
 
				     if args.amp:
			
 
				         optimizer.amp_init(device)
			
 
				 
			
 
				     global_step, final_loss, train_time_raw = program.run(
			
 
				-        exe, main_program, args, lr_scheduler, loss, feeds, progress)
			
 
				+        exe, main_program, args, lr_scheduler, loss, train_dataloader,
			
 
				+        progress)
			
 
				 
			
 
				     if get_trainer_id() == 0:
			
 
				         e2e_time = time.time() - now
			
--- a/PaddlePaddle/LanguageModeling/BERT/scripts/configs/pretrain_config.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/scripts/configs/pretrain_config.sh
@@ -30,14 +30,22 @@ dgxa100-80g_8gpu_amp ()
 
				     warmup_proportion_phase2="0.128"
			
 
				     train_steps_phase2=1563
			
 
				     gradient_accumulation_steps_phase2=128
			
 
				-    DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets
			
 
				+    DATASET=pretrain/phase1/unbinned/parquet # change this for other datasets
			
 
				     DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
			
 
				-    DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets
			
 
				+    DATASET2=pretrain/phase2/bin_size_64/parquet # change this for other datasets
			
 
				     DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
			
 
				     CODEDIR=/workspace/bert
			
 
				     init_checkpoint="None"
			
 
				+    VOCAB_FILE=vocab/bert-large-uncased-vocab.txt
			
 
				     RESULTS_DIR=$CODEDIR/results
			
 
				     CHECKPOINTS_DIR=$RESULTS_DIR
			
 
				+    wikipedia_source=$BERT_PREP_WORKING_DIR/wikipedia/source/
			
 
				+    num_dask_workers=128
			
 
				+    num_shards_per_worker=128
			
 
				+    num_workers=4
			
 
				+    sample_ratio="0.9"
			
 
				+    phase2_bin_size=64
			
 
				+    masking=static
			
 
				     BERT_CONFIG=bert_configs/bert-large-uncased.json
			
 
				     enable_benchmark="false"
			
 
				     benchmark_steps=10  # It takes effect only after the enable_benchmark is set to true
			
@@ -45,9 +53,11 @@ dgxa100-80g_8gpu_amp ()
 
				     echo $train_batch_size $learning_rate $precision $num_gpus \
			
 
				          $warmup_proportion $train_steps $save_checkpoint_steps \
			
 
				          $create_logfile $gradient_accumulation_steps $seed $job_name \
			
 
				-	 $train_batch_size_phase2 $learning_rate_phase2 \
			
 
				+         $train_batch_size_phase2 $learning_rate_phase2 \
			
 
				          $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
			
 
				          $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR $init_checkpoint \
			
 
				+         $wikipedia_source $num_dask_workers $num_shards_per_worker $num_workers \
			
 
				+         $sample_ratio $phase2_bin_size $masking \
			
 
				          $BERT_CONFIG $enable_benchmark $benchmark_steps $benchmark_warmup_steps
			
 
				 }
			
 
				 
			
@@ -69,14 +79,22 @@ dgxa100-80g_8gpu_tf32 ()
 
				     warmup_proportion_phase2="0.128"
			
 
				     train_steps_phase2=1563
			
 
				     gradient_accumulation_steps_phase2=256
			
 
				-    DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets
			
 
				+    DATASET=pretrain/phase1/unbinned/parquet # change this for other datasets
			
 
				     DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
			
 
				-    DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets
			
 
				+    DATASET2=pretrain/phase2/bin_size_64/parquet # change this for other datasets
			
 
				     DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
			
 
				     CODEDIR=/workspace/bert
			
 
				     init_checkpoint="None"
			
 
				+    VOCAB_FILE=vocab/bert-large-uncased-vocab.txt
			
 
				     RESULTS_DIR=$CODEDIR/results
			
 
				     CHECKPOINTS_DIR=$RESULTS_DIR
			
 
				+    wikipedia_source=$BERT_PREP_WORKING_DIR/wikipedia/source/
			
 
				+    num_dask_workers=128
			
 
				+    num_shards_per_worker=128
			
 
				+    num_workers=4
			
 
				+    sample_ratio="0.9"
			
 
				+    phase2_bin_size=64
			
 
				+    masking=static
			
 
				     BERT_CONFIG=bert_configs/bert-large-uncased.json
			
 
				     enable_benchmark="false"
			
 
				     benchmark_steps=10  # It takes effect only after the enable_benchmark is set to true
			
@@ -84,8 +102,10 @@ dgxa100-80g_8gpu_tf32 ()
 
				     echo $train_batch_size $learning_rate $precision $num_gpus \
			
 
				          $warmup_proportion $train_steps $save_checkpoint_steps \
			
 
				          $create_logfile $gradient_accumulation_steps $seed $job_name \
			
 
				-	 $train_batch_size_phase2 $learning_rate_phase2 \
			
 
				+         $train_batch_size_phase2 $learning_rate_phase2 \
			
 
				          $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
			
 
				          $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR $init_checkpoint \
			
 
				+         $wikipedia_source $num_dask_workers $num_shards_per_worker $num_workers \
			
 
				+         $sample_ratio $phase2_bin_size $masking \
			
 
				          $BERT_CONFIG $enable_benchmark $benchmark_steps $benchmark_warmup_steps
			
 
				 }
			
--- a/PaddlePaddle/LanguageModeling/BERT/scripts/docker/build.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/scripts/docker/build.sh
@@ -1,6 +1,6 @@
 
				 #!/bin/bash
			
 
				 
			
 
				-# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
			
 
				+# Copyright (c) 2023 NVIDIA Corporation.  All rights reserved.
			
 
				 #
			
 
				 # Licensed under the Apache License, Version 2.0 (the "License");
			
 
				 # you may not use this file except in compliance with the License.
			
@@ -14,4 +14,24 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				-docker build --network=host . --rm --pull --no-cache -t bert
			
 
				+URL=${1:-"bert"}
			
 
				+PUSH=${2:-"none"}  # 'push' or 'none'
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+docker build \
			
 
				+  --network=host \
			
 
				+  --rm \
			
 
				+  --pull \
			
 
				+  --no-cache \
			
 
				+  -t ${URL} \
			
 
				+  .
			
 
				+
			
 
				+if [ "${PUSH}" == "push" ]; then
			
 
				+  docker push ${URL}
			
 
				+elif [ "${PUSH}" == "none" ]; then
			
 
				+  echo "Keep the built image locally."
			
 
				+else
			
 
				+  echo "Invalid \${PUSH} option: ${PUSH} !"
			
 
				+  exit 1
			
 
				+fi
			
--- a/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining.sh
@@ -32,25 +32,87 @@ warmup_proportion_phase2=${14:-"0.128"}
 
				 train_steps_phase2=${15:-1563}
			
 
				 gradient_accumulation_steps_phase2=${16:-128}
			
 
				 #change this for other datasets
			
 
				-DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en
			
 
				+DATASET=pretrain/phase1/unbinned/parquet
			
 
				 DATA_DIR_PHASE1=${17:-$BERT_PREP_WORKING_DIR/${DATASET}/}
			
 
				 #change this for other datasets
			
 
				-DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en
			
 
				+DATASET2=pretrain/phase2/bin_size_64/parquet
			
 
				 DATA_DIR_PHASE2=${18:-$BERT_PREP_WORKING_DIR/${DATASET2}/}
			
 
				 CODEDIR=${19:-"/workspace/bert"}
			
 
				 init_checkpoint=${20:-"None"}
			
 
				+VOCAB_FILE=vocab/bert-large-uncased-vocab.txt
			
 
				 RESULTS_DIR=$CODEDIR/results
			
 
				 CHECKPOINTS_DIR=$RESULTS_DIR
			
 
				-BERT_CONFIG=${21:-"None"}
			
 
				-enable_benchmark=${22:-"false"}
			
 
				-benchmark_steps=${23:-"10"}
			
 
				-benchmark_warmup_steps=${24:-"10"}
			
 
				+wikipedia_source=${21:-$BERT_PREP_WORKING_DIR/wikipedia/source/}
			
 
				+num_dask_workers=${22:-$(nproc)}
			
 
				+num_shards_per_worker=${23:-128}
			
 
				+num_workers=${24:-4}
			
 
				+num_nodes=1
			
 
				+sample_ratio=${25:-0.9}
			
 
				+phase2_bin_size=${26:-64}
			
 
				+masking=${27:-static}
			
 
				+BERT_CONFIG=${28:-"None"}
			
 
				+enable_benchmark=${29:-"false"}
			
 
				+benchmark_steps=${30:-"10"}
			
 
				+benchmark_warmup_steps=${31:-"10"}
			
 
				+
			
 
				+# Calculate the total number of shards.
			
 
				+readonly num_blocks=$((num_shards_per_worker * $(( num_workers > 0 ? num_workers : 1 )) * num_nodes * num_gpus))
			
 
				+
			
 
				+if [ "${phase2_bin_size}" == "none" ]; then
			
 
				+   readonly phase2_bin_size_flag=""
			
 
				+elif [[ "${phase2_bin_size}" =~ ^(32|64|128|256|512)$ ]]; then
			
 
				+   readonly phase2_bin_size_flag="--bin-size ${phase2_bin_size}"
			
 
				+else
			
 
				+   echo "Error! phase2_bin_size=${phase2_bin_size} not supported!"
			
 
				+   return -1
			
 
				+fi
			
 
				+
			
 
				+if [ "${masking}" == "static" ]; then
			
 
				+   readonly masking_flag="--masking"
			
 
				+elif [ "${masking}" == "dynamic" ]; then
			
 
				+   readonly masking_flag=""
			
 
				+else
			
 
				+   echo "Error! masking=${masking} not supported!"
			
 
				+   return -1
			
 
				+fi
			
 
				 
			
 
				 mkdir -p $CHECKPOINTS_DIR
			
 
				 
			
 
				 
			
 
				-if [ ! -d "$DATA_DIR_PHASE1" ] ; then
			
 
				-   echo "Warning! $DATA_DIR_PHASE1 directory missing. Training cannot start"
			
 
				+if [ ! -d "${DATA_DIR_PHASE1}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE1})" ]; then
			
 
				+   echo "Warning! ${DATA_DIR_PHASE1} directory missing."
			
 
				+   if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
			
 
				+      echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
			
 
				+      return -1
			
 
				+   fi
			
 
				+   preprocess_cmd=" \
			
 
				+      mpirun \
			
 
				+         --oversubscribe \
			
 
				+         --allow-run-as-root \
			
 
				+         -np ${num_dask_workers} \
			
 
				+         -x LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so \
			
 
				+            preprocess_bert_pretrain \
			
 
				+               --schedule mpi \
			
 
				+               --vocab-file ${VOCAB_FILE} \
			
 
				+               --wikipedia ${wikipedia_source} \
			
 
				+               --sink ${DATA_DIR_PHASE1} \
			
 
				+               --num-blocks ${num_blocks} \
			
 
				+               --sample-ratio ${sample_ratio} \
			
 
				+               ${masking_flag} \
			
 
				+               --seed ${seed}"
			
 
				+   echo "Running ${preprocess_cmd} ..."
			
 
				+   ${preprocess_cmd}
			
 
				+
			
 
				+   balance_load_cmd=" \
			
 
				+      mpirun \
			
 
				+         --oversubscribe \
			
 
				+         --allow-run-as-root \
			
 
				+         -np ${num_dask_workers} \
			
 
				+            balance_dask_output \
			
 
				+               --indir ${DATA_DIR_PHASE1} \
			
 
				+               --num-shards ${num_blocks}"
			
 
				+   echo "Running ${balance_load_cmd} ..."
			
 
				+   ${balance_load_cmd}
			
 
				 fi
			
 
				 if [ ! -d "$RESULTS_DIR" ] ; then
			
 
				    echo "Error! $RESULTS_DIR directory missing."
			
@@ -119,6 +181,7 @@ echo $DATA_DIR_PHASE1
 
				 INPUT_DIR=$DATA_DIR_PHASE1
			
 
				 CMD=" $CODEDIR/run_pretraining.py"
			
 
				 CMD+=" --input-dir=$DATA_DIR_PHASE1"
			
 
				+CMD+=" --vocab-file=$VOCAB_FILE"
			
 
				 CMD+=" --output-dir=$CHECKPOINTS_DIR"
			
 
				 CMD+=" $CONFIG "
			
 
				 CMD+=" --bert-model=bert-large-uncased"
			
@@ -180,11 +243,49 @@ fi
 
				 ACCUMULATE_GRADIENTS="--gradient-merge-steps=$gradient_accumulation_steps_phase2"
			
 
				 
			
 
				 
			
 
				+if [ ! -d "${DATA_DIR_PHASE2}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE2})" ]; then
			
 
				+   echo "Warning! ${DATA_DIR_PHASE2} directory missing."
			
 
				+   if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
			
 
				+      echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
			
 
				+      return -1
			
 
				+   fi
			
 
				+   preprocess_cmd=" \
			
 
				+      mpirun \
			
 
				+         --oversubscribe \
			
 
				+         --allow-run-as-root \
			
 
				+         -np ${num_dask_workers} \
			
 
				+         -x LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so \
			
 
				+            preprocess_bert_pretrain \
			
 
				+               --schedule mpi \
			
 
				+               --vocab-file ${VOCAB_FILE} \
			
 
				+               --wikipedia ${wikipedia_source} \
			
 
				+               --sink ${DATA_DIR_PHASE2} \
			
 
				+               --target-seq-length 512 \
			
 
				+               --num-blocks ${num_blocks} \
			
 
				+               --sample-ratio ${sample_ratio} \
			
 
				+               ${phase2_bin_size_flag} \
			
 
				+               ${masking_flag} \
			
 
				+               --seed ${seed}"
			
 
				+   echo "Running ${preprocess_cmd} ..."
			
 
				+   ${preprocess_cmd}
			
 
				+
			
 
				+   balance_load_cmd=" \
			
 
				+      mpirun \
			
 
				+         --oversubscribe \
			
 
				+         --allow-run-as-root \
			
 
				+         -np ${num_dask_workers} \
			
 
				+            balance_dask_output \
			
 
				+               --indir ${DATA_DIR_PHASE2} \
			
 
				+               --num-shards ${num_blocks}"
			
 
				+   echo "Running ${balance_load_cmd} ..."
			
 
				+   ${balance_load_cmd}
			
 
				+fi
			
 
				 echo $DATA_DIR_PHASE2
			
 
				 INPUT_DIR=$DATA_DIR_PHASE2
			
 
				 PHASE1_END_CKPT_DIR="${CHECKPOINTS_DIR}/bert-large-uncased/phase1/${train_steps}"
			
 
				 CMD=" $CODEDIR/run_pretraining.py"
			
 
				 CMD+=" --input-dir=$DATA_DIR_PHASE2"
			
 
				+CMD+=" --vocab-file=$VOCAB_FILE"
			
 
				 CMD+=" --output-dir=$CHECKPOINTS_DIR"
			
 
				 CMD+=" $CONFIG "
			
 
				 CMD+=" --bert-model=bert-large-uncased"
			
--- a/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining_p1.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining_p1.sh
@@ -15,7 +15,8 @@
 
				 python3 -m paddle.distributed.launch \
			
 
				 --gpus="0,1,2,3,4,5,6,7" \
			
 
				 ./run_pretraining.py \
			
 
				---input-dir=./data/hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en \
			
 
				+--input-dir=pretrain/phase1/unbinned/parquet \
			
 
				+--vocab-file=vocab/bert-large-uncased-vocab.txt \
			
 
				 --output-dir=./results/checkpoints \
			
 
				 --bert-model=bert-large-uncased \
			
 
				 --from-checkpoint=./results/checkpoints/bert-large-uncased/phase1 \
			
--- a/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining_p2.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining_p2.sh
@@ -15,7 +15,8 @@
 
				 python3 -m paddle.distributed.launch \
			
 
				 --gpus="0,1,2,3,4,5,6,7" \
			
 
				 ./run_pretraining.py \
			
 
				---input-dir=./data/hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en \
			
 
				+--input-dir=pretrain/phase2/bin_size_64/parquet \
			
 
				+--vocab-file=vocab/bert-large-uncased-vocab.txt \
			
 
				 --output-dir=./results/checkpoints \
			
 
				 --bert-model=bert-large-uncased \
			
 
				 --from-checkpoint=./results/checkpoints/bert-large-uncased/phase2 \
			
--- a/PaddlePaddle/LanguageModeling/BERT/utils/config.py
+++ b/PaddlePaddle/LanguageModeling/BERT/utils/config.py
@@ -157,6 +157,7 @@ def add_global_args(parser, task=Task.pretrain):
 
				             required=True,
			
 
				             help='The input data directory. Should be specified by users and contain .hdf5 files for the task.'
			
 
				         )
			
 
				+        group.add_argument('--num-workers', default=4, type=int)
			
 
				     if task == Task.squad:
			
 
				         group.add_argument(
			
 
				             '--train-file',
			
@@ -169,12 +170,6 @@ def add_global_args(parser, task=Task.pretrain):
 
				             default=None,
			
 
				             help='SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json'
			
 
				         )
			
 
				-        group.add_argument(
			
 
				-            '--vocab-file',
			
 
				-            type=str,
			
 
				-            default=None,
			
 
				-            required=True,
			
 
				-            help="Vocabulary mapping/file BERT was pretrainined on")
			
 
				         group.add_argument(
			
 
				             "--eval-script",
			
 
				             help="Script to evaluate squad predictions",
			
@@ -186,6 +181,12 @@ def add_global_args(parser, task=Task.pretrain):
 
				             default=3,
			
 
				             help='The number of epochs for training.')
			
 
				 
			
 
				+    group.add_argument(
			
 
				+        '--vocab-file',
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        required=True,
			
 
				+        help="Vocabulary mapping/file BERT was pretrainined on")
			
 
				     group.add_argument(
			
 
				         '--output-dir',
			
 
				         type=str,