2 yıl önce · a5388a45f7
--- a/PaddlePaddle/LanguageModeling/BERT/Dockerfile
+++ b/PaddlePaddle/LanguageModeling/BERT/Dockerfile
@@ -1,4 +1,4 @@
 
															-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:22.12-py3
														
 
															+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.06-py3
														
 
															 FROM ${FROM_IMAGE_NAME}
														
 
															 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
														
--- a/PaddlePaddle/LanguageModeling/BERT/README.md
+++ b/PaddlePaddle/LanguageModeling/BERT/README.md
@@ -437,6 +437,7 @@ Advanced Training:
 
															   --use-dynamic-loss-scaling
														
 
															                         Enable dynamic loss scaling in AMP training, only applied when --amp is set. (default: False)
														
 
															   --use-pure-fp16       Enable pure FP16 training, only applied when --amp is set. (default: False)
														
 
															+  --fuse-mha            Enable multihead attention fusion. Require cudnn version >= 8.9.1.
														
 
															 ```
														
@@ -463,6 +464,7 @@ Default arguments are listed below in the order `scripts/run_squad.sh` expects:
 
															 -   Enable benchmark - The default is `false`.
														
 
															 -   Benchmark steps - The default is `100`.
														
 
															 -   Benchmark warmup steps - The default is `100`.
														
 
															+-   Fuse MHA fusion - The default is `true`
														
 
															 The script saves the final checkpoint to the `/results/bert-large-uncased/squad` folder.
														
@@ -593,7 +595,8 @@ bash run_pretraining.sh \
 
															     <bert_config_file> \
														
 
															     <enable_benchmark> \
														
 
															     <benchmark_steps> \
														
 
															-    <benchmark_warmup_steps>
														
 
															+    <benchmark_warmup_steps> \
														
 
															+    <fuse_mha>
														
 
															 ```
														
 
															 Where:
														
@@ -627,6 +630,7 @@ Where:
 
															 -   `masking` LDDL supports both static and dynamic masking. Refer to [LDDL's README](https://github.com/NVIDIA/LDDL/blob/main/README.md) for more information.
														
 
															 -   `<bert_config_file>` is the path to the bert config file.
														
 
															 -   `<enable_benchmark>` a flag to enable benchmark. The train process will warmup for `<benchmark_warmup_steps>` and then measure the throughput of the following `<benchmark_steps>`.
														
 
															+-   `<fuse_mha>` a flag to enable cuDNN MHA fusion.
														
 
															 Note that: 
														
 
															 - If users follow [Quick Start Guide](#quick-start-guide) to set up container and dataset, there is no need to set any parameters. For example:
														
@@ -670,6 +674,7 @@ python3 -m paddle.distributed.launch \
 
															     --max-predictions-per-seq=20 \
														
 
															     --gradient-merge-steps=32 \
														
 
															     --amp \
														
 
															+    --fuse-mha \
														
 
															     --use-dynamic-loss-scaling \
														
 
															     --optimizer=Lamb \
														
 
															     --phase1 \
														
@@ -769,7 +774,8 @@ bash scripts/run_squad.sh \
 
															     <max_steps> \
														
 
															     <enable_benchmark> \
														
 
															     <benchmark_steps> \
														
 
															-    <benchmark_warmup_steps>
														
 
															+    <benchmark_warmup_steps> \
														
 
															+    <fuse_mha>
														
 
															 ```
														
 
															 By default, the `mode` argument is set to `train eval`. Refer to the [Quick Start Guide](#quick-start-guide) for explanations of each positional argument.
														
@@ -812,7 +818,7 @@ bash scripts/run_pretraining.sh \
 
															     None \
														
 
															     /path/to/wikipedia/source \
														
 
															     32 128 4 0.9 64 static \
														
 
															-    None true 10 10
														
 
															+    None true 10 10 true
														
 
															 ```
														
 
															 To benchmark the training performance on a specific batch size for SQuAD, refer to [Fine-tuning](#fine-tuning) and turn on the `<benchmark>` flags. An example call to run training for 200 steps (100 steps for warmup and 100 steps to measure), and generate throughput numbers:
														
@@ -825,7 +831,7 @@ bash scripts/run_squad.sh \
 
															     results/checkpoints \
														
 
															     train \
														
 
															     bert_configs/bert-large-uncased.json \
														
 
															-    -1 true 100 100
														
 
															+    -1 true 100 100 true
														
 
															 ```
														
 
															 #### Inference performance benchmark
														
@@ -841,7 +847,8 @@ bash scripts/run_squad.sh \
 
															     <results directory> \
														
 
															     eval \
														
 
															     <BERT config path> \
														
 
															-    <max steps> <benchmark> <benchmark_steps> <benchmark_warmup_steps>
														
 
															+    <max steps> <benchmark> <benchmark_steps> <benchmark_warmup_steps> \
														
 
															+    <fuse_mha>
														
 
															 ```
														
 
															 An example call to run inference and generate throughput numbers:
														
@@ -854,7 +861,7 @@ bash scripts/run_squad.sh \
 
															     results/checkpoints \
														
 
															     eval \
														
 
															     bert_configs/bert-large-uncased.json \
														
 
															-    -1 true 100 100
														
 
															+    -1 true 100 100 true
														
 
															 ```
														
@@ -870,7 +877,7 @@ Our results were obtained by running the `scripts/run_squad.sh` and `scripts/run
 
															 | DGX System         | GPUs / Node | Precision | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) |     Final Loss    | Time to train(hours) | Time to train speedup (TF32 to mixed precision) |
														
 
															 |--------------------|-------------|-----------|----------------------------------------------------|------------------------------------------|-------------------|----------------------|-------------------------------------------------|
														
 
															-| 32 x DGX A100 80GB | 8           | AMP       | 256 and 128                                        | 1 and 4                                  |       1.409       |    ~ 1.2 hours       | 1.72                                            |
														
 
															+| 32 x DGX A100 80GB | 8           | AMP       | 256 and 128                                        | 1 and 4                                  |       1.409       |    ~ 1.1 hours       | 2.27                                            |
														
 
															 | 32 x DGX A100 80GB | 8           | TF32      | 128 and 16b                                        | 2 and 8                                  |       1.421       |    ~ 2.5 hours       | 1                                               |
														
@@ -914,28 +921,28 @@ Our results were obtained by running the script `run_pretraining.sh` in the Padd
 
															 | GPUs | Batch size / GPU (TF32 and FP16) | Accumulation steps (TF32 and FP16) | Sequence length | Throughput - TF32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
														
 
															 |------|----------------------------------|------------------------------------|-----------------|----------------------------------|---------------------------------------------|---------------------------------------------|---------------------|--------------------------------|
														
 
															-| 1    | 8192 and 8192                    | 64 and 32                          | 128             |  307                             |   633                                       | 2.06                                        | 1.00                | 1.00                           |
														
 
															-| 8    | 8192 and 8192                    | 64 and 32                          | 128             | 2428                             |  4990                                       | 2.06                                        | 7.91                | 7.88                           |
														
 
															-| 1    | 4096 and 4096                    | 256 and 128                        | 512             |  107                             |   219                                       | 2.05                                        | 1.00                | 1.00                           |
														
 
															-| 8    | 4096 and 4096                    | 256 and 128                        | 512             |  851                             |  1724                                       | 2.26                                        | 7.95                | 7.87                           |
														
 
															+| 1    | 8192 and 8192                    | 64 and 32                          | 128             |  307                             |   694                                       | 2.26                                        | 1.00                | 1.00                           |
														
 
															+| 8    | 8192 and 8192                    | 64 and 32                          | 128             | 2428                             |  5541                                       | 2.28                                        | 7.91                | 7.98                           |
														
 
															+| 1    | 4096 and 4096                    | 256 and 128                        | 512             |  107                             |   264                                       | 2.47                                        | 1.00                | 1.00                           |
														
 
															+| 8    | 4096 and 4096                    | 256 and 128                        | 512             |  851                             |  2109                                       | 2.48                                        | 7.95                | 7.99                           |
														
 
															 ###### Pre-training NVIDIA DGX A100 (8x A100 80GB) Multi-node Scaling
														
 
															 | Nodes | GPUs / node | Batch size / GPU (TF32 and FP16) | Accumulated Batch size / GPU (TF32 and FP16) | Accumulation steps (TF32 and FP16) | Sequence length | Mixed Precision Throughput | Mixed Precision Strong Scaling | TF32 Throughput | TF32 Strong Scaling | Speedup (Mixed Precision to TF32) |
														
 
															 |-------|-------------|----------------------------------|------------------------------------|-----------------|----------------------------|--------------------------------|-----------------|---------------------|-----------------------------------|-----|
														
 
															-| 1     | 8           | 126 and 256 | 8192 and 8192                    | 64 and 32             | 128             |   4990               | 1                              |   2428          |  1                  |  2.06               |
														
 
															-| 2     | 8           | 126 and 256 | 4096 and 4096                    | 32 and 16             | 128             |   9581               | 1.92                           |   4638          |  1.91               |  2.07               |
														
 
															-| 4     | 8           | 126 and 256 | 2048 and 2048                    | 16 and 8              | 128             |   19262              | 3.86                           |   9445          |  3.89               |  2.04               |
														
 
															-| 8     | 8           | 126 and 256 | 1024 and 1024                    | 8 and 4               | 128             |   37526              | 7.52                           |   18335         |  7.55               |  2.05               |
														
 
															-| 16    | 8           | 126 and 256 | 512 and 512                      | 4 and 2               | 128             |   71156              | 14.26                          |   35526         |  14.63              |  2.00               |
														
 
															-| 32    | 8           | 126 and 256 | 256 and 256                      | 2 and 1               | 128             |   142087             | 28.47                          |   69701         |  28.71              |  2.04               |
														
 
															-| 1     | 8           | 16  and 32  | 4096 and 4096                    | 256 and 128           | 512             |   1724               | 1                              |   851           |  1                  |  2.03               |
														
 
															-| 2     | 8           | 16  and 32  | 2048 and 2048                    | 128 and 64            | 512             |   3305               | 1.92                           |   1601          |  1.88               |  2.06               |
														
 
															-| 4     | 8           | 16  and 32  | 1024 and 1024                    | 64 and 32             | 512             |   6492               | 3.77                           |   3240          |  3.81               |  2.00               |
														
 
															-| 8     | 8           | 16  and 32  | 512 and 512                      | 32 and 16             | 512             |   12884              | 7.47                           |   6329          |  7.44               |  2.04               |
														
 
															-| 16    | 8           | 16  and 32  | 256 and 256                      | 16 and 8              | 512             |   25493              | 14.79                          |   12273         |  14.42              |  2.08               |
														
 
															-| 32    | 8           | 16  and 32  | 128 and 128                      | 8 and 4               | 512             |   49307              | 28.60                          |   24047         |  28.26              |  2.05               |
														
 
															+| 1     | 8           | 126 and 256 | 8192 and 8192                    | 64 and 32             | 128             |   5541               | 1                              |   2428          |  1                  |  2.28               |
														
 
															+| 2     | 8           | 126 and 256 | 4096 and 4096                    | 32 and 16             | 128             |   10646              | 1.92                           |   4638          |  1.91               |  2.29               |
														
 
															+| 4     | 8           | 126 and 256 | 2048 and 2048                    | 16 and 8              | 128             |   21389              | 3.86                           |   9445          |  3.89               |  2.26               |
														
 
															+| 8     | 8           | 126 and 256 | 1024 and 1024                    | 8 and 4               | 128             |   41681              | 7.52                           |   18335         |  7.55               |  2.27               |
														
 
															+| 16    | 8           | 126 and 256 | 512 and 512                      | 4 and 2               | 128             |   79023              | 14.26                          |   35526         |  14.63              |  2.22               |
														
 
															+| 32    | 8           | 126 and 256 | 256 and 256                      | 2 and 1               | 128             |   157952             | 28.51                          |   69701         |  28.71              |  2.27               |
														
 
															+| 1     | 8           | 16  and 32  | 4096 and 4096                    | 256 and 128           | 512             |   2109               | 1                              |   851           |  1                  |  2.48               |
														
 
															+| 2     | 8           | 16  and 32  | 2048 and 2048                    | 128 and 64            | 512             |   4051               | 1.92                           |   1601          |  1.88               |  2.53               |
														
 
															+| 4     | 8           | 16  and 32  | 1024 and 1024                    | 64 and 32             | 512             |   7972               | 3.78                           |   3240          |  3.81               |  2.46               |
														
 
															+| 8     | 8           | 16  and 32  | 512 and 512                      | 32 and 16             | 512             |   15760              | 7.47                           |   6329          |  7.44               |  2.49               |
														
 
															+| 16    | 8           | 16  and 32  | 256 and 256                      | 16 and 8              | 512             |   31129              | 14.76                          |   12273         |  14.42              |  2.54               |
														
 
															+| 32    | 8           | 16  and 32  | 128 and 128                      | 8 and 4               | 512             |   60206              | 28.55                          |   24047         |  28.26              |  2.50               |
														
 
															 ###### Fine-tuning NVIDIA DGX A100 (8x A100 80GB)
														
@@ -944,8 +951,8 @@ Our results were obtained by running the script `run_pretraining.sh` in the Padd
 
															 | GPUs | Batch size / GPU (TF32 and FP16) | Throughput - TF32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
														
 
															 |------|----------------------------------|----------------------------------|---------------------------------------------|---------------------------------------------|---------------------|--------------------------------|
														
 
															-| 1    | 32 and 32                        |          83                      |               120                           |               1.45                          | 1.00                | 1.00                           |
														
 
															-| 8    | 32 and 32                        |         629                      |               876                           |               1.39                          | 7.59                | 7.30                           |
														
 
															+| 1    | 32 and 32                        |          83                      |               123                           |               1.48                          | 1.00                | 1.00                           |
														
 
															+| 8    | 32 and 32                        |         629                      |               929                           |               1.48                          | 7.59                | 7.55                           |
														
 
															 #### Inference performance results
														
@@ -983,6 +990,13 @@ August 2022
 
															 - SQuAD finetune support with AdamW optimizer.
														
 
															 - Updated accuracy and performance tables tested on A100.
														
 
															 - Initial release.
														
 
															+
														
 
															+March 2023
														
 
															+- Pre-training using [Language Datasets and Data Loaders (LDDL)](https://github.com/NVIDIA/LDDL)
														
 
															+- Binned pretraining for phase2 with LDDL using a bin size of 64
														
 
															+
														
 
															+July 2023
														
 
															+- Optimize AMP training with cuDNN fused dot product attention kernel.
														
 
															 ### Known issues
														
--- a/PaddlePaddle/LanguageModeling/BERT/modeling.py
+++ b/PaddlePaddle/LanguageModeling/BERT/modeling.py
@@ -172,7 +172,8 @@ class BertModel(nn.Layer):
 
															                 dropout=bert_config.hidden_dropout_prob,
														
 
															                 activation=bert_config.hidden_act,
														
 
															                 attn_dropout=bert_config.attention_probs_dropout_prob,
														
 
															-                act_dropout=0)
														
 
															+                act_dropout=0,
														
 
															+                fuse_qkv=bert_config.fuse_mha)
														
 
															             self.encoder = nn.TransformerEncoder(encoder_layer,
														
 
															                                                  bert_config.num_hidden_layers)
														
--- a/PaddlePaddle/LanguageModeling/BERT/program.py
+++ b/PaddlePaddle/LanguageModeling/BERT/program.py
@@ -44,12 +44,12 @@ def create_pretraining_data_holder():
 
															     ]
														
 
															-def create_strategy(use_amp, use_distributed_fused_lamb=False):
														
 
															+def create_strategy(args, use_distributed_fused_lamb=False):
														
 
															     """
														
 
															     Create paddle.static.BuildStrategy and paddle.static.ExecutionStrategy with arguments.
														
 
															     Args:
														
 
															-        use_amp(bool): Whether to use amp.
														
 
															+        args(Namespace): Arguments obtained from ArgumentParser.
														
 
															         use_distributed_fused_lamb(bool, optional): Whether to use distributed fused lamb.
														
 
															     Returns:
														
 
															         build_strategy(paddle.static.BuildStrategy): A instance of BuildStrategy.
														
@@ -59,8 +59,9 @@ def create_strategy(use_amp, use_distributed_fused_lamb=False):
 
															     exec_strategy = paddle.static.ExecutionStrategy()
														
 
															     build_strategy.enable_addto = True
														
 
															-    if use_amp:
														
 
															+    if args.amp:
														
 
															         build_strategy.fuse_gemm_epilogue = True
														
 
															+        build_strategy.fuse_dot_product_attention = args.fuse_mha
														
 
															     if use_distributed_fused_lamb:
														
 
															         build_strategy.fuse_all_reduce_ops = False
														
@@ -86,7 +87,7 @@ def dist_optimizer(args, optimizer):
 
															         optimizer(fleet.distributed_optimizer): A distributed optimizer.
														
 
															     """
														
 
															     use_distributed_fused_lamb = True if args.optimizer == 'DistributedFusedLamb' else False
														
 
															-    build_strategy, exec_strategy = create_strategy(args.amp,
														
 
															+    build_strategy, exec_strategy = create_strategy(args,
														
 
															                                                     use_distributed_fused_lamb)
														
 
															     dist_strategy = fleet.DistributedStrategy()
														
@@ -160,6 +161,7 @@ def build(args, main_prog, startup_prog, is_train=True):
 
															             bert_config = BertConfig.from_json_file(args.config_file)
														
 
															             if bert_config.vocab_size % 8 != 0:
														
 
															                 bert_config.vocab_size += 8 - (bert_config.vocab_size % 8)
														
 
															+            bert_config.fuse_mha = args.fuse_mha
														
 
															             model = BertForPretraining(bert_config)
														
 
															             criterion = BertPretrainingCriterion(bert_config.vocab_size)
														
 
															             prediction_scores, seq_relationship_score = model(
														
@@ -224,6 +226,7 @@ def run(exe,
 
															     logging.info(f"Training will start at the {last_step+1}th step")
														
 
															     max_steps = args.max_steps
														
 
															+    steps_this_run = max_steps
														
 
															     if args.steps_this_run is not None:
														
 
															         if args.steps_this_run + last_step > max_steps:
														
 
															             logging.info(
														
@@ -231,12 +234,14 @@ def run(exe,
 
															             )
														
 
															         else:
														
 
															             steps_this_run = args.steps_this_run
														
 
															-            if args.benchmark:
														
 
															-                steps_this_run = min(steps_this_run, args.benchmark_warmup_steps + args.benchmark_steps)
														
 
															             max_steps = steps_this_run + last_step
														
 
															             logging.warning(
														
 
															                 f"{steps_this_run} steps will be performed in this run.")
														
 
															+    if args.benchmark:
														
 
															+        max_steps = args.benchmark_warmup_steps + args.benchmark_steps + last_step
														
 
															+
														
 
															+
														
 
															     total_samples = 0
														
 
															     raw_train_start = time.time()
														
 
															     step_start = time.time()
														
--- a/PaddlePaddle/LanguageModeling/BERT/run_pretraining.py
+++ b/PaddlePaddle/LanguageModeling/BERT/run_pretraining.py
@@ -81,7 +81,8 @@ def main():
 
															         log_dir=None if args.output_dir is None else
														
 
															         os.path.join(args.output_dir, 'lddl_log'),
														
 
															         log_level=logging.WARNING,
														
 
															-        start_epoch=0 if progress is None else progress.get("epoch", 0), )
														
 
															+        start_epoch=0 if progress is None else progress.get("epoch", 0),
														
 
															+        sequence_length_alignment=64)
														
 
															     if args.amp:
														
 
															         optimizer.amp_init(device)
														
--- a/PaddlePaddle/LanguageModeling/BERT/run_squad.py
+++ b/PaddlePaddle/LanguageModeling/BERT/run_squad.py
@@ -186,9 +186,11 @@ def main(args):
 
															     with paddle.static.program_guard(main_program, startup_program):
														
 
															         bert_config = BertConfig.from_json_file(args.config_file)
														
 
															+        bert_config.fuse_mha = args.fuse_mha
														
 
															         if bert_config.vocab_size % 8 != 0:
														
 
															             bert_config.vocab_size += 8 - (bert_config.vocab_size % 8)
														
 
															+
														
 
															         model = BertForQuestionAnswering(bert_config)
														
 
															         criterion = CrossEntropyLossForSQuAD()
														
 
															         logits = model(input_ids=input_ids, token_type_ids=segment_ids)
														
--- a/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining.sh
@@ -54,6 +54,7 @@ BERT_CONFIG=${28:-"None"}
 
															 enable_benchmark=${29:-"false"}
														
 
															 benchmark_steps=${30:-"10"}
														
 
															 benchmark_warmup_steps=${31:-"10"}
														
 
															+fuse_mha=${32:-"true"}
														
 
															 # Calculate the total number of shards.
														
 
															 readonly num_blocks=$((num_shards_per_worker * $(( num_workers > 0 ? num_workers : 1 )) * num_nodes * num_gpus))
														
@@ -130,8 +131,12 @@ if [ "$BERT_CONFIG" != "None" ] ; then
 
															 fi
														
 
															 PREC=""
														
 
															+FUSE_MHA=""
														
 
															 if [ "$precision" = "amp" ] ; then
														
 
															    PREC="--amp --use-dynamic-loss-scaling --scale-loss=1048576"
														
 
															+   if [ "$fuse_mha" = "true" ] ; then
														
 
															+      FUSE_MHA="--fuse-mha"
														
 
															+   fi
														
 
															 elif [ "$precision" = "fp32" ] ; then
														
 
															    PREC=""
														
 
															 elif [ "$precision" = "tf32" ] ; then
														
@@ -197,6 +202,7 @@ CMD+=" --log-freq=1"
 
															 CMD+=" --optimizer=Lamb"
														
 
															 CMD+=" --phase1"
														
 
															 CMD+=" $PREC"
														
 
															+CMD+=" $FUSE_MHA"
														
 
															 CMD+=" $ACCUMULATE_GRADIENTS"
														
 
															 CMD+=" $INIT_CHECKPOINT"
														
 
															 CMD+=" $BENCH"
														
--- a/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining_p1.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining_p1.sh
@@ -31,6 +31,7 @@ python3 -m paddle.distributed.launch \
 
															 --amp \
														
 
															 --use-dynamic-loss-scaling \
														
 
															 --optimizer=Lamb \
														
 
															+--fuse-mha \
														
 
															 --phase1 \
														
 
															 --scale-loss=1048576 \
														
 
															 --learning-rate=6e-3 \
														
--- a/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining_p2.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/scripts/run_pretraining_p2.sh
@@ -32,6 +32,7 @@ python3 -m paddle.distributed.launch \
 
															 --amp \
														
 
															 --use-dynamic-loss-scaling \
														
 
															 --optimizer=Lamb \
														
 
															+--fuse-mha \
														
 
															 --phase2 \
														
 
															 --scale-loss=1048576 \
														
 
															 --learning-rate=4e-3 \
														
--- a/PaddlePaddle/LanguageModeling/BERT/scripts/run_squad.sh
+++ b/PaddlePaddle/LanguageModeling/BERT/scripts/run_squad.sh
@@ -31,6 +31,7 @@ max_steps=${14:-"-1"}
 
															 enable_benchmark=${15:-"false"}
														
 
															 benchmark_steps=${16:-"100"}
														
 
															 benchmark_warmup_steps=${17:-"100"}
														
 
															+fuse_mha=${18:-"true"}
														
 
															 echo "out dir is $OUT_DIR"
														
@@ -41,9 +42,13 @@ if [ ! -d "$OUT_DIR" ]; then
 
															 fi
														
 
															 amp=""
														
 
															+FUSE_MHA=""
														
 
															 if [ "$precision" = "amp" ] ; then
														
 
															   echo "amp activated!"
														
 
															   amp=" --amp --use-dynamic-loss-scaling --scale-loss=128.0"
														
 
															+  if [ "$fuse_mha" = "true" ] ; then
														
 
															+    FUSE_MHA="--fuse-mha"
														
 
															+  fi
														
 
															 fi
														
 
															 CONFIG=""
														
@@ -119,6 +124,7 @@ CMD+=" --max-steps=$max_steps "
 
															 CMD+=" --optimizer=AdamW "
														
 
															 CMD+=" --log-freq=100 "
														
 
															 CMD+=" $amp "
														
 
															+CMD+=" $FUSE_MHA "
														
 
															 CMD+=" $BENCH "
														
 
															 CMD+=" --report-file $OUT_DIR/dllogger_${num_gpus}_${precision}.json "
														
--- a/PaddlePaddle/LanguageModeling/BERT/utils/config.py
+++ b/PaddlePaddle/LanguageModeling/BERT/utils/config.py
@@ -18,6 +18,7 @@ import argparse
 
															 import distutils.util
														
 
															 import logging
														
 
															 import dllogger
														
 
															+import paddle
														
 
															 from utils.task import Task
														
 
															 from utils.save_load import _PDOPT_SUFFIX, _PDPARAMS_SUFFIX, _PROGRESS_SUFFIX
														
@@ -27,7 +28,7 @@ _DEFAULT_BERT_CONFIG = {
 
															     'bert-large-uncased': './bert_configs/bert-large-uncased.json',
														
 
															     'bert-large-cased': './bert_configs/bert-large-cased.json',
														
 
															     'bert-base-uncased': './bert_configs/bert-base-uncased.json',
														
 
															-    'bert-base-cased': './bert_configs/bert-base-cased.json'
														
 
															+    'bert-base-cased': './bert_configs/bert-base-cased.json',
														
 
															 }
														
@@ -41,28 +42,34 @@ def _get_full_path_of_ckpt(args):
 
															         pdparams_path = path_with_prefix + _PDPARAMS_SUFFIX
														
 
															         progress_path = path_with_prefix + _PROGRESS_SUFFIX
														
 
															         found = False
														
 
															-        if os.path.exists(pdopt_path) and os.path.exists(
														
 
															-                pdparams_path) and os.path.exists(progress_path):
														
 
															+        if (
														
 
															+            os.path.exists(pdopt_path)
														
 
															+            and os.path.exists(pdparams_path)
														
 
															+            and os.path.exists(progress_path)
														
 
															+        ):
														
 
															             found = True
														
 
															         return found, pdopt_path, pdparams_path, progress_path
														
 
															     if not os.path.exists(args.from_checkpoint):
														
 
															         logging.warning(
														
 
															-            f"Start training from scratch since no checkpoint is found.")
														
 
															+            f"Start training from scratch since no checkpoint is found."
														
 
															+        )
														
 
															         args.from_checkpoint = None
														
 
															         args.last_step_of_checkpoint = 0
														
 
															         return
														
 
															-    target_from_checkpoint = os.path.join(args.from_checkpoint,
														
 
															-                                          args.model_prefix)
														
 
															+    target_from_checkpoint = os.path.join(
														
 
															+        args.from_checkpoint, args.model_prefix
														
 
															+    )
														
 
															     if args.last_step_of_checkpoint is None:
														
 
															         args.last_step_of_checkpoint = 0
														
 
															     elif args.last_step_of_checkpoint == _AUTO_LAST_EPOCH:
														
 
															         folders = os.listdir(args.from_checkpoint)
														
 
															         args.last_step_of_checkpoint = 0
														
 
															         for folder in folders:
														
 
															-            tmp_ckpt_path = os.path.join(args.from_checkpoint, folder,
														
 
															-                                         args.model_prefix)
														
 
															+            tmp_ckpt_path = os.path.join(
														
 
															+                args.from_checkpoint, folder, args.model_prefix
														
 
															+            )
														
 
															             try:
														
 
															                 folder = int(folder)
														
@@ -72,23 +79,32 @@ def _get_full_path_of_ckpt(args):
 
															                 )
														
 
															                 continue
														
 
															-            if folder > args.last_step_of_checkpoint and \
														
 
															-                _check_file_exist(tmp_ckpt_path)[0]:
														
 
															+            if (
														
 
															+                folder > args.last_step_of_checkpoint
														
 
															+                and _check_file_exist(tmp_ckpt_path)[0]
														
 
															+            ):
														
 
															                 args.last_step_of_checkpoint = folder
														
 
															-        step_with_prefix = os.path.join(str(args.last_step_of_checkpoint), args.model_prefix) \
														
 
															-                            if args.last_step_of_checkpoint > 0 else args.model_prefix
														
 
															-        target_from_checkpoint = os.path.join(args.from_checkpoint,
														
 
															-                                              step_with_prefix)
														
 
															+        step_with_prefix = (
														
 
															+            os.path.join(str(args.last_step_of_checkpoint), args.model_prefix)
														
 
															+            if args.last_step_of_checkpoint > 0
														
 
															+            else args.model_prefix
														
 
															+        )
														
 
															+        target_from_checkpoint = os.path.join(
														
 
															+            args.from_checkpoint, step_with_prefix
														
 
															+        )
														
 
															     else:
														
 
															         try:
														
 
															             args.last_step_of_checkpoint = int(args.last_step_of_checkpoint)
														
 
															         except ValueError:
														
 
															-            raise ValueError(f"The value of --last-step-of-checkpoint should be None, {_AUTO_LAST_EPOCH}"  \
														
 
															-                            f" or integer >= 0, but receive {args.last_step_of_checkpoint}")
														
 
															+            raise ValueError(
														
 
															+                f"The value of --last-step-of-checkpoint should be None, {_AUTO_LAST_EPOCH}"
														
 
															+                f" or integer >= 0, but receive {args.last_step_of_checkpoint}"
														
 
															+            )
														
 
															     args.from_checkpoint = target_from_checkpoint
														
 
															     found, pdopt_path, pdparams_path, progress_path = _check_file_exist(
														
 
															-        args.from_checkpoint)
														
 
															+        args.from_checkpoint
														
 
															+    )
														
 
															     if not found:
														
 
															         args.from_checkpoint = None
														
 
															         args.last_step_of_checkpoint = 0
														
@@ -98,19 +114,28 @@ def _get_full_path_of_ckpt(args):
 
															 def _get_full_path_of_pretrained_params(args, task=Task.pretrain):
														
 
															-    if args.from_pretrained_params is None and args.from_phase1_final_params is None:
														
 
															+    if (
														
 
															+        args.from_pretrained_params is None
														
 
															+        and args.from_phase1_final_params is None
														
 
															+    ):
														
 
															         args.last_step_of_checkpoint = 0
														
 
															         return
														
 
															-    if task == Task.pretrain and args.from_phase1_final_params is not None and args.last_step_of_checkpoint == 0:
														
 
															+    if (
														
 
															+        task == Task.pretrain
														
 
															+        and args.from_phase1_final_params is not None
														
 
															+        and args.last_step_of_checkpoint == 0
														
 
															+    ):
														
 
															         args.from_pretrained_params = args.from_phase1_final_params
														
 
															-    args.from_pretrained_params = os.path.join(args.from_pretrained_params,
														
 
															-                                               args.model_prefix)
														
 
															+    args.from_pretrained_params = os.path.join(
														
 
															+        args.from_pretrained_params, args.model_prefix
														
 
															+    )
														
 
															     pdparams_path = args.from_pretrained_params + _PDPARAMS_SUFFIX
														
 
															     if not os.path.exists(pdparams_path):
														
 
															         args.from_pretrained_params = None
														
 
															         logging.warning(
														
 
															-            f"Cannot find {pdparams_path}, disable --from-pretrained-params.")
														
 
															+            f"Cannot find {pdparams_path}, disable --from-pretrained-params."
														
 
															+        )
														
 
															     args.last_step_of_checkpoint = 0
														
@@ -121,20 +146,28 @@ def print_args(args):
 
															 def check_and_process_args(args, task=Task.pretrain):
														
 
															     if task == Task.pretrain:
														
 
															-        assert not (args.from_checkpoint is not None and \
														
 
															-            args.from_pretrained_params is not None), \
														
 
															-           "--from-pretrained-params and --from-checkpoint should " \
														
 
															-           "not be set simultaneously."
														
 
															-        assert not (args.phase1 and args.phase2), \
														
 
															-            "--phase1 and --phase2 should not be set simultaneously in bert pretraining."
														
 
															+        assert not (
														
 
															+            args.from_checkpoint is not None
														
 
															+            and args.from_pretrained_params is not None
														
 
															+        ), (
														
 
															+            "--from-pretrained-params and --from-checkpoint should "
														
 
															+            "not be set simultaneously."
														
 
															+        )
														
 
															+        assert not (
														
 
															+            args.phase1 and args.phase2
														
 
															+        ), "--phase1 and --phase2 should not be set simultaneously in bert pretraining."
														
 
															         if args.from_phase1_final_params is not None:
														
 
															-            assert args.phase2, "--from-phase1-final-params should only be used in phase2"
														
 
															+            assert (
														
 
															+                args.phase2
														
 
															+            ), "--from-phase1-final-params should only be used in phase2"
														
 
															         # SQuAD finetuning does not support suspend-resume yet.(TODO)
														
 
															         _get_full_path_of_ckpt(args)
														
 
															     if args.bert_model == 'custom':
														
 
															-        assert args.config_file is not None, "--config-file must be specified if --bert-model=custom"
														
 
															+        assert (
														
 
															+            args.config_file is not None
														
 
															+        ), "--config-file must be specified if --bert-model=custom"
														
 
															     elif args.config_file is None:
														
 
															         args.config_file = _DEFAULT_BERT_CONFIG[args.bert_model]
														
 
															         logging.info(
														
@@ -144,7 +177,19 @@ def check_and_process_args(args, task=Task.pretrain):
 
															         _get_full_path_of_pretrained_params(args, task)
														
 
															     assert os.path.isfile(
														
 
															-        args.config_file), f"Cannot find config file in {args.config_file}"
														
 
															+        args.config_file
														
 
															+    ), f"Cannot find config file in {args.config_file}"
														
 
															+
														
 
															+    # cudnn mha fusion is only supported after v8.9.1 on Ampere and Hopper GPU
														
 
															+    device_capability = paddle.device.cuda.get_device_capability()
														
 
															+    cudnn_mha_supported = paddle.get_cudnn_version() >= 8901 and (
														
 
															+        device_capability == (8, 0) or device_capability == (9, 0)
														
 
															+    )
														
 
															+    if (not cudnn_mha_supported or args.amp is False) and args.fuse_mha is True:
														
 
															+        logging.info(
														
 
															+            f"cudnn mha fusion is not supported, fall back to unfused mha"
														
 
															+        )
														
 
															+        args.fuse_mha = False
														
 
															 def add_global_args(parser, task=Task.pretrain):
														
@@ -155,145 +200,165 @@ def add_global_args(parser, task=Task.pretrain):
 
															             type=str,
														
 
															             default=None,
														
 
															             required=True,
														
 
															-            help='The input data directory. Should be specified by users and contain .hdf5 files for the task.'
														
 
															+            help='The input data directory. Should be specified by users and contain .hdf5 files for the task.',
														
 
															         )
														
 
															-        group.add_argument('--num-workers', default=4, type=int)
														
 
															+        group.add_argument('--num-workers', default=1, type=int)
														
 
															     if task == Task.squad:
														
 
															         group.add_argument(
														
 
															             '--train-file',
														
 
															             type=str,
														
 
															             default=None,
														
 
															-            help='SQuAD json for training. E.g., train-v1.1.json')
														
 
															+            help='SQuAD json for training. E.g., train-v1.1.json',
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             '--predict-file',
														
 
															             type=str,
														
 
															             default=None,
														
 
															-            help='SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json'
														
 
															+            help='SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json',
														
 
															         )
														
 
															         group.add_argument(
														
 
															             "--eval-script",
														
 
															             help="Script to evaluate squad predictions",
														
 
															             default="evaluate.py",
														
 
															-            type=str)
														
 
															+            type=str,
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             '--epochs',
														
 
															             type=int,
														
 
															             default=3,
														
 
															-            help='The number of epochs for training.')
														
 
															+            help='The number of epochs for training.',
														
 
															+        )
														
 
															     group.add_argument(
														
 
															         '--vocab-file',
														
 
															         type=str,
														
 
															         default=None,
														
 
															         required=True,
														
 
															-        help="Vocabulary mapping/file BERT was pretrainined on")
														
 
															+        help="Vocabulary mapping/file BERT was pretrainined on",
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--output-dir',
														
 
															         type=str,
														
 
															         default=None,
														
 
															         required=True,
														
 
															-        help='The output directory where the model checkpoints will be written. Should be specified by users.'
														
 
															+        help='The output directory where the model checkpoints will be written. Should be specified by users.',
														
 
															     )
														
 
															     group.add_argument(
														
 
															         '--bert-model',
														
 
															         type=str,
														
 
															         default='bert-large-uncased',
														
 
															-        choices=('bert-base-uncased', 'bert-base-cased', 'bert-large-uncased',
														
 
															-                 'bert-large-cased', 'custom'),
														
 
															+        choices=(
														
 
															+            'bert-base-uncased',
														
 
															+            'bert-base-cased',
														
 
															+            'bert-large-uncased',
														
 
															+            'bert-large-cased',
														
 
															+            'custom',
														
 
															+        ),
														
 
															         help='Specifies the type of BERT model to use. If it is set as custom, '
														
 
															-        'the path to the config file must be given by specifying --config-file')
														
 
															+        'the path to the config file must be given by specifying --config-file',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--config-file',
														
 
															         type=str,
														
 
															         default=None,
														
 
															-        help='The BERT model config. If set to None, `<--bert-model>.json` in folder `bert_configs` will be used.'
														
 
															+        help='The BERT model config. If set to None, `<--bert-model>.json` in folder `bert_configs` will be used.',
														
 
															     )
														
 
															     group.add_argument(
														
 
															         '--max-steps',
														
 
															         type=int,
														
 
															         default=None,
														
 
															         required=True if task == Task.pretrain else False,
														
 
															-        help='Total number of training steps to perform.')
														
 
															+        help='Total number of training steps to perform.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															-        '--log-freq', type=int, default=10, help='Frequency of logging loss.')
														
 
															+        '--log-freq', type=int, default=10, help='Frequency of logging loss.'
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--num-steps-per-checkpoint',
														
 
															         type=int,
														
 
															         default=100,
														
 
															-        help='Number of update steps until a model checkpoint is saved to disk.'
														
 
															+        help='Number of update steps until a model checkpoint is saved to disk.',
														
 
															     )
														
 
															     # Init model
														
 
															     group.add_argument(
														
 
															         '--from-pretrained-params',
														
 
															         type=str,
														
 
															         default=None,
														
 
															-        help='Path to pretrained parameters. If set to None, no pretrained params will be used.'
														
 
															+        help='Path to pretrained parameters. If set to None, no pretrained params will be used.',
														
 
															     )
														
 
															     group.add_argument(
														
 
															         '--from-checkpoint',
														
 
															         type=str,
														
 
															         default=None,
														
 
															-        help='A checkpoint path to resume training. If set to None, no checkpoint will be used. ' \
														
 
															-             'If not None, --from-pretrained-params will be ignored.')
														
 
															+        help='A checkpoint path to resume training. If set to None, no checkpoint will be used. '
														
 
															+        'If not None, --from-pretrained-params will be ignored.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--last-step-of-checkpoint',
														
 
															         type=str,
														
 
															         default=None,
														
 
															-        help='The step id of the checkpoint given by --from-checkpoint. ' \
														
 
															-             'It should be None, auto, or integer > 0. If it is set as ' \
														
 
															-             'None, then training will start from the 1-th epoch. If it is set as ' \
														
 
															-             'auto, then it will search largest integer-convertable folder ' \
														
 
															-             ' --from-checkpoint, which contains required checkpoint. '
														
 
															+        help='The step id of the checkpoint given by --from-checkpoint. '
														
 
															+        'It should be None, auto, or integer > 0. If it is set as '
														
 
															+        'None, then training will start from the 1-th epoch. If it is set as '
														
 
															+        'auto, then it will search largest integer-convertable folder '
														
 
															+        ' --from-checkpoint, which contains required checkpoint. ',
														
 
															     )
														
 
															     if task == Task.pretrain:
														
 
															         group.add_argument(
														
 
															             '--from-phase1-final-params',
														
 
															             type=str,
														
 
															             default=None,
														
 
															-            help='Path to final checkpoint of phase1, which will be used to ' \
														
 
															-   'initialize the parameter in the first step of phase2, and ' \
														
 
															-                 'ignored in the rest steps of phase2.'
														
 
															+            help='Path to final checkpoint of phase1, which will be used to '
														
 
															+            'initialize the parameter in the first step of phase2, and '
														
 
															+            'ignored in the rest steps of phase2.',
														
 
															         )
														
 
															         group.add_argument(
														
 
															             '--steps-this-run',
														
 
															             type=int,
														
 
															             default=None,
														
 
															-            help='If provided, only run this many steps before exiting.' \
														
 
															+            help='If provided, only run this many steps before exiting.',
														
 
															         )
														
 
															     group.add_argument(
														
 
															-        '--seed', type=int, default=42, help="random seed for initialization")
														
 
															+        '--seed', type=int, default=42, help="random seed for initialization"
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--report-file',
														
 
															         type=str,
														
 
															         default='./report.json',
														
 
															-        help='A file in which to store JSON experiment report.')
														
 
															+        help='A file in which to store JSON experiment report.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--model-prefix',
														
 
															         type=str,
														
 
															         default='bert_paddle',
														
 
															-        help='The prefix name of model files to save/load.')
														
 
															+        help='The prefix name of model files to save/load.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--show-config',
														
 
															         type=distutils.util.strtobool,
														
 
															         default=True,
														
 
															-        help='To show arguments.')
														
 
															+        help='To show arguments.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--enable-cpu-affinity',
														
 
															         type=distutils.util.strtobool,
														
 
															         default=True,
														
 
															-        help='To enable in-built GPU-CPU affinity.')
														
 
															+        help='To enable in-built GPU-CPU affinity.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															-        '--benchmark', action='store_true', help='To enable benchmark mode.')
														
 
															+        '--benchmark', action='store_true', help='To enable benchmark mode.'
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--benchmark-steps',
														
 
															         type=int,
														
 
															         default=20,
														
 
															-        help='Steps for a benchmark run, only applied when --benchmark is set.')
														
 
															+        help='Steps for a benchmark run, only applied when --benchmark is set.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--benchmark-warmup-steps',
														
 
															         type=int,
														
 
															         default=20,
														
 
															-        help='Warmup steps for a benchmark run, only applied when --benchmark is set.'
														
 
															+        help='Warmup steps for a benchmark run, only applied when --benchmark is set.',
														
 
															     )
														
 
															     return parser
														
@@ -305,145 +370,166 @@ def add_training_args(parser, task=Task.pretrain):
 
															         default='Lamb',
														
 
															         metavar="OPTIMIZER",
														
 
															         choices=('Lamb', 'AdamW'),
														
 
															-        help='The name of optimizer. It should be one of {Lamb, AdamW}.')
														
 
															+        help='The name of optimizer. It should be one of {Lamb, AdamW}.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--gradient-merge-steps',
														
 
															         type=int,
														
 
															         default=1,
														
 
															-        help="Number of update steps to accumualte before performing a backward/update pass."
														
 
															+        help="Number of update steps to accumualte before performing a backward/update pass.",
														
 
															     )
														
 
															     group.add_argument(
														
 
															         '--learning-rate',
														
 
															         type=float,
														
 
															         default=1e-4,
														
 
															-        help='The initial learning rate.')
														
 
															+        help='The initial learning rate.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--warmup-start-lr',
														
 
															         type=float,
														
 
															         default=0.0,
														
 
															-        help='The initial learning rate for warmup.')
														
 
															+        help='The initial learning rate for warmup.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--warmup-proportion',
														
 
															         type=float,
														
 
															         default=0.01,
														
 
															         help='Proportion of training to perform linear learning rate warmup for. '
														
 
															-        'For example, 0.1 = 10%% of training.')
														
 
															+        'For example, 0.1 = 10%% of training.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--beta1',
														
 
															         type=float,
														
 
															         default=0.9,
														
 
															-        help='The exponential decay rate for the 1st moment estimates.')
														
 
															+        help='The exponential decay rate for the 1st moment estimates.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--beta2',
														
 
															         type=float,
														
 
															         default=0.999,
														
 
															-        help='The exponential decay rate for the 2st moment estimates.')
														
 
															+        help='The exponential decay rate for the 2st moment estimates.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--epsilon',
														
 
															         type=float,
														
 
															         default=1e-6,
														
 
															-        help='A small float value for numerical stability.')
														
 
															+        help='A small float value for numerical stability.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--weight-decay',
														
 
															         type=float,
														
 
															         default=0.01,
														
 
															-        help='The weight decay coefficient.')
														
 
															+        help='The weight decay coefficient.',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--max-seq-length',
														
 
															         default=512,
														
 
															         type=int,
														
 
															         help='The maximum total input sequence length after WordPiece tokenization. \n'
														
 
															         'Sequences longer than this will be truncated, and sequences shorter \n'
														
 
															-        'than this will be padded.')
														
 
															+        'than this will be padded.',
														
 
															+    )
														
 
															     if task == Task.pretrain:
														
 
															         group.add_argument(
														
 
															             '--batch-size',
														
 
															             type=int,
														
 
															             default=32,
														
 
															-            help='The batch size for training')
														
 
															+            help='The batch size for training',
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             '--phase1',
														
 
															             action='store_true',
														
 
															-            help='The phase of BERT pretraining. It should not be set ' \
														
 
															-                'with --phase2 at the same time.'
														
 
															+            help='The phase of BERT pretraining. It should not be set '
														
 
															+            'with --phase2 at the same time.',
														
 
															         )
														
 
															         group.add_argument(
														
 
															             '--phase2',
														
 
															             action='store_true',
														
 
															-            help='The phase of BERT pretraining. It should not be set ' \
														
 
															-                'with --phase1 at the same time.'
														
 
															+            help='The phase of BERT pretraining. It should not be set '
														
 
															+            'with --phase1 at the same time.',
														
 
															         )
														
 
															         group.add_argument(
														
 
															             '--max-predictions-per-seq',
														
 
															             default=80,
														
 
															             type=int,
														
 
															-            help='The maximum total of masked tokens in the input sequence')
														
 
															+            help='The maximum total of masked tokens in the input sequence',
														
 
															+        )
														
 
															     if task == Task.squad:
														
 
															         group.add_argument(
														
 
															-            "--do-train", action='store_true', help="Whether to run training.")
														
 
															+            "--do-train", action='store_true', help="Whether to run training."
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--do-predict",
														
 
															             action='store_true',
														
 
															-            help="Whether to run eval on the dev set.")
														
 
															+            help="Whether to run eval on the dev set.",
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--do-eval",
														
 
															             action='store_true',
														
 
															-            help="Whether to use evaluate accuracy of predictions")
														
 
															+            help="Whether to use evaluate accuracy of predictions",
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--train-batch-size",
														
 
															             default=32,
														
 
															             type=int,
														
 
															-            help="Total batch size for training.")
														
 
															+            help="Total batch size for training.",
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--predict-batch-size",
														
 
															             default=8,
														
 
															             type=int,
														
 
															-            help="Total batch size for predictions.")
														
 
															+            help="Total batch size for predictions.",
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--verbose-logging",
														
 
															             action='store_true',
														
 
															             help="If true, all of the warnings related to data processing will be printed. "
														
 
															-            "A number of warnings are expected for a normal SQuAD evaluation.")
														
 
															+            "A number of warnings are expected for a normal SQuAD evaluation.",
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--doc-stride",
														
 
															             default=128,
														
 
															             type=int,
														
 
															             help="When splitting up a long document into chunks, how much stride to take "
														
 
															-            "between chunks.")
														
 
															+            "between chunks.",
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--max-query-length",
														
 
															             default=64,
														
 
															             type=int,
														
 
															             help="The maximum number of tokens for the question. Questions longer than this "
														
 
															-            "will be truncated to this length.")
														
 
															+            "will be truncated to this length.",
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--n-best-size",
														
 
															             default=20,
														
 
															             type=int,
														
 
															             help="The total number of n-best predictions to generate in the nbest_predictions.json "
														
 
															-            "output file.")
														
 
															+            "output file.",
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--max-answer-length",
														
 
															             default=30,
														
 
															             type=int,
														
 
															             help="The maximum length of an answer that can be generated. This is needed because the start "
														
 
															-            "and end predictions are not conditioned on one another.")
														
 
															+            "and end predictions are not conditioned on one another.",
														
 
															+        )
														
 
															         group.add_argument(
														
 
															             "--do-lower-case",
														
 
															             action='store_true',
														
 
															-            help="Whether to lower case the input text. True for uncased models, False for cased models."
														
 
															+            help="Whether to lower case the input text. True for uncased models, False for cased models.",
														
 
															         )
														
 
															         group.add_argument(
														
 
															             '--version-2-with-negative',
														
 
															             action='store_true',
														
 
															-            help='If true, the SQuAD examples contain some that do not have an answer.'
														
 
															+            help='If true, the SQuAD examples contain some that do not have an answer.',
														
 
															         )
														
 
															         group.add_argument(
														
 
															             '--null-score-diff-threshold',
														
 
															             type=float,
														
 
															             default=0.0,
														
 
															-            help="If null_score - best_non_null is greater than the threshold predict null."
														
 
															+            help="If null_score - best_non_null is greater than the threshold predict null.",
														
 
															         )
														
 
															     return parser
														
@@ -453,22 +539,29 @@ def add_advance_args(parser):
 
															     group.add_argument(
														
 
															         '--amp',
														
 
															         action='store_true',
														
 
															-        help='Enable automatic mixed precision training (AMP).')
														
 
															+        help='Enable automatic mixed precision training (AMP).',
														
 
															+    )
														
 
															     group.add_argument(
														
 
															         '--scale-loss',
														
 
															         type=float,
														
 
															         default=1.0,
														
 
															-        help='The loss scalar for AMP training, only applied when --amp is set.'
														
 
															+        help='The loss scalar for AMP training, only applied when --amp is set.',
														
 
															     )
														
 
															     group.add_argument(
														
 
															         '--use-dynamic-loss-scaling',
														
 
															         action='store_true',
														
 
															-        help='Enable dynamic loss scaling in AMP training, only applied when --amp is set.'
														
 
															+        help='Enable dynamic loss scaling in AMP training, only applied when --amp is set.',
														
 
															     )
														
 
															     group.add_argument(
														
 
															         '--use-pure-fp16',
														
 
															         action='store_true',
														
 
															-        help='Enable pure FP16 training, only applied when --amp is set.')
														
 
															+        help='Enable pure FP16 training, only applied when --amp is set.',
														
 
															+    )
														
 
															+    group.add_argument(
														
 
															+        '--fuse-mha',
														
 
															+        action='store_true',
														
 
															+        help='Enable multihead attention fusion. Require cudnn version >= 8.9.1',
														
 
															+    )
														
 
															     return parser
														
@@ -476,8 +569,10 @@ def add_advance_args(parser):
 
															 def parse_args(task=Task.pretrain):
														
 
															     parser = argparse.ArgumentParser(
														
 
															         description="PaddlePaddle BERT pretraining script"
														
 
															-        if task == Task.pretrain else "PaddlePaddle SQuAD finetuning script",
														
 
															-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
														
 
															+        if task == Task.pretrain
														
 
															+        else "PaddlePaddle SQuAD finetuning script",
														
 
															+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
														
 
															+    )
														
 
															     parser = add_global_args(parser, task)
														
 
															     parser = add_training_args(parser, task)