|
@@ -18,6 +18,7 @@ import argparse
|
|
|
import distutils.util
|
|
import distutils.util
|
|
|
import logging
|
|
import logging
|
|
|
import dllogger
|
|
import dllogger
|
|
|
|
|
+import paddle
|
|
|
from utils.task import Task
|
|
from utils.task import Task
|
|
|
from utils.save_load import _PDOPT_SUFFIX, _PDPARAMS_SUFFIX, _PROGRESS_SUFFIX
|
|
from utils.save_load import _PDOPT_SUFFIX, _PDPARAMS_SUFFIX, _PROGRESS_SUFFIX
|
|
|
|
|
|
|
@@ -27,7 +28,7 @@ _DEFAULT_BERT_CONFIG = {
|
|
|
'bert-large-uncased': './bert_configs/bert-large-uncased.json',
|
|
'bert-large-uncased': './bert_configs/bert-large-uncased.json',
|
|
|
'bert-large-cased': './bert_configs/bert-large-cased.json',
|
|
'bert-large-cased': './bert_configs/bert-large-cased.json',
|
|
|
'bert-base-uncased': './bert_configs/bert-base-uncased.json',
|
|
'bert-base-uncased': './bert_configs/bert-base-uncased.json',
|
|
|
- 'bert-base-cased': './bert_configs/bert-base-cased.json'
|
|
|
|
|
|
|
+ 'bert-base-cased': './bert_configs/bert-base-cased.json',
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@@ -41,28 +42,34 @@ def _get_full_path_of_ckpt(args):
|
|
|
pdparams_path = path_with_prefix + _PDPARAMS_SUFFIX
|
|
pdparams_path = path_with_prefix + _PDPARAMS_SUFFIX
|
|
|
progress_path = path_with_prefix + _PROGRESS_SUFFIX
|
|
progress_path = path_with_prefix + _PROGRESS_SUFFIX
|
|
|
found = False
|
|
found = False
|
|
|
- if os.path.exists(pdopt_path) and os.path.exists(
|
|
|
|
|
- pdparams_path) and os.path.exists(progress_path):
|
|
|
|
|
|
|
+ if (
|
|
|
|
|
+ os.path.exists(pdopt_path)
|
|
|
|
|
+ and os.path.exists(pdparams_path)
|
|
|
|
|
+ and os.path.exists(progress_path)
|
|
|
|
|
+ ):
|
|
|
found = True
|
|
found = True
|
|
|
return found, pdopt_path, pdparams_path, progress_path
|
|
return found, pdopt_path, pdparams_path, progress_path
|
|
|
|
|
|
|
|
if not os.path.exists(args.from_checkpoint):
|
|
if not os.path.exists(args.from_checkpoint):
|
|
|
logging.warning(
|
|
logging.warning(
|
|
|
- f"Start training from scratch since no checkpoint is found.")
|
|
|
|
|
|
|
+ f"Start training from scratch since no checkpoint is found."
|
|
|
|
|
+ )
|
|
|
args.from_checkpoint = None
|
|
args.from_checkpoint = None
|
|
|
args.last_step_of_checkpoint = 0
|
|
args.last_step_of_checkpoint = 0
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
- target_from_checkpoint = os.path.join(args.from_checkpoint,
|
|
|
|
|
- args.model_prefix)
|
|
|
|
|
|
|
+ target_from_checkpoint = os.path.join(
|
|
|
|
|
+ args.from_checkpoint, args.model_prefix
|
|
|
|
|
+ )
|
|
|
if args.last_step_of_checkpoint is None:
|
|
if args.last_step_of_checkpoint is None:
|
|
|
args.last_step_of_checkpoint = 0
|
|
args.last_step_of_checkpoint = 0
|
|
|
elif args.last_step_of_checkpoint == _AUTO_LAST_EPOCH:
|
|
elif args.last_step_of_checkpoint == _AUTO_LAST_EPOCH:
|
|
|
folders = os.listdir(args.from_checkpoint)
|
|
folders = os.listdir(args.from_checkpoint)
|
|
|
args.last_step_of_checkpoint = 0
|
|
args.last_step_of_checkpoint = 0
|
|
|
for folder in folders:
|
|
for folder in folders:
|
|
|
- tmp_ckpt_path = os.path.join(args.from_checkpoint, folder,
|
|
|
|
|
- args.model_prefix)
|
|
|
|
|
|
|
+ tmp_ckpt_path = os.path.join(
|
|
|
|
|
+ args.from_checkpoint, folder, args.model_prefix
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
folder = int(folder)
|
|
folder = int(folder)
|
|
@@ -72,23 +79,32 @@ def _get_full_path_of_ckpt(args):
|
|
|
)
|
|
)
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
- if folder > args.last_step_of_checkpoint and \
|
|
|
|
|
- _check_file_exist(tmp_ckpt_path)[0]:
|
|
|
|
|
|
|
+ if (
|
|
|
|
|
+ folder > args.last_step_of_checkpoint
|
|
|
|
|
+ and _check_file_exist(tmp_ckpt_path)[0]
|
|
|
|
|
+ ):
|
|
|
args.last_step_of_checkpoint = folder
|
|
args.last_step_of_checkpoint = folder
|
|
|
- step_with_prefix = os.path.join(str(args.last_step_of_checkpoint), args.model_prefix) \
|
|
|
|
|
- if args.last_step_of_checkpoint > 0 else args.model_prefix
|
|
|
|
|
- target_from_checkpoint = os.path.join(args.from_checkpoint,
|
|
|
|
|
- step_with_prefix)
|
|
|
|
|
|
|
+ step_with_prefix = (
|
|
|
|
|
+ os.path.join(str(args.last_step_of_checkpoint), args.model_prefix)
|
|
|
|
|
+ if args.last_step_of_checkpoint > 0
|
|
|
|
|
+ else args.model_prefix
|
|
|
|
|
+ )
|
|
|
|
|
+ target_from_checkpoint = os.path.join(
|
|
|
|
|
+ args.from_checkpoint, step_with_prefix
|
|
|
|
|
+ )
|
|
|
else:
|
|
else:
|
|
|
try:
|
|
try:
|
|
|
args.last_step_of_checkpoint = int(args.last_step_of_checkpoint)
|
|
args.last_step_of_checkpoint = int(args.last_step_of_checkpoint)
|
|
|
except ValueError:
|
|
except ValueError:
|
|
|
- raise ValueError(f"The value of --last-step-of-checkpoint should be None, {_AUTO_LAST_EPOCH}" \
|
|
|
|
|
- f" or integer >= 0, but receive {args.last_step_of_checkpoint}")
|
|
|
|
|
|
|
+ raise ValueError(
|
|
|
|
|
+ f"The value of --last-step-of-checkpoint should be None, {_AUTO_LAST_EPOCH}"
|
|
|
|
|
+ f" or integer >= 0, but receive {args.last_step_of_checkpoint}"
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
args.from_checkpoint = target_from_checkpoint
|
|
args.from_checkpoint = target_from_checkpoint
|
|
|
found, pdopt_path, pdparams_path, progress_path = _check_file_exist(
|
|
found, pdopt_path, pdparams_path, progress_path = _check_file_exist(
|
|
|
- args.from_checkpoint)
|
|
|
|
|
|
|
+ args.from_checkpoint
|
|
|
|
|
+ )
|
|
|
if not found:
|
|
if not found:
|
|
|
args.from_checkpoint = None
|
|
args.from_checkpoint = None
|
|
|
args.last_step_of_checkpoint = 0
|
|
args.last_step_of_checkpoint = 0
|
|
@@ -98,19 +114,28 @@ def _get_full_path_of_ckpt(args):
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_full_path_of_pretrained_params(args, task=Task.pretrain):
|
|
def _get_full_path_of_pretrained_params(args, task=Task.pretrain):
|
|
|
- if args.from_pretrained_params is None and args.from_phase1_final_params is None:
|
|
|
|
|
|
|
+ if (
|
|
|
|
|
+ args.from_pretrained_params is None
|
|
|
|
|
+ and args.from_phase1_final_params is None
|
|
|
|
|
+ ):
|
|
|
args.last_step_of_checkpoint = 0
|
|
args.last_step_of_checkpoint = 0
|
|
|
return
|
|
return
|
|
|
- if task == Task.pretrain and args.from_phase1_final_params is not None and args.last_step_of_checkpoint == 0:
|
|
|
|
|
|
|
+ if (
|
|
|
|
|
+ task == Task.pretrain
|
|
|
|
|
+ and args.from_phase1_final_params is not None
|
|
|
|
|
+ and args.last_step_of_checkpoint == 0
|
|
|
|
|
+ ):
|
|
|
args.from_pretrained_params = args.from_phase1_final_params
|
|
args.from_pretrained_params = args.from_phase1_final_params
|
|
|
|
|
|
|
|
- args.from_pretrained_params = os.path.join(args.from_pretrained_params,
|
|
|
|
|
- args.model_prefix)
|
|
|
|
|
|
|
+ args.from_pretrained_params = os.path.join(
|
|
|
|
|
+ args.from_pretrained_params, args.model_prefix
|
|
|
|
|
+ )
|
|
|
pdparams_path = args.from_pretrained_params + _PDPARAMS_SUFFIX
|
|
pdparams_path = args.from_pretrained_params + _PDPARAMS_SUFFIX
|
|
|
if not os.path.exists(pdparams_path):
|
|
if not os.path.exists(pdparams_path):
|
|
|
args.from_pretrained_params = None
|
|
args.from_pretrained_params = None
|
|
|
logging.warning(
|
|
logging.warning(
|
|
|
- f"Cannot find {pdparams_path}, disable --from-pretrained-params.")
|
|
|
|
|
|
|
+ f"Cannot find {pdparams_path}, disable --from-pretrained-params."
|
|
|
|
|
+ )
|
|
|
args.last_step_of_checkpoint = 0
|
|
args.last_step_of_checkpoint = 0
|
|
|
|
|
|
|
|
|
|
|
|
@@ -121,20 +146,28 @@ def print_args(args):
|
|
|
|
|
|
|
|
def check_and_process_args(args, task=Task.pretrain):
|
|
def check_and_process_args(args, task=Task.pretrain):
|
|
|
if task == Task.pretrain:
|
|
if task == Task.pretrain:
|
|
|
- assert not (args.from_checkpoint is not None and \
|
|
|
|
|
- args.from_pretrained_params is not None), \
|
|
|
|
|
- "--from-pretrained-params and --from-checkpoint should " \
|
|
|
|
|
- "not be set simultaneously."
|
|
|
|
|
- assert not (args.phase1 and args.phase2), \
|
|
|
|
|
- "--phase1 and --phase2 should not be set simultaneously in bert pretraining."
|
|
|
|
|
|
|
+ assert not (
|
|
|
|
|
+ args.from_checkpoint is not None
|
|
|
|
|
+ and args.from_pretrained_params is not None
|
|
|
|
|
+ ), (
|
|
|
|
|
+ "--from-pretrained-params and --from-checkpoint should "
|
|
|
|
|
+ "not be set simultaneously."
|
|
|
|
|
+ )
|
|
|
|
|
+ assert not (
|
|
|
|
|
+ args.phase1 and args.phase2
|
|
|
|
|
+ ), "--phase1 and --phase2 should not be set simultaneously in bert pretraining."
|
|
|
if args.from_phase1_final_params is not None:
|
|
if args.from_phase1_final_params is not None:
|
|
|
- assert args.phase2, "--from-phase1-final-params should only be used in phase2"
|
|
|
|
|
|
|
+ assert (
|
|
|
|
|
+ args.phase2
|
|
|
|
|
+ ), "--from-phase1-final-params should only be used in phase2"
|
|
|
|
|
|
|
|
# SQuAD finetuning does not support suspend-resume yet.(TODO)
|
|
# SQuAD finetuning does not support suspend-resume yet.(TODO)
|
|
|
_get_full_path_of_ckpt(args)
|
|
_get_full_path_of_ckpt(args)
|
|
|
|
|
|
|
|
if args.bert_model == 'custom':
|
|
if args.bert_model == 'custom':
|
|
|
- assert args.config_file is not None, "--config-file must be specified if --bert-model=custom"
|
|
|
|
|
|
|
+ assert (
|
|
|
|
|
+ args.config_file is not None
|
|
|
|
|
+ ), "--config-file must be specified if --bert-model=custom"
|
|
|
elif args.config_file is None:
|
|
elif args.config_file is None:
|
|
|
args.config_file = _DEFAULT_BERT_CONFIG[args.bert_model]
|
|
args.config_file = _DEFAULT_BERT_CONFIG[args.bert_model]
|
|
|
logging.info(
|
|
logging.info(
|
|
@@ -144,7 +177,19 @@ def check_and_process_args(args, task=Task.pretrain):
|
|
|
_get_full_path_of_pretrained_params(args, task)
|
|
_get_full_path_of_pretrained_params(args, task)
|
|
|
|
|
|
|
|
assert os.path.isfile(
|
|
assert os.path.isfile(
|
|
|
- args.config_file), f"Cannot find config file in {args.config_file}"
|
|
|
|
|
|
|
+ args.config_file
|
|
|
|
|
+ ), f"Cannot find config file in {args.config_file}"
|
|
|
|
|
+
|
|
|
|
|
+ # cudnn mha fusion is only supported after v8.9.1 on Ampere and Hopper GPU
|
|
|
|
|
+ device_capability = paddle.device.cuda.get_device_capability()
|
|
|
|
|
+ cudnn_mha_supported = paddle.get_cudnn_version() >= 8901 and (
|
|
|
|
|
+ device_capability == (8, 0) or device_capability == (9, 0)
|
|
|
|
|
+ )
|
|
|
|
|
+ if (not cudnn_mha_supported or args.amp is False) and args.fuse_mha is True:
|
|
|
|
|
+ logging.info(
|
|
|
|
|
+ f"cudnn mha fusion is not supported, fall back to unfused mha"
|
|
|
|
|
+ )
|
|
|
|
|
+ args.fuse_mha = False
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_global_args(parser, task=Task.pretrain):
|
|
def add_global_args(parser, task=Task.pretrain):
|
|
@@ -155,145 +200,165 @@ def add_global_args(parser, task=Task.pretrain):
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
required=True,
|
|
required=True,
|
|
|
- help='The input data directory. Should be specified by users and contain .hdf5 files for the task.'
|
|
|
|
|
|
|
+ help='The input data directory. Should be specified by users and contain .hdf5 files for the task.',
|
|
|
)
|
|
)
|
|
|
- group.add_argument('--num-workers', default=4, type=int)
|
|
|
|
|
|
|
+ group.add_argument('--num-workers', default=1, type=int)
|
|
|
if task == Task.squad:
|
|
if task == Task.squad:
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--train-file',
|
|
'--train-file',
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
- help='SQuAD json for training. E.g., train-v1.1.json')
|
|
|
|
|
|
|
+ help='SQuAD json for training. E.g., train-v1.1.json',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--predict-file',
|
|
'--predict-file',
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
- help='SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json'
|
|
|
|
|
|
|
+ help='SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--eval-script",
|
|
"--eval-script",
|
|
|
help="Script to evaluate squad predictions",
|
|
help="Script to evaluate squad predictions",
|
|
|
default="evaluate.py",
|
|
default="evaluate.py",
|
|
|
- type=str)
|
|
|
|
|
|
|
+ type=str,
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--epochs',
|
|
'--epochs',
|
|
|
type=int,
|
|
type=int,
|
|
|
default=3,
|
|
default=3,
|
|
|
- help='The number of epochs for training.')
|
|
|
|
|
|
|
+ help='The number of epochs for training.',
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--vocab-file',
|
|
'--vocab-file',
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
required=True,
|
|
required=True,
|
|
|
- help="Vocabulary mapping/file BERT was pretrainined on")
|
|
|
|
|
|
|
+ help="Vocabulary mapping/file BERT was pretrainined on",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--output-dir',
|
|
'--output-dir',
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
required=True,
|
|
required=True,
|
|
|
- help='The output directory where the model checkpoints will be written. Should be specified by users.'
|
|
|
|
|
|
|
+ help='The output directory where the model checkpoints will be written. Should be specified by users.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--bert-model',
|
|
'--bert-model',
|
|
|
type=str,
|
|
type=str,
|
|
|
default='bert-large-uncased',
|
|
default='bert-large-uncased',
|
|
|
- choices=('bert-base-uncased', 'bert-base-cased', 'bert-large-uncased',
|
|
|
|
|
- 'bert-large-cased', 'custom'),
|
|
|
|
|
|
|
+ choices=(
|
|
|
|
|
+ 'bert-base-uncased',
|
|
|
|
|
+ 'bert-base-cased',
|
|
|
|
|
+ 'bert-large-uncased',
|
|
|
|
|
+ 'bert-large-cased',
|
|
|
|
|
+ 'custom',
|
|
|
|
|
+ ),
|
|
|
help='Specifies the type of BERT model to use. If it is set as custom, '
|
|
help='Specifies the type of BERT model to use. If it is set as custom, '
|
|
|
- 'the path to the config file must be given by specifying --config-file')
|
|
|
|
|
|
|
+ 'the path to the config file must be given by specifying --config-file',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--config-file',
|
|
'--config-file',
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
- help='The BERT model config. If set to None, `<--bert-model>.json` in folder `bert_configs` will be used.'
|
|
|
|
|
|
|
+ help='The BERT model config. If set to None, `<--bert-model>.json` in folder `bert_configs` will be used.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--max-steps',
|
|
'--max-steps',
|
|
|
type=int,
|
|
type=int,
|
|
|
default=None,
|
|
default=None,
|
|
|
required=True if task == Task.pretrain else False,
|
|
required=True if task == Task.pretrain else False,
|
|
|
- help='Total number of training steps to perform.')
|
|
|
|
|
|
|
+ help='Total number of training steps to perform.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
- '--log-freq', type=int, default=10, help='Frequency of logging loss.')
|
|
|
|
|
|
|
+ '--log-freq', type=int, default=10, help='Frequency of logging loss.'
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--num-steps-per-checkpoint',
|
|
'--num-steps-per-checkpoint',
|
|
|
type=int,
|
|
type=int,
|
|
|
default=100,
|
|
default=100,
|
|
|
- help='Number of update steps until a model checkpoint is saved to disk.'
|
|
|
|
|
|
|
+ help='Number of update steps until a model checkpoint is saved to disk.',
|
|
|
)
|
|
)
|
|
|
# Init model
|
|
# Init model
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--from-pretrained-params',
|
|
'--from-pretrained-params',
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
- help='Path to pretrained parameters. If set to None, no pretrained params will be used.'
|
|
|
|
|
|
|
+ help='Path to pretrained parameters. If set to None, no pretrained params will be used.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--from-checkpoint',
|
|
'--from-checkpoint',
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
- help='A checkpoint path to resume training. If set to None, no checkpoint will be used. ' \
|
|
|
|
|
- 'If not None, --from-pretrained-params will be ignored.')
|
|
|
|
|
|
|
+ help='A checkpoint path to resume training. If set to None, no checkpoint will be used. '
|
|
|
|
|
+ 'If not None, --from-pretrained-params will be ignored.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--last-step-of-checkpoint',
|
|
'--last-step-of-checkpoint',
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
- help='The step id of the checkpoint given by --from-checkpoint. ' \
|
|
|
|
|
- 'It should be None, auto, or integer > 0. If it is set as ' \
|
|
|
|
|
- 'None, then training will start from the 1-th epoch. If it is set as ' \
|
|
|
|
|
- 'auto, then it will search largest integer-convertable folder ' \
|
|
|
|
|
- ' --from-checkpoint, which contains required checkpoint. '
|
|
|
|
|
|
|
+ help='The step id of the checkpoint given by --from-checkpoint. '
|
|
|
|
|
+ 'It should be None, auto, or integer > 0. If it is set as '
|
|
|
|
|
+ 'None, then training will start from the 1-th epoch. If it is set as '
|
|
|
|
|
+ 'auto, then it will search largest integer-convertable folder '
|
|
|
|
|
+ ' --from-checkpoint, which contains required checkpoint. ',
|
|
|
)
|
|
)
|
|
|
if task == Task.pretrain:
|
|
if task == Task.pretrain:
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--from-phase1-final-params',
|
|
'--from-phase1-final-params',
|
|
|
type=str,
|
|
type=str,
|
|
|
default=None,
|
|
default=None,
|
|
|
- help='Path to final checkpoint of phase1, which will be used to ' \
|
|
|
|
|
- 'initialize the parameter in the first step of phase2, and ' \
|
|
|
|
|
- 'ignored in the rest steps of phase2.'
|
|
|
|
|
|
|
+ help='Path to final checkpoint of phase1, which will be used to '
|
|
|
|
|
+ 'initialize the parameter in the first step of phase2, and '
|
|
|
|
|
+ 'ignored in the rest steps of phase2.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--steps-this-run',
|
|
'--steps-this-run',
|
|
|
type=int,
|
|
type=int,
|
|
|
default=None,
|
|
default=None,
|
|
|
- help='If provided, only run this many steps before exiting.' \
|
|
|
|
|
|
|
+ help='If provided, only run this many steps before exiting.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
- '--seed', type=int, default=42, help="random seed for initialization")
|
|
|
|
|
|
|
+ '--seed', type=int, default=42, help="random seed for initialization"
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--report-file',
|
|
'--report-file',
|
|
|
type=str,
|
|
type=str,
|
|
|
default='./report.json',
|
|
default='./report.json',
|
|
|
- help='A file in which to store JSON experiment report.')
|
|
|
|
|
|
|
+ help='A file in which to store JSON experiment report.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--model-prefix',
|
|
'--model-prefix',
|
|
|
type=str,
|
|
type=str,
|
|
|
default='bert_paddle',
|
|
default='bert_paddle',
|
|
|
- help='The prefix name of model files to save/load.')
|
|
|
|
|
|
|
+ help='The prefix name of model files to save/load.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--show-config',
|
|
'--show-config',
|
|
|
type=distutils.util.strtobool,
|
|
type=distutils.util.strtobool,
|
|
|
default=True,
|
|
default=True,
|
|
|
- help='To show arguments.')
|
|
|
|
|
|
|
+ help='To show arguments.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--enable-cpu-affinity',
|
|
'--enable-cpu-affinity',
|
|
|
type=distutils.util.strtobool,
|
|
type=distutils.util.strtobool,
|
|
|
default=True,
|
|
default=True,
|
|
|
- help='To enable in-built GPU-CPU affinity.')
|
|
|
|
|
|
|
+ help='To enable in-built GPU-CPU affinity.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
- '--benchmark', action='store_true', help='To enable benchmark mode.')
|
|
|
|
|
|
|
+ '--benchmark', action='store_true', help='To enable benchmark mode.'
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--benchmark-steps',
|
|
'--benchmark-steps',
|
|
|
type=int,
|
|
type=int,
|
|
|
default=20,
|
|
default=20,
|
|
|
- help='Steps for a benchmark run, only applied when --benchmark is set.')
|
|
|
|
|
|
|
+ help='Steps for a benchmark run, only applied when --benchmark is set.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--benchmark-warmup-steps',
|
|
'--benchmark-warmup-steps',
|
|
|
type=int,
|
|
type=int,
|
|
|
default=20,
|
|
default=20,
|
|
|
- help='Warmup steps for a benchmark run, only applied when --benchmark is set.'
|
|
|
|
|
|
|
+ help='Warmup steps for a benchmark run, only applied when --benchmark is set.',
|
|
|
)
|
|
)
|
|
|
return parser
|
|
return parser
|
|
|
|
|
|
|
@@ -305,145 +370,166 @@ def add_training_args(parser, task=Task.pretrain):
|
|
|
default='Lamb',
|
|
default='Lamb',
|
|
|
metavar="OPTIMIZER",
|
|
metavar="OPTIMIZER",
|
|
|
choices=('Lamb', 'AdamW'),
|
|
choices=('Lamb', 'AdamW'),
|
|
|
- help='The name of optimizer. It should be one of {Lamb, AdamW}.')
|
|
|
|
|
|
|
+ help='The name of optimizer. It should be one of {Lamb, AdamW}.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--gradient-merge-steps',
|
|
'--gradient-merge-steps',
|
|
|
type=int,
|
|
type=int,
|
|
|
default=1,
|
|
default=1,
|
|
|
- help="Number of update steps to accumualte before performing a backward/update pass."
|
|
|
|
|
|
|
+ help="Number of update steps to accumualte before performing a backward/update pass.",
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--learning-rate',
|
|
'--learning-rate',
|
|
|
type=float,
|
|
type=float,
|
|
|
default=1e-4,
|
|
default=1e-4,
|
|
|
- help='The initial learning rate.')
|
|
|
|
|
|
|
+ help='The initial learning rate.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--warmup-start-lr',
|
|
'--warmup-start-lr',
|
|
|
type=float,
|
|
type=float,
|
|
|
default=0.0,
|
|
default=0.0,
|
|
|
- help='The initial learning rate for warmup.')
|
|
|
|
|
|
|
+ help='The initial learning rate for warmup.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--warmup-proportion',
|
|
'--warmup-proportion',
|
|
|
type=float,
|
|
type=float,
|
|
|
default=0.01,
|
|
default=0.01,
|
|
|
help='Proportion of training to perform linear learning rate warmup for. '
|
|
help='Proportion of training to perform linear learning rate warmup for. '
|
|
|
- 'For example, 0.1 = 10%% of training.')
|
|
|
|
|
|
|
+ 'For example, 0.1 = 10%% of training.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--beta1',
|
|
'--beta1',
|
|
|
type=float,
|
|
type=float,
|
|
|
default=0.9,
|
|
default=0.9,
|
|
|
- help='The exponential decay rate for the 1st moment estimates.')
|
|
|
|
|
|
|
+ help='The exponential decay rate for the 1st moment estimates.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--beta2',
|
|
'--beta2',
|
|
|
type=float,
|
|
type=float,
|
|
|
default=0.999,
|
|
default=0.999,
|
|
|
- help='The exponential decay rate for the 2st moment estimates.')
|
|
|
|
|
|
|
+ help='The exponential decay rate for the 2st moment estimates.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--epsilon',
|
|
'--epsilon',
|
|
|
type=float,
|
|
type=float,
|
|
|
default=1e-6,
|
|
default=1e-6,
|
|
|
- help='A small float value for numerical stability.')
|
|
|
|
|
|
|
+ help='A small float value for numerical stability.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--weight-decay',
|
|
'--weight-decay',
|
|
|
type=float,
|
|
type=float,
|
|
|
default=0.01,
|
|
default=0.01,
|
|
|
- help='The weight decay coefficient.')
|
|
|
|
|
|
|
+ help='The weight decay coefficient.',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--max-seq-length',
|
|
'--max-seq-length',
|
|
|
default=512,
|
|
default=512,
|
|
|
type=int,
|
|
type=int,
|
|
|
help='The maximum total input sequence length after WordPiece tokenization. \n'
|
|
help='The maximum total input sequence length after WordPiece tokenization. \n'
|
|
|
'Sequences longer than this will be truncated, and sequences shorter \n'
|
|
'Sequences longer than this will be truncated, and sequences shorter \n'
|
|
|
- 'than this will be padded.')
|
|
|
|
|
|
|
+ 'than this will be padded.',
|
|
|
|
|
+ )
|
|
|
if task == Task.pretrain:
|
|
if task == Task.pretrain:
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--batch-size',
|
|
'--batch-size',
|
|
|
type=int,
|
|
type=int,
|
|
|
default=32,
|
|
default=32,
|
|
|
- help='The batch size for training')
|
|
|
|
|
|
|
+ help='The batch size for training',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--phase1',
|
|
'--phase1',
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
- help='The phase of BERT pretraining. It should not be set ' \
|
|
|
|
|
- 'with --phase2 at the same time.'
|
|
|
|
|
|
|
+ help='The phase of BERT pretraining. It should not be set '
|
|
|
|
|
+ 'with --phase2 at the same time.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--phase2',
|
|
'--phase2',
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
- help='The phase of BERT pretraining. It should not be set ' \
|
|
|
|
|
- 'with --phase1 at the same time.'
|
|
|
|
|
|
|
+ help='The phase of BERT pretraining. It should not be set '
|
|
|
|
|
+ 'with --phase1 at the same time.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--max-predictions-per-seq',
|
|
'--max-predictions-per-seq',
|
|
|
default=80,
|
|
default=80,
|
|
|
type=int,
|
|
type=int,
|
|
|
- help='The maximum total of masked tokens in the input sequence')
|
|
|
|
|
|
|
+ help='The maximum total of masked tokens in the input sequence',
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
if task == Task.squad:
|
|
if task == Task.squad:
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
- "--do-train", action='store_true', help="Whether to run training.")
|
|
|
|
|
|
|
+ "--do-train", action='store_true', help="Whether to run training."
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--do-predict",
|
|
"--do-predict",
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
- help="Whether to run eval on the dev set.")
|
|
|
|
|
|
|
+ help="Whether to run eval on the dev set.",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--do-eval",
|
|
"--do-eval",
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
- help="Whether to use evaluate accuracy of predictions")
|
|
|
|
|
|
|
+ help="Whether to use evaluate accuracy of predictions",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--train-batch-size",
|
|
"--train-batch-size",
|
|
|
default=32,
|
|
default=32,
|
|
|
type=int,
|
|
type=int,
|
|
|
- help="Total batch size for training.")
|
|
|
|
|
|
|
+ help="Total batch size for training.",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--predict-batch-size",
|
|
"--predict-batch-size",
|
|
|
default=8,
|
|
default=8,
|
|
|
type=int,
|
|
type=int,
|
|
|
- help="Total batch size for predictions.")
|
|
|
|
|
|
|
+ help="Total batch size for predictions.",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--verbose-logging",
|
|
"--verbose-logging",
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
help="If true, all of the warnings related to data processing will be printed. "
|
|
help="If true, all of the warnings related to data processing will be printed. "
|
|
|
- "A number of warnings are expected for a normal SQuAD evaluation.")
|
|
|
|
|
|
|
+ "A number of warnings are expected for a normal SQuAD evaluation.",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--doc-stride",
|
|
"--doc-stride",
|
|
|
default=128,
|
|
default=128,
|
|
|
type=int,
|
|
type=int,
|
|
|
help="When splitting up a long document into chunks, how much stride to take "
|
|
help="When splitting up a long document into chunks, how much stride to take "
|
|
|
- "between chunks.")
|
|
|
|
|
|
|
+ "between chunks.",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--max-query-length",
|
|
"--max-query-length",
|
|
|
default=64,
|
|
default=64,
|
|
|
type=int,
|
|
type=int,
|
|
|
help="The maximum number of tokens for the question. Questions longer than this "
|
|
help="The maximum number of tokens for the question. Questions longer than this "
|
|
|
- "will be truncated to this length.")
|
|
|
|
|
|
|
+ "will be truncated to this length.",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--n-best-size",
|
|
"--n-best-size",
|
|
|
default=20,
|
|
default=20,
|
|
|
type=int,
|
|
type=int,
|
|
|
help="The total number of n-best predictions to generate in the nbest_predictions.json "
|
|
help="The total number of n-best predictions to generate in the nbest_predictions.json "
|
|
|
- "output file.")
|
|
|
|
|
|
|
+ "output file.",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--max-answer-length",
|
|
"--max-answer-length",
|
|
|
default=30,
|
|
default=30,
|
|
|
type=int,
|
|
type=int,
|
|
|
help="The maximum length of an answer that can be generated. This is needed because the start "
|
|
help="The maximum length of an answer that can be generated. This is needed because the start "
|
|
|
- "and end predictions are not conditioned on one another.")
|
|
|
|
|
|
|
+ "and end predictions are not conditioned on one another.",
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
"--do-lower-case",
|
|
"--do-lower-case",
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
- help="Whether to lower case the input text. True for uncased models, False for cased models."
|
|
|
|
|
|
|
+ help="Whether to lower case the input text. True for uncased models, False for cased models.",
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--version-2-with-negative',
|
|
'--version-2-with-negative',
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
- help='If true, the SQuAD examples contain some that do not have an answer.'
|
|
|
|
|
|
|
+ help='If true, the SQuAD examples contain some that do not have an answer.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--null-score-diff-threshold',
|
|
'--null-score-diff-threshold',
|
|
|
type=float,
|
|
type=float,
|
|
|
default=0.0,
|
|
default=0.0,
|
|
|
- help="If null_score - best_non_null is greater than the threshold predict null."
|
|
|
|
|
|
|
+ help="If null_score - best_non_null is greater than the threshold predict null.",
|
|
|
)
|
|
)
|
|
|
return parser
|
|
return parser
|
|
|
|
|
|
|
@@ -453,22 +539,29 @@ def add_advance_args(parser):
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--amp',
|
|
'--amp',
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
- help='Enable automatic mixed precision training (AMP).')
|
|
|
|
|
|
|
+ help='Enable automatic mixed precision training (AMP).',
|
|
|
|
|
+ )
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--scale-loss',
|
|
'--scale-loss',
|
|
|
type=float,
|
|
type=float,
|
|
|
default=1.0,
|
|
default=1.0,
|
|
|
- help='The loss scalar for AMP training, only applied when --amp is set.'
|
|
|
|
|
|
|
+ help='The loss scalar for AMP training, only applied when --amp is set.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--use-dynamic-loss-scaling',
|
|
'--use-dynamic-loss-scaling',
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
- help='Enable dynamic loss scaling in AMP training, only applied when --amp is set.'
|
|
|
|
|
|
|
+ help='Enable dynamic loss scaling in AMP training, only applied when --amp is set.',
|
|
|
)
|
|
)
|
|
|
group.add_argument(
|
|
group.add_argument(
|
|
|
'--use-pure-fp16',
|
|
'--use-pure-fp16',
|
|
|
action='store_true',
|
|
action='store_true',
|
|
|
- help='Enable pure FP16 training, only applied when --amp is set.')
|
|
|
|
|
|
|
+ help='Enable pure FP16 training, only applied when --amp is set.',
|
|
|
|
|
+ )
|
|
|
|
|
+ group.add_argument(
|
|
|
|
|
+ '--fuse-mha',
|
|
|
|
|
+ action='store_true',
|
|
|
|
|
+ help='Enable multihead attention fusion. Require cudnn version >= 8.9.1',
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
return parser
|
|
return parser
|
|
|
|
|
|
|
@@ -476,8 +569,10 @@ def add_advance_args(parser):
|
|
|
def parse_args(task=Task.pretrain):
|
|
def parse_args(task=Task.pretrain):
|
|
|
parser = argparse.ArgumentParser(
|
|
parser = argparse.ArgumentParser(
|
|
|
description="PaddlePaddle BERT pretraining script"
|
|
description="PaddlePaddle BERT pretraining script"
|
|
|
- if task == Task.pretrain else "PaddlePaddle SQuAD finetuning script",
|
|
|
|
|
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
|
|
|
|
|
+ if task == Task.pretrain
|
|
|
|
|
+ else "PaddlePaddle SQuAD finetuning script",
|
|
|
|
|
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
parser = add_global_args(parser, task)
|
|
parser = add_global_args(parser, task)
|
|
|
parser = add_training_args(parser, task)
|
|
parser = add_training_args(parser, task)
|