SunnyMirror
/
DeepLearningExamples
同期ミラー https://github.com/NVIDIA/DeepLearningExamples.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
							# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import copy
import argparse
import distutils.util
import logging
import dllogger
import paddle
from utils.task import Task
from utils.save_load import _PDOPT_SUFFIX, _PDPARAMS_SUFFIX, _PROGRESS_SUFFIX

_AUTO_LAST_EPOCH = 'auto'

_DEFAULT_BERT_CONFIG = {
    'bert-large-uncased': './bert_configs/bert-large-uncased.json',
    'bert-large-cased': './bert_configs/bert-large-cased.json',
    'bert-base-uncased': './bert_configs/bert-base-uncased.json',
    'bert-base-cased': './bert_configs/bert-base-cased.json',
}


def _get_full_path_of_ckpt(args):
    if args.from_checkpoint is None:
        args.last_step_of_checkpoint = 0
        return

    def _check_file_exist(path_with_prefix):
        pdopt_path = path_with_prefix + _PDOPT_SUFFIX
        pdparams_path = path_with_prefix + _PDPARAMS_SUFFIX
        progress_path = path_with_prefix + _PROGRESS_SUFFIX
        found = False
        if (
            os.path.exists(pdopt_path)
            and os.path.exists(pdparams_path)
            and os.path.exists(progress_path)
        ):
            found = True
        return found, pdopt_path, pdparams_path, progress_path

    if not os.path.exists(args.from_checkpoint):
        logging.warning(
            f"Start training from scratch since no checkpoint is found."
        )
        args.from_checkpoint = None
        args.last_step_of_checkpoint = 0
        return

    target_from_checkpoint = os.path.join(
        args.from_checkpoint, args.model_prefix
    )
    if args.last_step_of_checkpoint is None:
        args.last_step_of_checkpoint = 0
    elif args.last_step_of_checkpoint == _AUTO_LAST_EPOCH:
        folders = os.listdir(args.from_checkpoint)
        args.last_step_of_checkpoint = 0
        for folder in folders:
            tmp_ckpt_path = os.path.join(
                args.from_checkpoint, folder, args.model_prefix
            )

            try:
                folder = int(folder)
            except ValueError:
                logging.warning(
                    f"Skip folder '{folder}' since its name is not integer-convertable."
                )
                continue

            if (
                folder > args.last_step_of_checkpoint
                and _check_file_exist(tmp_ckpt_path)[0]
            ):
                args.last_step_of_checkpoint = folder
        step_with_prefix = (
            os.path.join(str(args.last_step_of_checkpoint), args.model_prefix)
            if args.last_step_of_checkpoint > 0
            else args.model_prefix
        )
        target_from_checkpoint = os.path.join(
            args.from_checkpoint, step_with_prefix
        )
    else:
        try:
            args.last_step_of_checkpoint = int(args.last_step_of_checkpoint)
        except ValueError:
            raise ValueError(
                f"The value of --last-step-of-checkpoint should be None, {_AUTO_LAST_EPOCH}"
                f" or integer >= 0, but receive {args.last_step_of_checkpoint}"
            )

    args.from_checkpoint = target_from_checkpoint
    found, pdopt_path, pdparams_path, progress_path = _check_file_exist(
        args.from_checkpoint
    )
    if not found:
        args.from_checkpoint = None
        args.last_step_of_checkpoint = 0
        logging.warning(
            f"Cannot find {pdopt_path} and {pdparams_path} and {progress_path}, disable --from-checkpoint."
        )


def _get_full_path_of_pretrained_params(args, task=Task.pretrain):
    if (
        args.from_pretrained_params is None
        and args.from_phase1_final_params is None
    ):
        args.last_step_of_checkpoint = 0
        return
    if (
        task == Task.pretrain
        and args.from_phase1_final_params is not None
        and args.last_step_of_checkpoint == 0
    ):
        args.from_pretrained_params = args.from_phase1_final_params

    args.from_pretrained_params = os.path.join(
        args.from_pretrained_params, args.model_prefix
    )
    pdparams_path = args.from_pretrained_params + _PDPARAMS_SUFFIX
    if not os.path.exists(pdparams_path):
        args.from_pretrained_params = None
        logging.warning(
            f"Cannot find {pdparams_path}, disable --from-pretrained-params."
        )
    args.last_step_of_checkpoint = 0


def print_args(args):
    args_for_log = copy.deepcopy(args)
    dllogger.log(step='PARAMETER', data=vars(args_for_log))


def check_and_process_args(args, task=Task.pretrain):
    if task == Task.pretrain:
        assert not (
            args.from_checkpoint is not None
            and args.from_pretrained_params is not None
        ), (
            "--from-pretrained-params and --from-checkpoint should "
            "not be set simultaneously."
        )
        assert not (
            args.phase1 and args.phase2
        ), "--phase1 and --phase2 should not be set simultaneously in bert pretraining."
        if args.from_phase1_final_params is not None:
            assert (
                args.phase2
            ), "--from-phase1-final-params should only be used in phase2"

        # SQuAD finetuning does not support suspend-resume yet.(TODO)
        _get_full_path_of_ckpt(args)

    if args.bert_model == 'custom':
        assert (
            args.config_file is not None
        ), "--config-file must be specified if --bert-model=custom"
    elif args.config_file is None:
        args.config_file = _DEFAULT_BERT_CONFIG[args.bert_model]
        logging.info(
            f"According to the name of bert_model, the default config_file: {args.config_file} will be used."
        )
    if args.from_checkpoint is None:
        _get_full_path_of_pretrained_params(args, task)

    assert os.path.isfile(
        args.config_file
    ), f"Cannot find config file in {args.config_file}"

    # cudnn mha fusion is only supported after v8.9.1 on Ampere and Hopper GPU
    device_capability = paddle.device.cuda.get_device_capability()
    cudnn_mha_supported = paddle.get_cudnn_version() >= 8901 and (
        device_capability == (8, 0) or device_capability == (9, 0)
    )
    if (not cudnn_mha_supported or args.amp is False) and args.fuse_mha is True:
        logging.info(
            f"cudnn mha fusion is not supported, fall back to unfused mha"
        )
        args.fuse_mha = False


def add_global_args(parser, task=Task.pretrain):
    group = parser.add_argument_group('Global')
    if task == Task.pretrain:
        group.add_argument(
            '--input-dir',
            type=str,
            default=None,
            required=True,
            help='The input data directory. Should be specified by users and contain .hdf5 files for the task.',
        )
        group.add_argument('--num-workers', default=1, type=int)
    if task == Task.squad:
        group.add_argument(
            '--train-file',
            type=str,
            default=None,
            help='SQuAD json for training. E.g., train-v1.1.json',
        )
        group.add_argument(
            '--predict-file',
            type=str,
            default=None,
            help='SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json',
        )
        group.add_argument(
            "--eval-script",
            help="Script to evaluate squad predictions",
            default="evaluate.py",
            type=str,
        )
        group.add_argument(
            '--epochs',
            type=int,
            default=3,
            help='The number of epochs for training.',
        )

    group.add_argument(
        '--vocab-file',
        type=str,
        default=None,
        required=True,
        help="Vocabulary mapping/file BERT was pretrainined on",
    )
    group.add_argument(
        '--output-dir',
        type=str,
        default=None,
        required=True,
        help='The output directory where the model checkpoints will be written. Should be specified by users.',
    )
    group.add_argument(
        '--bert-model',
        type=str,
        default='bert-large-uncased',
        choices=(
            'bert-base-uncased',
            'bert-base-cased',
            'bert-large-uncased',
            'bert-large-cased',
            'custom',
        ),
        help='Specifies the type of BERT model to use. If it is set as custom, '
        'the path to the config file must be given by specifying --config-file',
    )
    group.add_argument(
        '--config-file',
        type=str,
        default=None,
        help='The BERT model config. If set to None, `<--bert-model>.json` in folder `bert_configs` will be used.',
    )
    group.add_argument(
        '--max-steps',
        type=int,
        default=None,
        required=True if task == Task.pretrain else False,
        help='Total number of training steps to perform.',
    )
    group.add_argument(
        '--log-freq', type=int, default=10, help='Frequency of logging loss.'
    )
    group.add_argument(
        '--num-steps-per-checkpoint',
        type=int,
        default=100,
        help='Number of update steps until a model checkpoint is saved to disk.',
    )
    # Init model
    group.add_argument(
        '--from-pretrained-params',
        type=str,
        default=None,
        help='Path to pretrained parameters. If set to None, no pretrained params will be used.',
    )
    group.add_argument(
        '--from-checkpoint',
        type=str,
        default=None,
        help='A checkpoint path to resume training. If set to None, no checkpoint will be used. '
        'If not None, --from-pretrained-params will be ignored.',
    )
    group.add_argument(
        '--last-step-of-checkpoint',
        type=str,
        default=None,
        help='The step id of the checkpoint given by --from-checkpoint. '
        'It should be None, auto, or integer > 0. If it is set as '
        'None, then training will start from the 1-th epoch. If it is set as '
        'auto, then it will search largest integer-convertable folder '
        ' --from-checkpoint, which contains required checkpoint. ',
    )
    if task == Task.pretrain:
        group.add_argument(
            '--from-phase1-final-params',
            type=str,
            default=None,
            help='Path to final checkpoint of phase1, which will be used to '
            'initialize the parameter in the first step of phase2, and '
            'ignored in the rest steps of phase2.',
        )
        group.add_argument(
            '--steps-this-run',
            type=int,
            default=None,
            help='If provided, only run this many steps before exiting.',
        )
    group.add_argument(
        '--seed', type=int, default=42, help="random seed for initialization"
    )
    group.add_argument(
        '--report-file',
        type=str,
        default='./report.json',
        help='A file in which to store JSON experiment report.',
    )
    group.add_argument(
        '--model-prefix',
        type=str,
        default='bert_paddle',
        help='The prefix name of model files to save/load.',
    )
    group.add_argument(
        '--show-config',
        type=distutils.util.strtobool,
        default=True,
        help='To show arguments.',
    )
    group.add_argument(
        '--enable-cpu-affinity',
        type=distutils.util.strtobool,
        default=True,
        help='To enable in-built GPU-CPU affinity.',
    )
    group.add_argument(
        '--benchmark', action='store_true', help='To enable benchmark mode.'
    )
    group.add_argument(
        '--benchmark-steps',
        type=int,
        default=20,
        help='Steps for a benchmark run, only applied when --benchmark is set.',
    )
    group.add_argument(
        '--benchmark-warmup-steps',
        type=int,
        default=20,
        help='Warmup steps for a benchmark run, only applied when --benchmark is set.',
    )
    return parser


def add_training_args(parser, task=Task.pretrain):
    group = parser.add_argument_group('Training')
    group.add_argument(
        '--optimizer',
        default='Lamb',
        metavar="OPTIMIZER",
        choices=('Lamb', 'AdamW'),
        help='The name of optimizer. It should be one of {Lamb, AdamW}.',
    )
    group.add_argument(
        '--gradient-merge-steps',
        type=int,
        default=1,
        help="Number of update steps to accumualte before performing a backward/update pass.",
    )
    group.add_argument(
        '--learning-rate',
        type=float,
        default=1e-4,
        help='The initial learning rate.',
    )
    group.add_argument(
        '--warmup-start-lr',
        type=float,
        default=0.0,
        help='The initial learning rate for warmup.',
    )
    group.add_argument(
        '--warmup-proportion',
        type=float,
        default=0.01,
        help='Proportion of training to perform linear learning rate warmup for. '
        'For example, 0.1 = 10%% of training.',
    )
    group.add_argument(
        '--beta1',
        type=float,
        default=0.9,
        help='The exponential decay rate for the 1st moment estimates.',
    )
    group.add_argument(
        '--beta2',
        type=float,
        default=0.999,
        help='The exponential decay rate for the 2st moment estimates.',
    )
    group.add_argument(
        '--epsilon',
        type=float,
        default=1e-6,
        help='A small float value for numerical stability.',
    )
    group.add_argument(
        '--weight-decay',
        type=float,
        default=0.01,
        help='The weight decay coefficient.',
    )
    group.add_argument(
        '--max-seq-length',
        default=512,
        type=int,
        help='The maximum total input sequence length after WordPiece tokenization. \n'
        'Sequences longer than this will be truncated, and sequences shorter \n'
        'than this will be padded.',
    )
    if task == Task.pretrain:
        group.add_argument(
            '--batch-size',
            type=int,
            default=32,
            help='The batch size for training',
        )
        group.add_argument(
            '--phase1',
            action='store_true',
            help='The phase of BERT pretraining. It should not be set '
            'with --phase2 at the same time.',
        )
        group.add_argument(
            '--phase2',
            action='store_true',
            help='The phase of BERT pretraining. It should not be set '
            'with --phase1 at the same time.',
        )
        group.add_argument(
            '--max-predictions-per-seq',
            default=80,
            type=int,
            help='The maximum total of masked tokens in the input sequence',
        )

    if task == Task.squad:
        group.add_argument(
            "--do-train", action='store_true', help="Whether to run training."
        )
        group.add_argument(
            "--do-predict",
            action='store_true',
            help="Whether to run eval on the dev set.",
        )
        group.add_argument(
            "--do-eval",
            action='store_true',
            help="Whether to use evaluate accuracy of predictions",
        )
        group.add_argument(
            "--train-batch-size",
            default=32,
            type=int,
            help="Total batch size for training.",
        )
        group.add_argument(
            "--predict-batch-size",
            default=8,
            type=int,
            help="Total batch size for predictions.",
        )
        group.add_argument(
            "--verbose-logging",
            action='store_true',
            help="If true, all of the warnings related to data processing will be printed. "
            "A number of warnings are expected for a normal SQuAD evaluation.",
        )
        group.add_argument(
            "--doc-stride",
            default=128,
            type=int,
            help="When splitting up a long document into chunks, how much stride to take "
            "between chunks.",
        )
        group.add_argument(
            "--max-query-length",
            default=64,
            type=int,
            help="The maximum number of tokens for the question. Questions longer than this "
            "will be truncated to this length.",
        )
        group.add_argument(
            "--n-best-size",
            default=20,
            type=int,
            help="The total number of n-best predictions to generate in the nbest_predictions.json "
            "output file.",
        )
        group.add_argument(
            "--max-answer-length",
            default=30,
            type=int,
            help="The maximum length of an answer that can be generated. This is needed because the start "
            "and end predictions are not conditioned on one another.",
        )
        group.add_argument(
            "--do-lower-case",
            action='store_true',
            help="Whether to lower case the input text. True for uncased models, False for cased models.",
        )
        group.add_argument(
            '--version-2-with-negative',
            action='store_true',
            help='If true, the SQuAD examples contain some that do not have an answer.',
        )
        group.add_argument(
            '--null-score-diff-threshold',
            type=float,
            default=0.0,
            help="If null_score - best_non_null is greater than the threshold predict null.",
        )
    return parser


def add_advance_args(parser):
    group = parser.add_argument_group('Advanced Training')
    group.add_argument(
        '--amp',
        action='store_true',
        help='Enable automatic mixed precision training (AMP).',
    )
    group.add_argument(
        '--scale-loss',
        type=float,
        default=1.0,
        help='The loss scalar for AMP training, only applied when --amp is set.',
    )
    group.add_argument(
        '--use-dynamic-loss-scaling',
        action='store_true',
        help='Enable dynamic loss scaling in AMP training, only applied when --amp is set.',
    )
    group.add_argument(
        '--use-pure-fp16',
        action='store_true',
        help='Enable pure FP16 training, only applied when --amp is set.',
    )
    group.add_argument(
        '--fuse-mha',
        action='store_true',
        help='Enable multihead attention fusion. Require cudnn version >= 8.9.1',
    )

    return parser


def parse_args(task=Task.pretrain):
    parser = argparse.ArgumentParser(
        description="PaddlePaddle BERT pretraining script"
        if task == Task.pretrain
        else "PaddlePaddle SQuAD finetuning script",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser = add_global_args(parser, task)
    parser = add_training_args(parser, task)
    parser = add_advance_args(parser)

    args = parser.parse_args()
    check_and_process_args(args, task)
    return args