SunnyMirror
/
DeepLearningExamples
огледало од https://github.com/NVIDIA/DeepLearningExamples.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
							# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import copy
import argparse
import distutils.util
import logging
import dllogger
from utils.task import Task
from utils.save_load import _PDOPT_SUFFIX, _PDPARAMS_SUFFIX, _PROGRESS_SUFFIX

_AUTO_LAST_EPOCH = 'auto'

_DEFAULT_BERT_CONFIG = {
    'bert-large-uncased': './bert_configs/bert-large-uncased.json',
    'bert-large-cased': './bert_configs/bert-large-cased.json',
    'bert-base-uncased': './bert_configs/bert-base-uncased.json',
    'bert-base-cased': './bert_configs/bert-base-cased.json'
}


def _get_full_path_of_ckpt(args):
    if args.from_checkpoint is None:
        args.last_step_of_checkpoint = 0
        return

    def _check_file_exist(path_with_prefix):
        pdopt_path = path_with_prefix + _PDOPT_SUFFIX
        pdparams_path = path_with_prefix + _PDPARAMS_SUFFIX
        progress_path = path_with_prefix + _PROGRESS_SUFFIX
        found = False
        if os.path.exists(pdopt_path) and os.path.exists(
                pdparams_path) and os.path.exists(progress_path):
            found = True
        return found, pdopt_path, pdparams_path, progress_path

    if not os.path.exists(args.from_checkpoint):
        logging.warning(
            f"Start training from scratch since no checkpoint is found.")
        args.from_checkpoint = None
        args.last_step_of_checkpoint = 0
        return

    target_from_checkpoint = os.path.join(args.from_checkpoint,
                                          args.model_prefix)
    if args.last_step_of_checkpoint is None:
        args.last_step_of_checkpoint = 0
    elif args.last_step_of_checkpoint == _AUTO_LAST_EPOCH:
        folders = os.listdir(args.from_checkpoint)
        args.last_step_of_checkpoint = 0
        for folder in folders:
            tmp_ckpt_path = os.path.join(args.from_checkpoint, folder,
                                         args.model_prefix)

            try:
                folder = int(folder)
            except ValueError:
                logging.warning(
                    f"Skip folder '{folder}' since its name is not integer-convertable."
                )
                continue

            if folder > args.last_step_of_checkpoint and \
                _check_file_exist(tmp_ckpt_path)[0]:
                args.last_step_of_checkpoint = folder
        step_with_prefix = os.path.join(str(args.last_step_of_checkpoint), args.model_prefix) \
                            if args.last_step_of_checkpoint > 0 else args.model_prefix
        target_from_checkpoint = os.path.join(args.from_checkpoint,
                                              step_with_prefix)
    else:
        try:
            args.last_step_of_checkpoint = int(args.last_step_of_checkpoint)
        except ValueError:
            raise ValueError(f"The value of --last-step-of-checkpoint should be None, {_AUTO_LAST_EPOCH}"  \
                            f" or integer >= 0, but receive {args.last_step_of_checkpoint}")

    args.from_checkpoint = target_from_checkpoint
    found, pdopt_path, pdparams_path, progress_path = _check_file_exist(
        args.from_checkpoint)
    if not found:
        args.from_checkpoint = None
        args.last_step_of_checkpoint = 0
        logging.warning(
            f"Cannot find {pdopt_path} and {pdparams_path} and {progress_path}, disable --from-checkpoint."
        )


def _get_full_path_of_pretrained_params(args, task=Task.pretrain):
    if args.from_pretrained_params is None and args.from_phase1_final_params is None:
        args.last_step_of_checkpoint = 0
        return
    if task == Task.pretrain and args.from_phase1_final_params is not None and args.last_step_of_checkpoint == 0:
        args.from_pretrained_params = args.from_phase1_final_params

    args.from_pretrained_params = os.path.join(args.from_pretrained_params,
                                               args.model_prefix)
    pdparams_path = args.from_pretrained_params + _PDPARAMS_SUFFIX
    if not os.path.exists(pdparams_path):
        args.from_pretrained_params = None
        logging.warning(
            f"Cannot find {pdparams_path}, disable --from-pretrained-params.")
    args.last_step_of_checkpoint = 0


def print_args(args):
    args_for_log = copy.deepcopy(args)
    dllogger.log(step='PARAMETER', data=vars(args_for_log))


def check_and_process_args(args, task=Task.pretrain):
    if task == Task.pretrain:
        assert not (args.from_checkpoint is not None and \
            args.from_pretrained_params is not None), \
           "--from-pretrained-params and --from-checkpoint should " \
           "not be set simultaneously."
        assert not (args.phase1 and args.phase2), \
            "--phase1 and --phase2 should not be set simultaneously in bert pretraining."
        if args.from_phase1_final_params is not None:
            assert args.phase2, "--from-phase1-final-params should only be used in phase2"

        # SQuAD finetuning does not support suspend-resume yet.(TODO)
        _get_full_path_of_ckpt(args)

    if args.bert_model == 'custom':
        assert args.config_file is not None, "--config-file must be specified if --bert-model=custom"
    elif args.config_file is None:
        args.config_file = _DEFAULT_BERT_CONFIG[args.bert_model]
        logging.info(
            f"According to the name of bert_model, the default config_file: {args.config_file} will be used."
        )
    if args.from_checkpoint is None:
        _get_full_path_of_pretrained_params(args, task)

    assert os.path.isfile(
        args.config_file), f"Cannot find config file in {args.config_file}"


def add_global_args(parser, task=Task.pretrain):
    group = parser.add_argument_group('Global')
    if task == Task.pretrain:
        group.add_argument(
            '--input-dir',
            type=str,
            default=None,
            required=True,
            help='The input data directory. Should be specified by users and contain .hdf5 files for the task.'
        )
        group.add_argument('--num-workers', default=4, type=int)
    if task == Task.squad:
        group.add_argument(
            '--train-file',
            type=str,
            default=None,
            help='SQuAD json for training. E.g., train-v1.1.json')
        group.add_argument(
            '--predict-file',
            type=str,
            default=None,
            help='SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json'
        )
        group.add_argument(
            "--eval-script",
            help="Script to evaluate squad predictions",
            default="evaluate.py",
            type=str)
        group.add_argument(
            '--epochs',
            type=int,
            default=3,
            help='The number of epochs for training.')

    group.add_argument(
        '--vocab-file',
        type=str,
        default=None,
        required=True,
        help="Vocabulary mapping/file BERT was pretrainined on")
    group.add_argument(
        '--output-dir',
        type=str,
        default=None,
        required=True,
        help='The output directory where the model checkpoints will be written. Should be specified by users.'
    )
    group.add_argument(
        '--bert-model',
        type=str,
        default='bert-large-uncased',
        choices=('bert-base-uncased', 'bert-base-cased', 'bert-large-uncased',
                 'bert-large-cased', 'custom'),
        help='Specifies the type of BERT model to use. If it is set as custom, '
        'the path to the config file must be given by specifying --config-file')
    group.add_argument(
        '--config-file',
        type=str,
        default=None,
        help='The BERT model config. If set to None, `<--bert-model>.json` in folder `bert_configs` will be used.'
    )
    group.add_argument(
        '--max-steps',
        type=int,
        default=None,
        required=True if task == Task.pretrain else False,
        help='Total number of training steps to perform.')
    group.add_argument(
        '--log-freq', type=int, default=10, help='Frequency of logging loss.')
    group.add_argument(
        '--num-steps-per-checkpoint',
        type=int,
        default=100,
        help='Number of update steps until a model checkpoint is saved to disk.'
    )
    # Init model
    group.add_argument(
        '--from-pretrained-params',
        type=str,
        default=None,
        help='Path to pretrained parameters. If set to None, no pretrained params will be used.'
    )
    group.add_argument(
        '--from-checkpoint',
        type=str,
        default=None,
        help='A checkpoint path to resume training. If set to None, no checkpoint will be used. ' \
             'If not None, --from-pretrained-params will be ignored.')
    group.add_argument(
        '--last-step-of-checkpoint',
        type=str,
        default=None,
        help='The step id of the checkpoint given by --from-checkpoint. ' \
             'It should be None, auto, or integer > 0. If it is set as ' \
             'None, then training will start from the 1-th epoch. If it is set as ' \
             'auto, then it will search largest integer-convertable folder ' \
             ' --from-checkpoint, which contains required checkpoint. '
    )
    if task == Task.pretrain:
        group.add_argument(
            '--from-phase1-final-params',
            type=str,
            default=None,
            help='Path to final checkpoint of phase1, which will be used to ' \
   'initialize the parameter in the first step of phase2, and ' \
                 'ignored in the rest steps of phase2.'
        )
        group.add_argument(
            '--steps-this-run',
            type=int,
            default=None,
            help='If provided, only run this many steps before exiting.' \
        )
    group.add_argument(
        '--seed', type=int, default=42, help="random seed for initialization")
    group.add_argument(
        '--report-file',
        type=str,
        default='./report.json',
        help='A file in which to store JSON experiment report.')
    group.add_argument(
        '--model-prefix',
        type=str,
        default='bert_paddle',
        help='The prefix name of model files to save/load.')
    group.add_argument(
        '--show-config',
        type=distutils.util.strtobool,
        default=True,
        help='To show arguments.')
    group.add_argument(
        '--enable-cpu-affinity',
        type=distutils.util.strtobool,
        default=True,
        help='To enable in-built GPU-CPU affinity.')
    group.add_argument(
        '--benchmark', action='store_true', help='To enable benchmark mode.')
    group.add_argument(
        '--benchmark-steps',
        type=int,
        default=20,
        help='Steps for a benchmark run, only applied when --benchmark is set.')
    group.add_argument(
        '--benchmark-warmup-steps',
        type=int,
        default=20,
        help='Warmup steps for a benchmark run, only applied when --benchmark is set.'
    )
    return parser


def add_training_args(parser, task=Task.pretrain):
    group = parser.add_argument_group('Training')
    group.add_argument(
        '--optimizer',
        default='Lamb',
        metavar="OPTIMIZER",
        choices=('Lamb', 'AdamW'),
        help='The name of optimizer. It should be one of {Lamb, AdamW}.')
    group.add_argument(
        '--gradient-merge-steps',
        type=int,
        default=1,
        help="Number of update steps to accumualte before performing a backward/update pass."
    )
    group.add_argument(
        '--learning-rate',
        type=float,
        default=1e-4,
        help='The initial learning rate.')
    group.add_argument(
        '--warmup-start-lr',
        type=float,
        default=0.0,
        help='The initial learning rate for warmup.')
    group.add_argument(
        '--warmup-proportion',
        type=float,
        default=0.01,
        help='Proportion of training to perform linear learning rate warmup for. '
        'For example, 0.1 = 10%% of training.')
    group.add_argument(
        '--beta1',
        type=float,
        default=0.9,
        help='The exponential decay rate for the 1st moment estimates.')
    group.add_argument(
        '--beta2',
        type=float,
        default=0.999,
        help='The exponential decay rate for the 2st moment estimates.')
    group.add_argument(
        '--epsilon',
        type=float,
        default=1e-6,
        help='A small float value for numerical stability.')
    group.add_argument(
        '--weight-decay',
        type=float,
        default=0.01,
        help='The weight decay coefficient.')
    group.add_argument(
        '--max-seq-length',
        default=512,
        type=int,
        help='The maximum total input sequence length after WordPiece tokenization. \n'
        'Sequences longer than this will be truncated, and sequences shorter \n'
        'than this will be padded.')
    if task == Task.pretrain:
        group.add_argument(
            '--batch-size',
            type=int,
            default=32,
            help='The batch size for training')
        group.add_argument(
            '--phase1',
            action='store_true',
            help='The phase of BERT pretraining. It should not be set ' \
                'with --phase2 at the same time.'
        )
        group.add_argument(
            '--phase2',
            action='store_true',
            help='The phase of BERT pretraining. It should not be set ' \
                'with --phase1 at the same time.'
        )
        group.add_argument(
            '--max-predictions-per-seq',
            default=80,
            type=int,
            help='The maximum total of masked tokens in the input sequence')

    if task == Task.squad:
        group.add_argument(
            "--do-train", action='store_true', help="Whether to run training.")
        group.add_argument(
            "--do-predict",
            action='store_true',
            help="Whether to run eval on the dev set.")
        group.add_argument(
            "--do-eval",
            action='store_true',
            help="Whether to use evaluate accuracy of predictions")
        group.add_argument(
            "--train-batch-size",
            default=32,
            type=int,
            help="Total batch size for training.")
        group.add_argument(
            "--predict-batch-size",
            default=8,
            type=int,
            help="Total batch size for predictions.")
        group.add_argument(
            "--verbose-logging",
            action='store_true',
            help="If true, all of the warnings related to data processing will be printed. "
            "A number of warnings are expected for a normal SQuAD evaluation.")
        group.add_argument(
            "--doc-stride",
            default=128,
            type=int,
            help="When splitting up a long document into chunks, how much stride to take "
            "between chunks.")
        group.add_argument(
            "--max-query-length",
            default=64,
            type=int,
            help="The maximum number of tokens for the question. Questions longer than this "
            "will be truncated to this length.")
        group.add_argument(
            "--n-best-size",
            default=20,
            type=int,
            help="The total number of n-best predictions to generate in the nbest_predictions.json "
            "output file.")
        group.add_argument(
            "--max-answer-length",
            default=30,
            type=int,
            help="The maximum length of an answer that can be generated. This is needed because the start "
            "and end predictions are not conditioned on one another.")
        group.add_argument(
            "--do-lower-case",
            action='store_true',
            help="Whether to lower case the input text. True for uncased models, False for cased models."
        )
        group.add_argument(
            '--version-2-with-negative',
            action='store_true',
            help='If true, the SQuAD examples contain some that do not have an answer.'
        )
        group.add_argument(
            '--null-score-diff-threshold',
            type=float,
            default=0.0,
            help="If null_score - best_non_null is greater than the threshold predict null."
        )
    return parser


def add_advance_args(parser):
    group = parser.add_argument_group('Advanced Training')
    group.add_argument(
        '--amp',
        action='store_true',
        help='Enable automatic mixed precision training (AMP).')
    group.add_argument(
        '--scale-loss',
        type=float,
        default=1.0,
        help='The loss scalar for AMP training, only applied when --amp is set.'
    )
    group.add_argument(
        '--use-dynamic-loss-scaling',
        action='store_true',
        help='Enable dynamic loss scaling in AMP training, only applied when --amp is set.'
    )
    group.add_argument(
        '--use-pure-fp16',
        action='store_true',
        help='Enable pure FP16 training, only applied when --amp is set.')

    return parser


def parse_args(task=Task.pretrain):
    parser = argparse.ArgumentParser(
        description="PaddlePaddle BERT pretraining script"
        if task == Task.pretrain else "PaddlePaddle SQuAD finetuning script",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser = add_global_args(parser, task)
    parser = add_training_args(parser, task)
    parser = add_advance_args(parser)

    args = parser.parse_args()
    check_and_process_args(args, task)
    return args