| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525 |
- # Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- import copy
- import argparse
- import logging
- import distutils.util
- import dllogger
- from utils.mode import RunScope
- from utils.utility import get_num_trainers
- from utils.save_load import _PDOPT_SUFFIX, _PDPARAMS_SUFFIX
- _AUTO_LAST_EPOCH = 'auto'
- def _get_full_path_of_ckpt(args):
- if args.from_checkpoint is None:
- args.last_epoch_of_checkpoint = -1
- return
- def _check_file_exist(path_with_prefix):
- pdopt_path = path_with_prefix + _PDOPT_SUFFIX
- pdparams_path = path_with_prefix + _PDPARAMS_SUFFIX
- found = False
- if os.path.exists(pdopt_path) and os.path.exists(pdparams_path):
- found = True
- return found, pdopt_path, pdparams_path
- target_from_checkpoint = os.path.join(args.from_checkpoint,
- args.model_prefix)
- if args.last_epoch_of_checkpoint is None:
- args.last_epoch_of_checkpoint = -1
- elif args.last_epoch_of_checkpoint == _AUTO_LAST_EPOCH:
- folders = os.listdir(args.from_checkpoint)
- args.last_epoch_of_checkpoint = -1
- for folder in folders:
- tmp_ckpt_path = os.path.join(args.from_checkpoint, folder,
- args.model_prefix)
- try:
- folder = int(folder)
- except ValueError:
- logging.warning(
- f"Skip folder '{folder}' since its name is not integer-convertable."
- )
- continue
- if folder > args.last_epoch_of_checkpoint and \
- _check_file_exist(tmp_ckpt_path)[0]:
- args.last_epoch_of_checkpoint = folder
- epoch_with_prefix = os.path.join(str(args.last_epoch_of_checkpoint), args.model_prefix) \
- if args.last_epoch_of_checkpoint > -1 else args.model_prefix
- target_from_checkpoint = os.path.join(args.from_checkpoint,
- epoch_with_prefix)
- else:
- try:
- args.last_epoch_of_checkpoint = int(args.last_epoch_of_checkpoint)
- except ValueError:
- raise ValueError(f"The value of --last-epoch-of-checkpoint should be None, {_AUTO_LAST_EPOCH}" \
- f" or integer >= 0, but receive {args.last_epoch_of_checkpoint}")
- args.from_checkpoint = target_from_checkpoint
- found, pdopt_path, pdparams_path = _check_file_exist(args.from_checkpoint)
- if not found:
- args.from_checkpoint = None
- args.last_epoch_of_checkpoint = -1
- logging.warning(
- f"Cannot find {pdopt_path} and {pdparams_path}, disable --from-checkpoint."
- )
- def _get_full_path_of_pretrained_params(args):
- if args.from_pretrained_params is None:
- args.last_epoch_of_checkpoint = -1
- return
- args.from_pretrained_params = os.path.join(args.from_pretrained_params,
- args.model_prefix)
- pdparams_path = args.from_pretrained_params + _PDPARAMS_SUFFIX
- if not os.path.exists(pdparams_path):
- args.from_pretrained_params = None
- logging.warning(
- f"Cannot find {pdparams_path}, disable --from-pretrained-params.")
- args.last_epoch_of_checkpoint = -1
- def print_args(args):
- args_for_log = copy.deepcopy(args)
- # Due to dllogger cannot serialize Enum into JSON.
- args_for_log.run_scope = args_for_log.run_scope.value
- dllogger.log(step='PARAMETER', data=vars(args_for_log))
- def check_and_process_args(args):
- # Precess the scope of run
- run_scope = None
- for scope in RunScope:
- if args.run_scope == scope.value:
- run_scope = scope
- break
- assert run_scope is not None, \
- f"only support {[scope.value for scope in RunScope]} as run_scope"
- args.run_scope = run_scope
- # Precess image layout and channel
- args.image_channel = args.image_shape[0]
- if args.data_layout == "NHWC":
- args.image_shape = [
- args.image_shape[1], args.image_shape[2], args.image_shape[0]
- ]
- # Precess learning rate
- args.lr = get_num_trainers() * args.lr
- # Precess model loading
- assert not (args.from_checkpoint is not None and \
- args.from_pretrained_params is not None), \
- "--from-pretrained-params and --from-checkpoint should " \
- "not be set simultaneously."
- _get_full_path_of_pretrained_params(args)
- _get_full_path_of_ckpt(args)
- args.start_epoch = args.last_epoch_of_checkpoint + 1
- # Precess benchmark
- if args.benchmark:
- assert args.run_scope in [
- RunScope.TRAIN_ONLY, RunScope.EVAL_ONLY
- ], "If benchmark enabled, run_scope must be `train_only` or `eval_only`"
- # Only run one epoch when benchmark or eval_only.
- if args.benchmark or \
- (args.run_scope == RunScope.EVAL_ONLY):
- args.epochs = args.start_epoch + 1
- if args.run_scope == RunScope.EVAL_ONLY:
- args.eval_interval = 1
- def add_global_args(parser):
- group = parser.add_argument_group('Global')
- group.add_argument(
- '--output-dir',
- type=str,
- default='./output/',
- help='A path to store trained models.')
- group.add_argument(
- '--run-scope',
- default='train_eval',
- choices=('train_eval', 'train_only', 'eval_only'),
- help='Running scope. It should be one of {train_eval, train_only, eval_only}.'
- )
- group.add_argument(
- '--epochs',
- type=int,
- default=90,
- help='The number of epochs for training.')
- group.add_argument(
- '--save-interval',
- type=int,
- default=1,
- help='The iteration interval to save checkpoints.')
- group.add_argument(
- '--eval-interval',
- type=int,
- default=1,
- help='The iteration interval to test trained models on a given validation dataset. ' \
- 'Ignored when --run-scope is train_only.'
- )
- group.add_argument(
- '--print-interval',
- type=int,
- default=10,
- help='The iteration interval to show training/evaluation message.')
- group.add_argument(
- '--report-file',
- type=str,
- default='./report.json',
- help='A file in which to store JSON experiment report.')
- group.add_argument(
- '--data-layout',
- default='NCHW',
- choices=('NCHW', 'NHWC'),
- help='Data format. It should be one of {NCHW, NHWC}.')
- group.add_argument(
- '--benchmark', action='store_true', help='To enable benchmark mode.')
- group.add_argument(
- '--benchmark-steps',
- type=int,
- default=100,
- help='Steps for benchmark run, only be applied when --benchmark is set.'
- )
- group.add_argument(
- '--benchmark-warmup-steps',
- type=int,
- default=100,
- help='Warmup steps for benchmark run, only be applied when --benchmark is set.'
- )
- group.add_argument(
- '--model-prefix',
- type=str,
- default="resnet_50_paddle",
- help='The prefix name of model files to save/load.')
- group.add_argument(
- '--from-pretrained-params',
- type=str,
- default=None,
- help='A folder path which contains pretrained parameters, that is a file in name' \
- ' --model-prefix + .pdparams. It should not be set with --from-checkpoint' \
- ' at the same time.'
- )
- group.add_argument(
- '--from-checkpoint',
- type=str,
- default=None,
- help='A checkpoint path to resume training. It should not be set ' \
- 'with --from-pretrained-params at the same time. The path provided ' \
- 'could be a folder contains < epoch_id/ckpt_files > or < ckpt_files >.'
- )
- group.add_argument(
- '--last-epoch-of-checkpoint',
- type=str,
- default=None,
- help='The epoch id of the checkpoint given by --from-checkpoint. ' \
- 'It should be None, auto or integer >= 0. If it is set as ' \
- 'None, then training will start from 0-th epoch. If it is set as ' \
- 'auto, then it will search largest integer-convertable folder ' \
- ' --from-checkpoint, which contains required checkpoint. ' \
- 'Default is None.'
- )
- group.add_argument(
- '--show-config',
- type=distutils.util.strtobool,
- default=True,
- help='To show arguments.')
- group.add_argument(
- '--enable-cpu-affinity',
- type=distutils.util.strtobool,
- default=True,
- help='To enable in-built GPU-CPU affinity.')
- return parser
- def add_advance_args(parser):
- group = parser.add_argument_group('Advanced Training')
- # AMP
- group.add_argument(
- '--amp',
- action='store_true',
- help='Enable automatic mixed precision training (AMP).')
- group.add_argument(
- '--scale-loss',
- type=float,
- default=1.0,
- help='The loss scalar for AMP training, only be applied when --amp is set.'
- )
- group.add_argument(
- '--use-dynamic-loss-scaling',
- action='store_true',
- help='Enable dynamic loss scaling in AMP training, only be applied when --amp is set.'
- )
- group.add_argument(
- '--use-pure-fp16',
- action='store_true',
- help='Enable pure FP16 training, only be applied when --amp is set.')
- group.add_argument(
- '--fuse-resunit',
- action='store_true',
- help='Enable CUDNNv8 ResUnit fusion, only be applied when --amp is set.')
- # ASP
- group.add_argument(
- '--asp',
- action='store_true',
- help='Enable automatic sparse training (ASP).')
- group.add_argument(
- '--prune-model',
- action='store_true',
- help='Prune model to 2:4 sparse pattern, only be applied when --asp is set.'
- )
- group.add_argument(
- '--mask-algo',
- default='mask_1d',
- choices=('mask_1d', 'mask_2d_greedy', 'mask_2d_best'),
- help='The algorithm to generate sparse masks. It should be one of ' \
- '{mask_1d, mask_2d_greedy, mask_2d_best}. This only be applied ' \
- 'when --asp and --prune-model is set.'
- )
- return parser
- def add_dataset_args(parser):
- def float_list(x):
- return list(map(float, x.split(',')))
- def int_list(x):
- return list(map(int, x.split(',')))
- dataset_group = parser.add_argument_group('Dataset')
- dataset_group.add_argument(
- '--image-root',
- type=str,
- default='/imagenet',
- help='A root folder of train/val images. It should contain train and val folders, ' \
- 'which store corresponding images.'
- )
- dataset_group.add_argument(
- '--image-shape',
- type=int_list,
- default=[4, 224, 224],
- help='The image shape. Its shape should be [channel, height, width].')
- # Data Loader
- dataset_group.add_argument(
- '--batch-size',
- type=int,
- default=256,
- help='The batch size for both training and evaluation.')
- dataset_group.add_argument(
- '--dali-random-seed',
- type=int,
- default=42,
- help='The random seed for DALI data loader.')
- dataset_group.add_argument(
- '--dali-num-threads',
- type=int,
- default=4,
- help='The number of threads applied to DALI data loader.')
- dataset_group.add_argument(
- '--dali-output-fp16',
- action='store_true',
- help='Output FP16 data from DALI data loader.')
- # Augmentation
- augmentation_group = parser.add_argument_group('Data Augmentation')
- augmentation_group.add_argument(
- '--crop-size',
- type=int,
- default=224,
- help='The size to crop input images.')
- augmentation_group.add_argument(
- '--rand-crop-scale',
- type=float_list,
- default=[0.08, 1.],
- help='Range from which to choose a random area fraction.')
- augmentation_group.add_argument(
- '--rand-crop-ratio',
- type=float_list,
- default=[3.0 / 4, 4.0 / 3],
- help='Range from which to choose a random aspect ratio (width/height).')
- augmentation_group.add_argument(
- '--normalize-scale',
- type=float,
- default=1.0 / 255.0,
- help='A scalar to normalize images.')
- augmentation_group.add_argument(
- '--normalize-mean',
- type=float_list,
- default=[0.485, 0.456, 0.406],
- help='The mean values to normalize RGB images.')
- augmentation_group.add_argument(
- '--normalize-std',
- type=float_list,
- default=[0.229, 0.224, 0.225],
- help='The std values to normalize RGB images.')
- augmentation_group.add_argument(
- '--resize-short',
- type=int,
- default=256,
- help='The length of the shorter dimension of the resized image.')
- return parser
- def add_model_args(parser):
- group = parser.add_argument_group('Model')
- group.add_argument(
- '--model-arch-name',
- type=str,
- default='ResNet50',
- help='The model architecture name. It should be one of {ResNet50}.')
- group.add_argument(
- '--num-of-class',
- type=int,
- default=1000,
- help='The number classes of images.')
- group.add_argument(
- '--bn-weight-decay',
- action='store_true',
- help='Apply weight decay to BatchNorm shift and scale.')
- return parser
- def add_training_args(parser):
- group = parser.add_argument_group('Training')
- group.add_argument(
- '--label-smoothing',
- type=float,
- default=0.1,
- help='The ratio of label smoothing.')
- group.add_argument(
- '--optimizer',
- default='Momentum',
- metavar="OPTIMIZER",
- choices=('Momentum'),
- help='The name of optimizer. It should be one of {Momentum}.')
- group.add_argument(
- '--momentum',
- type=float,
- default=0.875,
- help='The momentum value of optimizer.')
- group.add_argument(
- '--weight-decay',
- type=float,
- default=3.0517578125e-05,
- help='The coefficient of weight decay.')
- group.add_argument(
- '--lr-scheduler',
- default='Cosine',
- metavar="LR_SCHEDULER",
- choices=('Cosine'),
- help='The name of learning rate scheduler. It should be one of {Cosine}.'
- )
- group.add_argument(
- '--lr', type=float, default=0.256, help='The initial learning rate.')
- group.add_argument(
- '--warmup-epochs',
- type=int,
- default=5,
- help='The number of epochs for learning rate warmup.')
- group.add_argument(
- '--warmup-start-lr',
- type=float,
- default=0.0,
- help='The initial learning rate for warmup.')
- return parser
- def add_trt_args(parser):
- group = parser.add_argument_group('Paddle-TRT')
- group.add_argument(
- '--device',
- type=int,
- default='0',
- help='The GPU device id for Paddle-TRT inference.'
- )
- group.add_argument(
- '--trt-inference-dir',
- type=str,
- default='./inference',
- help='A path to store/load inference models. ' \
- 'export_model.py would export models to this folder, ' \
- 'then inference.py would load from here.'
- )
- group.add_argument(
- '--trt-precision',
- default='FP32',
- choices=('FP32', 'FP16', 'INT8'),
- help='The precision of TensorRT. It should be one of {FP32, FP16, INT8}.'
- )
- group.add_argument(
- '--trt-workspace-size',
- type=int,
- default=(1 << 30),
- help='The memory workspace of TensorRT in MB.')
- group.add_argument(
- '--trt-min-subgraph-size',
- type=int,
- default=3,
- help='The minimal subgraph size to enable PaddleTRT.')
- group.add_argument(
- '--trt-use-static',
- type=distutils.util.strtobool,
- default=False,
- help='Fix TensorRT engine at first running.')
- group.add_argument(
- '--trt-use-calib-mode',
- type=distutils.util.strtobool,
- default=False,
- help='Use the PTQ calibration of PaddleTRT int8.')
- group.add_argument(
- '--trt-export-log-path',
- type=str,
- default='./export.json',
- help='A file in which to store JSON model exporting report.')
- group.add_argument(
- '--trt-log-path',
- type=str,
- default='./inference.json',
- help='A file in which to store JSON inference report.')
- group.add_argument(
- '--trt-use-synthetic',
- type=distutils.util.strtobool,
- default=False,
- help='Apply synthetic data for benchmark.')
- return parser
- def parse_args(including_trt=False):
- parser = argparse.ArgumentParser(
- description="PaddlePaddle RN50v1.5 training script",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser = add_global_args(parser)
- parser = add_dataset_args(parser)
- parser = add_model_args(parser)
- parser = add_training_args(parser)
- parser = add_advance_args(parser)
- if including_trt:
- parser = add_trt_args(parser)
- args = parser.parse_args()
- check_and_process_args(args)
- return args
|