| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- #!/usr/bin/env python
- # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- import argparse
- from pathlib import Path
- optparser = argparse.ArgumentParser(description='Train classification models on ImageNet',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- optparser.add_argument('-n', '--ngpus', type=int, default=1, help='number of GPUs to use')
- optparser.add_argument('-b', '--batch-size', type=int, default=192, help='batch size per GPU')
- optparser.add_argument('-e', '--num-epochs', type=int, default=90, help='number of epochs')
- optparser.add_argument('-l', '--lr', type=float, default=0.256, help='learning rate; '
- 'IMPORTANT: true learning rate will be calculated as `lr * batch_size / 256`')
- optparser.add_argument('--data-root', type=Path, help='Directory with RecordIO data files',
- default=Path('/data/imagenet/train-val-recordio-passthrough'))
- optparser.add_argument('--dtype', help='Precision', default='float16', choices=('float32', 'float16'))
- optparser.add_argument('--kv-store', default='horovod', choices=('device', 'horovod'), help='key-value store type')
- optparser.add_argument('--data-backend', default='dali-gpu', choices=('dali-gpu', 'dali-cpu', 'mxnet', 'synthetic'),
- help='data backend')
- optparser.add_argument('--launcher', default='horovod', choices=('horovod', 'slurm'), help='type of launcher')
- opts, args = optparser.parse_known_args()
- if opts.dtype == 'float16':
- n_ch = str(4 - int(opts.data_backend == 'mxnet'))
- else:
- n_ch = str(3)
- opts.batch_size *= opts.ngpus
- opts.lr *= opts.batch_size / 256
- command = []
- if 'horovod' in opts.kv_store and opts.launcher == 'horovod':
- command += ['horovodrun', '-np', str(opts.ngpus)]
- command += ['python', str(Path(__file__).parent / "train.py")]
- command += ['--data-train', str(opts.data_root / "train.rec")]
- command += ['--data-train-idx', str(opts.data_root / "train.idx")]
- command += ['--data-val', str(opts.data_root / "val.rec")]
- command += ['--data-val-idx', str(opts.data_root / "val.idx")]
- command += ['--dtype', opts.dtype]
- command += ['--image-shape', n_ch + ',224,224']
- if opts.dtype == 'float16':
- command += '--fuse-bn-relu 1 --fuse-bn-add-relu 1'.split()
- command += '--input-layout NCHW --conv-layout NHWC ' \
- '--batchnorm-layout NHWC --pooling-layout NHWC'.split()
- command += ['--kv-store', opts.kv_store]
- command += ['--data-backend', opts.data_backend]
- command += ['--lr', str(opts.lr)]
- command += ['--gpus', ','.join(list(map(str, range(opts.ngpus))))]
- command += ['--batch-size', str(opts.batch_size)]
- command += ['--num-epochs', str(opts.num_epochs)]
- command += args
- os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
- os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
- os.environ['MXNET_USE_TENSORRT'] = "0"
- os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
- os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
- os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"
- os.environ['HOROVOD_CYCLE_TIME'] = "0.1"
- os.environ['HOROVOD_FUSION_THRESHOLD'] = "67108864"
- os.environ['MXNET_HOROVOD_NUM_GROUPS'] = "16"
- os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD'] = "999"
- os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD'] = "25"
- os.execvp(command[0], command)
|