runner 4.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. #!/usr/bin/env python
  2. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import os
  16. import argparse
  17. from pathlib import Path
  18. optparser = argparse.ArgumentParser(description='Train classification models on ImageNet',
  19. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  20. optparser.add_argument('-n', '--ngpus', type=int, default=1, help='number of GPUs to use')
  21. optparser.add_argument('-b', '--batch-size', type=int, default=192, help='batch size per GPU')
  22. optparser.add_argument('-e', '--num-epochs', type=int, default=90, help='number of epochs')
  23. optparser.add_argument('-l', '--lr', type=float, default=0.256, help='learning rate; '
  24. 'IMPORTANT: true learning rate will be calculated as `lr * batch_size / 256`')
  25. optparser.add_argument('--data-root', type=Path, help='Directory with RecordIO data files',
  26. default=Path('/data/imagenet/train-val-recordio-passthrough'))
  27. optparser.add_argument('--dtype', help='Precision', default='float16', choices=('float32', 'float16'))
  28. optparser.add_argument('--kv-store', default='horovod', choices=('device', 'horovod'), help='key-value store type')
  29. optparser.add_argument('--data-backend', default='dali-gpu', choices=('dali-gpu', 'dali-cpu', 'mxnet', 'synthetic'),
  30. help='data backend')
  31. optparser.add_argument('--launcher', default='horovod', choices=('horovod', 'slurm'), help='type of launcher')
  32. opts, args = optparser.parse_known_args()
  33. if opts.dtype == 'float16':
  34. n_ch = str(4 - int(opts.data_backend == 'mxnet'))
  35. else:
  36. n_ch = str(3)
  37. opts.batch_size *= opts.ngpus
  38. opts.lr *= opts.batch_size / 256
  39. command = []
  40. if 'horovod' in opts.kv_store and opts.launcher == 'horovod':
  41. command += ['horovodrun', '-np', str(opts.ngpus)]
  42. command += ['python', str(Path(__file__).parent / "train.py")]
  43. command += ['--data-train', str(opts.data_root / "train.rec")]
  44. command += ['--data-train-idx', str(opts.data_root / "train.idx")]
  45. command += ['--data-val', str(opts.data_root / "val.rec")]
  46. command += ['--data-val-idx', str(opts.data_root / "val.idx")]
  47. command += ['--dtype', opts.dtype]
  48. command += ['--image-shape', n_ch + ',224,224']
  49. if opts.dtype == 'float16':
  50. command += '--fuse-bn-relu 1 --fuse-bn-add-relu 1'.split()
  51. command += '--input-layout NCHW --conv-layout NHWC ' \
  52. '--batchnorm-layout NHWC --pooling-layout NHWC'.split()
  53. command += ['--kv-store', opts.kv_store]
  54. command += ['--data-backend', opts.data_backend]
  55. command += ['--lr', str(opts.lr)]
  56. command += ['--gpus', ','.join(list(map(str, range(opts.ngpus))))]
  57. command += ['--batch-size', str(opts.batch_size)]
  58. command += ['--num-epochs', str(opts.num_epochs)]
  59. command += args
  60. os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
  61. os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
  62. os.environ['MXNET_USE_TENSORRT'] = "0"
  63. os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
  64. os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
  65. os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "60"
  66. os.environ['HOROVOD_CYCLE_TIME'] = "0.1"
  67. os.environ['HOROVOD_FUSION_THRESHOLD'] = "67108864"
  68. os.environ['MXNET_HOROVOD_NUM_GROUPS'] = "16"
  69. os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD'] = "999"
  70. os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD'] = "25"
  71. os.environ['MXNET_ENABLE_CUDA_GRAPHS'] = "1"
  72. os.environ['MXNET_ASYNC_GPU_ENGINE'] = "1"
  73. os.environ['HOROVOD_ENABLE_ASYNC_COMPLETION'] = "1"
  74. os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = "0"
  75. os.environ['HOROVOD_BATCH_D2D_MEMCOPIES'] = "1"
  76. os.environ['HOROVOD_GROUPED_ALLREDUCES'] = "1"
  77. os.environ['OMP_NUM_THREADS'] = "1"
  78. os.execvp(command[0], command)