runner 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. #!/usr/bin/env python
  2. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import os, socket
  16. from argparse import ArgumentParser
  17. import warnings
  18. optparser = ArgumentParser(description="train resnet50 with MXNet")
  19. optparser.add_argument("-n", "--n-GPUs", type=int, default=8, help="number of GPUs to use; " +\
  20. "default = 8")
  21. optparser.add_argument("-b", "--batch-size", type=int, default=208, help="batch size per GPU; " +\
  22. "default = 208")
  23. optparser.add_argument("-e", "--num-epochs", type=int, default=90, help="number of epochs; " +\
  24. "default = 90")
  25. optparser.add_argument("-l", "--lr", type=float, default=0.1, help="learning rate; default = 0.1; " +\
  26. "IMPORTANT: true learning rate will be calculated as `lr * batch_size/256`")
  27. optparser.add_argument("--no-val", action="store_true",
  28. help="if set no validation will be performed")
  29. optparser.add_argument("--no-dali", action="store_true", default=False,
  30. help="use default MXNet pipeline instead of DALI")
  31. optparser.add_argument("--data-root", type=str, help="Directory with RecordIO data files", default="/data/imagenet/train-val-recordio-passthrough")
  32. optparser.add_argument("--data-nthreads", type=int, help="number of threads for data loading; default = 40", default=40)
  33. optparser.add_argument("--dtype", type=str, help="Precision, float16 or float32", default="float16")
  34. opts, args = optparser.parse_known_args()
  35. if opts.dtype == "float16":
  36. n_ch = str(4 - int(opts.no_dali))
  37. else:
  38. n_ch = str(3)
  39. opts.batch_size *= opts.n_GPUs
  40. opts.lr *= opts.batch_size/256
  41. command = ""
  42. command += "python "+os.path.dirname(__file__)+"/train.py"
  43. command += " --num-layers 50"
  44. command += " --data-train " + opts.data_root + "/train.rec"
  45. command += " --data-train-idx " + opts.data_root + "/train.idx"
  46. if not opts.no_val:
  47. command += " --data-val " + opts.data_root + "/val.rec"
  48. command += " --data-val-idx " + opts.data_root + "/val.idx"
  49. command += " --data-nthreads " + str(opts.data_nthreads)
  50. command += " --optimizer sgd --dtype " + opts.dtype
  51. command += " --lr-step-epochs 30,60,80 --max-random-area 1"
  52. command += " --min-random-area 0.05 --max-random-scale 1"
  53. command += " --min-random-scale 1 --min-random-aspect-ratio 0.75"
  54. command += " --max-random-aspect-ratio 1.33 --max-random-shear-ratio 0"
  55. command += " --max-random-rotate-angle 0 --random-resized-crop 1"
  56. command += " --random-crop 0 --random-mirror 1"
  57. command += " --image-shape "+n_ch+",224,224 --warmup-epochs 5"
  58. command += " --disp-batches 20"
  59. command += " --batchnorm-mom 0.9 --batchnorm-eps 1e-5"
  60. if opts.dtype == 'float16':
  61. command += " --fuse-bn-relu 1"
  62. command += " --input-layout NHWC --conv-layout NHWC"
  63. command += " --batchnorm-layout NHWC --pooling-layout NHWC"
  64. command += " --conv-algo 1 --force-tensor-core 1"
  65. command += " --fuse-bn-add-relu 1"
  66. command += " --kv-store device"
  67. if not opts.no_dali:
  68. command += " --use-dali"
  69. command += " --dali-prefetch-queue 2 --dali-nvjpeg-memory-padding 64"
  70. command += " --lr "+str(opts.lr)
  71. command += " --gpus " + str(list(range(opts.n_GPUs))).replace(' ', '').replace('[', '').replace(']', '')
  72. command += " --batch-size " + str(opts.batch_size)
  73. command += " --num-epochs " + str(opts.num_epochs)
  74. for arg in args:
  75. command += " " + arg
  76. os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
  77. os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
  78. os.environ['MXNET_USE_TENSORRT'] = "0"
  79. os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
  80. os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
  81. os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"
  82. exit(os.system('/bin/bash -c "'+command+'"'))