Sfoglia il codice sorgente

Adding ResNet50v1.5 to MxNet/Classification

Przemek Strzelczyk 7 anni fa
parent
commit
a8e1d1cd83
26 ha cambiato i file con 2348 aggiunte e 0 eliminazioni
  1. 202 0
      MxNet/Classification/RN50v1.5/LICENSE
  2. 235 0
      MxNet/Classification/RN50v1.5/README.md
  3. 92 0
      MxNet/Classification/RN50v1.5/benchmark.py
  4. 62 0
      MxNet/Classification/RN50v1.5/benchmarking.py
  5. 163 0
      MxNet/Classification/RN50v1.5/dali.py
  6. 283 0
      MxNet/Classification/RN50v1.5/data.py
  7. 19 0
      MxNet/Classification/RN50v1.5/examples/BENCHMARK_FP16.sh
  8. 19 0
      MxNet/Classification/RN50v1.5/examples/BENCHMARK_FP32.sh
  9. 19 0
      MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP16.sh
  10. 19 0
      MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP32.sh
  11. 19 0
      MxNet/Classification/RN50v1.5/examples/RN50_FP16_1GPU.sh
  12. 19 0
      MxNet/Classification/RN50v1.5/examples/RN50_FP16_4GPU.sh
  13. 19 0
      MxNet/Classification/RN50v1.5/examples/RN50_FP16_8GPU.sh
  14. 19 0
      MxNet/Classification/RN50v1.5/examples/RN50_FP32_1GPU.sh
  15. 19 0
      MxNet/Classification/RN50v1.5/examples/RN50_FP32_4GPU.sh
  16. 19 0
      MxNet/Classification/RN50v1.5/examples/RN50_FP32_8GPU.sh
  17. 19 0
      MxNet/Classification/RN50v1.5/examples/SCORE_FP16.sh
  18. 19 0
      MxNet/Classification/RN50v1.5/examples/SCORE_FP32.sh
  19. 463 0
      MxNet/Classification/RN50v1.5/fit.py
  20. BIN
      MxNet/Classification/RN50v1.5/img/training_accuracy.png
  21. BIN
      MxNet/Classification/RN50v1.5/img/training_loss.png
  22. BIN
      MxNet/Classification/RN50v1.5/img/validation_accuracy.png
  23. 57 0
      MxNet/Classification/RN50v1.5/report.py
  24. 376 0
      MxNet/Classification/RN50v1.5/resnet.py
  25. 96 0
      MxNet/Classification/RN50v1.5/runner
  26. 91 0
      MxNet/Classification/RN50v1.5/train.py

+ 202 - 0
MxNet/Classification/RN50v1.5/LICENSE

@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

+ 235 - 0
MxNet/Classification/RN50v1.5/README.md

@@ -0,0 +1,235 @@
+# ResNet50 v1.5 For MXNet
+
+## The model
+The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
+
+The difference between v1 and v1.5 is in the bottleneck blocks which require
+downsampling. ResNet v1 has stride = 2 in the first 1x1 convolution, whereas
+v1.5 has stride = 2 in the 3x3 convolution
+
+This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1, but comes with a small performance drawback (~5% imgs/sec).
+
+## Training procedure
+
+### Optimizer
+
+This model trains for 90 epochs, with the standard ResNet v1.5 setup:
+
+* SGD with momentum (0.9)
+
+* Learning rate = 0.1 for 256 batch size, for other batch sizes we linearly
+scale the learning rate.
+
+* Learning rate decay - multiply by 0.1 after 30, 60, and 80 epochs
+
+* Linear warmup of the learning rate during first 5 epochs
+according to [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
+
+* Weight decay: 1e-4
+
+### Data Augmentation
+
+During training, we perform the following augmentation techniques:
+* Normalization
+* Random resized crop to 224x224
+* Scale from 5% to 100%
+* Aspect ratio from 3/4 to 4/3
+* Random horizontal flip
+
+During inference, we perform the following augmentation techniques:
+* Normalization
+* Scale to 256x256
+* Center crop to 224x224
+
+See `data.py` for more info.
+
+# Setup
+
+## Requirements
+
+Ensure your environment meets the following requirements:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [MXNet 18.12-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia%2Fmxnet) or newer
+* [NVIDIA-DALI 0.5.0](https://github.com/NVIDIA/DALI) -- included in the MXNet container
+* [Python 3.5](https://www.python.org) -- included in the MXNet container
+* [CUDA 10](https://developer.nvidia.com/cuda-toolkit) -- included in the MXNet container
+* [cuDNN 7.4.1](https://developer.nvidia.com/cudnn) -- included in the the MXNet container
+* (optional) NVIDIA Volta or Turing GPU (see section below) -- for best training performance using FP16
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+* [Running MXNet](https://docs.nvidia.com/deeplearning/dgx/mxnet-release-notes/running.html#running)
+
+## Training using mixed precision with Tensor Cores
+
+### Hardware requirements
+Training with mixed precision on NVIDIA Tensor Cores, requires an NVIDIA Volta-based or Turing-based GPU.
+
+
+### Software changes
+
+For information about how to train using mixed precision, see the
+[Mixed Precision Training paper](https://arxiv.org/abs/1710.03740)
+and
+[Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
+
+
+# Quick start guide
+
+## Docker
+
+To run docker MXNet container, run:
+
+`nvidia-docker run --rm -it --ipc=host -v <path to source of this repo>:/workspace/resnet50 -v <path to prepared dataset>:/data/imagenet/train-val-recordio-passthrough nvcr.io/nvidia/mxnet:18.12-py3`
+
+It will also automatically start downloading the MXNet container if you haven't downloaded it yet. You can also download it manually by running:
+
+`nvidia-docker pull nvcr.io/nvidia/mxnet:18.12-py3`
+
+If you haven't prepared dataset yet (see section below), download raw ImageNet dataset (see section below), and run:
+
+`nvidia-docker run --rm -it --ipc=host -v <path to source of this repo>:/workspace/resnet50 -v <path where prepared dataset should be created>:/data/imagenet/train-val-recordio-passthrough -v <path to raw dataset>:/data/imagenet/raw nvcr.io/nvidia/mxnet:18.12-py3`
+
+and follow step from Prepare Dataset section.
+
+## Prepare Dataset
+
+The MXNet ResNet50 v1.5 script operates on ImageNet 1k, a widely popular image classification dataset from ILSVRC challenge.
+You can download the images from http://image-net.org/download-images
+
+The recommended data format is
+[RecordIO](http://mxnet.io/architecture/note_data_loading.html), which
+concatenates multiple examples into seekable binary files for better read
+efficiency. MXNet provides a tool called `im2rec.py` located in the `/opt/mxnet/tools/` directory.
+The tool converts individual images into `.rec` files.
+
+To prepare RecordIO file containing ImageNet data, we first need to create .lst files
+which consist of the labels and image paths. We assume that the original images were
+downloaded to `/data/imagenet/raw/train-jpeg` and `/data/imagenet/raw/val-jpeg`.
+
+```bash
+python /opt/mxnet/tools/im2rec.py --list --recursive train /data/imagenet/raw/train-jpeg
+python /opt/mxnet/tools/im2rec.py --list --recursive val /data/imagenet/raw/val-jpeg
+```
+
+Then we generate the `.rec` (RecordIO files with data) and `.idx` (indexes required by DALI
+to speed up data loading) files. To obtain the best training accuracy
+we do not preprocess the images when creating RecordIO file.
+
+```bash
+python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 train /data/imagenet/raw/train-jpeg
+python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 val /data/imagenet/raw/val-jpeg
+```
+
+## Running training
+
+To run training for a standard configuration (1/4/8 GPUs, FP16/FP32),
+run one of the scripts in the `./examples` directory
+called `./examples/RN50_{FP16, FP32}_{1, 4, 8}GPU.sh`.
+By default the training scripts run the validation and save checkpoint after each epoch.
+Checkpoints will be stored in `model-symbol.json` and `model-<number of epoch>.params` files.
+
+If imagenet is mounted in the `/data/imagenet/train-val-recordio-passthrough` directory, you don't have to specify `--data-root` flag.
+
+To run a non standard configuration use:
+
+`./runner -n <number of gpus> -b <batch size per gpu> --data-root <path to imagenet> --dtype <float32 or float16> --model-prefix <model prefix>`
+
+Checkpoints will be stored in `<model prefix>-symbol.json` and `<model prefix>-<number of epoch>.params` files.
+To generate JSON report with performance and accuracy stats, use `--report <path to report>` flag (see `report.py` for info about JSON report file structure).
+Use `./runner -h` and `python ./train.py -h` to obtain the list of available options.
+
+## Running inference
+
+To run inference on a checkpointed model run:
+* For FP16
+    `./examples/SCORE_FP16.sh <model prefix> <epoch>`
+* For FP32
+    `./examples/SCORE_FP32.sh <model prefix> <epoch>`
+
+
+## Benchmark scripts
+
+To benchmark training and inference, run:
+
+`python benchmark.py -n <numbers of gpus separated by comma> -b <batch sizes per gpu separated by comma> --data-root <path to imagenet> --dtype <float32 or float16> -o <path to benchmark report>`
+
+To control benchmark length per epoch, use `-i` flag (defaults to 100 iterations).
+To control number of epochs, use `-e` flag.
+To control number of warmup epochs (epochs which are not taken into account), use `-w` flag.
+To limit length of dataset, use `--num-examples` flag.
+To benchmark only inference, use `--only-inference` flag.
+By default, the same parameters as in `./runner` will be used. Additional flags will be passed to `./runner`.
+
+
+## Training accuracy results
+
+The following results were obtained by running the `./examples/RN50_{FP16, FP32}_{1, 4, 8}GPU.sh` scripts in the
+mxnet-18.12-py3 Docker container on NVIDIA DGX-1 with 8 V100 16G GPUs.
+
+| **number of GPUs** | **FP16 top1** | **FP16 training time** | **FP32 top1** | **FP32 training time** |
+|:------------------:|:-------------:|:----------------------:|:-------------:|:----------------------:|
+| 1                  | 76.424        | 22.9h                  | 76.462        | 82.0h                  |
+| 4                  | 76.328        | 6.2h                   | 76.448        | 21.1h                  |
+| 8                  | 76.490        | 3.3h                   | 76.668        | 11.1h                  |
+
+Here are example graphs of FP32 and FP16 training on 8 GPU configuration:
+
+![TrainingLoss](./img/training_loss.png)
+
+![TrainingAccuracy](./img/training_accuracy.png)
+
+![ValidationAccuracy](./img/validation_accuracy.png)
+
+
+## Training performance results
+
+The following results were obtained by running
+`python benchmark.py -n 1,4,8 -b 208 --dtype float16 -o benchmark_report_fp16.json --data-root <path to imagenet> -i 100 -e 12 -w 4 --num-examples 25600` for FP16, and
+`python benchmark.py -n 1,4,8 -b 96 --dtype float32 -o benchmark_report_fp32.json --data-root <path to imagenet> -i 100 -e 12 -w 4 --num-examples 12800` for FP32
+in the mxnet-18.12-py3 Docker container on NVIDIA DGX-1 with V100 16G GPUs.
+Training performance reported as Total IPS (data + compute time taken into account).
+Weak scaling is calculated as a ratio of speed for given number of GPUs to speed for 1 GPU.
+
+| **number of GPUs** | **FP16 img/s** | **FP32 img/s** | **FP16 speedup** | **FP16 weak scaling** | **FP32 weak scaling** |
+|:------------------:|:--------------:|:--------------:|:----------------:|:---------------------:|:---------------------:|
+| 1                  | 1442.6         | 400.2          | 3.60             | 1.00                  | 1.00                  |
+| 4                  | 5391.8         | 1558.6         | 3.46             | 3.74                  | 3.89                  |
+| 8                  | 10263.2        | 2957.4         | 3.47             | 7.11                  | 7.39                  |
+
+
+## Inference performance results
+
+The following results were obtained by running
+`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,96,128,192,208 --dtype float16 -o inferbenchmark_report_fp16.json --data-root <path to imagenet> -i 200 -e 12 -w 4 --only-inference` for FP16, and
+`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,96 --dtype float32 -o inferbenchmark_report_fp32.json --data-root <path to imagenet> -i 200 -e 12 -w 4 --only-inference` for FP32
+in the mxnet-18.12-py3 Docker container on NVIDIA DGX-1 using one V100 16G GPU.
+Inference performance reported as Total IPS (data + compute time taken into account).
+
+| **batch size** | **FP16 img/s** | **FP32 img/s** |
+|:--------------:|:--------------:|:--------------:|
+|              1 |  314           | 252            |
+|              2 |  555           | 393            |
+|              4 |  1024          | 601            |
+|              8 |  1642          | 824            |
+|             16 |  2144          | 1028           |
+|             32 |  2954          | 1138           |
+|             64 |  3428          | 1236           |
+|             96 |  3546          | 1282           |
+|            128 |  3690          |                |
+|            192 |  3828          |                |
+|            208 |  3832          |                |
+
+
+# Changelog
+
+1. Dec 19, 2018
+  * Initial release (based on https://github.com/apache/incubator-mxnet/tree/master/example/image-classification)
+
+
+# Known Issues
+
+There are no known issues with this model.

+ 92 - 0
MxNet/Classification/RN50v1.5/benchmark.py

@@ -0,0 +1,92 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import sys
+import tempfile
+import json
+import os
+from collections import OrderedDict
+from subprocess import Popen
+
+parser = argparse.ArgumentParser(description='Benchmark')
+parser.add_argument('--executable', default='./runner', help='path to runner')
+parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]',
+                    required=True, help='numbers of gpus separated by comma')
+parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]',
+                    required=True, help='batch sizes separated by comma')
+parser.add_argument('-i', '--benchmark-iters', metavar='I',
+                    type=int, default=100, help='iterations')
+parser.add_argument('-e', '--epochs', metavar='E',
+                    type=int, default=1, help='number of epochs')
+parser.add_argument('-w', '--warmup', metavar='N',
+                    type=int, default=0, help='warmup epochs')
+parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report")
+parser.add_argument('--only-inference', action='store_true', help="benchmark inference only")
+args, other_args = parser.parse_known_args()
+
+ngpus = list(map(int, args.ngpus.split(',')))
+batch_sizes = list(map(int, args.batch_sizes.split(',')))
+
+
+res = OrderedDict()
+res['model'] = ''
+res['ngpus'] = ngpus
+res['bs'] = batch_sizes
+if args.only_inference:
+    res['metric_keys'] = ['val.total_ips']
+else:
+    res['metric_keys'] = ['train.total_ips', 'val.total_ips']
+res['metrics'] = OrderedDict()
+
+for n in ngpus:
+    res['metrics'][str(n)] = OrderedDict()
+    for bs in batch_sizes:
+        res['metrics'][str(n)][str(bs)] = OrderedDict()
+
+        report_file = args.output + '-{},{}'.format(n, bs)
+        Popen([args.executable, '-n', str(n), '-b', str(bs),
+               '--benchmark-iters', str(args.benchmark_iters),
+               '-e', str(args.epochs), '--report', report_file,
+               *([] if not args.only_inference else ['--only-inference']),
+               '--no-metrics'] + other_args, stdout=sys.stderr).wait()
+
+        with open(report_file, 'r') as f:
+            report = json.load(f)
+
+        for metric in res['metric_keys']:
+            data = report['metrics'][metric][args.warmup:]
+            avg = len(data) / sum(map(lambda x: 1 / x, data))
+            res['metrics'][str(n)][str(bs)][metric] = avg
+
+
+column_len = 7
+for m in res['metric_keys']:
+    print(m, file=sys.stderr)
+    print(' ' * column_len, end='|', file=sys.stderr)
+    for bs in batch_sizes:
+        print(str(bs).center(column_len), end='|', file=sys.stderr)
+    print(file=sys.stderr)
+    print('-' * (len(batch_sizes) + 1) * (column_len + 1), file=sys.stderr)
+    for n in ngpus:
+        print(str(n).center(column_len), end='|', file=sys.stderr)
+        for bs in batch_sizes:
+            print(str(round(res['metrics'][str(n)][str(bs)][m])).center(column_len), end='|', file=sys.stderr)
+        print(file=sys.stderr)
+    print(file=sys.stderr)
+
+
+with open(args.output, 'w') as f:
+    json.dump(res, f, indent=4)

+ 62 - 0
MxNet/Classification/RN50v1.5/benchmarking.py

@@ -0,0 +1,62 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from mxnet.io import DataIter
+import time
+
+class BenchmarkingDataIter:
+    def __init__(self, data_iter, benchmark_iters=None):
+        self.data_iter = data_iter
+        self.benchmark_iters = benchmark_iters
+        self.overall_time = 0
+        self.num = 0
+
+    def __iter__(self):
+        iter(self.data_iter)
+        return self
+
+    def next(self):
+        if self.benchmark_iters is not None and self.num >= self.benchmark_iters:
+            raise StopIteration
+        try:
+            start_time = time.time()
+            ret = self.data_iter.next()
+            end_time = time.time()
+        except StopIteration:
+            if self.benchmark_iters is None:
+                raise
+            self.data_iter.reset()
+            start_time = time.time()
+            ret = self.data_iter.next()
+            end_time = time.time()
+
+        if self.num != 0:
+            self.overall_time += end_time - start_time
+        self.num += 1
+        return ret
+
+    def __next__(self):
+        return self.next()
+
+    def __getattr__(self, attr):
+        return getattr(self.data_iter, attr)
+
+    def get_avg_time_and_clear(self):
+        if self.num <= 1:
+            avg = float('nan')
+        else:
+            avg = self.overall_time / (self.num - 1)
+        self.overall_time = 0
+        self.num = 0
+        return avg

+ 163 - 0
MxNet/Classification/RN50v1.5/dali.py

@@ -0,0 +1,163 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from nvidia import dali
+from nvidia.dali.pipeline import Pipeline
+import nvidia.dali.ops as ops
+import nvidia.dali.types as types
+from nvidia.dali.plugin.mxnet import DALIClassificationIterator
+
+
+def add_dali_args(parser):
+    group = parser.add_argument_group('DALI', 'pipeline and augumentation')
+    group.add_argument('--use-dali', action='store_true',
+                      help='use dalli pipeline and augunetation')
+    group.add_argument('--separ-val', action='store_true',
+                      help='each process will perform independent validation on whole val-set')
+    group.add_argument('--dali-threads', type=int, default=3, help="number of threads" +\
+                       "per GPU for DALI")
+    group.add_argument('--validation-dali-threads', type=int, default=10, help="number of threads" +\
+                       "per GPU for DALI for validation")
+    group.add_argument('--dali-prefetch-queue', type=int, default=3, help="DALI prefetch queue depth")
+    group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=16, help="Memory padding value for nvJPEG (in MB)")
+    return parser
+
+
+_mean_pixel = [255 * x for x in (0.485, 0.456, 0.406)]
+_std_pixel  = [255 * x for x in (0.229, 0.224, 0.225)]
+
+class HybridTrainPipe(Pipeline):
+    def __init__(self, batch_size, num_threads, device_id, rec_path, idx_path,
+                 shard_id, num_shards, crop_shape,
+                 nvjpeg_padding, prefetch_queue=3,
+                 output_layout=types.NCHW, pad_output=True, dtype='float16'):
+        super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id, prefetch_queue_depth = prefetch_queue)
+        self.input = ops.MXNetReader(path = [rec_path], index_path=[idx_path],
+                                     random_shuffle=True, shard_id=shard_id, num_shards=num_shards)
+
+        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB,
+                                        device_memory_padding = nvjpeg_padding,
+                                        host_memory_padding = nvjpeg_padding)
+        self.rrc = ops.RandomResizedCrop(device = "gpu", size = crop_shape)
+        self.cmnp = ops.CropMirrorNormalize(device = "gpu",
+                                            output_dtype = types.FLOAT16 if dtype == 'float16' else types.FLOAT,
+                                            output_layout = output_layout,
+                                            crop = crop_shape,
+                                            pad_output = pad_output,
+                                            image_type = types.RGB,
+                                            mean = _mean_pixel,
+                                            std =  _std_pixel)
+        self.coin = ops.CoinFlip(probability = 0.5)
+
+    def define_graph(self):
+        rng = self.coin()
+        self.jpegs, self.labels = self.input(name = "Reader")
+
+        images = self.decode(self.jpegs)
+        images = self.rrc(images)
+        output = self.cmnp(images, mirror = rng)
+        return [output, self.labels]
+
+
+class HybridValPipe(Pipeline):
+    def __init__(self, batch_size, num_threads, device_id, rec_path, idx_path,
+                 shard_id, num_shards, crop_shape,
+                 nvjpeg_padding, prefetch_queue=3,
+                 resize_shp=None,
+                 output_layout=types.NCHW, pad_output=True, dtype='float16'):
+        super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id, prefetch_queue_depth = prefetch_queue)
+        self.input = ops.MXNetReader(path = [rec_path], index_path=[idx_path],
+                                     random_shuffle=False, shard_id=shard_id, num_shards=num_shards)
+        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB,
+                                        device_memory_padding = nvjpeg_padding,
+                                        host_memory_padding = nvjpeg_padding)
+        self.resize = ops.Resize(device = "gpu", resize_shorter=resize_shp) if resize_shp else None
+        self.cmnp = ops.CropMirrorNormalize(device = "gpu",
+                                            output_dtype = types.FLOAT16 if dtype == 'float16' else types.FLOAT,
+                                            output_layout = output_layout,
+                                            crop = crop_shape,
+                                            pad_output = pad_output,
+                                            image_type = types.RGB,
+                                            mean = _mean_pixel,
+                                            std =  _std_pixel)
+
+    def define_graph(self):
+        self.jpegs, self.labels = self.input(name = "Reader")
+        images = self.decode(self.jpegs)
+        if self.resize:
+            images = self.resize(images)
+        output = self.cmnp(images)
+        return [output, self.labels]
+
+
+def get_rec_iter(args, kv=None):
+    # resize is default base length of shorter edge for dataset;
+    # all images will be reshaped to this size
+    resize = int(args.resize)
+    # target shape is final shape of images pipelined to network;
+    # all images will be cropped to this size
+    target_shape = tuple([int(l) for l in args.image_shape.split(',')])
+    pad_output = target_shape[0] == 4
+    gpus = list(map(int, filter(None, args.gpus.split(',')))) # filter to not encount eventually empty strings
+    batch_size = args.batch_size//len(gpus)
+    num_threads = args.dali_threads
+    num_validation_threads = args.validation_dali_threads
+    #db_folder = "/data/imagenet/train-480-val-256-recordio/"
+
+    # the input_layout w.r.t. the model is the output_layout of the image pipeline
+    output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW
+
+    rank = kv.rank if kv else 0
+    nWrk = kv.num_workers if kv else 1
+
+    trainpipes = [HybridTrainPipe(batch_size     = batch_size,
+                                  num_threads    = num_threads,
+                                  device_id      = gpu_id,
+                                  rec_path       = args.data_train,
+                                  idx_path       = args.data_train_idx,
+                                  shard_id       = gpus.index(gpu_id) + len(gpus)*rank,
+                                  num_shards     = len(gpus)*nWrk,
+                                  crop_shape     = target_shape[1:],
+                                  output_layout  = output_layout,
+                                  pad_output     = pad_output,
+                                  dtype          = args.dtype,
+                                  nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
+                                  prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus]
+
+    valpipes = [HybridValPipe(batch_size     = batch_size,
+                              num_threads    = num_validation_threads,
+                              device_id      = gpu_id,
+                              rec_path       = args.data_val,
+                              idx_path       = args.data_val_idx,
+                              shard_id       = 0 if args.separ_val
+                                                 else gpus.index(gpu_id) + len(gpus)*rank,
+                              num_shards     = 1 if args.separ_val else len(gpus)*nWrk,
+                              crop_shape     = target_shape[1:],
+                              resize_shp     = resize,
+                              output_layout  = output_layout,
+                              pad_output     = pad_output,
+                              dtype          = args.dtype,
+                              nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
+                              prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] if args.data_val else None
+    trainpipes[0].build()
+    if args.data_val:
+        valpipes[0].build()
+
+    if args.num_examples < trainpipes[0].epoch_size("Reader"):
+        warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, trainpipes[0].epoch_size("Reader")))
+    dali_train_iter = DALIClassificationIterator(trainpipes, args.num_examples // nWrk)
+    dali_val_iter = DALIClassificationIterator(valpipes, valpipes[0].epoch_size("Reader") // (1 if args.separ_val else nWrk), fill_last_batch = False) if args.data_val else None
+    return dali_train_iter, dali_val_iter
+

+ 283 - 0
MxNet/Classification/RN50v1.5/data.py

@@ -0,0 +1,283 @@
+# Copyright 2017-2018 The Apache Software Foundation
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -----------------------------------------------------------------------
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import mxnet as mx
+import random
+import argparse
+from mxnet.io import DataBatch, DataIter
+import numpy as np
+
+def add_data_args(parser):
+    data = parser.add_argument_group('Data', 'the input images')
+    data.add_argument('--data-train', type=str, help='the training data')
+    data.add_argument('--data-train-idx', type=str, default='', help='the index of training data')
+    data.add_argument('--data-val', type=str, help='the validation data')
+    data.add_argument('--data-val-idx', type=str, default='', help='the index of validation data')
+    data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
+                      help='a tuple of size 3 for the mean rgb')
+    data.add_argument('--rgb-std', type=str, default='1,1,1',
+                      help='a tuple of size 3 for the std rgb')
+    data.add_argument('--pad-size', type=int, default=0,
+                      help='padding the input image')
+    data.add_argument('--fill-value', type=int, default=127,
+                      help='Set the padding pixels value to fill_value')
+    data.add_argument('--image-shape', type=str,
+                      help='the image shape feed into the network, e.g. (3,224,224)')
+    data.add_argument('--num-classes', type=int, help='the number of classes')
+    data.add_argument('--num-examples', type=int, help='the number of training examples')
+    data.add_argument('--data-nthreads', type=int, default=4,
+                      help='number of threads for data decoding')
+    data.add_argument('--benchmark-iters', type=int, default=None,
+                      help='run only benchmark-iters iterations from each epoch')
+    data.add_argument('--input-layout', type=str, default='NCHW',
+                      help='the layout of the input data (e.g. NCHW)')
+    data.add_argument('--conv-layout', type=str, default='NCHW',
+                      help='the layout of the data assumed by the conv operation (e.g. NCHW)')
+    data.add_argument('--conv-algo', type=int, default=-1,
+                      help='set the convolution algos (fwd, dgrad, wgrad)')
+    data.add_argument('--batchnorm-layout', type=str, default='NCHW',
+                      help='the layout of the data assumed by the batchnorm operation (e.g. NCHW)')
+    data.add_argument('--batchnorm-eps', type=float, default=2e-5,
+                      help='the amount added to the batchnorm variance to prevent output explosion.')
+    data.add_argument('--batchnorm-mom', type=float, default=0.9,
+                      help='the leaky-integrator factor controling the batchnorm mean and variance.')
+    data.add_argument('--pooling-layout', type=str, default='NCHW',
+                      help='the layout of the data assumed by the pooling operation (e.g. NCHW)')
+    data.add_argument('--verbose', type=int, default=0,
+                      help='turn on reporting of chosen algos for convolution, etc.')
+    data.add_argument('--seed', type=int, default=None,
+                      help='set the seed for python, nd and mxnet rngs')
+    data.add_argument('--custom-bn-off', type=int, default=0,
+                      help='disable use of custom batchnorm kernel')
+    data.add_argument('--fuse-bn-relu', type=int, default=0,
+                      help='have batchnorm kernel perform activation relu')
+    data.add_argument('--fuse-bn-add-relu', type=int, default=0,
+                      help='have batchnorm kernel perform add followed by activation relu')
+    data.add_argument('--force-tensor-core', type=int, default=0,
+                      help='require conv algos to be tensor core')
+    return data
+
+# Action to translate --set-resnet-aug flag to its component settings.
+class SetResnetAugAction(argparse.Action):
+    def __init__(self, nargs=0, **kwargs):
+        if nargs != 0:
+            raise ValueError('nargs for SetResnetAug must be 0.')
+        super(SetResnetAugAction, self).__init__(nargs=nargs, **kwargs)
+    def __call__(self, parser, namespace, values, option_string=None):
+        # standard data augmentation setting for resnet training
+        setattr(namespace, 'random_crop', 1)
+        setattr(namespace, 'random_resized_crop', 1)
+        setattr(namespace, 'random_mirror', 1)
+        setattr(namespace, 'min_random_area', 0.08)
+        setattr(namespace, 'max_random_aspect_ratio', 4./3.)
+        setattr(namespace, 'min_random_aspect_ratio', 3./4.)
+        setattr(namespace, 'brightness', 0.4)
+        setattr(namespace, 'contrast', 0.4)
+        setattr(namespace, 'saturation', 0.4)
+        setattr(namespace, 'pca_noise', 0.1)
+        # record that this --set-resnet-aug 'macro arg' has been invoked
+        setattr(namespace, self.dest, 1)
+
+# Similar to the above, but suitable for calling within a training script to set the defaults.
+def set_resnet_aug(aug):
+    # standard data augmentation setting for resnet training
+    aug.set_defaults(random_crop=0, random_resized_crop=1)
+    aug.set_defaults(random_mirror=1)
+    aug.set_defaults(min_random_area=0.08)
+    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
+    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
+
+# Action to translate --set-data-aug-level <N> arg to its component settings.
+class SetDataAugLevelAction(argparse.Action):
+    def __init__(self, option_strings, dest, nargs=None, **kwargs):
+        if nargs is not None:
+            raise ValueError("nargs not allowed")
+        super(SetDataAugLevelAction, self).__init__(option_strings, dest, **kwargs)
+    def __call__(self, parser, namespace, values, option_string=None):
+        level = values
+        # record that this --set-data-aug-level <N> 'macro arg' has been invoked
+        setattr(namespace, self.dest, level)
+        if level >= 1:
+            setattr(namespace, 'random_crop', 1)
+            setattr(namespace, 'random_mirror', 1)
+        if level >= 2:
+            setattr(namespace, 'max_random_h', 36)
+            setattr(namespace, 'max_random_s', 50)
+            setattr(namespace, 'max_random_l', 50)
+        if level >= 3:
+            setattr(namespace, 'max_random_rotate_angle', 10)
+            setattr(namespace, 'max_random_shear_ratio', 0.1)
+            setattr(namespace, 'max_random_aspect_ratio', 0.25)
+
+# Similar to the above, but suitable for calling within a training script to set the defaults.
+def set_data_aug_level(aug, level):
+    if level >= 1:
+        aug.set_defaults(random_crop=1, random_mirror=1)
+    if level >= 2:
+        aug.set_defaults(max_random_h=36, max_random_s=50, max_random_l=50)
+    if level >= 3:
+        aug.set_defaults(max_random_rotate_angle=10, max_random_shear_ratio=0.1, max_random_aspect_ratio=0.25)
+
+def add_data_aug_args(parser):
+    aug = parser.add_argument_group(
+        'Image augmentations', 'implemented in src/io/image_aug_default.cc')
+    aug.add_argument('--random-crop', type=int, default=0,
+                     help='if or not randomly crop the image')
+    aug.add_argument('--random-mirror', type=int, default=0,
+                     help='if or not randomly flip horizontally')
+    aug.add_argument('--max-random-h', type=int, default=0,
+                     help='max change of hue, whose range is [0, 180]')
+    aug.add_argument('--max-random-s', type=int, default=0,
+                     help='max change of saturation, whose range is [0, 255]')
+    aug.add_argument('--max-random-l', type=int, default=0,
+                     help='max change of intensity, whose range is [0, 255]')
+    aug.add_argument('--min-random-aspect-ratio', type=float, default=None,
+                     help='min value of aspect ratio, whose value is either None or a positive value.')
+    aug.add_argument('--max-random-aspect-ratio', type=float, default=0,
+                     help='max value of aspect ratio. If min_random_aspect_ratio is None, '
+                          'the aspect ratio range is [1-max_random_aspect_ratio, '
+                          '1+max_random_aspect_ratio], otherwise it is '
+                          '[min_random_aspect_ratio, max_random_aspect_ratio].')
+    aug.add_argument('--max-random-rotate-angle', type=int, default=0,
+                     help='max angle to rotate, whose range is [0, 360]')
+    aug.add_argument('--max-random-shear-ratio', type=float, default=0,
+                     help='max ratio to shear, whose range is [0, 1]')
+    aug.add_argument('--max-random-scale', type=float, default=1,
+                     help='max ratio to scale')
+    aug.add_argument('--min-random-scale', type=float, default=1,
+                     help='min ratio to scale, should >= img_size/input_shape. '
+                          'otherwise use --pad-size')
+    aug.add_argument('--max-random-area', type=float, default=1,
+                     help='max area to crop in random resized crop, whose range is [0, 1]')
+    aug.add_argument('--min-random-area', type=float, default=1,
+                     help='min area to crop in random resized crop, whose range is [0, 1]')
+    aug.add_argument('--min-crop-size', type=int, default=-1,
+                     help='Crop both width and height into a random size in '
+                          '[min_crop_size, max_crop_size]')
+    aug.add_argument('--max-crop-size', type=int, default=-1,
+                     help='Crop both width and height into a random size in '
+                          '[min_crop_size, max_crop_size]')
+    aug.add_argument('--brightness', type=float, default=0,
+                     help='brightness jittering, whose range is [0, 1]')
+    aug.add_argument('--contrast', type=float, default=0,
+                     help='contrast jittering, whose range is [0, 1]')
+    aug.add_argument('--saturation', type=float, default=0,
+                     help='saturation jittering, whose range is [0, 1]')
+    aug.add_argument('--pca-noise', type=float, default=0,
+                     help='pca noise, whose range is [0, 1]')
+    aug.add_argument('--random-resized-crop', type=int, default=0,
+                     help='whether to use random resized crop')
+    aug.add_argument('--set-resnet-aug', action=SetResnetAugAction,
+                     help='whether to employ standard resnet augmentations (see data.py)')
+    aug.add_argument('--set-data-aug-level', type=int, default=None, action=SetDataAugLevelAction,
+                     help='set multiple data augmentations based on a `level` (see data.py)')
+    return aug
+
+def get_rec_iter(args, kv=None):
+    image_shape = tuple([int(l) for l in args.image_shape.split(',')])
+    if args.input_layout == 'NHWC':
+        image_shape = image_shape[1:] + (image_shape[0],)
+    if kv:
+        (rank, nworker) = (kv.rank, kv.num_workers)
+    else:
+        (rank, nworker) = (0, 1)
+    rgb_mean = [float(i) for i in args.rgb_mean.split(',')]
+    rgb_std = [float(i) for i in args.rgb_std.split(',')]
+    if args.input_layout == 'NHWC':
+        raise ValueError('ImageRecordIter cannot handle layout {}'.format(args.input_layout))
+
+    train = mx.io.ImageRecordIter(
+        path_imgrec         = args.data_train,
+        path_imgidx         = args.data_train_idx,
+        label_width         = 1,
+        mean_r              = rgb_mean[0],
+        mean_g              = rgb_mean[1],
+        mean_b              = rgb_mean[2],
+        std_r               = rgb_std[0],
+        std_g               = rgb_std[1],
+        std_b               = rgb_std[2],
+        data_name           = 'data',
+        label_name          = 'softmax_label',
+        data_shape          = image_shape,
+        batch_size          = args.batch_size,
+        rand_crop           = args.random_crop,
+        max_random_scale    = args.max_random_scale,
+        pad                 = args.pad_size,
+        fill_value          = args.fill_value,
+        random_resized_crop = args.random_resized_crop,
+        min_random_scale    = args.min_random_scale,
+        max_aspect_ratio    = args.max_random_aspect_ratio,
+        min_aspect_ratio    = args.min_random_aspect_ratio,
+        max_random_area     = args.max_random_area,
+        min_random_area     = args.min_random_area,
+        min_crop_size       = args.min_crop_size,
+        max_crop_size       = args.max_crop_size,
+        brightness          = args.brightness,
+        contrast            = args.contrast,
+        saturation          = args.saturation,
+        pca_noise           = args.pca_noise,
+        random_h            = args.max_random_h,
+        random_s            = args.max_random_s,
+        random_l            = args.max_random_l,
+        max_rotate_angle    = args.max_random_rotate_angle,
+        max_shear_ratio     = args.max_random_shear_ratio,
+        rand_mirror         = args.random_mirror,
+        preprocess_threads  = args.data_nthreads,
+        shuffle             = True,
+        num_parts           = nworker,
+        part_index          = rank)
+    if args.data_val is None:
+        return (train, None)
+    val = mx.io.ImageRecordIter(
+        path_imgrec         = args.data_val,
+        path_imgidx         = args.data_val_idx,
+        label_width         = 1,
+        mean_r              = rgb_mean[0],
+        mean_g              = rgb_mean[1],
+        mean_b              = rgb_mean[2],
+        std_r               = rgb_std[0],
+        std_g               = rgb_std[1],
+        std_b               = rgb_std[2],
+        data_name           = 'data',
+        label_name          = 'softmax_label',
+        batch_size          = args.batch_size,
+        round_batch         = False,
+        data_shape          = image_shape,
+        preprocess_threads  = args.data_nthreads,
+        rand_crop           = False,
+        rand_mirror         = False,
+        num_parts           = nworker,
+        part_index          = rank)
+    return (train, val)

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/BENCHMARK_FP16.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 benchmark in FP16 on 1,4,8 GPUs with 64,128,192,208 batch size
+# Usage ./BENCHMARK_FP16.sh <additionals flags>
+
+python benchmark.py -n 1,4,8 -b 64,128,192,208 -e 2 -w 1 -i 100 -o report.json $@

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/BENCHMARK_FP32.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 benchmark in FP32 on 1,4,8 GPUs with 32,64,96 batch size
+# Usage ./BENCHMARK_FP32.sh <additionals flags>
+
+python benchmark.py -n 1,4,8 -b 32,64,96 -e 2 -w 1 -i 100 --dtype float32 -o report.json $@

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP16.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 inference benchmark in FP16 on 1 GPU with 1,2,4,64,128,192,208 batch size
+# Usage ./INFER_BENCHMARK_FP16.sh <additionals flags>
+
+python benchmark.py -n 1 -b 1,2,4,64,128,192,208 --only-inference -e 3 -w 1 -i 100 -o report.json $@

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP32.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 inference benchmark in FP32 on 1 GPU with 1,2,4,32,64,96 batch size
+# Usage ./INFER_BENCHMARK_FP32.sh <additionals flags>
+
+python benchmark.py -n 1 -b 1,2,4,32,64,96 --only-inference -e 3 -w 1 -i 100 -o report.json $@

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/RN50_FP16_1GPU.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 training in FP16 on 1 GPUs using 208 batch size (208 per GPU)
+# Usage ./RN50_FP16_1GPU.sh <path to this repository> <additionals flags>
+
+"$1/runner" -n 1 -b 208 --model-prefix model ${@:2}

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/RN50_FP16_4GPU.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 training in FP16 on 4 GPUs using 832 batch size (208 per GPU)
+# Usage ./RN50_FP16_4GPU.sh <path to this repository> <additionals flags>
+
+"$1/runner" -n 4 -b 208 --model-prefix model ${@:2}

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/RN50_FP16_8GPU.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 training in FP16 on 8 GPUs using 1664 batch size (208 per GPU)
+# Usage ./RN50_FP16_8GPU.sh <path to this repository> <additionals flags>
+
+"$1/runner" -n 8 -b 208 --model-prefix model ${@:2}

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/RN50_FP32_1GPU.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 training in FP32 on 1 GPUs using 96 batch size (96 per GPU)
+# Usage ./RN50_FP32_1GPU.sh <path to this repository> <additionals flags>
+
+"$1/runner" -n 1 -b 96 --dtype float32 --model-prefix model ${@:2}

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/RN50_FP32_4GPU.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 training in FP32 on 4 GPUs using 384 batch size (96 per GPU)
+# Usage ./RN50_FP32_4GPU.sh <path to this repository> <additionals flags>
+
+"$1/runner" -n 4 -b 96 --dtype float32 --model-prefix model ${@:2}

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/RN50_FP32_8GPU.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script launches ResNet50 training in FP32 on 8 GPUs using 768 batch size (96 per GPU)
+# Usage ./RN50_FP32_8GPU.sh <path to this repository> <additionals flags>
+
+"$1/runner" -n 8 -b 96 --dtype float32 --model-prefix model ${@:2}

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/SCORE_FP16.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script score ResNet50 checkpoint in FP16 on 1 GPUs using 128 batch size
+# Usage ./SCORE_FP16.sh <model prefix> <epoch> <additionals flags>
+
+./runner -n 1 -b 128 --only-inference --model-prefix $1 --load-epoch $2 -e 1 ${@:3}

+ 19 - 0
MxNet/Classification/RN50v1.5/examples/SCORE_FP32.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script score ResNet50 checkpoint in FP32 on 1 GPUs using 64 batch size
+# Usage ./SCORE_FP32.sh <model prefix> <epoch> <additionals flags>
+
+./runner -n 1 -b 64 --dtype float32 --only-inference --model-prefix $1 --load-epoch $2 -e 1 ${@:3}

+ 463 - 0
MxNet/Classification/RN50v1.5/fit.py

@@ -0,0 +1,463 @@
+# Copyright 2017-2018 The Apache Software Foundation
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -----------------------------------------------------------------------
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" example train fit utility """
+import logging
+import os
+import time
+import re
+import math
+import sys
+import mxnet as mx
+from report import Report
+from benchmarking import BenchmarkingDataIter
+
+def get_epoch_size(args, kv):
+    return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size)
+
+def _get_lr_scheduler(args, kv):
+    if 'lr_factor' not in args or args.lr_factor >= 1:
+        return (args.lr, None)
+    epoch_size = get_epoch_size(args, kv)
+    begin_epoch = args.load_epoch if args.load_epoch else 0
+    if 'pow' in args.lr_step_epochs:
+        lr = args.lr
+        max_up = args.num_epochs * epoch_size
+        pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs))
+        poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr)
+        return (lr, poly_sched)
+    step_epochs = [int(l) for l in args.lr_step_epochs.split(',')]
+    lr = args.lr
+    for s in step_epochs:
+        if begin_epoch >= s:
+            lr *= args.lr_factor
+    if lr != args.lr:
+        logging.info('Adjust learning rate to %e for epoch %d',
+                     lr, begin_epoch)
+
+    steps = [epoch_size * (x - begin_epoch)
+             for x in step_epochs if x - begin_epoch > 0]
+    if steps:
+        if kv:
+            num_workers = kv.num_workers
+        else:
+            num_workers = 1
+        epoch_size = math.ceil(int(args.num_examples/num_workers)/args.batch_size)
+        return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor,
+                                                         base_lr=args.lr, warmup_steps=epoch_size * args.warmup_epochs,
+                                                         warmup_mode=args.warmup_strategy))
+    else:
+        return (lr, None)
+
+def _load_model(args, rank=0):
+    if 'load_epoch' not in args or args.load_epoch is None:
+        return (None, None, None)
+    assert args.model_prefix is not None
+    model_prefix = args.model_prefix
+    if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)):
+        model_prefix += "-%d" % (rank)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(
+        model_prefix, args.load_epoch)
+    logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch)
+    return (sym, arg_params, aux_params)
+
+
+def _save_model(args, rank=0):
+    if args.model_prefix is None:
+        return None
+    return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % (
+        args.model_prefix, rank), period=args.save_period)
+
+
+def add_fit_args(parser):
+    """
+    parser : argparse.ArgumentParser
+    return a parser added with args required by fit
+    """
+    train = parser.add_argument_group('Training', 'model training')
+    train.add_argument('--num-layers', type=int,
+                       help='number of layers in the neural network, \
+                             required by some networks such as resnet')
+    train.add_argument('--gpus', type=str,
+                       help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu')
+    train.add_argument('--kv-store', type=str, default='device',
+                       help='key-value store type')
+    train.add_argument('--num-epochs', type=int, default=100,
+                       help='max num of epochs')
+    train.add_argument('--lr', type=float, default=0.1,
+                       help='initial learning rate')
+    train.add_argument('--lr-factor', type=float, default=0.1,
+                       help='the ratio to reduce lr on each step')
+    train.add_argument('--lr-step-epochs', type=str,
+                       help='the epochs to reduce the lr, e.g. 30,60')
+    train.add_argument('--initializer', type=str, default='default',
+                       help='the initializer type')
+    train.add_argument('--optimizer', type=str, default='sgd',
+                       help='the optimizer type')
+    train.add_argument('--mom', type=float, default=0.9,
+                       help='momentum for sgd')
+    train.add_argument('--wd', type=float, default=0.0001,
+                       help='weight decay for sgd')
+    train.add_argument('--batch-size', type=int, default=208,
+                       help='the batch size')
+    train.add_argument('--disp-batches', type=int, default=20,
+                       help='show progress for every n batches')
+    train.add_argument('--model-prefix', type=str,
+                       help='model prefix')
+    train.add_argument('--save-period', type=int, default=1, help='params saving period')
+    parser.add_argument('--monitor', dest='monitor', type=int, default=0,
+                        help='log network parameters every N iters if larger than 0')
+    train.add_argument('--load-epoch', type=int,
+                       help='load the model on an epoch using the model-load-prefix')
+    train.add_argument('--loss', type=str, default='',
+                       help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss')
+    train.add_argument('--test-io', type=int, default=0,
+                       help='1 means test reading speed without training')
+    train.add_argument('--dtype', type=str, default='float16',
+                       help='precision: float32 or float16')
+    train.add_argument('--gc-type', type=str, default='none',
+                       help='type of gradient compression to use, \
+                             takes `2bit` or `none` for now')
+    train.add_argument('--gc-threshold', type=float, default=0.5,
+                       help='threshold for 2bit gradient compression')
+    # additional parameters for large batch sgd
+    train.add_argument('--macrobatch-size', type=int, default=0,
+                       help='distributed effective batch size')
+    train.add_argument('--warmup-epochs', type=int, default=5,
+                       help='the epochs to ramp-up lr to scaled large-batch value')
+    train.add_argument('--warmup-strategy', type=str, default='linear',
+                       help='the ramping-up strategy for large batch sgd')
+    train.add_argument('--logging-dir', type=str, default='logs')
+    train.add_argument('--log', type=str, default='')
+    train.add_argument('--bn-gamma-init0', action='store_true')
+    train.add_argument('--epoch-size',type=int, default=0,
+                       help='set number of batches in an epoch. useful for debugging')
+    #train.add_argument('--tensorboard', type=str, default='',
+    #                   help='log parameters to visualize in tensorboard every epoch. takes name to specify as tensorboard run. Empty means tensorboard logging is disabled')
+    train.add_argument('--profile-worker-suffix', type=str, default='',
+                       help='profile workers actions into this file. During distributed training\
+                             filename saved will be rank1_ followed by this suffix')
+    train.add_argument('--profile-server-suffix', type=str, default='',
+                       help='profile server actions into a file with name like rank1_ followed by this suffix \
+                             during distributed training')
+    train.add_argument('--report', type=str, help='file where to save report')
+    train.add_argument('--only-inference', action='store_true', help='do not train, only inference (for benchmarking)')
+    train.add_argument('--no-metrics', action='store_true', help='do not calculate evaluation metrics (for benchmarking)')
+    return train
+
+
+def fit(args, network, data_loader, **kwargs):
+    """
+    train a model
+    args : argparse returns
+    network : the symbol definition of the nerual network
+    data_loader : function that returns the train and val data iterators
+    """
+
+    start_time = time.time()
+
+    # kvstore
+    kv = mx.kvstore.create(args.kv_store)
+    if args.gc_type != 'none':
+        kv.set_gradient_compression({'type': args.gc_type,
+                                     'threshold': args.gc_threshold})
+    if args.profile_server_suffix:
+        mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server')
+        mx.profiler.set_state(state='run', profile_process='server')
+
+    if args.profile_worker_suffix:
+        if kv.num_workers > 1:
+            filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix
+        else:
+            filename = args.profile_worker_suffix
+        mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker')
+        mx.profiler.set_state(state='run', profile_process='worker')
+
+    # logging
+    head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    logging.info('start with arguments %s', args)
+
+    epoch_size = get_epoch_size(args, kv)
+
+    # data iterators
+    (train, val) = data_loader(args, kv)
+    if 'dist' in args.kv_store and not 'async' in args.kv_store:
+        logging.info('Resizing training data to %d batches per machine', epoch_size)
+        # resize train iter to ensure each machine has same number of batches per epoch
+        # if not, dist_sync can hang at the end with one machine waiting for other machines
+        if not args.use_dali:
+            train = mx.io.ResizeIter(train, epoch_size)
+
+    if args.test_io:
+        tic = time.time()
+        for i, batch in enumerate(train):
+            if isinstance(batch, list):
+                for b in batch:
+                    for j in b.data:
+                        j.wait_to_read()
+            else:
+                for j in batch.data:
+                    j.wait_to_read()
+            if (i + 1) % args.disp_batches == 0:
+                logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i,
+                             args.disp_batches * args.batch_size / (time.time() - tic))
+                tic = time.time()
+        return
+
+    # load model
+    if 'arg_params' in kwargs and 'aux_params' in kwargs:
+        arg_params = kwargs['arg_params']
+        aux_params = kwargs['aux_params']
+    else:
+        sym, arg_params, aux_params = _load_model(args, kv.rank)
+
+    # save model
+    checkpoint = _save_model(args, kv.rank)
+    epoch_end_callbacks = []
+    if checkpoint:
+        epoch_end_callbacks.append(checkpoint)
+
+    # devices for training
+    devs = mx.cpu() if args.gpus is None or args.gpus == "" else [
+        mx.gpu(int(i)) for i in args.gpus.split(',')]
+
+    # learning rate
+    lr, lr_scheduler = _get_lr_scheduler(args, kv)
+
+    # create model
+    model = mx.mod.Module(
+        context=devs,
+        symbol=network
+    )
+
+    optimizer_params = {
+        'learning_rate': lr,
+        'wd': args.wd,
+        'lr_scheduler': lr_scheduler,
+        'multi_precision': True}
+
+    # Only a limited number of optimizers have 'momentum' property
+    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'}
+    if args.optimizer in has_momentum:
+        optimizer_params['momentum'] = args.mom
+
+    monitor = mx.mon.Monitor(
+        args.monitor, pattern=".*") if args.monitor > 0 else None
+
+    # A limited number of optimizers have a warmup period
+    has_warmup = {'lbsgd', 'lbnag'}
+    if args.optimizer in has_warmup:
+        if 'dist' in args.kv_store:
+            nworkers = kv.num_workers
+        else:
+            nworkers = 1
+        epoch_size = args.num_examples / args.batch_size / nworkers
+
+        if epoch_size < 1:
+            epoch_size = 1
+        macrobatch_size = args.macrobatch_size
+        if macrobatch_size < args.batch_size * nworkers:
+            macrobatch_size = args.batch_size * nworkers
+        #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999)
+        batch_scale = math.ceil(
+            float(macrobatch_size) / args.batch_size / nworkers)
+        optimizer_params['updates_per_epoch'] = epoch_size
+        optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0
+        optimizer_params['batch_scale'] = batch_scale
+        optimizer_params['warmup_strategy'] = args.warmup_strategy
+        optimizer_params['warmup_epochs'] = args.warmup_epochs
+        optimizer_params['num_epochs'] = args.num_epochs
+
+    if args.initializer == 'default':
+        initializer = mx.init.Xavier(
+            rnd_type='gaussian', factor_type="in", magnitude=2)
+    # initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
+    elif args.initializer == 'xavier':
+        initializer = mx.init.Xavier()
+    elif args.initializer == 'msra':
+        initializer = mx.init.MSRAPrelu()
+    elif args.initializer == 'orthogonal':
+        initializer = mx.init.Orthogonal()
+    elif args.initializer == 'normal':
+        initializer = mx.init.Normal()
+    elif args.initializer == 'uniform':
+        initializer = mx.init.Uniform()
+    elif args.initializer == 'one':
+        initializer = mx.init.One()
+    elif args.initializer == 'zero':
+        initializer = mx.init.Zero()
+
+    # evaluation metrices
+    if not args.no_metrics:
+        eval_metrics = ['crossentropy', 'accuracy']
+        eval_metrics.append(mx.metric.create(
+            'top_k_accuracy', top_k=5))
+    else:
+        eval_metrics = []
+
+    supported_loss = ['ce', 'nll_loss']
+    if len(args.loss) > 0:
+        # ce or nll loss is only applicable to softmax output
+        loss_type_list = args.loss.split(',')
+        if 'softmax_output' in network.list_outputs():
+            for loss_type in loss_type_list:
+                loss_type = loss_type.strip()
+                if loss_type == 'nll':
+                    loss_type = 'nll_loss'
+                if loss_type not in supported_loss:
+                    logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \
+                                    'negative likelihood loss is supported!')
+                else:
+                    eval_metrics.append(mx.metric.create(loss_type))
+        else:
+            logging.warning("The output is not softmax_output, loss argument will be skipped!")
+
+    # callbacks that run after each batch
+    batch_end_callbacks = []
+    batch_end_callbacks.append(mx.callback.Speedometer(
+        args.batch_size, args.disp_batches))
+
+    if 'batch_end_callback' in kwargs:
+        cbs = kwargs['batch_end_callback']
+        batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs]
+
+
+    report = Report('resnet{}'.format(args.num_layers), len(args.gpus.split(',')), sys.argv)
+
+    train = BenchmarkingDataIter(train, args.benchmark_iters)
+    val = BenchmarkingDataIter(val, args.benchmark_iters)
+
+    class Gatherer:
+        def __init__(self, report, mode, data_iter, total_bs=None):
+            self.report = report
+            self.mode = mode
+            self.total_bs = total_bs
+            self.data_iter = data_iter
+            self.clear()
+
+        def clear(self):
+            self.num = 0
+            self.top1 = 0
+            self.top5 = 0
+            self.loss = 0
+            self.time = 0
+            self.tic = 0
+
+        def gather_metrics(self, data):
+            params = dict(data.eval_metric.get_global_name_value())
+
+            if self.num != 0:
+                self.time += time.time() - self.tic
+            self.num += 1
+            if not args.no_metrics:
+                self.top1 = params['accuracy']
+                self.top5 = params['top_k_accuracy_5']
+                self.loss = params['cross-entropy']
+
+            self.tic = time.time()
+
+        def add_metrics(self, *a, **k):
+            top1 = self.top1 * 100
+            top5 = self.top5 * 100
+            loss = self.loss
+            if self.num <= 1:
+                time = float('nan')
+            else:
+                time = self.time / (self.num - 1)
+            data = self.data_iter.get_avg_time_and_clear()
+            if self.total_bs is not None:
+                compute_ips = self.total_bs / (time - data)
+                total_ips = self.total_bs / time
+
+            if not args.no_metrics:
+                self.report.add_value('{}.top1'.format(self.mode), top1)
+                self.report.add_value('{}.top5'.format(self.mode), top5)
+                self.report.add_value('{}.loss'.format(self.mode), loss)
+            self.report.add_value('{}.time'.format(self.mode), time)
+            # self.report.add_value('{}.data'.format(self.mode), data)
+            if self.total_bs is not None:
+                # self.report.add_value('{}.compute_ips'.format(self.mode), compute_ips)
+                self.report.add_value('{}.total_ips'.format(self.mode), total_ips)
+            self.clear()
+
+    def save_report(*a, **k):
+        report.set_total_duration(time.time() - start_time)
+        if args.report:
+            report.save(args.report)
+
+    train_gatherer = Gatherer(report, 'train', train, args.batch_size)
+    eval_gatherer = Gatherer(report, 'val', val, args.batch_size)
+
+    batch_end_callbacks = [train_gatherer.gather_metrics] + batch_end_callbacks
+    epoch_end_callbacks = [train_gatherer.add_metrics, save_report] + epoch_end_callbacks
+
+    eval_batch_end_callbacks = [eval_gatherer.gather_metrics]
+    eval_end_callbacks = [eval_gatherer.add_metrics, save_report]
+
+    # run
+    model.fit(train,
+              begin_epoch=args.load_epoch if args.load_epoch else 0,
+              num_epoch=args.num_epochs if not args.only_inference else 0,
+              eval_data=val,
+              eval_metric=eval_metrics,
+              kvstore=kv,
+              optimizer=args.optimizer,
+              optimizer_params=optimizer_params,
+              initializer=initializer,
+              arg_params=arg_params,
+              aux_params=aux_params,
+              batch_end_callback=batch_end_callbacks,
+              epoch_end_callback=epoch_end_callbacks, #checkpoint if args.use_dali else ,,
+              eval_batch_end_callback=eval_batch_end_callbacks,
+              eval_end_callback=eval_end_callbacks,
+              allow_missing=True,
+              monitor=monitor)
+
+    if args.only_inference:
+        for epoch in range(args.num_epochs):
+            score = model.score(val, eval_metrics, batch_end_callback=eval_batch_end_callbacks, score_end_callback=eval_end_callbacks, epoch=epoch)
+            print('-------------')
+            for name, value in score:
+                print('{}: {}'.format(name, value))
+
+    if args.profile_server_suffix:
+        mx.profiler.set_state(state='run', profile_process='server')
+    if args.profile_worker_suffix:
+        mx.profiler.set_state(state='run', profile_process='worker')
+
+    save_report()
+
+    print('Experiment took: {} sec'.format(report.total_duration))

BIN
MxNet/Classification/RN50v1.5/img/training_accuracy.png


BIN
MxNet/Classification/RN50v1.5/img/training_loss.png


BIN
MxNet/Classification/RN50v1.5/img/validation_accuracy.png


+ 57 - 0
MxNet/Classification/RN50v1.5/report.py

@@ -0,0 +1,57 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Report JSON file structure:
+# - "model"          : architecture of the model (e.g. "resnet50").
+# - "ngpus"          : number of gpus on which training was performed.
+# - "total_duration" : total duration of training in seconds.
+# - "cmd"            : list of application arguments.
+# - "metrics"        : per epoch metrics for train and validation
+#                      (some of below metrics may not exist in the report,
+#                       depending on application arguments)
+#       - "train.top1"      : training top1 accuracy in epoch.
+#       - "train.top5"      : training top5 accuracy in epoch.
+#       - "train.loss"      : training loss in epoch.
+#       - "train.time"      : average training time of iteration in seconds.
+#       - "train.total_ips" : training speed (data and compute time taken into account) for epoch in images/sec.
+#       - "val.top1", "val.top5", "val.loss", "val.time", "val.total_ips" : the same but for validation.
+
+import json
+from collections import defaultdict, OrderedDict
+
+class Report:
+    def __init__(self, model_name, ngpus, cmd):
+        self.model_name = model_name
+        self.ngpus = ngpus
+        self.cmd = cmd
+        self.total_duration = 0
+        self.metrics = defaultdict(lambda: [])
+
+    def add_value(self, metric, value):
+        self.metrics[metric].append(value)
+
+    def set_total_duration(self, duration):
+        self.total_duration = duration
+
+    def save(self, filename):
+        report = OrderedDict([
+            ('model', self.model_name),
+            ('ngpus', self.ngpus),
+            ('total_duration', self.total_duration),
+            ('cmd', self.cmd),
+            ('metrics', self.metrics),
+        ])
+        with open(filename, 'w') as f:
+            json.dump(report, f, indent=4)

+ 376 - 0
MxNet/Classification/RN50v1.5/resnet.py

@@ -0,0 +1,376 @@
+# Copyright 2017-2018 The Apache Software Foundation
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -----------------------------------------------------------------------
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+(Original author Wei Wu) by Antti-Pekka Hynninen
+
+"Flexible Layout" (fl) version created by Dick Carter.
+
+Implementing the original resnet ILSVRC 2015 winning network from:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
+'''
+import mxnet as mx
+import numpy as np
+import random
+
+# Transform a symbol from one layout to another, or do nothing if they have the same layout
+def transform_layout(data, from_layout, to_layout):
+    supported_layouts = ['NCHW', 'NHWC']
+    if from_layout not in supported_layouts:
+        raise ValueError('Not prepared to handle layout: {}'.format(from_layout))
+    if to_layout not in supported_layouts:
+        raise ValueError('Not prepared to handle layout: {}'.format(to_layout))
+
+    # Insert transpose if from_layout and to_layout don't match
+    if from_layout == 'NCHW' and to_layout == 'NHWC':
+        return mx.sym.transpose(data, axes=(0, 2, 3, 1))
+    elif from_layout == 'NHWC' and to_layout == 'NCHW':
+        return mx.sym.transpose(data, axes=(0, 3, 1, 2))
+    else:
+        return data
+
+# A BatchNorm wrapper that responds to the input layout
+def batchnorm(data, io_layout, batchnorm_layout, **kwargs):
+    # Transpose as needed to batchnorm_layout
+    transposed_as_needed = transform_layout(data, io_layout, batchnorm_layout)
+    bn_axis = 3 if batchnorm_layout == 'NHWC' else 1
+    batchnormed = mx.sym.BatchNorm(data=transposed_as_needed, axis=bn_axis, **kwargs)
+    # Transpose back to i/o layout as needed
+    return transform_layout(batchnormed, batchnorm_layout, io_layout)
+
+# A BatchNormAddRelu wrapper that responds to the input layout
+def batchnorm_add_relu(data, addend, io_layout, batchnorm_layout, **kwargs):
+    # Transpose as needed to batchnorm_layout
+    transposed_data_as_needed = transform_layout(data, io_layout, batchnorm_layout)
+    transposed_addend_as_needed = transform_layout(addend, io_layout, batchnorm_layout)
+    bn_axis = 3 if batchnorm_layout == 'NHWC' else 1
+    batchnormed = mx.sym.BatchNormAddRelu(data=transposed_data_as_needed,
+                                      addend=transposed_addend_as_needed,
+                                      axis=bn_axis, **kwargs)
+    # Transpose back to i/o layout as needed
+    return transform_layout(batchnormed, batchnorm_layout, io_layout)
+
+# A Pooling wrapper that responds to the input layout
+def pooling(data, io_layout, pooling_layout, **kwargs):
+    # Pooling kernel, as specified by pooling_layout, may be in conflict with i/o layout.
+    transposed_as_needed = transform_layout(data, io_layout, pooling_layout)
+    pooled = mx.sym.Pooling(data=transposed_as_needed, layout=pooling_layout, **kwargs)
+    # Transpose back to i/o layout as needed
+    return transform_layout(pooled, pooling_layout, io_layout)
+
+# Assumption is that data comes in and out in the 'conv_layout' format.
+# If this format is different from the 'batchnorm_layout' format, then the batchnorm() routine
+# will introduce transposes on both sides of the mx.sym.BatchNorm symbol
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True,
+                  workspace=256, memonger=False, conv_layout='NCHW', batchnorm_layout='NCHW',
+                  verbose=False, cudnn_bn_off=False, bn_eps=2e-5, bn_mom=0.9, conv_algo=-1,
+                  fuse_bn_relu=False, fuse_bn_add_relu=False, cudnn_tensor_core_only=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+
+    act = 'relu' if fuse_bn_relu else None
+    if bottle_neck:
+        conv1 = mx.sym.Convolution(data=data, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1', layout=conv_layout,
+                                   cudnn_algo_verbose=verbose,
+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
+        bn1 = batchnorm(data=conv1, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                        fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn1', cudnn_off=cudnn_bn_off, act_type=act)
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') if not fuse_bn_relu else bn1
+        conv2 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2', layout=conv_layout,
+                                   cudnn_algo_verbose=verbose,
+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
+        bn2 = batchnorm(data=conv2, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                        fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn2', cudnn_off=cudnn_bn_off, act_type=act)
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2') if not fuse_bn_relu else bn2
+        conv3 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3', layout=conv_layout,
+                                   cudnn_algo_verbose=verbose,
+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_conv1sc', layout=conv_layout,
+                                         cudnn_algo_verbose=verbose,
+                                         cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
+                                         cudnn_tensor_core_only=cudnn_tensor_core_only)
+            shortcut = batchnorm(data=conv1sc, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                                 fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_sc', cudnn_off=cudnn_bn_off)
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+
+        if fuse_bn_add_relu:
+            return batchnorm_add_relu(data=conv3, addend=shortcut, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                            fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn3', cudnn_off=cudnn_bn_off)
+        else:
+            bn3 = batchnorm(data=conv3, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                            fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn3', cudnn_off=cudnn_bn_off)
+            return mx.sym.Activation(data=bn3 + shortcut, act_type='relu', name=name + '_relu3')
+
+    else:
+        conv1 = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv1', layout=conv_layout,
+                                   cudnn_algo_verbose=verbose,
+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
+        bn1 = batchnorm(data=conv1, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                        fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn1', cudnn_off=cudnn_bn_off, act_type=act)
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') if not fuse_bn_relu else bn1
+        conv2 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv2', layout=conv_layout,
+                                   cudnn_algo_verbose=verbose,
+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_conv1sc', layout=conv_layout,
+                                         cudnn_algo_verbose=verbose,
+                                         cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
+                                         cudnn_tensor_core_only=cudnn_tensor_core_only)
+            shortcut = batchnorm(data=conv1sc, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                                 fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_sc', cudnn_off=cudnn_bn_off)
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+
+        if fuse_bn_add_relu:
+            return batchnorm_add_relu(data=conv2, addend=shortcut, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                            fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn2', cudnn_off=cudnn_bn_off)
+        else:
+            bn2 = batchnorm(data=conv2, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                            fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn2', cudnn_off=cudnn_bn_off)
+            return mx.sym.Activation(data=bn2 + shortcut, act_type='relu', name=name + '_relu2')
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, workspace=256, dtype='float32', memonger=False,
+           input_layout='NCHW', conv_layout='NCHW',  batchnorm_layout='NCHW', pooling_layout='NCHW', verbose=False,
+           cudnn_bn_off=False, bn_eps=2e-5, bn_mom=0.9, conv_algo=-1,
+           fuse_bn_relu=False, fuse_bn_add_relu=False, force_tensor_core=False, use_dali=True):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    dtype : str
+        Precision (float32 or float16)
+    memonger : boolean
+        Activates "memory monger" to reduce the model's memory footprint
+    input_layout : str
+        interpretation (e.g. NCHW vs NHWC) of data provided by the i/o pipeline (may introduce transposes
+        if in conflict with 'layout' above)
+    conv_layout : str
+        interpretation (e.g. NCHW vs NHWC) of data for convolution operation.
+    batchnorm_layout : str
+        directs which kernel performs the batchnorm (may introduce transposes if in conflict with 'conv_layout' above)
+    pooling_layout : str
+        directs which kernel performs the pooling (may introduce transposes if in conflict with 'conv_layout' above)
+    """
+
+    act = 'relu' if fuse_bn_relu else None
+    num_unit = len(units)
+    assert(num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if not use_dali:
+        # double buffering of data
+        if dtype == 'float32':
+            data = mx.sym.identity(data=data, name='id')
+        else:
+            if dtype == 'float16':
+                data = mx.sym.Cast(data=data, dtype=np.float16)
+    (nchannel, height, width) = image_shape
+
+    # Insert transpose as needed to get the input layout to match the desired processing layout
+    data = transform_layout(data, input_layout, conv_layout)
+
+    if height <= 32:            # such as cifar10
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace, layout=conv_layout,
+                                  cudnn_algo_verbose=verbose,
+                                  cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
+                                  cudnn_tensor_core_only=force_tensor_core)
+        # Is this BatchNorm supposed to be here?
+        body = batchnorm(data=body, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                         fix_gamma=False, eps=bn_eps, momentum=bn_mom, name='bn0', cudnn_off=cudnn_bn_off)
+    else:                       # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace, layout=conv_layout,
+                                  cudnn_algo_verbose=verbose,
+                                  cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
+                                  cudnn_tensor_core_only=force_tensor_core)
+        body = batchnorm(data=body, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                         fix_gamma=False, eps=bn_eps, momentum=bn_mom, name='bn0', cudnn_off=cudnn_bn_off, act_type=act)
+        if not fuse_bn_relu:
+            body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = pooling(data=body, io_layout=conv_layout, pooling_layout=pooling_layout,
+                       kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+                             name='stage%d_unit%d' % (i + 1, 1),
+                             bottle_neck=bottle_neck, workspace=workspace,
+                             memonger=memonger, conv_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                             verbose=verbose, cudnn_bn_off=cudnn_bn_off, bn_eps=bn_eps, bn_mom=bn_mom,
+                             conv_algo=conv_algo, fuse_bn_relu=fuse_bn_relu, fuse_bn_add_relu=fuse_bn_add_relu,
+                             cudnn_tensor_core_only=force_tensor_core)
+        for j in range(units[i]-1):
+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace,
+                                 memonger=memonger, conv_layout=conv_layout, batchnorm_layout=batchnorm_layout,
+                                 verbose=verbose, cudnn_bn_off=cudnn_bn_off, bn_eps = bn_eps, bn_mom=bn_mom,
+                                 conv_algo=conv_algo, fuse_bn_relu=fuse_bn_relu, fuse_bn_add_relu=fuse_bn_add_relu,
+                                 cudnn_tensor_core_only=force_tensor_core)
+    # bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    # relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = pooling(data=body, io_layout=conv_layout, pooling_layout=pooling_layout,
+                    global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
+    flat = mx.sym.Flatten(data=pool1)
+    fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', cublas_algo_verbose=verbose)
+    if dtype == 'float16':
+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
+    return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32',
+               input_layout='NCHW', conv_layout='NCHW', batchnorm_layout='NCHW', pooling_layout='NCHW',
+               verbose=False, seed=None, cudnn_bn_off=False, batchnorm_eps=2e-5, batchnorm_mom=0.9,
+               conv_algo=-1, fuse_bn_relu=False, fuse_bn_add_relu=False, force_tensor_core=False, use_dali=True, **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+    (Original author Wei Wu) by Antti-Pekka Hynninen
+    Implementing the original resnet ILSVRC 2015 winning network from:
+    Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
+    """
+    if seed is not None:
+        print('Setting seeds to %s' % (seed,))
+        random.seed(seed)
+        np.random.seed(seed)
+        mx.random.seed(seed)
+
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+
+    return resnet(units             = units,
+                  num_stages        = num_stages,
+                  filter_list       = filter_list,
+                  num_classes       = num_classes,
+                  image_shape       = image_shape,
+                  bottle_neck       = bottle_neck,
+                  workspace         = conv_workspace,
+                  dtype             = dtype,
+                  input_layout      = input_layout,
+                  conv_layout       = conv_layout,
+                  batchnorm_layout  = batchnorm_layout,
+                  pooling_layout    = pooling_layout,
+                  verbose           = verbose,
+                  cudnn_bn_off      = cudnn_bn_off,
+                  bn_eps            = batchnorm_eps,
+                  bn_mom            = batchnorm_mom,
+                  conv_algo         = conv_algo,
+                  fuse_bn_relu      = fuse_bn_relu,
+                  fuse_bn_add_relu  = fuse_bn_add_relu,
+                  force_tensor_core = force_tensor_core,
+                  use_dali          = use_dali)

+ 96 - 0
MxNet/Classification/RN50v1.5/runner

@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os, socket
+from argparse import ArgumentParser
+import warnings
+
+
+optparser = ArgumentParser(description="train resnet50 with MXNet")
+optparser.add_argument("-n", "--n-GPUs", type=int, default=8, help="number of GPUs to use; " +\
+                       "default = 8")
+optparser.add_argument("-b", "--batch-size", type=int, default=208, help="batch size per GPU; " +\
+                       "default = 208")
+optparser.add_argument("-e", "--num-epochs", type=int, default=90, help="number of epochs; " +\
+                       "default = 90")
+optparser.add_argument("-l", "--lr", type=float, default=0.1, help="learning rate; default = 0.1; " +\
+                       "IMPORTANT: true learning rate will be calculated as `lr * batch_size/256`")
+optparser.add_argument("--no-val", action="store_true",
+                       help="if set no validation will be performed")
+optparser.add_argument("--no-dali", action="store_true", default=False,
+                       help="use default MXNet pipeline instead of DALI")
+optparser.add_argument("--data-root", type=str, help="Directory with RecordIO data files", default="/data/imagenet/train-val-recordio-passthrough")
+optparser.add_argument("--data-nthreads", type=int, help="number of threads for data loading; default = 40", default=40)
+optparser.add_argument("--dtype", type=str, help="Precision, float16 or float32", default="float16")
+
+opts, args = optparser.parse_known_args()
+
+if opts.dtype == "float16":
+    n_ch = str(4 - int(opts.no_dali))
+else:
+    n_ch = str(3)
+
+opts.batch_size *= opts.n_GPUs
+
+opts.lr *= opts.batch_size/256
+
+command = ""
+command += "python "+os.path.dirname(__file__)+"/train.py"
+command += " --num-layers 50"
+command += " --data-train " + opts.data_root + "/train.rec"
+command += " --data-train-idx " + opts.data_root + "/train.idx"
+if not opts.no_val:
+    command += " --data-val " + opts.data_root + "/val.rec"
+    command += " --data-val-idx " + opts.data_root + "/val.idx"
+command += " --data-nthreads " + str(opts.data_nthreads)
+command += " --optimizer sgd --dtype " + opts.dtype
+command += " --lr-step-epochs 30,60,80 --max-random-area 1"
+command += " --min-random-area 0.05 --max-random-scale 1"
+command += " --min-random-scale 1 --min-random-aspect-ratio 0.75"
+command += " --max-random-aspect-ratio 1.33 --max-random-shear-ratio 0"
+command += " --max-random-rotate-angle 0 --random-resized-crop 1"
+command += " --random-crop 0 --random-mirror 1"
+command += " --image-shape "+n_ch+",224,224 --warmup-epochs 5"
+command += " --disp-batches 20"
+command += " --batchnorm-mom 0.9 --batchnorm-eps 1e-5"
+if opts.dtype == 'float16':
+    command += " --fuse-bn-relu 1"
+    command += " --input-layout NHWC --conv-layout NHWC"
+    command += " --batchnorm-layout NHWC --pooling-layout NHWC"
+    command += " --conv-algo 1 --force-tensor-core 1"
+    command += " --fuse-bn-add-relu 1"
+
+command += " --kv-store device"
+if not opts.no_dali:
+    command += " --use-dali"
+    command += " --dali-prefetch-queue 2 --dali-nvjpeg-memory-padding 64"
+command += " --lr "+str(opts.lr)
+command += " --gpus " + str(list(range(opts.n_GPUs))).replace(' ', '').replace('[', '').replace(']', '')
+command += " --batch-size " + str(opts.batch_size)
+command += " --num-epochs " + str(opts.num_epochs)
+
+
+for arg in args:
+    command += " " + arg
+
+os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
+os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
+os.environ['MXNET_USE_TENSORRT'] = "0"
+os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
+os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
+os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"
+
+exit(os.system('/bin/bash -c "'+command+'"'))

+ 91 - 0
MxNet/Classification/RN50v1.5/train.py

@@ -0,0 +1,91 @@
+# Copyright 2017-2018 The Apache Software Foundation
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -----------------------------------------------------------------------
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import logging
+logging.basicConfig(level=logging.DEBUG)
+import data, dali, fit
+import mxnet as mx
+import numpy as np
+
+def set_imagenet_aug(aug):
+    # standard data augmentation setting for imagenet training
+    aug.set_defaults(rgb_mean='123.68,116.779,103.939', rgb_std='58.393,57.12,57.375')
+    aug.set_defaults(random_crop=0, random_resized_crop=1, random_mirror=1)
+    aug.set_defaults(min_random_area=0.08)
+    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
+    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
+
+if __name__ == '__main__':
+    # parse args
+    parser = argparse.ArgumentParser(description="train resnet on imagenet",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    fit.add_fit_args(parser)
+    data.add_data_args(parser)
+    dali.add_dali_args(parser)
+    data.add_data_aug_args(parser)
+    
+    # Instead, to get standard resnet augmentation on a per-use basis, invoke as in:
+    # train_imagenet.py --set-resnet-aug ...
+    # Finally, to get the legacy MXNet v1.2 training settings on a per-use basis, invoke as in:
+    # train_imagenet.py --set-data-aug-level 3
+    parser.set_defaults(
+        # network
+        num_layers       = 50,
+
+        # data
+        resize           = 256,
+        num_classes      = 1000,
+        num_examples     = 1281167,
+        image_shape      = '3,224,224',
+        min_random_scale = 1, # if input image has min size k, suggest to use
+                              # 256.0/x, e.g. 0.533 for 480
+        # train
+        num_epochs       = 90,
+        lr_step_epochs   = '30,60,80',
+        dtype            = 'float32'
+    )
+    args = parser.parse_args()
+
+    if not args.use_dali:
+        data.set_data_aug_level(parser, 0)
+
+    # load network
+    import resnet as net
+    sym = net.get_symbol(**vars(args))
+
+    # train
+    fit.fit(args, sym, dali.get_rec_iter)