7 anni fa · a8e1d1cd83
--- a/MxNet/Classification/RN50v1.5/LICENSE
+++ b/MxNet/Classification/RN50v1.5/LICENSE
@@ -0,0 +1,202 @@
 
				+
			
 
				+                                 Apache License
			
 
				+                           Version 2.0, January 2004
			
 
				+                        http://www.apache.org/licenses/
			
 
				+
			
 
				+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
			
 
				+
			
 
				+   1. Definitions.
			
 
				+
			
 
				+      "License" shall mean the terms and conditions for use, reproduction,
			
 
				+      and distribution as defined by Sections 1 through 9 of this document.
			
 
				+
			
 
				+      "Licensor" shall mean the copyright owner or entity authorized by
			
 
				+      the copyright owner that is granting the License.
			
 
				+
			
 
				+      "Legal Entity" shall mean the union of the acting entity and all
			
 
				+      other entities that control, are controlled by, or are under common
			
 
				+      control with that entity. For the purposes of this definition,
			
 
				+      "control" means (i) the power, direct or indirect, to cause the
			
 
				+      direction or management of such entity, whether by contract or
			
 
				+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
			
 
				+      outstanding shares, or (iii) beneficial ownership of such entity.
			
 
				+
			
 
				+      "You" (or "Your") shall mean an individual or Legal Entity
			
 
				+      exercising permissions granted by this License.
			
 
				+
			
 
				+      "Source" form shall mean the preferred form for making modifications,
			
 
				+      including but not limited to software source code, documentation
			
 
				+      source, and configuration files.
			
 
				+
			
 
				+      "Object" form shall mean any form resulting from mechanical
			
 
				+      transformation or translation of a Source form, including but
			
 
				+      not limited to compiled object code, generated documentation,
			
 
				+      and conversions to other media types.
			
 
				+
			
 
				+      "Work" shall mean the work of authorship, whether in Source or
			
 
				+      Object form, made available under the License, as indicated by a
			
 
				+      copyright notice that is included in or attached to the work
			
 
				+      (an example is provided in the Appendix below).
			
 
				+
			
 
				+      "Derivative Works" shall mean any work, whether in Source or Object
			
 
				+      form, that is based on (or derived from) the Work and for which the
			
 
				+      editorial revisions, annotations, elaborations, or other modifications
			
 
				+      represent, as a whole, an original work of authorship. For the purposes
			
 
				+      of this License, Derivative Works shall not include works that remain
			
 
				+      separable from, or merely link (or bind by name) to the interfaces of,
			
 
				+      the Work and Derivative Works thereof.
			
 
				+
			
 
				+      "Contribution" shall mean any work of authorship, including
			
 
				+      the original version of the Work and any modifications or additions
			
 
				+      to that Work or Derivative Works thereof, that is intentionally
			
 
				+      submitted to Licensor for inclusion in the Work by the copyright owner
			
 
				+      or by an individual or Legal Entity authorized to submit on behalf of
			
 
				+      the copyright owner. For the purposes of this definition, "submitted"
			
 
				+      means any form of electronic, verbal, or written communication sent
			
 
				+      to the Licensor or its representatives, including but not limited to
			
 
				+      communication on electronic mailing lists, source code control systems,
			
 
				+      and issue tracking systems that are managed by, or on behalf of, the
			
 
				+      Licensor for the purpose of discussing and improving the Work, but
			
 
				+      excluding communication that is conspicuously marked or otherwise
			
 
				+      designated in writing by the copyright owner as "Not a Contribution."
			
 
				+
			
 
				+      "Contributor" shall mean Licensor and any individual or Legal Entity
			
 
				+      on behalf of whom a Contribution has been received by Licensor and
			
 
				+      subsequently incorporated within the Work.
			
 
				+
			
 
				+   2. Grant of Copyright License. Subject to the terms and conditions of
			
 
				+      this License, each Contributor hereby grants to You a perpetual,
			
 
				+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+      copyright license to reproduce, prepare Derivative Works of,
			
 
				+      publicly display, publicly perform, sublicense, and distribute the
			
 
				+      Work and such Derivative Works in Source or Object form.
			
 
				+
			
 
				+   3. Grant of Patent License. Subject to the terms and conditions of
			
 
				+      this License, each Contributor hereby grants to You a perpetual,
			
 
				+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+      (except as stated in this section) patent license to make, have made,
			
 
				+      use, offer to sell, sell, import, and otherwise transfer the Work,
			
 
				+      where such license applies only to those patent claims licensable
			
 
				+      by such Contributor that are necessarily infringed by their
			
 
				+      Contribution(s) alone or by combination of their Contribution(s)
			
 
				+      with the Work to which such Contribution(s) was submitted. If You
			
 
				+      institute patent litigation against any entity (including a
			
 
				+      cross-claim or counterclaim in a lawsuit) alleging that the Work
			
 
				+      or a Contribution incorporated within the Work constitutes direct
			
 
				+      or contributory patent infringement, then any patent licenses
			
 
				+      granted to You under this License for that Work shall terminate
			
 
				+      as of the date such litigation is filed.
			
 
				+
			
 
				+   4. Redistribution. You may reproduce and distribute copies of the
			
 
				+      Work or Derivative Works thereof in any medium, with or without
			
 
				+      modifications, and in Source or Object form, provided that You
			
 
				+      meet the following conditions:
			
 
				+
			
 
				+      (a) You must give any other recipients of the Work or
			
 
				+          Derivative Works a copy of this License; and
			
 
				+
			
 
				+      (b) You must cause any modified files to carry prominent notices
			
 
				+          stating that You changed the files; and
			
 
				+
			
 
				+      (c) You must retain, in the Source form of any Derivative Works
			
 
				+          that You distribute, all copyright, patent, trademark, and
			
 
				+          attribution notices from the Source form of the Work,
			
 
				+          excluding those notices that do not pertain to any part of
			
 
				+          the Derivative Works; and
			
 
				+
			
 
				+      (d) If the Work includes a "NOTICE" text file as part of its
			
 
				+          distribution, then any Derivative Works that You distribute must
			
 
				+          include a readable copy of the attribution notices contained
			
 
				+          within such NOTICE file, excluding those notices that do not
			
 
				+          pertain to any part of the Derivative Works, in at least one
			
 
				+          of the following places: within a NOTICE text file distributed
			
 
				+          as part of the Derivative Works; within the Source form or
			
 
				+          documentation, if provided along with the Derivative Works; or,
			
 
				+          within a display generated by the Derivative Works, if and
			
 
				+          wherever such third-party notices normally appear. The contents
			
 
				+          of the NOTICE file are for informational purposes only and
			
 
				+          do not modify the License. You may add Your own attribution
			
 
				+          notices within Derivative Works that You distribute, alongside
			
 
				+          or as an addendum to the NOTICE text from the Work, provided
			
 
				+          that such additional attribution notices cannot be construed
			
 
				+          as modifying the License.
			
 
				+
			
 
				+      You may add Your own copyright statement to Your modifications and
			
 
				+      may provide additional or different license terms and conditions
			
 
				+      for use, reproduction, or distribution of Your modifications, or
			
 
				+      for any such Derivative Works as a whole, provided Your use,
			
 
				+      reproduction, and distribution of the Work otherwise complies with
			
 
				+      the conditions stated in this License.
			
 
				+
			
 
				+   5. Submission of Contributions. Unless You explicitly state otherwise,
			
 
				+      any Contribution intentionally submitted for inclusion in the Work
			
 
				+      by You to the Licensor shall be under the terms and conditions of
			
 
				+      this License, without any additional terms or conditions.
			
 
				+      Notwithstanding the above, nothing herein shall supersede or modify
			
 
				+      the terms of any separate license agreement you may have executed
			
 
				+      with Licensor regarding such Contributions.
			
 
				+
			
 
				+   6. Trademarks. This License does not grant permission to use the trade
			
 
				+      names, trademarks, service marks, or product names of the Licensor,
			
 
				+      except as required for reasonable and customary use in describing the
			
 
				+      origin of the Work and reproducing the content of the NOTICE file.
			
 
				+
			
 
				+   7. Disclaimer of Warranty. Unless required by applicable law or
			
 
				+      agreed to in writing, Licensor provides the Work (and each
			
 
				+      Contributor provides its Contributions) on an "AS IS" BASIS,
			
 
				+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
			
 
				+      implied, including, without limitation, any warranties or conditions
			
 
				+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
			
 
				+      PARTICULAR PURPOSE. You are solely responsible for determining the
			
 
				+      appropriateness of using or redistributing the Work and assume any
			
 
				+      risks associated with Your exercise of permissions under this License.
			
 
				+
			
 
				+   8. Limitation of Liability. In no event and under no legal theory,
			
 
				+      whether in tort (including negligence), contract, or otherwise,
			
 
				+      unless required by applicable law (such as deliberate and grossly
			
 
				+      negligent acts) or agreed to in writing, shall any Contributor be
			
 
				+      liable to You for damages, including any direct, indirect, special,
			
 
				+      incidental, or consequential damages of any character arising as a
			
 
				+      result of this License or out of the use or inability to use the
			
 
				+      Work (including but not limited to damages for loss of goodwill,
			
 
				+      work stoppage, computer failure or malfunction, or any and all
			
 
				+      other commercial damages or losses), even if such Contributor
			
 
				+      has been advised of the possibility of such damages.
			
 
				+
			
 
				+   9. Accepting Warranty or Additional Liability. While redistributing
			
 
				+      the Work or Derivative Works thereof, You may choose to offer,
			
 
				+      and charge a fee for, acceptance of support, warranty, indemnity,
			
 
				+      or other liability obligations and/or rights consistent with this
			
 
				+      License. However, in accepting such obligations, You may act only
			
 
				+      on Your own behalf and on Your sole responsibility, not on behalf
			
 
				+      of any other Contributor, and only if You agree to indemnify,
			
 
				+      defend, and hold each Contributor harmless for any liability
			
 
				+      incurred by, or claims asserted against, such Contributor by reason
			
 
				+      of your accepting any such warranty or additional liability.
			
 
				+
			
 
				+   END OF TERMS AND CONDITIONS
			
 
				+
			
 
				+   APPENDIX: How to apply the Apache License to your work.
			
 
				+
			
 
				+      To apply the Apache License to your work, attach the following
			
 
				+      boilerplate notice, with the fields enclosed by brackets "[]"
			
 
				+      replaced with your own identifying information. (Don't include
			
 
				+      the brackets!)  The text should be enclosed in the appropriate
			
 
				+      comment syntax for the file format. We also recommend that a
			
 
				+      file or class name and description of purpose be included on the
			
 
				+      same "printed page" as the copyright notice for easier
			
 
				+      identification within third-party archives.
			
 
				+
			
 
				+   Copyright [yyyy] [name of copyright owner]
			
 
				+
			
 
				+   Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+   you may not use this file except in compliance with the License.
			
 
				+   You may obtain a copy of the License at
			
 
				+
			
 
				+       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+   Unless required by applicable law or agreed to in writing, software
			
 
				+   distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+   See the License for the specific language governing permissions and
			
 
				+   limitations under the License.
			
--- a/MxNet/Classification/RN50v1.5/README.md
+++ b/MxNet/Classification/RN50v1.5/README.md
@@ -0,0 +1,235 @@
 
				+# ResNet50 v1.5 For MXNet
			
 
				+
			
 
				+## The model
			
 
				+The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
			
 
				+
			
 
				+The difference between v1 and v1.5 is in the bottleneck blocks which require
			
 
				+downsampling. ResNet v1 has stride = 2 in the first 1x1 convolution, whereas
			
 
				+v1.5 has stride = 2 in the 3x3 convolution
			
 
				+
			
 
				+This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1, but comes with a small performance drawback (~5% imgs/sec).
			
 
				+
			
 
				+## Training procedure
			
 
				+
			
 
				+### Optimizer
			
 
				+
			
 
				+This model trains for 90 epochs, with the standard ResNet v1.5 setup:
			
 
				+
			
 
				+* SGD with momentum (0.9)
			
 
				+
			
 
				+* Learning rate = 0.1 for 256 batch size, for other batch sizes we linearly
			
 
				+scale the learning rate.
			
 
				+
			
 
				+* Learning rate decay - multiply by 0.1 after 30, 60, and 80 epochs
			
 
				+
			
 
				+* Linear warmup of the learning rate during first 5 epochs
			
 
				+according to [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
			
 
				+
			
 
				+* Weight decay: 1e-4
			
 
				+
			
 
				+### Data Augmentation
			
 
				+
			
 
				+During training, we perform the following augmentation techniques:
			
 
				+* Normalization
			
 
				+* Random resized crop to 224x224
			
 
				+* Scale from 5% to 100%
			
 
				+* Aspect ratio from 3/4 to 4/3
			
 
				+* Random horizontal flip
			
 
				+
			
 
				+During inference, we perform the following augmentation techniques:
			
 
				+* Normalization
			
 
				+* Scale to 256x256
			
 
				+* Center crop to 224x224
			
 
				+
			
 
				+See `data.py` for more info.
			
 
				+
			
 
				+# Setup
			
 
				+
			
 
				+## Requirements
			
 
				+
			
 
				+Ensure your environment meets the following requirements:
			
 
				+
			
 
				+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
			
 
				+* [MXNet 18.12-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia%2Fmxnet) or newer
			
 
				+* [NVIDIA-DALI 0.5.0](https://github.com/NVIDIA/DALI) -- included in the MXNet container
			
 
				+* [Python 3.5](https://www.python.org) -- included in the MXNet container
			
 
				+* [CUDA 10](https://developer.nvidia.com/cuda-toolkit) -- included in the MXNet container
			
 
				+* [cuDNN 7.4.1](https://developer.nvidia.com/cudnn) -- included in the the MXNet container
			
 
				+* (optional) NVIDIA Volta or Turing GPU (see section below) -- for best training performance using FP16
			
 
				+
			
 
				+For more information about how to get started with NGC containers, see the
			
 
				+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
			
 
				+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
			
 
				+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
			
 
				+* [Running MXNet](https://docs.nvidia.com/deeplearning/dgx/mxnet-release-notes/running.html#running)
			
 
				+
			
 
				+## Training using mixed precision with Tensor Cores
			
 
				+
			
 
				+### Hardware requirements
			
 
				+Training with mixed precision on NVIDIA Tensor Cores, requires an NVIDIA Volta-based or Turing-based GPU.
			
 
				+
			
 
				+
			
 
				+### Software changes
			
 
				+
			
 
				+For information about how to train using mixed precision, see the
			
 
				+[Mixed Precision Training paper](https://arxiv.org/abs/1710.03740)
			
 
				+and
			
 
				+[Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
			
 
				+
			
 
				+
			
 
				+# Quick start guide
			
 
				+
			
 
				+## Docker
			
 
				+
			
 
				+To run docker MXNet container, run:
			
 
				+
			
 
				+`nvidia-docker run --rm -it --ipc=host -v <path to source of this repo>:/workspace/resnet50 -v <path to prepared dataset>:/data/imagenet/train-val-recordio-passthrough nvcr.io/nvidia/mxnet:18.12-py3`
			
 
				+
			
 
				+It will also automatically start downloading the MXNet container if you haven't downloaded it yet. You can also download it manually by running:
			
 
				+
			
 
				+`nvidia-docker pull nvcr.io/nvidia/mxnet:18.12-py3`
			
 
				+
			
 
				+If you haven't prepared dataset yet (see section below), download raw ImageNet dataset (see section below), and run:
			
 
				+
			
 
				+`nvidia-docker run --rm -it --ipc=host -v <path to source of this repo>:/workspace/resnet50 -v <path where prepared dataset should be created>:/data/imagenet/train-val-recordio-passthrough -v <path to raw dataset>:/data/imagenet/raw nvcr.io/nvidia/mxnet:18.12-py3`
			
 
				+
			
 
				+and follow step from Prepare Dataset section.
			
 
				+
			
 
				+## Prepare Dataset
			
 
				+
			
 
				+The MXNet ResNet50 v1.5 script operates on ImageNet 1k, a widely popular image classification dataset from ILSVRC challenge.
			
 
				+You can download the images from http://image-net.org/download-images
			
 
				+
			
 
				+The recommended data format is
			
 
				+[RecordIO](http://mxnet.io/architecture/note_data_loading.html), which
			
 
				+concatenates multiple examples into seekable binary files for better read
			
 
				+efficiency. MXNet provides a tool called `im2rec.py` located in the `/opt/mxnet/tools/` directory.
			
 
				+The tool converts individual images into `.rec` files.
			
 
				+
			
 
				+To prepare RecordIO file containing ImageNet data, we first need to create .lst files
			
 
				+which consist of the labels and image paths. We assume that the original images were
			
 
				+downloaded to `/data/imagenet/raw/train-jpeg` and `/data/imagenet/raw/val-jpeg`.
			
 
				+
			
 
				+```bash
			
 
				+python /opt/mxnet/tools/im2rec.py --list --recursive train /data/imagenet/raw/train-jpeg
			
 
				+python /opt/mxnet/tools/im2rec.py --list --recursive val /data/imagenet/raw/val-jpeg
			
 
				+```
			
 
				+
			
 
				+Then we generate the `.rec` (RecordIO files with data) and `.idx` (indexes required by DALI
			
 
				+to speed up data loading) files. To obtain the best training accuracy
			
 
				+we do not preprocess the images when creating RecordIO file.
			
 
				+
			
 
				+```bash
			
 
				+python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 train /data/imagenet/raw/train-jpeg
			
 
				+python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 val /data/imagenet/raw/val-jpeg
			
 
				+```
			
 
				+
			
 
				+## Running training
			
 
				+
			
 
				+To run training for a standard configuration (1/4/8 GPUs, FP16/FP32),
			
 
				+run one of the scripts in the `./examples` directory
			
 
				+called `./examples/RN50_{FP16, FP32}_{1, 4, 8}GPU.sh`.
			
 
				+By default the training scripts run the validation and save checkpoint after each epoch.
			
 
				+Checkpoints will be stored in `model-symbol.json` and `model-<number of epoch>.params` files.
			
 
				+
			
 
				+If imagenet is mounted in the `/data/imagenet/train-val-recordio-passthrough` directory, you don't have to specify `--data-root` flag.
			
 
				+
			
 
				+To run a non standard configuration use:
			
 
				+
			
 
				+`./runner -n <number of gpus> -b <batch size per gpu> --data-root <path to imagenet> --dtype <float32 or float16> --model-prefix <model prefix>`
			
 
				+
			
 
				+Checkpoints will be stored in `<model prefix>-symbol.json` and `<model prefix>-<number of epoch>.params` files.
			
 
				+To generate JSON report with performance and accuracy stats, use `--report <path to report>` flag (see `report.py` for info about JSON report file structure).
			
 
				+Use `./runner -h` and `python ./train.py -h` to obtain the list of available options.
			
 
				+
			
 
				+## Running inference
			
 
				+
			
 
				+To run inference on a checkpointed model run:
			
 
				+* For FP16
			
 
				+    `./examples/SCORE_FP16.sh <model prefix> <epoch>`
			
 
				+* For FP32
			
 
				+    `./examples/SCORE_FP32.sh <model prefix> <epoch>`
			
 
				+
			
 
				+
			
 
				+## Benchmark scripts
			
 
				+
			
 
				+To benchmark training and inference, run:
			
 
				+
			
 
				+`python benchmark.py -n <numbers of gpus separated by comma> -b <batch sizes per gpu separated by comma> --data-root <path to imagenet> --dtype <float32 or float16> -o <path to benchmark report>`
			
 
				+
			
 
				+To control benchmark length per epoch, use `-i` flag (defaults to 100 iterations).
			
 
				+To control number of epochs, use `-e` flag.
			
 
				+To control number of warmup epochs (epochs which are not taken into account), use `-w` flag.
			
 
				+To limit length of dataset, use `--num-examples` flag.
			
 
				+To benchmark only inference, use `--only-inference` flag.
			
 
				+By default, the same parameters as in `./runner` will be used. Additional flags will be passed to `./runner`.
			
 
				+
			
 
				+
			
 
				+## Training accuracy results
			
 
				+
			
 
				+The following results were obtained by running the `./examples/RN50_{FP16, FP32}_{1, 4, 8}GPU.sh` scripts in the
			
 
				+mxnet-18.12-py3 Docker container on NVIDIA DGX-1 with 8 V100 16G GPUs.
			
 
				+
			
 
				+| **number of GPUs** | **FP16 top1** | **FP16 training time** | **FP32 top1** | **FP32 training time** |
			
 
				+|:------------------:|:-------------:|:----------------------:|:-------------:|:----------------------:|
			
 
				+| 1                  | 76.424        | 22.9h                  | 76.462        | 82.0h                  |
			
 
				+| 4                  | 76.328        | 6.2h                   | 76.448        | 21.1h                  |
			
 
				+| 8                  | 76.490        | 3.3h                   | 76.668        | 11.1h                  |
			
 
				+
			
 
				+Here are example graphs of FP32 and FP16 training on 8 GPU configuration:
			
 
				+
			
 
				+![TrainingLoss](./img/training_loss.png)
			
 
				+
			
 
				+![TrainingAccuracy](./img/training_accuracy.png)
			
 
				+
			
 
				+![ValidationAccuracy](./img/validation_accuracy.png)
			
 
				+
			
 
				+
			
 
				+## Training performance results
			
 
				+
			
 
				+The following results were obtained by running
			
 
				+`python benchmark.py -n 1,4,8 -b 208 --dtype float16 -o benchmark_report_fp16.json --data-root <path to imagenet> -i 100 -e 12 -w 4 --num-examples 25600` for FP16, and
			
 
				+`python benchmark.py -n 1,4,8 -b 96 --dtype float32 -o benchmark_report_fp32.json --data-root <path to imagenet> -i 100 -e 12 -w 4 --num-examples 12800` for FP32
			
 
				+in the mxnet-18.12-py3 Docker container on NVIDIA DGX-1 with V100 16G GPUs.
			
 
				+Training performance reported as Total IPS (data + compute time taken into account).
			
 
				+Weak scaling is calculated as a ratio of speed for given number of GPUs to speed for 1 GPU.
			
 
				+
			
 
				+| **number of GPUs** | **FP16 img/s** | **FP32 img/s** | **FP16 speedup** | **FP16 weak scaling** | **FP32 weak scaling** |
			
 
				+|:------------------:|:--------------:|:--------------:|:----------------:|:---------------------:|:---------------------:|
			
 
				+| 1                  | 1442.6         | 400.2          | 3.60             | 1.00                  | 1.00                  |
			
 
				+| 4                  | 5391.8         | 1558.6         | 3.46             | 3.74                  | 3.89                  |
			
 
				+| 8                  | 10263.2        | 2957.4         | 3.47             | 7.11                  | 7.39                  |
			
 
				+
			
 
				+
			
 
				+## Inference performance results
			
 
				+
			
 
				+The following results were obtained by running
			
 
				+`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,96,128,192,208 --dtype float16 -o inferbenchmark_report_fp16.json --data-root <path to imagenet> -i 200 -e 12 -w 4 --only-inference` for FP16, and
			
 
				+`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,96 --dtype float32 -o inferbenchmark_report_fp32.json --data-root <path to imagenet> -i 200 -e 12 -w 4 --only-inference` for FP32
			
 
				+in the mxnet-18.12-py3 Docker container on NVIDIA DGX-1 using one V100 16G GPU.
			
 
				+Inference performance reported as Total IPS (data + compute time taken into account).
			
 
				+
			
 
				+| **batch size** | **FP16 img/s** | **FP32 img/s** |
			
 
				+|:--------------:|:--------------:|:--------------:|
			
 
				+|              1 |  314           | 252            |
			
 
				+|              2 |  555           | 393            |
			
 
				+|              4 |  1024          | 601            |
			
 
				+|              8 |  1642          | 824            |
			
 
				+|             16 |  2144          | 1028           |
			
 
				+|             32 |  2954          | 1138           |
			
 
				+|             64 |  3428          | 1236           |
			
 
				+|             96 |  3546          | 1282           |
			
 
				+|            128 |  3690          |                |
			
 
				+|            192 |  3828          |                |
			
 
				+|            208 |  3832          |                |
			
 
				+
			
 
				+
			
 
				+# Changelog
			
 
				+
			
 
				+1. Dec 19, 2018
			
 
				+  * Initial release (based on https://github.com/apache/incubator-mxnet/tree/master/example/image-classification)
			
 
				+
			
 
				+
			
 
				+# Known Issues
			
 
				+
			
 
				+There are no known issues with this model.
			
--- a/MxNet/Classification/RN50v1.5/benchmark.py
+++ b/MxNet/Classification/RN50v1.5/benchmark.py
@@ -0,0 +1,92 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import argparse
			
 
				+import json
			
 
				+import sys
			
 
				+import tempfile
			
 
				+import json
			
 
				+import os
			
 
				+from collections import OrderedDict
			
 
				+from subprocess import Popen
			
 
				+
			
 
				+parser = argparse.ArgumentParser(description='Benchmark')
			
 
				+parser.add_argument('--executable', default='./runner', help='path to runner')
			
 
				+parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]',
			
 
				+                    required=True, help='numbers of gpus separated by comma')
			
 
				+parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]',
			
 
				+                    required=True, help='batch sizes separated by comma')
			
 
				+parser.add_argument('-i', '--benchmark-iters', metavar='I',
			
 
				+                    type=int, default=100, help='iterations')
			
 
				+parser.add_argument('-e', '--epochs', metavar='E',
			
 
				+                    type=int, default=1, help='number of epochs')
			
 
				+parser.add_argument('-w', '--warmup', metavar='N',
			
 
				+                    type=int, default=0, help='warmup epochs')
			
 
				+parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report")
			
 
				+parser.add_argument('--only-inference', action='store_true', help="benchmark inference only")
			
 
				+args, other_args = parser.parse_known_args()
			
 
				+
			
 
				+ngpus = list(map(int, args.ngpus.split(',')))
			
 
				+batch_sizes = list(map(int, args.batch_sizes.split(',')))
			
 
				+
			
 
				+
			
 
				+res = OrderedDict()
			
 
				+res['model'] = ''
			
 
				+res['ngpus'] = ngpus
			
 
				+res['bs'] = batch_sizes
			
 
				+if args.only_inference:
			
 
				+    res['metric_keys'] = ['val.total_ips']
			
 
				+else:
			
 
				+    res['metric_keys'] = ['train.total_ips', 'val.total_ips']
			
 
				+res['metrics'] = OrderedDict()
			
 
				+
			
 
				+for n in ngpus:
			
 
				+    res['metrics'][str(n)] = OrderedDict()
			
 
				+    for bs in batch_sizes:
			
 
				+        res['metrics'][str(n)][str(bs)] = OrderedDict()
			
 
				+
			
 
				+        report_file = args.output + '-{},{}'.format(n, bs)
			
 
				+        Popen([args.executable, '-n', str(n), '-b', str(bs),
			
 
				+               '--benchmark-iters', str(args.benchmark_iters),
			
 
				+               '-e', str(args.epochs), '--report', report_file,
			
 
				+               *([] if not args.only_inference else ['--only-inference']),
			
 
				+               '--no-metrics'] + other_args, stdout=sys.stderr).wait()
			
 
				+
			
 
				+        with open(report_file, 'r') as f:
			
 
				+            report = json.load(f)
			
 
				+
			
 
				+        for metric in res['metric_keys']:
			
 
				+            data = report['metrics'][metric][args.warmup:]
			
 
				+            avg = len(data) / sum(map(lambda x: 1 / x, data))
			
 
				+            res['metrics'][str(n)][str(bs)][metric] = avg
			
 
				+
			
 
				+
			
 
				+column_len = 7
			
 
				+for m in res['metric_keys']:
			
 
				+    print(m, file=sys.stderr)
			
 
				+    print(' ' * column_len, end='|', file=sys.stderr)
			
 
				+    for bs in batch_sizes:
			
 
				+        print(str(bs).center(column_len), end='|', file=sys.stderr)
			
 
				+    print(file=sys.stderr)
			
 
				+    print('-' * (len(batch_sizes) + 1) * (column_len + 1), file=sys.stderr)
			
 
				+    for n in ngpus:
			
 
				+        print(str(n).center(column_len), end='|', file=sys.stderr)
			
 
				+        for bs in batch_sizes:
			
 
				+            print(str(round(res['metrics'][str(n)][str(bs)][m])).center(column_len), end='|', file=sys.stderr)
			
 
				+        print(file=sys.stderr)
			
 
				+    print(file=sys.stderr)
			
 
				+
			
 
				+
			
 
				+with open(args.output, 'w') as f:
			
 
				+    json.dump(res, f, indent=4)
			
--- a/MxNet/Classification/RN50v1.5/benchmarking.py
+++ b/MxNet/Classification/RN50v1.5/benchmarking.py
@@ -0,0 +1,62 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+from mxnet.io import DataIter
			
 
				+import time
			
 
				+
			
 
				+class BenchmarkingDataIter:
			
 
				+    def __init__(self, data_iter, benchmark_iters=None):
			
 
				+        self.data_iter = data_iter
			
 
				+        self.benchmark_iters = benchmark_iters
			
 
				+        self.overall_time = 0
			
 
				+        self.num = 0
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        iter(self.data_iter)
			
 
				+        return self
			
 
				+
			
 
				+    def next(self):
			
 
				+        if self.benchmark_iters is not None and self.num >= self.benchmark_iters:
			
 
				+            raise StopIteration
			
 
				+        try:
			
 
				+            start_time = time.time()
			
 
				+            ret = self.data_iter.next()
			
 
				+            end_time = time.time()
			
 
				+        except StopIteration:
			
 
				+            if self.benchmark_iters is None:
			
 
				+                raise
			
 
				+            self.data_iter.reset()
			
 
				+            start_time = time.time()
			
 
				+            ret = self.data_iter.next()
			
 
				+            end_time = time.time()
			
 
				+
			
 
				+        if self.num != 0:
			
 
				+            self.overall_time += end_time - start_time
			
 
				+        self.num += 1
			
 
				+        return ret
			
 
				+
			
 
				+    def __next__(self):
			
 
				+        return self.next()
			
 
				+
			
 
				+    def __getattr__(self, attr):
			
 
				+        return getattr(self.data_iter, attr)
			
 
				+
			
 
				+    def get_avg_time_and_clear(self):
			
 
				+        if self.num <= 1:
			
 
				+            avg = float('nan')
			
 
				+        else:
			
 
				+            avg = self.overall_time / (self.num - 1)
			
 
				+        self.overall_time = 0
			
 
				+        self.num = 0
			
 
				+        return avg
			
--- a/MxNet/Classification/RN50v1.5/dali.py
+++ b/MxNet/Classification/RN50v1.5/dali.py
@@ -0,0 +1,163 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import warnings
			
 
				+from nvidia import dali
			
 
				+from nvidia.dali.pipeline import Pipeline
			
 
				+import nvidia.dali.ops as ops
			
 
				+import nvidia.dali.types as types
			
 
				+from nvidia.dali.plugin.mxnet import DALIClassificationIterator
			
 
				+
			
 
				+
			
 
				+def add_dali_args(parser):
			
 
				+    group = parser.add_argument_group('DALI', 'pipeline and augumentation')
			
 
				+    group.add_argument('--use-dali', action='store_true',
			
 
				+                      help='use dalli pipeline and augunetation')
			
 
				+    group.add_argument('--separ-val', action='store_true',
			
 
				+                      help='each process will perform independent validation on whole val-set')
			
 
				+    group.add_argument('--dali-threads', type=int, default=3, help="number of threads" +\
			
 
				+                       "per GPU for DALI")
			
 
				+    group.add_argument('--validation-dali-threads', type=int, default=10, help="number of threads" +\
			
 
				+                       "per GPU for DALI for validation")
			
 
				+    group.add_argument('--dali-prefetch-queue', type=int, default=3, help="DALI prefetch queue depth")
			
 
				+    group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=16, help="Memory padding value for nvJPEG (in MB)")
			
 
				+    return parser
			
 
				+
			
 
				+
			
 
				+_mean_pixel = [255 * x for x in (0.485, 0.456, 0.406)]
			
 
				+_std_pixel  = [255 * x for x in (0.229, 0.224, 0.225)]
			
 
				+
			
 
				+class HybridTrainPipe(Pipeline):
			
 
				+    def __init__(self, batch_size, num_threads, device_id, rec_path, idx_path,
			
 
				+                 shard_id, num_shards, crop_shape,
			
 
				+                 nvjpeg_padding, prefetch_queue=3,
			
 
				+                 output_layout=types.NCHW, pad_output=True, dtype='float16'):
			
 
				+        super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id, prefetch_queue_depth = prefetch_queue)
			
 
				+        self.input = ops.MXNetReader(path = [rec_path], index_path=[idx_path],
			
 
				+                                     random_shuffle=True, shard_id=shard_id, num_shards=num_shards)
			
 
				+
			
 
				+        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB,
			
 
				+                                        device_memory_padding = nvjpeg_padding,
			
 
				+                                        host_memory_padding = nvjpeg_padding)
			
 
				+        self.rrc = ops.RandomResizedCrop(device = "gpu", size = crop_shape)
			
 
				+        self.cmnp = ops.CropMirrorNormalize(device = "gpu",
			
 
				+                                            output_dtype = types.FLOAT16 if dtype == 'float16' else types.FLOAT,
			
 
				+                                            output_layout = output_layout,
			
 
				+                                            crop = crop_shape,
			
 
				+                                            pad_output = pad_output,
			
 
				+                                            image_type = types.RGB,
			
 
				+                                            mean = _mean_pixel,
			
 
				+                                            std =  _std_pixel)
			
 
				+        self.coin = ops.CoinFlip(probability = 0.5)
			
 
				+
			
 
				+    def define_graph(self):
			
 
				+        rng = self.coin()
			
 
				+        self.jpegs, self.labels = self.input(name = "Reader")
			
 
				+
			
 
				+        images = self.decode(self.jpegs)
			
 
				+        images = self.rrc(images)
			
 
				+        output = self.cmnp(images, mirror = rng)
			
 
				+        return [output, self.labels]
			
 
				+
			
 
				+
			
 
				+class HybridValPipe(Pipeline):
			
 
				+    def __init__(self, batch_size, num_threads, device_id, rec_path, idx_path,
			
 
				+                 shard_id, num_shards, crop_shape,
			
 
				+                 nvjpeg_padding, prefetch_queue=3,
			
 
				+                 resize_shp=None,
			
 
				+                 output_layout=types.NCHW, pad_output=True, dtype='float16'):
			
 
				+        super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id, prefetch_queue_depth = prefetch_queue)
			
 
				+        self.input = ops.MXNetReader(path = [rec_path], index_path=[idx_path],
			
 
				+                                     random_shuffle=False, shard_id=shard_id, num_shards=num_shards)
			
 
				+        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB,
			
 
				+                                        device_memory_padding = nvjpeg_padding,
			
 
				+                                        host_memory_padding = nvjpeg_padding)
			
 
				+        self.resize = ops.Resize(device = "gpu", resize_shorter=resize_shp) if resize_shp else None
			
 
				+        self.cmnp = ops.CropMirrorNormalize(device = "gpu",
			
 
				+                                            output_dtype = types.FLOAT16 if dtype == 'float16' else types.FLOAT,
			
 
				+                                            output_layout = output_layout,
			
 
				+                                            crop = crop_shape,
			
 
				+                                            pad_output = pad_output,
			
 
				+                                            image_type = types.RGB,
			
 
				+                                            mean = _mean_pixel,
			
 
				+                                            std =  _std_pixel)
			
 
				+
			
 
				+    def define_graph(self):
			
 
				+        self.jpegs, self.labels = self.input(name = "Reader")
			
 
				+        images = self.decode(self.jpegs)
			
 
				+        if self.resize:
			
 
				+            images = self.resize(images)
			
 
				+        output = self.cmnp(images)
			
 
				+        return [output, self.labels]
			
 
				+
			
 
				+
			
 
				+def get_rec_iter(args, kv=None):
			
 
				+    # resize is default base length of shorter edge for dataset;
			
 
				+    # all images will be reshaped to this size
			
 
				+    resize = int(args.resize)
			
 
				+    # target shape is final shape of images pipelined to network;
			
 
				+    # all images will be cropped to this size
			
 
				+    target_shape = tuple([int(l) for l in args.image_shape.split(',')])
			
 
				+    pad_output = target_shape[0] == 4
			
 
				+    gpus = list(map(int, filter(None, args.gpus.split(',')))) # filter to not encount eventually empty strings
			
 
				+    batch_size = args.batch_size//len(gpus)
			
 
				+    num_threads = args.dali_threads
			
 
				+    num_validation_threads = args.validation_dali_threads
			
 
				+    #db_folder = "/data/imagenet/train-480-val-256-recordio/"
			
 
				+
			
 
				+    # the input_layout w.r.t. the model is the output_layout of the image pipeline
			
 
				+    output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW
			
 
				+
			
 
				+    rank = kv.rank if kv else 0
			
 
				+    nWrk = kv.num_workers if kv else 1
			
 
				+
			
 
				+    trainpipes = [HybridTrainPipe(batch_size     = batch_size,
			
 
				+                                  num_threads    = num_threads,
			
 
				+                                  device_id      = gpu_id,
			
 
				+                                  rec_path       = args.data_train,
			
 
				+                                  idx_path       = args.data_train_idx,
			
 
				+                                  shard_id       = gpus.index(gpu_id) + len(gpus)*rank,
			
 
				+                                  num_shards     = len(gpus)*nWrk,
			
 
				+                                  crop_shape     = target_shape[1:],
			
 
				+                                  output_layout  = output_layout,
			
 
				+                                  pad_output     = pad_output,
			
 
				+                                  dtype          = args.dtype,
			
 
				+                                  nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
			
 
				+                                  prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus]
			
 
				+
			
 
				+    valpipes = [HybridValPipe(batch_size     = batch_size,
			
 
				+                              num_threads    = num_validation_threads,
			
 
				+                              device_id      = gpu_id,
			
 
				+                              rec_path       = args.data_val,
			
 
				+                              idx_path       = args.data_val_idx,
			
 
				+                              shard_id       = 0 if args.separ_val
			
 
				+                                                 else gpus.index(gpu_id) + len(gpus)*rank,
			
 
				+                              num_shards     = 1 if args.separ_val else len(gpus)*nWrk,
			
 
				+                              crop_shape     = target_shape[1:],
			
 
				+                              resize_shp     = resize,
			
 
				+                              output_layout  = output_layout,
			
 
				+                              pad_output     = pad_output,
			
 
				+                              dtype          = args.dtype,
			
 
				+                              nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
			
 
				+                              prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] if args.data_val else None
			
 
				+    trainpipes[0].build()
			
 
				+    if args.data_val:
			
 
				+        valpipes[0].build()
			
 
				+
			
 
				+    if args.num_examples < trainpipes[0].epoch_size("Reader"):
			
 
				+        warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, trainpipes[0].epoch_size("Reader")))
			
 
				+    dali_train_iter = DALIClassificationIterator(trainpipes, args.num_examples // nWrk)
			
 
				+    dali_val_iter = DALIClassificationIterator(valpipes, valpipes[0].epoch_size("Reader") // (1 if args.separ_val else nWrk), fill_last_batch = False) if args.data_val else None
			
 
				+    return dali_train_iter, dali_val_iter
			
 
				+
			
--- a/MxNet/Classification/RN50v1.5/data.py
+++ b/MxNet/Classification/RN50v1.5/data.py
@@ -0,0 +1,283 @@
 
				+# Copyright 2017-2018 The Apache Software Foundation
			
 
				+#
			
 
				+# Licensed to the Apache Software Foundation (ASF) under one
			
 
				+# or more contributor license agreements.  See the NOTICE file
			
 
				+# distributed with this work for additional information
			
 
				+# regarding copyright ownership.  The ASF licenses this file
			
 
				+# to you under the Apache License, Version 2.0 (the
			
 
				+# "License"); you may not use this file except in compliance
			
 
				+# with the License.  You may obtain a copy of the License at
			
 
				+#
			
 
				+#   http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing,
			
 
				+# software distributed under the License is distributed on an
			
 
				+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+# KIND, either express or implied.  See the License for the
			
 
				+# specific language governing permissions and limitations
			
 
				+# under the License.
			
 
				+#
			
 
				+# -----------------------------------------------------------------------
			
 
				+#
			
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import mxnet as mx
			
 
				+import random
			
 
				+import argparse
			
 
				+from mxnet.io import DataBatch, DataIter
			
 
				+import numpy as np
			
 
				+
			
 
				+def add_data_args(parser):
			
 
				+    data = parser.add_argument_group('Data', 'the input images')
			
 
				+    data.add_argument('--data-train', type=str, help='the training data')
			
 
				+    data.add_argument('--data-train-idx', type=str, default='', help='the index of training data')
			
 
				+    data.add_argument('--data-val', type=str, help='the validation data')
			
 
				+    data.add_argument('--data-val-idx', type=str, default='', help='the index of validation data')
			
 
				+    data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
			
 
				+                      help='a tuple of size 3 for the mean rgb')
			
 
				+    data.add_argument('--rgb-std', type=str, default='1,1,1',
			
 
				+                      help='a tuple of size 3 for the std rgb')
			
 
				+    data.add_argument('--pad-size', type=int, default=0,
			
 
				+                      help='padding the input image')
			
 
				+    data.add_argument('--fill-value', type=int, default=127,
			
 
				+                      help='Set the padding pixels value to fill_value')
			
 
				+    data.add_argument('--image-shape', type=str,
			
 
				+                      help='the image shape feed into the network, e.g. (3,224,224)')
			
 
				+    data.add_argument('--num-classes', type=int, help='the number of classes')
			
 
				+    data.add_argument('--num-examples', type=int, help='the number of training examples')
			
 
				+    data.add_argument('--data-nthreads', type=int, default=4,
			
 
				+                      help='number of threads for data decoding')
			
 
				+    data.add_argument('--benchmark-iters', type=int, default=None,
			
 
				+                      help='run only benchmark-iters iterations from each epoch')
			
 
				+    data.add_argument('--input-layout', type=str, default='NCHW',
			
 
				+                      help='the layout of the input data (e.g. NCHW)')
			
 
				+    data.add_argument('--conv-layout', type=str, default='NCHW',
			
 
				+                      help='the layout of the data assumed by the conv operation (e.g. NCHW)')
			
 
				+    data.add_argument('--conv-algo', type=int, default=-1,
			
 
				+                      help='set the convolution algos (fwd, dgrad, wgrad)')
			
 
				+    data.add_argument('--batchnorm-layout', type=str, default='NCHW',
			
 
				+                      help='the layout of the data assumed by the batchnorm operation (e.g. NCHW)')
			
 
				+    data.add_argument('--batchnorm-eps', type=float, default=2e-5,
			
 
				+                      help='the amount added to the batchnorm variance to prevent output explosion.')
			
 
				+    data.add_argument('--batchnorm-mom', type=float, default=0.9,
			
 
				+                      help='the leaky-integrator factor controling the batchnorm mean and variance.')
			
 
				+    data.add_argument('--pooling-layout', type=str, default='NCHW',
			
 
				+                      help='the layout of the data assumed by the pooling operation (e.g. NCHW)')
			
 
				+    data.add_argument('--verbose', type=int, default=0,
			
 
				+                      help='turn on reporting of chosen algos for convolution, etc.')
			
 
				+    data.add_argument('--seed', type=int, default=None,
			
 
				+                      help='set the seed for python, nd and mxnet rngs')
			
 
				+    data.add_argument('--custom-bn-off', type=int, default=0,
			
 
				+                      help='disable use of custom batchnorm kernel')
			
 
				+    data.add_argument('--fuse-bn-relu', type=int, default=0,
			
 
				+                      help='have batchnorm kernel perform activation relu')
			
 
				+    data.add_argument('--fuse-bn-add-relu', type=int, default=0,
			
 
				+                      help='have batchnorm kernel perform add followed by activation relu')
			
 
				+    data.add_argument('--force-tensor-core', type=int, default=0,
			
 
				+                      help='require conv algos to be tensor core')
			
 
				+    return data
			
 
				+
			
 
				+# Action to translate --set-resnet-aug flag to its component settings.
			
 
				+class SetResnetAugAction(argparse.Action):
			
 
				+    def __init__(self, nargs=0, **kwargs):
			
 
				+        if nargs != 0:
			
 
				+            raise ValueError('nargs for SetResnetAug must be 0.')
			
 
				+        super(SetResnetAugAction, self).__init__(nargs=nargs, **kwargs)
			
 
				+    def __call__(self, parser, namespace, values, option_string=None):
			
 
				+        # standard data augmentation setting for resnet training
			
 
				+        setattr(namespace, 'random_crop', 1)
			
 
				+        setattr(namespace, 'random_resized_crop', 1)
			
 
				+        setattr(namespace, 'random_mirror', 1)
			
 
				+        setattr(namespace, 'min_random_area', 0.08)
			
 
				+        setattr(namespace, 'max_random_aspect_ratio', 4./3.)
			
 
				+        setattr(namespace, 'min_random_aspect_ratio', 3./4.)
			
 
				+        setattr(namespace, 'brightness', 0.4)
			
 
				+        setattr(namespace, 'contrast', 0.4)
			
 
				+        setattr(namespace, 'saturation', 0.4)
			
 
				+        setattr(namespace, 'pca_noise', 0.1)
			
 
				+        # record that this --set-resnet-aug 'macro arg' has been invoked
			
 
				+        setattr(namespace, self.dest, 1)
			
 
				+
			
 
				+# Similar to the above, but suitable for calling within a training script to set the defaults.
			
 
				+def set_resnet_aug(aug):
			
 
				+    # standard data augmentation setting for resnet training
			
 
				+    aug.set_defaults(random_crop=0, random_resized_crop=1)
			
 
				+    aug.set_defaults(random_mirror=1)
			
 
				+    aug.set_defaults(min_random_area=0.08)
			
 
				+    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
			
 
				+    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
			
 
				+
			
 
				+# Action to translate --set-data-aug-level <N> arg to its component settings.
			
 
				+class SetDataAugLevelAction(argparse.Action):
			
 
				+    def __init__(self, option_strings, dest, nargs=None, **kwargs):
			
 
				+        if nargs is not None:
			
 
				+            raise ValueError("nargs not allowed")
			
 
				+        super(SetDataAugLevelAction, self).__init__(option_strings, dest, **kwargs)
			
 
				+    def __call__(self, parser, namespace, values, option_string=None):
			
 
				+        level = values
			
 
				+        # record that this --set-data-aug-level <N> 'macro arg' has been invoked
			
 
				+        setattr(namespace, self.dest, level)
			
 
				+        if level >= 1:
			
 
				+            setattr(namespace, 'random_crop', 1)
			
 
				+            setattr(namespace, 'random_mirror', 1)
			
 
				+        if level >= 2:
			
 
				+            setattr(namespace, 'max_random_h', 36)
			
 
				+            setattr(namespace, 'max_random_s', 50)
			
 
				+            setattr(namespace, 'max_random_l', 50)
			
 
				+        if level >= 3:
			
 
				+            setattr(namespace, 'max_random_rotate_angle', 10)
			
 
				+            setattr(namespace, 'max_random_shear_ratio', 0.1)
			
 
				+            setattr(namespace, 'max_random_aspect_ratio', 0.25)
			
 
				+
			
 
				+# Similar to the above, but suitable for calling within a training script to set the defaults.
			
 
				+def set_data_aug_level(aug, level):
			
 
				+    if level >= 1:
			
 
				+        aug.set_defaults(random_crop=1, random_mirror=1)
			
 
				+    if level >= 2:
			
 
				+        aug.set_defaults(max_random_h=36, max_random_s=50, max_random_l=50)
			
 
				+    if level >= 3:
			
 
				+        aug.set_defaults(max_random_rotate_angle=10, max_random_shear_ratio=0.1, max_random_aspect_ratio=0.25)
			
 
				+
			
 
				+def add_data_aug_args(parser):
			
 
				+    aug = parser.add_argument_group(
			
 
				+        'Image augmentations', 'implemented in src/io/image_aug_default.cc')
			
 
				+    aug.add_argument('--random-crop', type=int, default=0,
			
 
				+                     help='if or not randomly crop the image')
			
 
				+    aug.add_argument('--random-mirror', type=int, default=0,
			
 
				+                     help='if or not randomly flip horizontally')
			
 
				+    aug.add_argument('--max-random-h', type=int, default=0,
			
 
				+                     help='max change of hue, whose range is [0, 180]')
			
 
				+    aug.add_argument('--max-random-s', type=int, default=0,
			
 
				+                     help='max change of saturation, whose range is [0, 255]')
			
 
				+    aug.add_argument('--max-random-l', type=int, default=0,
			
 
				+                     help='max change of intensity, whose range is [0, 255]')
			
 
				+    aug.add_argument('--min-random-aspect-ratio', type=float, default=None,
			
 
				+                     help='min value of aspect ratio, whose value is either None or a positive value.')
			
 
				+    aug.add_argument('--max-random-aspect-ratio', type=float, default=0,
			
 
				+                     help='max value of aspect ratio. If min_random_aspect_ratio is None, '
			
 
				+                          'the aspect ratio range is [1-max_random_aspect_ratio, '
			
 
				+                          '1+max_random_aspect_ratio], otherwise it is '
			
 
				+                          '[min_random_aspect_ratio, max_random_aspect_ratio].')
			
 
				+    aug.add_argument('--max-random-rotate-angle', type=int, default=0,
			
 
				+                     help='max angle to rotate, whose range is [0, 360]')
			
 
				+    aug.add_argument('--max-random-shear-ratio', type=float, default=0,
			
 
				+                     help='max ratio to shear, whose range is [0, 1]')
			
 
				+    aug.add_argument('--max-random-scale', type=float, default=1,
			
 
				+                     help='max ratio to scale')
			
 
				+    aug.add_argument('--min-random-scale', type=float, default=1,
			
 
				+                     help='min ratio to scale, should >= img_size/input_shape. '
			
 
				+                          'otherwise use --pad-size')
			
 
				+    aug.add_argument('--max-random-area', type=float, default=1,
			
 
				+                     help='max area to crop in random resized crop, whose range is [0, 1]')
			
 
				+    aug.add_argument('--min-random-area', type=float, default=1,
			
 
				+                     help='min area to crop in random resized crop, whose range is [0, 1]')
			
 
				+    aug.add_argument('--min-crop-size', type=int, default=-1,
			
 
				+                     help='Crop both width and height into a random size in '
			
 
				+                          '[min_crop_size, max_crop_size]')
			
 
				+    aug.add_argument('--max-crop-size', type=int, default=-1,
			
 
				+                     help='Crop both width and height into a random size in '
			
 
				+                          '[min_crop_size, max_crop_size]')
			
 
				+    aug.add_argument('--brightness', type=float, default=0,
			
 
				+                     help='brightness jittering, whose range is [0, 1]')
			
 
				+    aug.add_argument('--contrast', type=float, default=0,
			
 
				+                     help='contrast jittering, whose range is [0, 1]')
			
 
				+    aug.add_argument('--saturation', type=float, default=0,
			
 
				+                     help='saturation jittering, whose range is [0, 1]')
			
 
				+    aug.add_argument('--pca-noise', type=float, default=0,
			
 
				+                     help='pca noise, whose range is [0, 1]')
			
 
				+    aug.add_argument('--random-resized-crop', type=int, default=0,
			
 
				+                     help='whether to use random resized crop')
			
 
				+    aug.add_argument('--set-resnet-aug', action=SetResnetAugAction,
			
 
				+                     help='whether to employ standard resnet augmentations (see data.py)')
			
 
				+    aug.add_argument('--set-data-aug-level', type=int, default=None, action=SetDataAugLevelAction,
			
 
				+                     help='set multiple data augmentations based on a `level` (see data.py)')
			
 
				+    return aug
			
 
				+
			
 
				+def get_rec_iter(args, kv=None):
			
 
				+    image_shape = tuple([int(l) for l in args.image_shape.split(',')])
			
 
				+    if args.input_layout == 'NHWC':
			
 
				+        image_shape = image_shape[1:] + (image_shape[0],)
			
 
				+    if kv:
			
 
				+        (rank, nworker) = (kv.rank, kv.num_workers)
			
 
				+    else:
			
 
				+        (rank, nworker) = (0, 1)
			
 
				+    rgb_mean = [float(i) for i in args.rgb_mean.split(',')]
			
 
				+    rgb_std = [float(i) for i in args.rgb_std.split(',')]
			
 
				+    if args.input_layout == 'NHWC':
			
 
				+        raise ValueError('ImageRecordIter cannot handle layout {}'.format(args.input_layout))
			
 
				+
			
 
				+    train = mx.io.ImageRecordIter(
			
 
				+        path_imgrec         = args.data_train,
			
 
				+        path_imgidx         = args.data_train_idx,
			
 
				+        label_width         = 1,
			
 
				+        mean_r              = rgb_mean[0],
			
 
				+        mean_g              = rgb_mean[1],
			
 
				+        mean_b              = rgb_mean[2],
			
 
				+        std_r               = rgb_std[0],
			
 
				+        std_g               = rgb_std[1],
			
 
				+        std_b               = rgb_std[2],
			
 
				+        data_name           = 'data',
			
 
				+        label_name          = 'softmax_label',
			
 
				+        data_shape          = image_shape,
			
 
				+        batch_size          = args.batch_size,
			
 
				+        rand_crop           = args.random_crop,
			
 
				+        max_random_scale    = args.max_random_scale,
			
 
				+        pad                 = args.pad_size,
			
 
				+        fill_value          = args.fill_value,
			
 
				+        random_resized_crop = args.random_resized_crop,
			
 
				+        min_random_scale    = args.min_random_scale,
			
 
				+        max_aspect_ratio    = args.max_random_aspect_ratio,
			
 
				+        min_aspect_ratio    = args.min_random_aspect_ratio,
			
 
				+        max_random_area     = args.max_random_area,
			
 
				+        min_random_area     = args.min_random_area,
			
 
				+        min_crop_size       = args.min_crop_size,
			
 
				+        max_crop_size       = args.max_crop_size,
			
 
				+        brightness          = args.brightness,
			
 
				+        contrast            = args.contrast,
			
 
				+        saturation          = args.saturation,
			
 
				+        pca_noise           = args.pca_noise,
			
 
				+        random_h            = args.max_random_h,
			
 
				+        random_s            = args.max_random_s,
			
 
				+        random_l            = args.max_random_l,
			
 
				+        max_rotate_angle    = args.max_random_rotate_angle,
			
 
				+        max_shear_ratio     = args.max_random_shear_ratio,
			
 
				+        rand_mirror         = args.random_mirror,
			
 
				+        preprocess_threads  = args.data_nthreads,
			
 
				+        shuffle             = True,
			
 
				+        num_parts           = nworker,
			
 
				+        part_index          = rank)
			
 
				+    if args.data_val is None:
			
 
				+        return (train, None)
			
 
				+    val = mx.io.ImageRecordIter(
			
 
				+        path_imgrec         = args.data_val,
			
 
				+        path_imgidx         = args.data_val_idx,
			
 
				+        label_width         = 1,
			
 
				+        mean_r              = rgb_mean[0],
			
 
				+        mean_g              = rgb_mean[1],
			
 
				+        mean_b              = rgb_mean[2],
			
 
				+        std_r               = rgb_std[0],
			
 
				+        std_g               = rgb_std[1],
			
 
				+        std_b               = rgb_std[2],
			
 
				+        data_name           = 'data',
			
 
				+        label_name          = 'softmax_label',
			
 
				+        batch_size          = args.batch_size,
			
 
				+        round_batch         = False,
			
 
				+        data_shape          = image_shape,
			
 
				+        preprocess_threads  = args.data_nthreads,
			
 
				+        rand_crop           = False,
			
 
				+        rand_mirror         = False,
			
 
				+        num_parts           = nworker,
			
 
				+        part_index          = rank)
			
 
				+    return (train, val)
			
--- a/MxNet/Classification/RN50v1.5/examples/BENCHMARK_FP16.sh
+++ b/MxNet/Classification/RN50v1.5/examples/BENCHMARK_FP16.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 benchmark in FP16 on 1,4,8 GPUs with 64,128,192,208 batch size
			
 
				+# Usage ./BENCHMARK_FP16.sh <additionals flags>
			
 
				+
			
 
				+python benchmark.py -n 1,4,8 -b 64,128,192,208 -e 2 -w 1 -i 100 -o report.json $@
			
--- a/MxNet/Classification/RN50v1.5/examples/BENCHMARK_FP32.sh
+++ b/MxNet/Classification/RN50v1.5/examples/BENCHMARK_FP32.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 benchmark in FP32 on 1,4,8 GPUs with 32,64,96 batch size
			
 
				+# Usage ./BENCHMARK_FP32.sh <additionals flags>
			
 
				+
			
 
				+python benchmark.py -n 1,4,8 -b 32,64,96 -e 2 -w 1 -i 100 --dtype float32 -o report.json $@
			
--- a/MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP16.sh
+++ b/MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP16.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 inference benchmark in FP16 on 1 GPU with 1,2,4,64,128,192,208 batch size
			
 
				+# Usage ./INFER_BENCHMARK_FP16.sh <additionals flags>
			
 
				+
			
 
				+python benchmark.py -n 1 -b 1,2,4,64,128,192,208 --only-inference -e 3 -w 1 -i 100 -o report.json $@
			
--- a/MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP32.sh
+++ b/MxNet/Classification/RN50v1.5/examples/INFER_BENCHMARK_FP32.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 inference benchmark in FP32 on 1 GPU with 1,2,4,32,64,96 batch size
			
 
				+# Usage ./INFER_BENCHMARK_FP32.sh <additionals flags>
			
 
				+
			
 
				+python benchmark.py -n 1 -b 1,2,4,32,64,96 --only-inference -e 3 -w 1 -i 100 -o report.json $@
			
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP16_1GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP16_1GPU.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 training in FP16 on 1 GPUs using 208 batch size (208 per GPU)
			
 
				+# Usage ./RN50_FP16_1GPU.sh <path to this repository> <additionals flags>
			
 
				+
			
 
				+"$1/runner" -n 1 -b 208 --model-prefix model ${@:2}
			
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP16_4GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP16_4GPU.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 training in FP16 on 4 GPUs using 832 batch size (208 per GPU)
			
 
				+# Usage ./RN50_FP16_4GPU.sh <path to this repository> <additionals flags>
			
 
				+
			
 
				+"$1/runner" -n 4 -b 208 --model-prefix model ${@:2}
			
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP16_8GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP16_8GPU.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 training in FP16 on 8 GPUs using 1664 batch size (208 per GPU)
			
 
				+# Usage ./RN50_FP16_8GPU.sh <path to this repository> <additionals flags>
			
 
				+
			
 
				+"$1/runner" -n 8 -b 208 --model-prefix model ${@:2}
			
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP32_1GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP32_1GPU.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 training in FP32 on 1 GPUs using 96 batch size (96 per GPU)
			
 
				+# Usage ./RN50_FP32_1GPU.sh <path to this repository> <additionals flags>
			
 
				+
			
 
				+"$1/runner" -n 1 -b 96 --dtype float32 --model-prefix model ${@:2}
			
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP32_4GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP32_4GPU.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 training in FP32 on 4 GPUs using 384 batch size (96 per GPU)
			
 
				+# Usage ./RN50_FP32_4GPU.sh <path to this repository> <additionals flags>
			
 
				+
			
 
				+"$1/runner" -n 4 -b 96 --dtype float32 --model-prefix model ${@:2}
			
--- a/MxNet/Classification/RN50v1.5/examples/RN50_FP32_8GPU.sh
+++ b/MxNet/Classification/RN50v1.5/examples/RN50_FP32_8GPU.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script launches ResNet50 training in FP32 on 8 GPUs using 768 batch size (96 per GPU)
			
 
				+# Usage ./RN50_FP32_8GPU.sh <path to this repository> <additionals flags>
			
 
				+
			
 
				+"$1/runner" -n 8 -b 96 --dtype float32 --model-prefix model ${@:2}
			
--- a/MxNet/Classification/RN50v1.5/examples/SCORE_FP16.sh
+++ b/MxNet/Classification/RN50v1.5/examples/SCORE_FP16.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script score ResNet50 checkpoint in FP16 on 1 GPUs using 128 batch size
			
 
				+# Usage ./SCORE_FP16.sh <model prefix> <epoch> <additionals flags>
			
 
				+
			
 
				+./runner -n 1 -b 128 --only-inference --model-prefix $1 --load-epoch $2 -e 1 ${@:3}
			
--- a/MxNet/Classification/RN50v1.5/examples/SCORE_FP32.sh
+++ b/MxNet/Classification/RN50v1.5/examples/SCORE_FP32.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# This script score ResNet50 checkpoint in FP32 on 1 GPUs using 64 batch size
			
 
				+# Usage ./SCORE_FP32.sh <model prefix> <epoch> <additionals flags>
			
 
				+
			
 
				+./runner -n 1 -b 64 --dtype float32 --only-inference --model-prefix $1 --load-epoch $2 -e 1 ${@:3}
			
--- a/MxNet/Classification/RN50v1.5/fit.py
+++ b/MxNet/Classification/RN50v1.5/fit.py
@@ -0,0 +1,463 @@
 
				+# Copyright 2017-2018 The Apache Software Foundation
			
 
				+#
			
 
				+# Licensed to the Apache Software Foundation (ASF) under one
			
 
				+# or more contributor license agreements.  See the NOTICE file
			
 
				+# distributed with this work for additional information
			
 
				+# regarding copyright ownership.  The ASF licenses this file
			
 
				+# to you under the Apache License, Version 2.0 (the
			
 
				+# "License"); you may not use this file except in compliance
			
 
				+# with the License.  You may obtain a copy of the License at
			
 
				+#
			
 
				+#   http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing,
			
 
				+# software distributed under the License is distributed on an
			
 
				+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+# KIND, either express or implied.  See the License for the
			
 
				+# specific language governing permissions and limitations
			
 
				+# under the License.
			
 
				+#
			
 
				+# -----------------------------------------------------------------------
			
 
				+#
			
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+""" example train fit utility """
			
 
				+import logging
			
 
				+import os
			
 
				+import time
			
 
				+import re
			
 
				+import math
			
 
				+import sys
			
 
				+import mxnet as mx
			
 
				+from report import Report
			
 
				+from benchmarking import BenchmarkingDataIter
			
 
				+
			
 
				+def get_epoch_size(args, kv):
			
 
				+    return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size)
			
 
				+
			
 
				+def _get_lr_scheduler(args, kv):
			
 
				+    if 'lr_factor' not in args or args.lr_factor >= 1:
			
 
				+        return (args.lr, None)
			
 
				+    epoch_size = get_epoch_size(args, kv)
			
 
				+    begin_epoch = args.load_epoch if args.load_epoch else 0
			
 
				+    if 'pow' in args.lr_step_epochs:
			
 
				+        lr = args.lr
			
 
				+        max_up = args.num_epochs * epoch_size
			
 
				+        pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs))
			
 
				+        poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr)
			
 
				+        return (lr, poly_sched)
			
 
				+    step_epochs = [int(l) for l in args.lr_step_epochs.split(',')]
			
 
				+    lr = args.lr
			
 
				+    for s in step_epochs:
			
 
				+        if begin_epoch >= s:
			
 
				+            lr *= args.lr_factor
			
 
				+    if lr != args.lr:
			
 
				+        logging.info('Adjust learning rate to %e for epoch %d',
			
 
				+                     lr, begin_epoch)
			
 
				+
			
 
				+    steps = [epoch_size * (x - begin_epoch)
			
 
				+             for x in step_epochs if x - begin_epoch > 0]
			
 
				+    if steps:
			
 
				+        if kv:
			
 
				+            num_workers = kv.num_workers
			
 
				+        else:
			
 
				+            num_workers = 1
			
 
				+        epoch_size = math.ceil(int(args.num_examples/num_workers)/args.batch_size)
			
 
				+        return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor,
			
 
				+                                                         base_lr=args.lr, warmup_steps=epoch_size * args.warmup_epochs,
			
 
				+                                                         warmup_mode=args.warmup_strategy))
			
 
				+    else:
			
 
				+        return (lr, None)
			
 
				+
			
 
				+def _load_model(args, rank=0):
			
 
				+    if 'load_epoch' not in args or args.load_epoch is None:
			
 
				+        return (None, None, None)
			
 
				+    assert args.model_prefix is not None
			
 
				+    model_prefix = args.model_prefix
			
 
				+    if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)):
			
 
				+        model_prefix += "-%d" % (rank)
			
 
				+    sym, arg_params, aux_params = mx.model.load_checkpoint(
			
 
				+        model_prefix, args.load_epoch)
			
 
				+    logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch)
			
 
				+    return (sym, arg_params, aux_params)
			
 
				+
			
 
				+
			
 
				+def _save_model(args, rank=0):
			
 
				+    if args.model_prefix is None:
			
 
				+        return None
			
 
				+    return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % (
			
 
				+        args.model_prefix, rank), period=args.save_period)
			
 
				+
			
 
				+
			
 
				+def add_fit_args(parser):
			
 
				+    """
			
 
				+    parser : argparse.ArgumentParser
			
 
				+    return a parser added with args required by fit
			
 
				+    """
			
 
				+    train = parser.add_argument_group('Training', 'model training')
			
 
				+    train.add_argument('--num-layers', type=int,
			
 
				+                       help='number of layers in the neural network, \
			
 
				+                             required by some networks such as resnet')
			
 
				+    train.add_argument('--gpus', type=str,
			
 
				+                       help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu')
			
 
				+    train.add_argument('--kv-store', type=str, default='device',
			
 
				+                       help='key-value store type')
			
 
				+    train.add_argument('--num-epochs', type=int, default=100,
			
 
				+                       help='max num of epochs')
			
 
				+    train.add_argument('--lr', type=float, default=0.1,
			
 
				+                       help='initial learning rate')
			
 
				+    train.add_argument('--lr-factor', type=float, default=0.1,
			
 
				+                       help='the ratio to reduce lr on each step')
			
 
				+    train.add_argument('--lr-step-epochs', type=str,
			
 
				+                       help='the epochs to reduce the lr, e.g. 30,60')
			
 
				+    train.add_argument('--initializer', type=str, default='default',
			
 
				+                       help='the initializer type')
			
 
				+    train.add_argument('--optimizer', type=str, default='sgd',
			
 
				+                       help='the optimizer type')
			
 
				+    train.add_argument('--mom', type=float, default=0.9,
			
 
				+                       help='momentum for sgd')
			
 
				+    train.add_argument('--wd', type=float, default=0.0001,
			
 
				+                       help='weight decay for sgd')
			
 
				+    train.add_argument('--batch-size', type=int, default=208,
			
 
				+                       help='the batch size')
			
 
				+    train.add_argument('--disp-batches', type=int, default=20,
			
 
				+                       help='show progress for every n batches')
			
 
				+    train.add_argument('--model-prefix', type=str,
			
 
				+                       help='model prefix')
			
 
				+    train.add_argument('--save-period', type=int, default=1, help='params saving period')
			
 
				+    parser.add_argument('--monitor', dest='monitor', type=int, default=0,
			
 
				+                        help='log network parameters every N iters if larger than 0')
			
 
				+    train.add_argument('--load-epoch', type=int,
			
 
				+                       help='load the model on an epoch using the model-load-prefix')
			
 
				+    train.add_argument('--loss', type=str, default='',
			
 
				+                       help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss')
			
 
				+    train.add_argument('--test-io', type=int, default=0,
			
 
				+                       help='1 means test reading speed without training')
			
 
				+    train.add_argument('--dtype', type=str, default='float16',
			
 
				+                       help='precision: float32 or float16')
			
 
				+    train.add_argument('--gc-type', type=str, default='none',
			
 
				+                       help='type of gradient compression to use, \
			
 
				+                             takes `2bit` or `none` for now')
			
 
				+    train.add_argument('--gc-threshold', type=float, default=0.5,
			
 
				+                       help='threshold for 2bit gradient compression')
			
 
				+    # additional parameters for large batch sgd
			
 
				+    train.add_argument('--macrobatch-size', type=int, default=0,
			
 
				+                       help='distributed effective batch size')
			
 
				+    train.add_argument('--warmup-epochs', type=int, default=5,
			
 
				+                       help='the epochs to ramp-up lr to scaled large-batch value')
			
 
				+    train.add_argument('--warmup-strategy', type=str, default='linear',
			
 
				+                       help='the ramping-up strategy for large batch sgd')
			
 
				+    train.add_argument('--logging-dir', type=str, default='logs')
			
 
				+    train.add_argument('--log', type=str, default='')
			
 
				+    train.add_argument('--bn-gamma-init0', action='store_true')
			
 
				+    train.add_argument('--epoch-size',type=int, default=0,
			
 
				+                       help='set number of batches in an epoch. useful for debugging')
			
 
				+    #train.add_argument('--tensorboard', type=str, default='',
			
 
				+    #                   help='log parameters to visualize in tensorboard every epoch. takes name to specify as tensorboard run. Empty means tensorboard logging is disabled')
			
 
				+    train.add_argument('--profile-worker-suffix', type=str, default='',
			
 
				+                       help='profile workers actions into this file. During distributed training\
			
 
				+                             filename saved will be rank1_ followed by this suffix')
			
 
				+    train.add_argument('--profile-server-suffix', type=str, default='',
			
 
				+                       help='profile server actions into a file with name like rank1_ followed by this suffix \
			
 
				+                             during distributed training')
			
 
				+    train.add_argument('--report', type=str, help='file where to save report')
			
 
				+    train.add_argument('--only-inference', action='store_true', help='do not train, only inference (for benchmarking)')
			
 
				+    train.add_argument('--no-metrics', action='store_true', help='do not calculate evaluation metrics (for benchmarking)')
			
 
				+    return train
			
 
				+
			
 
				+
			
 
				+def fit(args, network, data_loader, **kwargs):
			
 
				+    """
			
 
				+    train a model
			
 
				+    args : argparse returns
			
 
				+    network : the symbol definition of the nerual network
			
 
				+    data_loader : function that returns the train and val data iterators
			
 
				+    """
			
 
				+
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    # kvstore
			
 
				+    kv = mx.kvstore.create(args.kv_store)
			
 
				+    if args.gc_type != 'none':
			
 
				+        kv.set_gradient_compression({'type': args.gc_type,
			
 
				+                                     'threshold': args.gc_threshold})
			
 
				+    if args.profile_server_suffix:
			
 
				+        mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server')
			
 
				+        mx.profiler.set_state(state='run', profile_process='server')
			
 
				+
			
 
				+    if args.profile_worker_suffix:
			
 
				+        if kv.num_workers > 1:
			
 
				+            filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix
			
 
				+        else:
			
 
				+            filename = args.profile_worker_suffix
			
 
				+        mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker')
			
 
				+        mx.profiler.set_state(state='run', profile_process='worker')
			
 
				+
			
 
				+    # logging
			
 
				+    head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
			
 
				+    logging.basicConfig(level=logging.DEBUG, format=head)
			
 
				+    logging.info('start with arguments %s', args)
			
 
				+
			
 
				+    epoch_size = get_epoch_size(args, kv)
			
 
				+
			
 
				+    # data iterators
			
 
				+    (train, val) = data_loader(args, kv)
			
 
				+    if 'dist' in args.kv_store and not 'async' in args.kv_store:
			
 
				+        logging.info('Resizing training data to %d batches per machine', epoch_size)
			
 
				+        # resize train iter to ensure each machine has same number of batches per epoch
			
 
				+        # if not, dist_sync can hang at the end with one machine waiting for other machines
			
 
				+        if not args.use_dali:
			
 
				+            train = mx.io.ResizeIter(train, epoch_size)
			
 
				+
			
 
				+    if args.test_io:
			
 
				+        tic = time.time()
			
 
				+        for i, batch in enumerate(train):
			
 
				+            if isinstance(batch, list):
			
 
				+                for b in batch:
			
 
				+                    for j in b.data:
			
 
				+                        j.wait_to_read()
			
 
				+            else:
			
 
				+                for j in batch.data:
			
 
				+                    j.wait_to_read()
			
 
				+            if (i + 1) % args.disp_batches == 0:
			
 
				+                logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i,
			
 
				+                             args.disp_batches * args.batch_size / (time.time() - tic))
			
 
				+                tic = time.time()
			
 
				+        return
			
 
				+
			
 
				+    # load model
			
 
				+    if 'arg_params' in kwargs and 'aux_params' in kwargs:
			
 
				+        arg_params = kwargs['arg_params']
			
 
				+        aux_params = kwargs['aux_params']
			
 
				+    else:
			
 
				+        sym, arg_params, aux_params = _load_model(args, kv.rank)
			
 
				+
			
 
				+    # save model
			
 
				+    checkpoint = _save_model(args, kv.rank)
			
 
				+    epoch_end_callbacks = []
			
 
				+    if checkpoint:
			
 
				+        epoch_end_callbacks.append(checkpoint)
			
 
				+
			
 
				+    # devices for training
			
 
				+    devs = mx.cpu() if args.gpus is None or args.gpus == "" else [
			
 
				+        mx.gpu(int(i)) for i in args.gpus.split(',')]
			
 
				+
			
 
				+    # learning rate
			
 
				+    lr, lr_scheduler = _get_lr_scheduler(args, kv)
			
 
				+
			
 
				+    # create model
			
 
				+    model = mx.mod.Module(
			
 
				+        context=devs,
			
 
				+        symbol=network
			
 
				+    )
			
 
				+
			
 
				+    optimizer_params = {
			
 
				+        'learning_rate': lr,
			
 
				+        'wd': args.wd,
			
 
				+        'lr_scheduler': lr_scheduler,
			
 
				+        'multi_precision': True}
			
 
				+
			
 
				+    # Only a limited number of optimizers have 'momentum' property
			
 
				+    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'}
			
 
				+    if args.optimizer in has_momentum:
			
 
				+        optimizer_params['momentum'] = args.mom
			
 
				+
			
 
				+    monitor = mx.mon.Monitor(
			
 
				+        args.monitor, pattern=".*") if args.monitor > 0 else None
			
 
				+
			
 
				+    # A limited number of optimizers have a warmup period
			
 
				+    has_warmup = {'lbsgd', 'lbnag'}
			
 
				+    if args.optimizer in has_warmup:
			
 
				+        if 'dist' in args.kv_store:
			
 
				+            nworkers = kv.num_workers
			
 
				+        else:
			
 
				+            nworkers = 1
			
 
				+        epoch_size = args.num_examples / args.batch_size / nworkers
			
 
				+
			
 
				+        if epoch_size < 1:
			
 
				+            epoch_size = 1
			
 
				+        macrobatch_size = args.macrobatch_size
			
 
				+        if macrobatch_size < args.batch_size * nworkers:
			
 
				+            macrobatch_size = args.batch_size * nworkers
			
 
				+        #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999)
			
 
				+        batch_scale = math.ceil(
			
 
				+            float(macrobatch_size) / args.batch_size / nworkers)
			
 
				+        optimizer_params['updates_per_epoch'] = epoch_size
			
 
				+        optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0
			
 
				+        optimizer_params['batch_scale'] = batch_scale
			
 
				+        optimizer_params['warmup_strategy'] = args.warmup_strategy
			
 
				+        optimizer_params['warmup_epochs'] = args.warmup_epochs
			
 
				+        optimizer_params['num_epochs'] = args.num_epochs
			
 
				+
			
 
				+    if args.initializer == 'default':
			
 
				+        initializer = mx.init.Xavier(
			
 
				+            rnd_type='gaussian', factor_type="in", magnitude=2)
			
 
				+    # initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
			
 
				+    elif args.initializer == 'xavier':
			
 
				+        initializer = mx.init.Xavier()
			
 
				+    elif args.initializer == 'msra':
			
 
				+        initializer = mx.init.MSRAPrelu()
			
 
				+    elif args.initializer == 'orthogonal':
			
 
				+        initializer = mx.init.Orthogonal()
			
 
				+    elif args.initializer == 'normal':
			
 
				+        initializer = mx.init.Normal()
			
 
				+    elif args.initializer == 'uniform':
			
 
				+        initializer = mx.init.Uniform()
			
 
				+    elif args.initializer == 'one':
			
 
				+        initializer = mx.init.One()
			
 
				+    elif args.initializer == 'zero':
			
 
				+        initializer = mx.init.Zero()
			
 
				+
			
 
				+    # evaluation metrices
			
 
				+    if not args.no_metrics:
			
 
				+        eval_metrics = ['crossentropy', 'accuracy']
			
 
				+        eval_metrics.append(mx.metric.create(
			
 
				+            'top_k_accuracy', top_k=5))
			
 
				+    else:
			
 
				+        eval_metrics = []
			
 
				+
			
 
				+    supported_loss = ['ce', 'nll_loss']
			
 
				+    if len(args.loss) > 0:
			
 
				+        # ce or nll loss is only applicable to softmax output
			
 
				+        loss_type_list = args.loss.split(',')
			
 
				+        if 'softmax_output' in network.list_outputs():
			
 
				+            for loss_type in loss_type_list:
			
 
				+                loss_type = loss_type.strip()
			
 
				+                if loss_type == 'nll':
			
 
				+                    loss_type = 'nll_loss'
			
 
				+                if loss_type not in supported_loss:
			
 
				+                    logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \
			
 
				+                                    'negative likelihood loss is supported!')
			
 
				+                else:
			
 
				+                    eval_metrics.append(mx.metric.create(loss_type))
			
 
				+        else:
			
 
				+            logging.warning("The output is not softmax_output, loss argument will be skipped!")
			
 
				+
			
 
				+    # callbacks that run after each batch
			
 
				+    batch_end_callbacks = []
			
 
				+    batch_end_callbacks.append(mx.callback.Speedometer(
			
 
				+        args.batch_size, args.disp_batches))
			
 
				+
			
 
				+    if 'batch_end_callback' in kwargs:
			
 
				+        cbs = kwargs['batch_end_callback']
			
 
				+        batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs]
			
 
				+
			
 
				+
			
 
				+    report = Report('resnet{}'.format(args.num_layers), len(args.gpus.split(',')), sys.argv)
			
 
				+
			
 
				+    train = BenchmarkingDataIter(train, args.benchmark_iters)
			
 
				+    val = BenchmarkingDataIter(val, args.benchmark_iters)
			
 
				+
			
 
				+    class Gatherer:
			
 
				+        def __init__(self, report, mode, data_iter, total_bs=None):
			
 
				+            self.report = report
			
 
				+            self.mode = mode
			
 
				+            self.total_bs = total_bs
			
 
				+            self.data_iter = data_iter
			
 
				+            self.clear()
			
 
				+
			
 
				+        def clear(self):
			
 
				+            self.num = 0
			
 
				+            self.top1 = 0
			
 
				+            self.top5 = 0
			
 
				+            self.loss = 0
			
 
				+            self.time = 0
			
 
				+            self.tic = 0
			
 
				+
			
 
				+        def gather_metrics(self, data):
			
 
				+            params = dict(data.eval_metric.get_global_name_value())
			
 
				+
			
 
				+            if self.num != 0:
			
 
				+                self.time += time.time() - self.tic
			
 
				+            self.num += 1
			
 
				+            if not args.no_metrics:
			
 
				+                self.top1 = params['accuracy']
			
 
				+                self.top5 = params['top_k_accuracy_5']
			
 
				+                self.loss = params['cross-entropy']
			
 
				+
			
 
				+            self.tic = time.time()
			
 
				+
			
 
				+        def add_metrics(self, *a, **k):
			
 
				+            top1 = self.top1 * 100
			
 
				+            top5 = self.top5 * 100
			
 
				+            loss = self.loss
			
 
				+            if self.num <= 1:
			
 
				+                time = float('nan')
			
 
				+            else:
			
 
				+                time = self.time / (self.num - 1)
			
 
				+            data = self.data_iter.get_avg_time_and_clear()
			
 
				+            if self.total_bs is not None:
			
 
				+                compute_ips = self.total_bs / (time - data)
			
 
				+                total_ips = self.total_bs / time
			
 
				+
			
 
				+            if not args.no_metrics:
			
 
				+                self.report.add_value('{}.top1'.format(self.mode), top1)
			
 
				+                self.report.add_value('{}.top5'.format(self.mode), top5)
			
 
				+                self.report.add_value('{}.loss'.format(self.mode), loss)
			
 
				+            self.report.add_value('{}.time'.format(self.mode), time)
			
 
				+            # self.report.add_value('{}.data'.format(self.mode), data)
			
 
				+            if self.total_bs is not None:
			
 
				+                # self.report.add_value('{}.compute_ips'.format(self.mode), compute_ips)
			
 
				+                self.report.add_value('{}.total_ips'.format(self.mode), total_ips)
			
 
				+            self.clear()
			
 
				+
			
 
				+    def save_report(*a, **k):
			
 
				+        report.set_total_duration(time.time() - start_time)
			
 
				+        if args.report:
			
 
				+            report.save(args.report)
			
 
				+
			
 
				+    train_gatherer = Gatherer(report, 'train', train, args.batch_size)
			
 
				+    eval_gatherer = Gatherer(report, 'val', val, args.batch_size)
			
 
				+
			
 
				+    batch_end_callbacks = [train_gatherer.gather_metrics] + batch_end_callbacks
			
 
				+    epoch_end_callbacks = [train_gatherer.add_metrics, save_report] + epoch_end_callbacks
			
 
				+
			
 
				+    eval_batch_end_callbacks = [eval_gatherer.gather_metrics]
			
 
				+    eval_end_callbacks = [eval_gatherer.add_metrics, save_report]
			
 
				+
			
 
				+    # run
			
 
				+    model.fit(train,
			
 
				+              begin_epoch=args.load_epoch if args.load_epoch else 0,
			
 
				+              num_epoch=args.num_epochs if not args.only_inference else 0,
			
 
				+              eval_data=val,
			
 
				+              eval_metric=eval_metrics,
			
 
				+              kvstore=kv,
			
 
				+              optimizer=args.optimizer,
			
 
				+              optimizer_params=optimizer_params,
			
 
				+              initializer=initializer,
			
 
				+              arg_params=arg_params,
			
 
				+              aux_params=aux_params,
			
 
				+              batch_end_callback=batch_end_callbacks,
			
 
				+              epoch_end_callback=epoch_end_callbacks, #checkpoint if args.use_dali else ,,
			
 
				+              eval_batch_end_callback=eval_batch_end_callbacks,
			
 
				+              eval_end_callback=eval_end_callbacks,
			
 
				+              allow_missing=True,
			
 
				+              monitor=monitor)
			
 
				+
			
 
				+    if args.only_inference:
			
 
				+        for epoch in range(args.num_epochs):
			
 
				+            score = model.score(val, eval_metrics, batch_end_callback=eval_batch_end_callbacks, score_end_callback=eval_end_callbacks, epoch=epoch)
			
 
				+            print('-------------')
			
 
				+            for name, value in score:
			
 
				+                print('{}: {}'.format(name, value))
			
 
				+
			
 
				+    if args.profile_server_suffix:
			
 
				+        mx.profiler.set_state(state='run', profile_process='server')
			
 
				+    if args.profile_worker_suffix:
			
 
				+        mx.profiler.set_state(state='run', profile_process='worker')
			
 
				+
			
 
				+    save_report()
			
 
				+
			
 
				+    print('Experiment took: {} sec'.format(report.total_duration))
			
--- a/MxNet/Classification/RN50v1.5/img/training_accuracy.png
+++ b/MxNet/Classification/RN50v1.5/img/training_accuracy.png
--- a/MxNet/Classification/RN50v1.5/img/training_loss.png
+++ b/MxNet/Classification/RN50v1.5/img/training_loss.png
--- a/MxNet/Classification/RN50v1.5/img/validation_accuracy.png
+++ b/MxNet/Classification/RN50v1.5/img/validation_accuracy.png
--- a/MxNet/Classification/RN50v1.5/report.py
+++ b/MxNet/Classification/RN50v1.5/report.py
@@ -0,0 +1,57 @@
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+# Report JSON file structure:
			
 
				+# - "model"          : architecture of the model (e.g. "resnet50").
			
 
				+# - "ngpus"          : number of gpus on which training was performed.
			
 
				+# - "total_duration" : total duration of training in seconds.
			
 
				+# - "cmd"            : list of application arguments.
			
 
				+# - "metrics"        : per epoch metrics for train and validation
			
 
				+#                      (some of below metrics may not exist in the report,
			
 
				+#                       depending on application arguments)
			
 
				+#       - "train.top1"      : training top1 accuracy in epoch.
			
 
				+#       - "train.top5"      : training top5 accuracy in epoch.
			
 
				+#       - "train.loss"      : training loss in epoch.
			
 
				+#       - "train.time"      : average training time of iteration in seconds.
			
 
				+#       - "train.total_ips" : training speed (data and compute time taken into account) for epoch in images/sec.
			
 
				+#       - "val.top1", "val.top5", "val.loss", "val.time", "val.total_ips" : the same but for validation.
			
 
				+
			
 
				+import json
			
 
				+from collections import defaultdict, OrderedDict
			
 
				+
			
 
				+class Report:
			
 
				+    def __init__(self, model_name, ngpus, cmd):
			
 
				+        self.model_name = model_name
			
 
				+        self.ngpus = ngpus
			
 
				+        self.cmd = cmd
			
 
				+        self.total_duration = 0
			
 
				+        self.metrics = defaultdict(lambda: [])
			
 
				+
			
 
				+    def add_value(self, metric, value):
			
 
				+        self.metrics[metric].append(value)
			
 
				+
			
 
				+    def set_total_duration(self, duration):
			
 
				+        self.total_duration = duration
			
 
				+
			
 
				+    def save(self, filename):
			
 
				+        report = OrderedDict([
			
 
				+            ('model', self.model_name),
			
 
				+            ('ngpus', self.ngpus),
			
 
				+            ('total_duration', self.total_duration),
			
 
				+            ('cmd', self.cmd),
			
 
				+            ('metrics', self.metrics),
			
 
				+        ])
			
 
				+        with open(filename, 'w') as f:
			
 
				+            json.dump(report, f, indent=4)
			
--- a/MxNet/Classification/RN50v1.5/resnet.py
+++ b/MxNet/Classification/RN50v1.5/resnet.py
@@ -0,0 +1,376 @@
 
				+# Copyright 2017-2018 The Apache Software Foundation
			
 
				+#
			
 
				+# Licensed to the Apache Software Foundation (ASF) under one
			
 
				+# or more contributor license agreements.  See the NOTICE file
			
 
				+# distributed with this work for additional information
			
 
				+# regarding copyright ownership.  The ASF licenses this file
			
 
				+# to you under the Apache License, Version 2.0 (the
			
 
				+# "License"); you may not use this file except in compliance
			
 
				+# with the License.  You may obtain a copy of the License at
			
 
				+#
			
 
				+#   http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing,
			
 
				+# software distributed under the License is distributed on an
			
 
				+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+# KIND, either express or implied.  See the License for the
			
 
				+# specific language governing permissions and limitations
			
 
				+# under the License.
			
 
				+#
			
 
				+# -----------------------------------------------------------------------
			
 
				+#
			
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+'''
			
 
				+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
			
 
				+(Original author Wei Wu) by Antti-Pekka Hynninen
			
 
				+
			
 
				+"Flexible Layout" (fl) version created by Dick Carter.
			
 
				+
			
 
				+Implementing the original resnet ILSVRC 2015 winning network from:
			
 
				+
			
 
				+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
			
 
				+'''
			
 
				+import mxnet as mx
			
 
				+import numpy as np
			
 
				+import random
			
 
				+
			
 
				+# Transform a symbol from one layout to another, or do nothing if they have the same layout
			
 
				+def transform_layout(data, from_layout, to_layout):
			
 
				+    supported_layouts = ['NCHW', 'NHWC']
			
 
				+    if from_layout not in supported_layouts:
			
 
				+        raise ValueError('Not prepared to handle layout: {}'.format(from_layout))
			
 
				+    if to_layout not in supported_layouts:
			
 
				+        raise ValueError('Not prepared to handle layout: {}'.format(to_layout))
			
 
				+
			
 
				+    # Insert transpose if from_layout and to_layout don't match
			
 
				+    if from_layout == 'NCHW' and to_layout == 'NHWC':
			
 
				+        return mx.sym.transpose(data, axes=(0, 2, 3, 1))
			
 
				+    elif from_layout == 'NHWC' and to_layout == 'NCHW':
			
 
				+        return mx.sym.transpose(data, axes=(0, 3, 1, 2))
			
 
				+    else:
			
 
				+        return data
			
 
				+
			
 
				+# A BatchNorm wrapper that responds to the input layout
			
 
				+def batchnorm(data, io_layout, batchnorm_layout, **kwargs):
			
 
				+    # Transpose as needed to batchnorm_layout
			
 
				+    transposed_as_needed = transform_layout(data, io_layout, batchnorm_layout)
			
 
				+    bn_axis = 3 if batchnorm_layout == 'NHWC' else 1
			
 
				+    batchnormed = mx.sym.BatchNorm(data=transposed_as_needed, axis=bn_axis, **kwargs)
			
 
				+    # Transpose back to i/o layout as needed
			
 
				+    return transform_layout(batchnormed, batchnorm_layout, io_layout)
			
 
				+
			
 
				+# A BatchNormAddRelu wrapper that responds to the input layout
			
 
				+def batchnorm_add_relu(data, addend, io_layout, batchnorm_layout, **kwargs):
			
 
				+    # Transpose as needed to batchnorm_layout
			
 
				+    transposed_data_as_needed = transform_layout(data, io_layout, batchnorm_layout)
			
 
				+    transposed_addend_as_needed = transform_layout(addend, io_layout, batchnorm_layout)
			
 
				+    bn_axis = 3 if batchnorm_layout == 'NHWC' else 1
			
 
				+    batchnormed = mx.sym.BatchNormAddRelu(data=transposed_data_as_needed,
			
 
				+                                      addend=transposed_addend_as_needed,
			
 
				+                                      axis=bn_axis, **kwargs)
			
 
				+    # Transpose back to i/o layout as needed
			
 
				+    return transform_layout(batchnormed, batchnorm_layout, io_layout)
			
 
				+
			
 
				+# A Pooling wrapper that responds to the input layout
			
 
				+def pooling(data, io_layout, pooling_layout, **kwargs):
			
 
				+    # Pooling kernel, as specified by pooling_layout, may be in conflict with i/o layout.
			
 
				+    transposed_as_needed = transform_layout(data, io_layout, pooling_layout)
			
 
				+    pooled = mx.sym.Pooling(data=transposed_as_needed, layout=pooling_layout, **kwargs)
			
 
				+    # Transpose back to i/o layout as needed
			
 
				+    return transform_layout(pooled, pooling_layout, io_layout)
			
 
				+
			
 
				+# Assumption is that data comes in and out in the 'conv_layout' format.
			
 
				+# If this format is different from the 'batchnorm_layout' format, then the batchnorm() routine
			
 
				+# will introduce transposes on both sides of the mx.sym.BatchNorm symbol
			
 
				+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True,
			
 
				+                  workspace=256, memonger=False, conv_layout='NCHW', batchnorm_layout='NCHW',
			
 
				+                  verbose=False, cudnn_bn_off=False, bn_eps=2e-5, bn_mom=0.9, conv_algo=-1,
			
 
				+                  fuse_bn_relu=False, fuse_bn_add_relu=False, cudnn_tensor_core_only=False):
			
 
				+    """Return ResNet Unit symbol for building ResNet
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    data : str
			
 
				+        Input data
			
 
				+    num_filter : int
			
 
				+        Number of output channels
			
 
				+    bnf : int
			
 
				+        Bottle neck channels factor with regard to num_filter
			
 
				+    stride : tuple
			
 
				+        Stride used in convolution
			
 
				+    dim_match : Boolean
			
 
				+        True means channel number between input and output is the same, otherwise means differ
			
 
				+    name : str
			
 
				+        Base name of the operators
			
 
				+    workspace : int
			
 
				+        Workspace used in convolution operator
			
 
				+    """
			
 
				+
			
 
				+    act = 'relu' if fuse_bn_relu else None
			
 
				+    if bottle_neck:
			
 
				+        conv1 = mx.sym.Convolution(data=data, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
			
 
				+                                   no_bias=True, workspace=workspace, name=name + '_conv1', layout=conv_layout,
			
 
				+                                   cudnn_algo_verbose=verbose,
			
 
				+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
			
 
				+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
			
 
				+        bn1 = batchnorm(data=conv1, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                        fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn1', cudnn_off=cudnn_bn_off, act_type=act)
			
 
				+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') if not fuse_bn_relu else bn1
			
 
				+        conv2 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
			
 
				+                                   no_bias=True, workspace=workspace, name=name + '_conv2', layout=conv_layout,
			
 
				+                                   cudnn_algo_verbose=verbose,
			
 
				+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
			
 
				+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
			
 
				+        bn2 = batchnorm(data=conv2, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                        fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn2', cudnn_off=cudnn_bn_off, act_type=act)
			
 
				+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2') if not fuse_bn_relu else bn2
			
 
				+        conv3 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
			
 
				+                                   workspace=workspace, name=name + '_conv3', layout=conv_layout,
			
 
				+                                   cudnn_algo_verbose=verbose,
			
 
				+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
			
 
				+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
			
 
				+
			
 
				+        if dim_match:
			
 
				+            shortcut = data
			
 
				+        else:
			
 
				+            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
			
 
				+                                            workspace=workspace, name=name+'_conv1sc', layout=conv_layout,
			
 
				+                                         cudnn_algo_verbose=verbose,
			
 
				+                                         cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
			
 
				+                                         cudnn_tensor_core_only=cudnn_tensor_core_only)
			
 
				+            shortcut = batchnorm(data=conv1sc, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                                 fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_sc', cudnn_off=cudnn_bn_off)
			
 
				+        if memonger:
			
 
				+            shortcut._set_attr(mirror_stage='True')
			
 
				+
			
 
				+        if fuse_bn_add_relu:
			
 
				+            return batchnorm_add_relu(data=conv3, addend=shortcut, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                            fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn3', cudnn_off=cudnn_bn_off)
			
 
				+        else:
			
 
				+            bn3 = batchnorm(data=conv3, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                            fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn3', cudnn_off=cudnn_bn_off)
			
 
				+            return mx.sym.Activation(data=bn3 + shortcut, act_type='relu', name=name + '_relu3')
			
 
				+
			
 
				+    else:
			
 
				+        conv1 = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
			
 
				+                                      no_bias=True, workspace=workspace, name=name + '_conv1', layout=conv_layout,
			
 
				+                                   cudnn_algo_verbose=verbose,
			
 
				+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
			
 
				+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
			
 
				+        bn1 = batchnorm(data=conv1, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                        fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn1', cudnn_off=cudnn_bn_off, act_type=act)
			
 
				+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') if not fuse_bn_relu else bn1
			
 
				+        conv2 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
			
 
				+                                      no_bias=True, workspace=workspace, name=name + '_conv2', layout=conv_layout,
			
 
				+                                   cudnn_algo_verbose=verbose,
			
 
				+                                   cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
			
 
				+                                   cudnn_tensor_core_only=cudnn_tensor_core_only)
			
 
				+
			
 
				+        if dim_match:
			
 
				+            shortcut = data
			
 
				+        else:
			
 
				+            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
			
 
				+                                            workspace=workspace, name=name+'_conv1sc', layout=conv_layout,
			
 
				+                                         cudnn_algo_verbose=verbose,
			
 
				+                                         cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
			
 
				+                                         cudnn_tensor_core_only=cudnn_tensor_core_only)
			
 
				+            shortcut = batchnorm(data=conv1sc, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                                 fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_sc', cudnn_off=cudnn_bn_off)
			
 
				+        if memonger:
			
 
				+            shortcut._set_attr(mirror_stage='True')
			
 
				+
			
 
				+        if fuse_bn_add_relu:
			
 
				+            return batchnorm_add_relu(data=conv2, addend=shortcut, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                            fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn2', cudnn_off=cudnn_bn_off)
			
 
				+        else:
			
 
				+            bn2 = batchnorm(data=conv2, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                            fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn2', cudnn_off=cudnn_bn_off)
			
 
				+            return mx.sym.Activation(data=bn2 + shortcut, act_type='relu', name=name + '_relu2')
			
 
				+
			
 
				+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, workspace=256, dtype='float32', memonger=False,
			
 
				+           input_layout='NCHW', conv_layout='NCHW',  batchnorm_layout='NCHW', pooling_layout='NCHW', verbose=False,
			
 
				+           cudnn_bn_off=False, bn_eps=2e-5, bn_mom=0.9, conv_algo=-1,
			
 
				+           fuse_bn_relu=False, fuse_bn_add_relu=False, force_tensor_core=False, use_dali=True):
			
 
				+    """Return ResNet symbol of
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    units : list
			
 
				+        Number of units in each stage
			
 
				+    num_stages : int
			
 
				+        Number of stage
			
 
				+    filter_list : list
			
 
				+        Channel size of each stage
			
 
				+    num_classes : int
			
 
				+        Ouput size of symbol
			
 
				+    dataset : str
			
 
				+        Dataset type, only cifar10 and imagenet supports
			
 
				+    workspace : int
			
 
				+        Workspace used in convolution operator
			
 
				+    dtype : str
			
 
				+        Precision (float32 or float16)
			
 
				+    memonger : boolean
			
 
				+        Activates "memory monger" to reduce the model's memory footprint
			
 
				+    input_layout : str
			
 
				+        interpretation (e.g. NCHW vs NHWC) of data provided by the i/o pipeline (may introduce transposes
			
 
				+        if in conflict with 'layout' above)
			
 
				+    conv_layout : str
			
 
				+        interpretation (e.g. NCHW vs NHWC) of data for convolution operation.
			
 
				+    batchnorm_layout : str
			
 
				+        directs which kernel performs the batchnorm (may introduce transposes if in conflict with 'conv_layout' above)
			
 
				+    pooling_layout : str
			
 
				+        directs which kernel performs the pooling (may introduce transposes if in conflict with 'conv_layout' above)
			
 
				+    """
			
 
				+
			
 
				+    act = 'relu' if fuse_bn_relu else None
			
 
				+    num_unit = len(units)
			
 
				+    assert(num_unit == num_stages)
			
 
				+    data = mx.sym.Variable(name='data')
			
 
				+    if not use_dali:
			
 
				+        # double buffering of data
			
 
				+        if dtype == 'float32':
			
 
				+            data = mx.sym.identity(data=data, name='id')
			
 
				+        else:
			
 
				+            if dtype == 'float16':
			
 
				+                data = mx.sym.Cast(data=data, dtype=np.float16)
			
 
				+    (nchannel, height, width) = image_shape
			
 
				+
			
 
				+    # Insert transpose as needed to get the input layout to match the desired processing layout
			
 
				+    data = transform_layout(data, input_layout, conv_layout)
			
 
				+
			
 
				+    if height <= 32:            # such as cifar10
			
 
				+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
			
 
				+                                  no_bias=True, name="conv0", workspace=workspace, layout=conv_layout,
			
 
				+                                  cudnn_algo_verbose=verbose,
			
 
				+                                  cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
			
 
				+                                  cudnn_tensor_core_only=force_tensor_core)
			
 
				+        # Is this BatchNorm supposed to be here?
			
 
				+        body = batchnorm(data=body, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                         fix_gamma=False, eps=bn_eps, momentum=bn_mom, name='bn0', cudnn_off=cudnn_bn_off)
			
 
				+    else:                       # often expected to be 224 such as imagenet
			
 
				+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
			
 
				+                                  no_bias=True, name="conv0", workspace=workspace, layout=conv_layout,
			
 
				+                                  cudnn_algo_verbose=verbose,
			
 
				+                                  cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
			
 
				+                                  cudnn_tensor_core_only=force_tensor_core)
			
 
				+        body = batchnorm(data=body, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                         fix_gamma=False, eps=bn_eps, momentum=bn_mom, name='bn0', cudnn_off=cudnn_bn_off, act_type=act)
			
 
				+        if not fuse_bn_relu:
			
 
				+            body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
			
 
				+        body = pooling(data=body, io_layout=conv_layout, pooling_layout=pooling_layout,
			
 
				+                       kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max')
			
 
				+
			
 
				+    for i in range(num_stages):
			
 
				+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
			
 
				+                             name='stage%d_unit%d' % (i + 1, 1),
			
 
				+                             bottle_neck=bottle_neck, workspace=workspace,
			
 
				+                             memonger=memonger, conv_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                             verbose=verbose, cudnn_bn_off=cudnn_bn_off, bn_eps=bn_eps, bn_mom=bn_mom,
			
 
				+                             conv_algo=conv_algo, fuse_bn_relu=fuse_bn_relu, fuse_bn_add_relu=fuse_bn_add_relu,
			
 
				+                             cudnn_tensor_core_only=force_tensor_core)
			
 
				+        for j in range(units[i]-1):
			
 
				+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
			
 
				+                                 bottle_neck=bottle_neck, workspace=workspace,
			
 
				+                                 memonger=memonger, conv_layout=conv_layout, batchnorm_layout=batchnorm_layout,
			
 
				+                                 verbose=verbose, cudnn_bn_off=cudnn_bn_off, bn_eps = bn_eps, bn_mom=bn_mom,
			
 
				+                                 conv_algo=conv_algo, fuse_bn_relu=fuse_bn_relu, fuse_bn_add_relu=fuse_bn_add_relu,
			
 
				+                                 cudnn_tensor_core_only=force_tensor_core)
			
 
				+    # bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
			
 
				+    # relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
			
 
				+    # Although kernel is not used here when global_pool=True, we should put one
			
 
				+    pool1 = pooling(data=body, io_layout=conv_layout, pooling_layout=pooling_layout,
			
 
				+                    global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
			
 
				+    flat = mx.sym.Flatten(data=pool1)
			
 
				+    fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', cublas_algo_verbose=verbose)
			
 
				+    if dtype == 'float16':
			
 
				+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
			
 
				+    return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
			
 
				+
			
 
				+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32',
			
 
				+               input_layout='NCHW', conv_layout='NCHW', batchnorm_layout='NCHW', pooling_layout='NCHW',
			
 
				+               verbose=False, seed=None, cudnn_bn_off=False, batchnorm_eps=2e-5, batchnorm_mom=0.9,
			
 
				+               conv_algo=-1, fuse_bn_relu=False, fuse_bn_add_relu=False, force_tensor_core=False, use_dali=True, **kwargs):
			
 
				+    """
			
 
				+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
			
 
				+    (Original author Wei Wu) by Antti-Pekka Hynninen
			
 
				+    Implementing the original resnet ILSVRC 2015 winning network from:
			
 
				+    Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
			
 
				+    """
			
 
				+    if seed is not None:
			
 
				+        print('Setting seeds to %s' % (seed,))
			
 
				+        random.seed(seed)
			
 
				+        np.random.seed(seed)
			
 
				+        mx.random.seed(seed)
			
 
				+
			
 
				+    image_shape = [int(l) for l in image_shape.split(',')]
			
 
				+    (nchannel, height, width) = image_shape
			
 
				+    if height <= 28:
			
 
				+        num_stages = 3
			
 
				+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
			
 
				+            per_unit = [(num_layers-2)//9]
			
 
				+            filter_list = [16, 64, 128, 256]
			
 
				+            bottle_neck = True
			
 
				+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
			
 
				+            per_unit = [(num_layers-2)//6]
			
 
				+            filter_list = [16, 16, 32, 64]
			
 
				+            bottle_neck = False
			
 
				+        else:
			
 
				+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
			
 
				+        units = per_unit * num_stages
			
 
				+    else:
			
 
				+        if num_layers >= 50:
			
 
				+            filter_list = [64, 256, 512, 1024, 2048]
			
 
				+            bottle_neck = True
			
 
				+        else:
			
 
				+            filter_list = [64, 64, 128, 256, 512]
			
 
				+            bottle_neck = False
			
 
				+        num_stages = 4
			
 
				+        if num_layers == 18:
			
 
				+            units = [2, 2, 2, 2]
			
 
				+        elif num_layers == 34:
			
 
				+            units = [3, 4, 6, 3]
			
 
				+        elif num_layers == 50:
			
 
				+            units = [3, 4, 6, 3]
			
 
				+        elif num_layers == 101:
			
 
				+            units = [3, 4, 23, 3]
			
 
				+        elif num_layers == 152:
			
 
				+            units = [3, 8, 36, 3]
			
 
				+        elif num_layers == 200:
			
 
				+            units = [3, 24, 36, 3]
			
 
				+        elif num_layers == 269:
			
 
				+            units = [3, 30, 48, 8]
			
 
				+        else:
			
 
				+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
			
 
				+
			
 
				+    return resnet(units             = units,
			
 
				+                  num_stages        = num_stages,
			
 
				+                  filter_list       = filter_list,
			
 
				+                  num_classes       = num_classes,
			
 
				+                  image_shape       = image_shape,
			
 
				+                  bottle_neck       = bottle_neck,
			
 
				+                  workspace         = conv_workspace,
			
 
				+                  dtype             = dtype,
			
 
				+                  input_layout      = input_layout,
			
 
				+                  conv_layout       = conv_layout,
			
 
				+                  batchnorm_layout  = batchnorm_layout,
			
 
				+                  pooling_layout    = pooling_layout,
			
 
				+                  verbose           = verbose,
			
 
				+                  cudnn_bn_off      = cudnn_bn_off,
			
 
				+                  bn_eps            = batchnorm_eps,
			
 
				+                  bn_mom            = batchnorm_mom,
			
 
				+                  conv_algo         = conv_algo,
			
 
				+                  fuse_bn_relu      = fuse_bn_relu,
			
 
				+                  fuse_bn_add_relu  = fuse_bn_add_relu,
			
 
				+                  force_tensor_core = force_tensor_core,
			
 
				+                  use_dali          = use_dali)
			
--- a/MxNet/Classification/RN50v1.5/runner
+++ b/MxNet/Classification/RN50v1.5/runner
@@ -0,0 +1,96 @@
 
				+#!/usr/bin/env python
			
 
				+
			
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import os, socket
			
 
				+from argparse import ArgumentParser
			
 
				+import warnings
			
 
				+
			
 
				+
			
 
				+optparser = ArgumentParser(description="train resnet50 with MXNet")
			
 
				+optparser.add_argument("-n", "--n-GPUs", type=int, default=8, help="number of GPUs to use; " +\
			
 
				+                       "default = 8")
			
 
				+optparser.add_argument("-b", "--batch-size", type=int, default=208, help="batch size per GPU; " +\
			
 
				+                       "default = 208")
			
 
				+optparser.add_argument("-e", "--num-epochs", type=int, default=90, help="number of epochs; " +\
			
 
				+                       "default = 90")
			
 
				+optparser.add_argument("-l", "--lr", type=float, default=0.1, help="learning rate; default = 0.1; " +\
			
 
				+                       "IMPORTANT: true learning rate will be calculated as `lr * batch_size/256`")
			
 
				+optparser.add_argument("--no-val", action="store_true",
			
 
				+                       help="if set no validation will be performed")
			
 
				+optparser.add_argument("--no-dali", action="store_true", default=False,
			
 
				+                       help="use default MXNet pipeline instead of DALI")
			
 
				+optparser.add_argument("--data-root", type=str, help="Directory with RecordIO data files", default="/data/imagenet/train-val-recordio-passthrough")
			
 
				+optparser.add_argument("--data-nthreads", type=int, help="number of threads for data loading; default = 40", default=40)
			
 
				+optparser.add_argument("--dtype", type=str, help="Precision, float16 or float32", default="float16")
			
 
				+
			
 
				+opts, args = optparser.parse_known_args()
			
 
				+
			
 
				+if opts.dtype == "float16":
			
 
				+    n_ch = str(4 - int(opts.no_dali))
			
 
				+else:
			
 
				+    n_ch = str(3)
			
 
				+
			
 
				+opts.batch_size *= opts.n_GPUs
			
 
				+
			
 
				+opts.lr *= opts.batch_size/256
			
 
				+
			
 
				+command = ""
			
 
				+command += "python "+os.path.dirname(__file__)+"/train.py"
			
 
				+command += " --num-layers 50"
			
 
				+command += " --data-train " + opts.data_root + "/train.rec"
			
 
				+command += " --data-train-idx " + opts.data_root + "/train.idx"
			
 
				+if not opts.no_val:
			
 
				+    command += " --data-val " + opts.data_root + "/val.rec"
			
 
				+    command += " --data-val-idx " + opts.data_root + "/val.idx"
			
 
				+command += " --data-nthreads " + str(opts.data_nthreads)
			
 
				+command += " --optimizer sgd --dtype " + opts.dtype
			
 
				+command += " --lr-step-epochs 30,60,80 --max-random-area 1"
			
 
				+command += " --min-random-area 0.05 --max-random-scale 1"
			
 
				+command += " --min-random-scale 1 --min-random-aspect-ratio 0.75"
			
 
				+command += " --max-random-aspect-ratio 1.33 --max-random-shear-ratio 0"
			
 
				+command += " --max-random-rotate-angle 0 --random-resized-crop 1"
			
 
				+command += " --random-crop 0 --random-mirror 1"
			
 
				+command += " --image-shape "+n_ch+",224,224 --warmup-epochs 5"
			
 
				+command += " --disp-batches 20"
			
 
				+command += " --batchnorm-mom 0.9 --batchnorm-eps 1e-5"
			
 
				+if opts.dtype == 'float16':
			
 
				+    command += " --fuse-bn-relu 1"
			
 
				+    command += " --input-layout NHWC --conv-layout NHWC"
			
 
				+    command += " --batchnorm-layout NHWC --pooling-layout NHWC"
			
 
				+    command += " --conv-algo 1 --force-tensor-core 1"
			
 
				+    command += " --fuse-bn-add-relu 1"
			
 
				+
			
 
				+command += " --kv-store device"
			
 
				+if not opts.no_dali:
			
 
				+    command += " --use-dali"
			
 
				+    command += " --dali-prefetch-queue 2 --dali-nvjpeg-memory-padding 64"
			
 
				+command += " --lr "+str(opts.lr)
			
 
				+command += " --gpus " + str(list(range(opts.n_GPUs))).replace(' ', '').replace('[', '').replace(']', '')
			
 
				+command += " --batch-size " + str(opts.batch_size)
			
 
				+command += " --num-epochs " + str(opts.num_epochs)
			
 
				+
			
 
				+
			
 
				+for arg in args:
			
 
				+    command += " " + arg
			
 
				+
			
 
				+os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
			
 
				+os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
			
 
				+os.environ['MXNET_USE_TENSORRT'] = "0"
			
 
				+os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
			
 
				+os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
			
 
				+os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"
			
 
				+
			
 
				+exit(os.system('/bin/bash -c "'+command+'"'))
			
--- a/MxNet/Classification/RN50v1.5/train.py
+++ b/MxNet/Classification/RN50v1.5/train.py
@@ -0,0 +1,91 @@
 
				+# Copyright 2017-2018 The Apache Software Foundation
			
 
				+#
			
 
				+# Licensed to the Apache Software Foundation (ASF) under one
			
 
				+# or more contributor license agreements.  See the NOTICE file
			
 
				+# distributed with this work for additional information
			
 
				+# regarding copyright ownership.  The ASF licenses this file
			
 
				+# to you under the Apache License, Version 2.0 (the
			
 
				+# "License"); you may not use this file except in compliance
			
 
				+# with the License.  You may obtain a copy of the License at
			
 
				+#
			
 
				+#   http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing,
			
 
				+# software distributed under the License is distributed on an
			
 
				+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+# KIND, either express or implied.  See the License for the
			
 
				+# specific language governing permissions and limitations
			
 
				+# under the License.
			
 
				+#
			
 
				+# -----------------------------------------------------------------------
			
 
				+#
			
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import os
			
 
				+import argparse
			
 
				+import logging
			
 
				+logging.basicConfig(level=logging.DEBUG)
			
 
				+import data, dali, fit
			
 
				+import mxnet as mx
			
 
				+import numpy as np
			
 
				+
			
 
				+def set_imagenet_aug(aug):
			
 
				+    # standard data augmentation setting for imagenet training
			
 
				+    aug.set_defaults(rgb_mean='123.68,116.779,103.939', rgb_std='58.393,57.12,57.375')
			
 
				+    aug.set_defaults(random_crop=0, random_resized_crop=1, random_mirror=1)
			
 
				+    aug.set_defaults(min_random_area=0.08)
			
 
				+    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
			
 
				+    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # parse args
			
 
				+    parser = argparse.ArgumentParser(description="train resnet on imagenet",
			
 
				+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
			
 
				+    fit.add_fit_args(parser)
			
 
				+    data.add_data_args(parser)
			
 
				+    dali.add_dali_args(parser)
			
 
				+    data.add_data_aug_args(parser)
			
 
				+    
			
 
				+    # Instead, to get standard resnet augmentation on a per-use basis, invoke as in:
			
 
				+    # train_imagenet.py --set-resnet-aug ...
			
 
				+    # Finally, to get the legacy MXNet v1.2 training settings on a per-use basis, invoke as in:
			
 
				+    # train_imagenet.py --set-data-aug-level 3
			
 
				+    parser.set_defaults(
			
 
				+        # network
			
 
				+        num_layers       = 50,
			
 
				+
			
 
				+        # data
			
 
				+        resize           = 256,
			
 
				+        num_classes      = 1000,
			
 
				+        num_examples     = 1281167,
			
 
				+        image_shape      = '3,224,224',
			
 
				+        min_random_scale = 1, # if input image has min size k, suggest to use
			
 
				+                              # 256.0/x, e.g. 0.533 for 480
			
 
				+        # train
			
 
				+        num_epochs       = 90,
			
 
				+        lr_step_epochs   = '30,60,80',
			
 
				+        dtype            = 'float32'
			
 
				+    )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if not args.use_dali:
			
 
				+        data.set_data_aug_level(parser, 0)
			
 
				+
			
 
				+    # load network
			
 
				+    import resnet as net
			
 
				+    sym = net.get_symbol(**vars(args))
			
 
				+
			
 
				+    # train
			
 
				+    fit.fit(args, sym, dali.get_rec_iter)