Browse Source

[ConvNets/PyT] Native AMP and update to 20.12

kkudrynski 5 years ago
parent
commit
097c7ff917
69 changed files with 857 additions and 710 deletions
  1. 1 1
      PyTorch/Classification/ConvNets/Dockerfile
  2. 23 37
      PyTorch/Classification/ConvNets/README.md
  3. 2 2
      PyTorch/Classification/ConvNets/checkpoint2model.py
  4. 14 23
      PyTorch/Classification/ConvNets/classify.py
  5. 183 0
      PyTorch/Classification/ConvNets/configs.yml
  6. 22 50
      PyTorch/Classification/ConvNets/image_classification/dataloaders.py
  7. 108 122
      PyTorch/Classification/ConvNets/image_classification/training.py
  8. 43 0
      PyTorch/Classification/ConvNets/image_classification/utils.py
  9. 50 0
      PyTorch/Classification/ConvNets/launch.py
  10. 30 37
      PyTorch/Classification/ConvNets/main.py
  11. 0 1
      PyTorch/Classification/ConvNets/requirements.txt
  12. 116 137
      PyTorch/Classification/ConvNets/resnet50v1.5/README.md
  13. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1V_resnet50_AMP_250E.sh
  14. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1V_resnet50_AMP_90E.sh
  15. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_250E.sh
  16. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_50E.sh
  17. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh
  18. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2V_resnet50_AMP_250E.sh
  19. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2V_resnet50_AMP_90E.sh
  20. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_250E.sh
  21. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_50E.sh
  22. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_90E.sh
  23. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGXA100_RN50_AMP_90E.sh
  24. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGXA100_resnet50_AMP_250E.sh
  25. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGXA100_resnet50_AMP_90E.sh
  26. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1V_resnet50_FP32_250E.sh
  27. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1V_resnet50_FP32_90E.sh
  28. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_250E.sh
  29. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_50E.sh
  30. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_90E.sh
  31. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2V_resnet50_FP32_250E.sh
  32. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2V_resnet50_FP32_90E.sh
  33. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_250E.sh
  34. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_50E.sh
  35. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_90E.sh
  36. 0 1
      PyTorch/Classification/ConvNets/resnet50v1.5/training/TF32/DGXA100_RN50_TF32_90E.sh
  37. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/TF32/DGXA100_resnet50_TF32_250E.sh
  38. 1 0
      PyTorch/Classification/ConvNets/resnet50v1.5/training/TF32/DGXA100_resnet50_TF32_90E.sh
  39. 116 136
      PyTorch/Classification/ConvNets/resnext101-32x4d/README.md
  40. 1 0
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1V_resnext101-32x4d_AMP_250E.sh
  41. 1 0
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1V_resnext101-32x4d_AMP_90E.sh
  42. 0 1
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNXT101-32x4d_AMP_250E.sh
  43. 0 1
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNXT101-32x4d_AMP_90E.sh
  44. 0 1
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGXA100_RNXT101-32x4d_AMP_90E.sh
  45. 1 0
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGXA100_resnext101-32x4d_AMP_250E.sh
  46. 1 0
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGXA100_resnext101-32x4d_AMP_90E.sh
  47. 1 0
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1V_resnext101-32x4d_FP32_250E.sh
  48. 1 0
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1V_resnext101-32x4d_FP32_90E.sh
  49. 0 1
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNXT101-32x4d_FP32_250E.sh
  50. 0 1
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNXT101-32x4d_FP32_90E.sh
  51. 0 1
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/TF32/DGXA100_RNXT101-32x4d_TF32_90E.sh
  52. 1 0
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/TF32/DGXA100_resnext101-32x4d_TF32_250E.sh
  53. 1 0
      PyTorch/Classification/ConvNets/resnext101-32x4d/training/TF32/DGXA100_resnext101-32x4d_TF32_90E.sh
  54. 120 138
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/README.md
  55. 1 0
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1V_se-resnext101-32x4d_AMP_250E.sh
  56. 1 0
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1V_se-resnext101-32x4d_AMP_90E.sh
  57. 0 1
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNXT101-32x4d_AMP_250E.sh
  58. 0 1
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNXT101-32x4d_AMP_90E.sh
  59. 0 1
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGXA100_SE-RNXT101-32x4d_AMP_90E.sh
  60. 1 0
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGXA100_se-resnext101-32x4d_AMP_250E.sh
  61. 1 0
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGXA100_se-resnext101-32x4d_AMP_90E.sh
  62. 1 0
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1V_se-resnext101-32x4d_FP32_250E.sh
  63. 1 0
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1V_se-resnext101-32x4d_FP32_90E.sh
  64. 0 1
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNXT101-32x4d_FP32_250E.sh
  65. 0 1
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNXT101-32x4d_FP32_90E.sh
  66. 0 1
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/TF32/DGXA100_SE-RNXT101-32x4d_TF32_90E.sh
  67. 1 0
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/TF32/DGXA100_se-resnext101-32x4d_TF32_250E.sh
  68. 1 0
      PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/TF32/DGXA100_se-resnext101-32x4d_TF32_90E.sh
  69. 1 0
      PyTorch/Classification/ConvNets/triton/deployer.py

+ 1 - 1
PyTorch/Classification/ConvNets/Dockerfile

@@ -1,4 +1,4 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.07-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.12-py3
 FROM ${FROM_IMAGE_NAME}
 
 ADD requirements.txt /workspace/

+ 23 - 37
PyTorch/Classification/ConvNets/README.md

@@ -1,4 +1,4 @@
-# Convolutional Networks for Image Classification in PyTorch
+# Convolutional Network for Image Classification in PyTorch
 
 In this repository you will find implementations of various image classification models.
 
@@ -9,7 +9,7 @@ Detailed information on each model can be found here:
 * [Models](#models)
 * [Validation accuracy results](#validation-accuracy-results)
 * [Training performance results](#training-performance-results)
-  * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+  * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
   * [Training performance: NVIDIA DGX-1 16GB (8x V100 16GB)](#training-performance-nvidia-dgx-1-16gb-8x-v100-16gb)
   * [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb)
 * [Model comparison](#model-comparison)
@@ -38,22 +38,20 @@ in the corresponding model's README.
 The following table shows the validation accuracy results of the
 three classification models side-by-side.
 
-
-| **arch** | **AMP Top1** | **AMP Top5** | **FP32 Top1** | **FP32 Top5** |
-|:-:|:-:|:-:|:-:|:-:|
-| resnet50 | 78.46 | 94.15 | 78.50 | 94.11 |
-| resnext101-32x4d | 80.08 | 94.89 | 80.14 | 95.02 |
-| se-resnext101-32x4d | 81.01 | 95.52 | 81.12 | 95.54 |
-
+|      **Model**      | **Mixed Precision Top1** | **Mixed Precision Top5** | **32 bit Top1** | **32 bit Top5** |
+|:-------------------:|:------------------------:|:------------------------:|:---------------:|:---------------:|
+|      resnet50       |          78.60           |          94.19           |      78.69      |      94.16      |
+|  resnext101-32x4d   |          80.43           |          95.06           |      80.40      |      95.04      |
+| se-resnext101-32x4d |          81.00           |          95.48           |      81.09      |      95.45      |
 
 ## Training performance results
 
-### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+### Training performance: NVIDIA DGX A100 (8x A100 80GB)
 
 
 Our results were obtained by running the applicable
-training scripts in the pytorch-20.06 NGC container
-on NVIDIA DGX A100 with (8x A100 40GB) GPUs.
+training scripts in the pytorch-20.12 NGC container
+on NVIDIA DGX A100 with (8x A100 80GB) GPUs.
 Performance numbers (in images per second)
 were averaged over an entire training epoch.
 The specific training script that was run is documented
@@ -63,21 +61,16 @@ The following table shows the training accuracy results of the
 three classification models side-by-side.
 
 
-|      **arch**       | **Mixed Precision** |   **TF32**    | **Mixed Precision Speedup** |
-|:-------------------:|:-------------------:|:-------------:|:---------------------------:|
-|      resnet50       |    9488.39 img/s    | 5322.10 img/s |            1.78x            |
-|  resnext101-32x4d   |    6758.98 img/s    | 2353.25 img/s |            2.87x            |
-| se-resnext101-32x4d |    4670.72 img/s    | 2011.21 img/s |            2.32x            |
-
-ResNeXt and SE-ResNeXt use [NHWC data layout](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) when training using Mixed Precision,
-which improves the model performance. We are currently working on adding it for ResNet.
-
+|      **Model**      | **Mixed Precision** |  **TF32**  | **Mixed Precision Speedup** |
+|:-------------------:|:-------------------:|:----------:|:---------------------------:|
+|      resnet50       |     15977 img/s     | 7365 img/s |           2.16 x            |
+|  resnext101-32x4d   |     7399 img/s      | 3193 img/s |           2.31 x            |
+| se-resnext101-32x4d |     5248 img/s      | 2665 img/s |           1.96 x            |
 
 ### Training performance: NVIDIA DGX-1 16G (8x V100 16GB)
 
-
 Our results were obtained by running the applicable
-training scripts in the pytorch-20.06 NGC container
+training scripts in the pytorch-20.12 NGC container
 on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
 Performance numbers (in images per second)
 were averaged over an entire training epoch.
@@ -87,16 +80,11 @@ in the corresponding model's README.
 The following table shows the training accuracy results of the
 three classification models side-by-side.
 
-
-|      **arch**       | **Mixed Precision** |   **FP32**    | **Mixed Precision Speedup** |
-|:-------------------:|:-------------------:|:-------------:|:---------------------------:|
-|      resnet50       |    6565.61 img/s    | 2869.19 img/s |            2.29x            |
-|  resnext101-32x4d   |    3922.74 img/s    | 1136.30 img/s |            3.45x            |
-| se-resnext101-32x4d |    2651.13 img/s    | 982.78 img/s  |            2.70x            |
-
-ResNeXt and SE-ResNeXt use [NHWC data layout](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) when training using Mixed Precision,
-which improves the model performance. We are currently working on adding it for ResNet.
-
+|      **Model**      | **Mixed Precision** |  **FP32**  | **Mixed Precision Speedup** |
+|:-------------------:|:-------------------:|:----------:|:---------------------------:|
+|      resnet50       |     7608 img/s      | 2851 img/s |           2.66 x            |
+|  resnext101-32x4d   |     3742 img/s      | 1117 img/s |           3.34 x            |
+| se-resnext101-32x4d |     2716 img/s      | 994 img/s  |           2.73 x            |
 
 ## Model Comparison
 
@@ -111,8 +99,6 @@ Dot size indicates number of trainable parameters.
 ### Latency vs Throughput on different batch sizes
 ![LATvsTHR](./img/LATvsTHR.png)
 
-Plot describes relationship between 
-inference latency, throughput and batch size 
+Plot describes relationship between
+inference latency, throughput and batch size
 for the implemented models.
-
-

+ 2 - 2
PyTorch/Classification/ConvNets/checkpoint2model.py

@@ -30,7 +30,7 @@ if __name__ == "__main__":
     add_parser_arguments(parser)
     args = parser.parse_args()
 
-    checkpoint = torch.load(args.checkpoint_path)
+    checkpoint = torch.load(args.checkpoint_path, map_location=torch.device('cpu'))
 
     model_state_dict = {
         k[len("module.") :] if "module." in k else k: v
@@ -39,4 +39,4 @@ if __name__ == "__main__":
 
     print(f"Loaded {checkpoint['arch']} : {checkpoint['best_prec1']}")
 
-    torch.save(model_state_dict, args.weight_path)
+    torch.save(model_state_dict, args.weight_path.format(arch=checkpoint['arch'][0], acc = checkpoint['best_prec1']))

+ 14 - 23
PyTorch/Classification/ConvNets/classify.py

@@ -16,19 +16,12 @@ import argparse
 import numpy as np
 import json
 import torch
+from torch.cuda.amp import autocast
 import torch.backends.cudnn as cudnn
 import torchvision.transforms as transforms
 import image_classification.resnet as models
 from image_classification.dataloaders import load_jpeg_from_file
 
-try:
-    from apex.fp16_utils import *
-    from apex import amp
-except ImportError:
-    raise ImportError(
-        "Please install apex from https://www.github.com/nvidia/apex to run this example."
-    )
-
 
 def add_parser_arguments(parser):
     model_names = models.resnet_versions.keys()
@@ -52,7 +45,7 @@ def add_parser_arguments(parser):
     )
     parser.add_argument("--weights", metavar="<path>", help="file with model weights")
     parser.add_argument(
-        "--precision", metavar="PREC", default="FP16", choices=["AMP", "FP16", "FP32"]
+        "--precision", metavar="PREC", default="AMP", choices=["AMP", "FP32"]
     )
     parser.add_argument("--image", metavar="<path>", help="path to classified image")
 
@@ -63,30 +56,28 @@ def main(args):
 
     if args.weights is not None:
         weights = torch.load(args.weights)
-        #Temporary fix to allow NGC checkpoint loading
+        # Temporary fix to allow NGC checkpoint loading
         weights = {
             k.replace("module.", ""): v for k, v in weights.items()
         }
         model.load_state_dict(weights)
 
     model = model.cuda()
-
-    if args.precision in ["AMP", "FP16"]:
-        model = network_to_half()
-
     model.eval()
 
-    with torch.no_grad():
-        input = load_jpeg_from_file(
-            args.image, cuda=True, fp16=args.precision != "FP32"
-        )
+    input = load_jpeg_from_file(
+        args.image, cuda=True
+    )
+
+    with torch.no_grad(), autocast(enabled = args.precision == "AMP"):
+        output = torch.nn.functional.softmax(model(input), dim=1)
 
-        output = torch.nn.functional.softmax(model(input), dim=1).cpu().view(-1).numpy()
-        top5 = np.argsort(output)[-5:][::-1]
+    output = output.float().cpu().view(-1).numpy()
+    top5 = np.argsort(output)[-5:][::-1]
 
-        print(args.image)
-        for c, v in zip(imgnet_classes[top5], output[top5]):
-            print(f"{c}: {100*v:.1f}%")
+    print(args.image)
+    for c, v in zip(imgnet_classes[top5], output[top5]):
+        print(f"{c}: {100*v:.1f}%")
 
 
 if __name__ == "__main__":

+ 183 - 0
PyTorch/Classification/ConvNets/configs.yml

@@ -0,0 +1,183 @@
+precision:
+    AMP:
+        static_loss_scale: 128
+        amp: True
+    FP32:
+        amp: False
+    TF32:
+        amp: False
+
+platform:
+    T4:
+        workers: 8
+    DGX1V:
+        workers: 8
+    DGX2V:
+        workers: 8
+    DGXA100:
+        workers: 16
+
+mode:
+    benchmark_training: &benchmark_training
+        print_freq: 1
+        epochs: 3
+        training_only: True
+        evaluate: False
+        save_checkpoints: False
+    benchmark_training_short:
+        <<: *benchmark_training
+        epochs: 1
+        data_backend: syntetic
+        prof: 100
+    benchmark_inference: &benchmark_inference
+        print_freq: 1
+        epochs: 1
+        training_only: False
+        evaluate: True
+        save_checkpoints: False
+    convergence:
+        print_freq: 20
+        training_only: False
+        evaluate: False
+        save_checkpoints: True
+
+anchors:
+    # ResNet_like params: {{{
+    resnet_params: &resnet_params
+        model_config: fanin
+        label_smoothing: 0.1
+        mixup: 0.2
+        lr_schedule: cosine
+        momentum: 0.875
+        warmup: 8
+        epochs: 250
+        data_backend: pytorch
+        num_classes: 1000
+        image_size: 224
+    resnet_params_896: &resnet_params_896
+        <<: *resnet_params
+        optimizer_batch_size: 896
+        lr: 0.896
+        weight_decay: 6.103515625e-05
+    resnet_params_1k: &resnet_params_1k
+        <<: *resnet_params
+        optimizer_batch_size: 1024
+        lr: 1.024
+        weight_decay: 6.103515625e-05
+    resnet_params_2k: &resnet_params_2k
+        <<: *resnet_params
+        optimizer_batch_size: 2048
+        lr: 2.048
+        weight_decay: 3.0517578125e-05
+    resnet_params_4k: &resnet_params_4k
+        <<: *resnet_params
+        optimizer_batch_size: 4096
+        lr: 4.086
+        weight_decay: 3.0517578125e-05
+    # }}}
+
+models:
+    resnet50: # {{{
+        DGX1V:
+            AMP:
+                <<: *resnet_params_2k
+                arch: resnet50
+                batch_size: 256
+                memory_format: nhwc
+            FP32:
+                <<: *resnet_params_2k
+                batch_size: 112
+        DGX2V:
+            AMP:
+                <<: *resnet_params_4k
+                arch: resnet50
+                batch_size: 256
+                memory_format: nhwc
+            FP32:
+                <<: *resnet_params_4k
+                arch: resnet50
+                batch_size: 256
+        DGXA100:
+            AMP:
+                <<: *resnet_params_2k
+                arch: resnet50
+                batch_size: 256
+                memory_format: nhwc
+            TF32:
+                <<: *resnet_params_2k
+                arch: resnet50
+                batch_size: 256
+        T4:
+            AMP:
+                <<: *resnet_params_2k
+                arch: resnet50
+                batch_size: 256
+                memory_format: nhwc
+            FP32:
+                <<: *resnet_params_2k
+                batch_size: 128
+    # }}}
+    resnext101-32x4d: # {{{
+        DGX1V:
+            AMP:
+                <<: *resnet_params_1k
+                arch: resnext101-32x4d
+                batch_size: 128
+                memory_format: nhwc
+            FP32:
+                <<: *resnet_params_1k
+                arch: resnext101-32x4d
+                batch_size: 64
+        DGXA100:
+            AMP:
+                <<: *resnet_params_1k
+                arch: resnext101-32x4d
+                batch_size: 128
+                memory_format: nhwc
+            TF32:
+                <<: *resnet_params_1k
+                arch: resnext101-32x4d
+                batch_size: 128
+        T4:
+            AMP:
+                <<: *resnet_params_1k
+                arch: resnext101-32x4d
+                batch_size: 128
+                memory_format: nhwc
+            FP32:
+                <<: *resnet_params_1k
+                arch: resnext101-32x4d
+                batch_size: 64
+    # }}}
+    se-resnext101-32x4d: # {{{
+        DGX1V:
+            AMP:
+                <<: *resnet_params_896
+                arch: se-resnext101-32x4d
+                batch_size: 112
+                memory_format: nhwc
+            FP32:
+                <<: *resnet_params_1k
+                arch: se-resnext101-32x4d
+                batch_size: 64
+        DGXA100:
+            AMP:
+                <<: *resnet_params_1k
+                arch: se-resnext101-32x4d
+                batch_size: 128
+                memory_format: nhwc
+            TF32:
+                <<: *resnet_params_1k
+                arch: se-resnext101-32x4d
+                batch_size: 128
+        T4:
+            AMP:
+                <<: *resnet_params_1k
+                arch: se-resnext101-32x4d
+                batch_size: 128
+                memory_format: nhwc
+            FP32:
+                <<: *resnet_params_1k
+                arch: se-resnext101-32x4d
+                batch_size: 64
+# }}}

+ 22 - 50
PyTorch/Classification/ConvNets/image_classification/dataloaders.py

@@ -50,7 +50,7 @@ except ImportError:
     )
 
 
-def load_jpeg_from_file(path, cuda=True, fp16=False):
+def load_jpeg_from_file(path, cuda=True):
     img_transforms = transforms.Compose(
         [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
     )
@@ -67,12 +67,7 @@ def load_jpeg_from_file(path, cuda=True, fp16=False):
             mean = mean.cuda()
             std = std.cuda()
             img = img.cuda()
-        if fp16:
-            mean = mean.half()
-            std = std.half()
-            img = img.half()
-        else:
-            img = img.float()
+        img = img.float()
 
         input = img.unsqueeze(0).sub_(mean).div_(std)
 
@@ -98,6 +93,7 @@ class HybridTrainPipe(Pipeline):
             shard_id=rank,
             num_shards=world_size,
             random_shuffle=True,
+            pad_last_batch=True,
         )
 
         if dali_cpu:
@@ -125,10 +121,9 @@ class HybridTrainPipe(Pipeline):
 
         self.cmnp = ops.CropMirrorNormalize(
             device="gpu",
-            output_dtype=types.FLOAT,
+            dtype=types.FLOAT,
             output_layout=types.NCHW,
             crop=(crop, crop),
-            image_type=types.RGB,
             mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
             std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
         )
@@ -160,16 +155,16 @@ class HybridValPipe(Pipeline):
             shard_id=rank,
             num_shards=world_size,
             random_shuffle=False,
+            pad_last_batch=True,
         )
 
         self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
         self.res = ops.Resize(device="gpu", resize_shorter=size)
         self.cmnp = ops.CropMirrorNormalize(
             device="gpu",
-            output_dtype=types.FLOAT,
+            dtype=types.FLOAT,
             output_layout=types.NCHW,
             crop=(crop, crop),
-            image_type=types.RGB,
             mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
             std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
         )
@@ -213,7 +208,6 @@ def get_dali_train_loader(dali_cpu=False):
         start_epoch=0,
         workers=5,
         _worker_init_fn=None,
-        fp16=False,
         memory_format=torch.contiguous_format,
     ):
         if torch.distributed.is_initialized():
@@ -236,7 +230,7 @@ def get_dali_train_loader(dali_cpu=False):
 
         pipe.build()
         train_loader = DALIClassificationIterator(
-            pipe, size=int(pipe.epoch_size("Reader") / world_size)
+            pipe, reader_name="Reader", fill_last_batch=False
         )
 
         return (
@@ -255,7 +249,6 @@ def get_dali_val_loader():
         one_hot,
         workers=5,
         _worker_init_fn=None,
-        fp16=False,
         memory_format=torch.contiguous_format,
     ):
         if torch.distributed.is_initialized():
@@ -278,7 +271,7 @@ def get_dali_val_loader():
 
         pipe.build()
         val_loader = DALIClassificationIterator(
-            pipe, size=int(pipe.epoch_size("Reader") / world_size)
+            pipe, reader_name="Reader", fill_last_batch=False
         )
 
         return (
@@ -317,7 +310,7 @@ def expand(num_classes, dtype, tensor):
 
 
 class PrefetchedWrapper(object):
-    def prefetched_loader(loader, num_classes, fp16, one_hot):
+    def prefetched_loader(loader, num_classes, one_hot):
         mean = (
             torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255])
             .cuda()
@@ -328,9 +321,6 @@ class PrefetchedWrapper(object):
             .cuda()
             .view(1, 3, 1, 1)
         )
-        if fp16:
-            mean = mean.half()
-            std = std.half()
 
         stream = torch.cuda.Stream()
         first = True
@@ -339,14 +329,9 @@ class PrefetchedWrapper(object):
             with torch.cuda.stream(stream):
                 next_input = next_input.cuda(non_blocking=True)
                 next_target = next_target.cuda(non_blocking=True)
-                if fp16:
-                    next_input = next_input.half()
-                    if one_hot:
-                        next_target = expand(num_classes, torch.half, next_target)
-                else:
-                    next_input = next_input.float()
-                    if one_hot:
-                        next_target = expand(num_classes, torch.float, next_target)
+                next_input = next_input.float()
+                if one_hot:
+                    next_target = expand(num_classes, torch.float, next_target)
 
                 next_input = next_input.sub_(mean).div_(std)
 
@@ -361,9 +346,8 @@ class PrefetchedWrapper(object):
 
         yield input, target
 
-    def __init__(self, dataloader, start_epoch, num_classes, fp16, one_hot):
+    def __init__(self, dataloader, start_epoch, num_classes, one_hot):
         self.dataloader = dataloader
-        self.fp16 = fp16
         self.epoch = start_epoch
         self.one_hot = one_hot
         self.num_classes = num_classes
@@ -376,7 +360,7 @@ class PrefetchedWrapper(object):
             self.dataloader.sampler.set_epoch(self.epoch)
         self.epoch += 1
         return PrefetchedWrapper.prefetched_loader(
-            self.dataloader, self.num_classes, self.fp16, self.one_hot
+            self.dataloader, self.num_classes, self.one_hot
         )
 
     def __len__(self):
@@ -391,7 +375,6 @@ def get_pytorch_train_loader(
     start_epoch=0,
     workers=5,
     _worker_init_fn=None,
-    fp16=False,
     memory_format=torch.contiguous_format,
 ):
     traindir = os.path.join(data_path, "train")
@@ -403,24 +386,24 @@ def get_pytorch_train_loader(
     )
 
     if torch.distributed.is_initialized():
-        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True)
     else:
         train_sampler = None
 
     train_loader = torch.utils.data.DataLoader(
         train_dataset,
+        sampler=train_sampler,
         batch_size=batch_size,
         shuffle=(train_sampler is None),
         num_workers=workers,
         worker_init_fn=_worker_init_fn,
         pin_memory=True,
-        sampler=train_sampler,
         collate_fn=partial(fast_collate, memory_format),
         drop_last=True,
     )
 
     return (
-        PrefetchedWrapper(train_loader, start_epoch, num_classes, fp16, one_hot),
+        PrefetchedWrapper(train_loader, start_epoch, num_classes, one_hot),
         len(train_loader),
     )
 
@@ -432,7 +415,6 @@ def get_pytorch_val_loader(
     one_hot,
     workers=5,
     _worker_init_fn=None,
-    fp16=False,
     memory_format=torch.contiguous_format,
 ):
     valdir = os.path.join(data_path, "val")
@@ -441,7 +423,7 @@ def get_pytorch_val_loader(
     )
 
     if torch.distributed.is_initialized():
-        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False)
     else:
         val_sampler = None
 
@@ -449,20 +431,20 @@ def get_pytorch_val_loader(
         val_dataset,
         sampler=val_sampler,
         batch_size=batch_size,
-        shuffle=False,
+        shuffle=(val_sampler is None),
         num_workers=workers,
         worker_init_fn=_worker_init_fn,
         pin_memory=True,
         collate_fn=partial(fast_collate, memory_format),
+        drop_last=False,
     )
 
-    return PrefetchedWrapper(val_loader, 0, num_classes, fp16, one_hot), len(val_loader)
+    return PrefetchedWrapper(val_loader, 0, num_classes, one_hot), len(val_loader)
 
 
 class SynteticDataLoader(object):
     def __init__(
         self,
-        fp16,
         batch_size,
         num_classes,
         num_channels,
@@ -483,8 +465,6 @@ class SynteticDataLoader(object):
         else:
             input_target = torch.randint(0, num_classes, (batch_size,))
         input_target = input_target.cuda()
-        if fp16:
-            input_data = input_data.half()
 
         self.input_data = input_data
         self.input_target = input_target
@@ -502,19 +482,11 @@ def get_syntetic_loader(
     start_epoch=0,
     workers=None,
     _worker_init_fn=None,
-    fp16=False,
     memory_format=torch.contiguous_format,
 ):
     return (
         SynteticDataLoader(
-            fp16,
-            batch_size,
-            num_classes,
-            3,
-            224,
-            224,
-            one_hot,
-            memory_format=memory_format,
+            batch_size, num_classes, 3, 224, 224, one_hot, memory_format=memory_format
         ),
         -1,
     )

+ 108 - 122
PyTorch/Classification/ConvNets/image_classification/training.py

@@ -38,14 +38,8 @@ from . import resnet as models
 from . import utils
 import dllogger
 
-try:
-    from apex.parallel import DistributedDataParallel as DDP
-    from apex.fp16_utils import *
-    from apex import amp
-except ImportError:
-    raise ImportError(
-        "Please install apex from https://www.github.com/nvidia/apex to run this example."
-    )
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.cuda.amp import autocast
 
 ACC_METADATA = {"unit": "%", "format": ":.2f"}
 IPS_METADATA = {"unit": "img/s", "format": ":.2f"}
@@ -60,7 +54,6 @@ class ModelAndLoss(nn.Module):
         loss,
         pretrained_weights=None,
         cuda=True,
-        fp16=False,
         memory_format=torch.contiguous_format,
     ):
         super(ModelAndLoss, self).__init__()
@@ -74,8 +67,6 @@ class ModelAndLoss(nn.Module):
 
         if cuda:
             model = model.cuda().to(memory_format=memory_format)
-        if fp16:
-            model = network_to_half(model)
 
         # define loss function (criterion) and optimizer
         criterion = loss()
@@ -92,8 +83,8 @@ class ModelAndLoss(nn.Module):
 
         return loss, output
 
-    def distributed(self):
-        self.model = DDP(self.model)
+    def distributed(self, gpu_id):
+        self.model = DDP(self.model, device_ids=[gpu_id], output_device=gpu_id)
 
     def load_model_state(self, state):
         if not state is None:
@@ -102,14 +93,11 @@ class ModelAndLoss(nn.Module):
 
 def get_optimizer(
     parameters,
-    fp16,
     lr,
     momentum,
     weight_decay,
     nesterov=False,
     state=None,
-    static_loss_scale=1.0,
-    dynamic_loss_scale=False,
     bn_weight_decay=False,
 ):
 
@@ -138,13 +126,6 @@ def get_optimizer(
             weight_decay=weight_decay,
             nesterov=nesterov,
         )
-    if fp16:
-        optimizer = FP16_Optimizer(
-            optimizer,
-            static_loss_scale=static_loss_scale,
-            dynamic_loss_scale=dynamic_loss_scale,
-            verbose=False,
-        )
 
     if not state is None:
         optimizer.load_state_dict(state)
@@ -227,36 +208,25 @@ def lr_exponential_policy(
 
 
 def get_train_step(
-    model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1
+    model_and_loss, optimizer, scaler, use_amp=False, batch_size_multiplier=1
 ):
     def _step(input, target, optimizer_step=True):
         input_var = Variable(input)
         target_var = Variable(target)
-        loss, output = model_and_loss(input_var, target_var)
-        if torch.distributed.is_initialized():
-            reduced_loss = utils.reduce_tensor(loss.data)
-        else:
-            reduced_loss = loss.data
 
-        if fp16:
-            optimizer.backward(loss)
-        elif use_amp:
-            with amp.scale_loss(loss, optimizer) as scaled_loss:
-                scaled_loss.backward()
-        else:
-            loss.backward()
+        with autocast(enabled=use_amp):
+            loss, output = model_and_loss(input_var, target_var)
+            loss /= batch_size_multiplier
+            if torch.distributed.is_initialized():
+                reduced_loss = utils.reduce_tensor(loss.data)
+            else:
+                reduced_loss = loss.data
+
+        scaler.scale(loss).backward()
 
         if optimizer_step:
-            opt = (
-                optimizer.optimizer
-                if isinstance(optimizer, FP16_Optimizer)
-                else optimizer
-            )
-            for param_group in opt.param_groups:
-                for param in param_group["params"]:
-                    param.grad /= batch_size_multiplier
-
-            optimizer.step()
+            scaler.step(optimizer)
+            scaler.update()
             optimizer.zero_grad()
 
         torch.cuda.synchronize()
@@ -270,10 +240,11 @@ def train(
     train_loader,
     model_and_loss,
     optimizer,
+    scaler,
     lr_scheduler,
-    fp16,
     logger,
     epoch,
+    timeout_handler,
     use_amp=False,
     prof=-1,
     batch_size_multiplier=1,
@@ -315,7 +286,7 @@ def train(
     step = get_train_step(
         model_and_loss,
         optimizer,
-        fp16,
+        scaler=scaler,
         use_amp=use_amp,
         batch_size_multiplier=batch_size_multiplier,
     )
@@ -342,31 +313,33 @@ def train(
         it_time = time.time() - end
 
         if logger is not None:
-            logger.log_metric("train.loss", to_python_float(loss), bs)
+            logger.log_metric("train.loss", loss.item(), bs)
             logger.log_metric("train.compute_ips", calc_ips(bs, it_time - data_time))
             logger.log_metric("train.total_ips", calc_ips(bs, it_time))
             logger.log_metric("train.data_time", data_time)
             logger.log_metric("train.compute_time", it_time - data_time)
 
         end = time.time()
+        if timeout_handler.interrupted:
+            break
 
 
-def get_val_step(model_and_loss):
+def get_val_step(model_and_loss, use_amp=False):
     def _step(input, target):
         input_var = Variable(input)
         target_var = Variable(target)
 
-        with torch.no_grad():
+        with torch.no_grad(), autocast(enabled=use_amp):
             loss, output = model_and_loss(input_var, target_var)
 
-        prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
+            prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
 
-        if torch.distributed.is_initialized():
-            reduced_loss = utils.reduce_tensor(loss.data)
-            prec1 = utils.reduce_tensor(prec1)
-            prec5 = utils.reduce_tensor(prec5)
-        else:
-            reduced_loss = loss.data
+            if torch.distributed.is_initialized():
+                reduced_loss = utils.reduce_tensor(loss.data)
+                prec1 = utils.reduce_tensor(prec1)
+                prec5 = utils.reduce_tensor(prec5)
+            else:
+                reduced_loss = loss.data
 
         torch.cuda.synchronize()
 
@@ -376,7 +349,13 @@ def get_val_step(model_and_loss):
 
 
 def validate(
-    val_loader, model_and_loss, fp16, logger, epoch, prof=-1, register_metrics=True
+    val_loader,
+    model_and_loss,
+    logger,
+    epoch,
+    use_amp=False,
+    prof=-1,
+    register_metrics=True,
 ):
     if register_metrics and logger is not None:
         logger.register_metric(
@@ -440,7 +419,7 @@ def validate(
             metadata=TIME_METADATA,
         )
 
-    step = get_val_step(model_and_loss)
+    step = get_val_step(model_and_loss, use_amp=use_amp)
 
     top1 = log.AverageMeter()
     # switch to evaluate mode
@@ -462,11 +441,11 @@ def validate(
 
         it_time = time.time() - end
 
-        top1.record(to_python_float(prec1), bs)
+        top1.record(prec1.item(), bs)
         if logger is not None:
-            logger.log_metric("val.top1", to_python_float(prec1), bs)
-            logger.log_metric("val.top5", to_python_float(prec5), bs)
-            logger.log_metric("val.loss", to_python_float(loss), bs)
+            logger.log_metric("val.top1", prec1.item(), bs)
+            logger.log_metric("val.top5", prec5.item(), bs)
+            logger.log_metric("val.loss", loss.item(), bs)
             logger.log_metric("val.compute_ips", calc_ips(bs, it_time - data_time))
             logger.log_metric("val.total_ips", calc_ips(bs, it_time))
             logger.log_metric("val.data_time", data_time)
@@ -492,10 +471,10 @@ def calc_ips(batch_size, time):
 def train_loop(
     model_and_loss,
     optimizer,
+    scaler,
     lr_scheduler,
     train_loader,
     val_loader,
-    fp16,
     logger,
     should_backup_checkpoint,
     use_amp=False,
@@ -510,70 +489,77 @@ def train_loop(
     checkpoint_dir="./",
     checkpoint_filename="checkpoint.pth.tar",
 ):
-
     prec1 = -1
 
     print(f"RUNNING EPOCHS FROM {start_epoch} TO {end_epoch}")
-    for epoch in range(start_epoch, end_epoch):
-        if logger is not None:
-            logger.start_epoch()
-        if not skip_training:
-            train(
-                train_loader,
-                model_and_loss,
-                optimizer,
-                lr_scheduler,
-                fp16,
-                logger,
-                epoch,
-                use_amp=use_amp,
-                prof=prof,
-                register_metrics=epoch == start_epoch,
-                batch_size_multiplier=batch_size_multiplier,
-            )
-
-        if not skip_validation:
-            prec1, nimg = validate(
-                val_loader,
-                model_and_loss,
-                fp16,
-                logger,
-                epoch,
-                prof=prof,
-                register_metrics=epoch == start_epoch,
-            )
-        if logger is not None:
-            logger.end_epoch()
+    with utils.TimeoutHandler() as timeout_handler:
+        for epoch in range(start_epoch, end_epoch):
+            if logger is not None:
+                logger.start_epoch()
+            if not skip_training:
+                train(
+                    train_loader,
+                    model_and_loss,
+                    optimizer,
+                    scaler,
+                    lr_scheduler,
+                    logger,
+                    epoch,
+                    timeout_handler,
+                    use_amp=use_amp,
+                    prof=prof,
+                    register_metrics=epoch == start_epoch,
+                    batch_size_multiplier=batch_size_multiplier,
+                )
 
-        if save_checkpoints and (
-            not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
-        ):
             if not skip_validation:
-                is_best = logger.metrics["val.top1"]["meter"].get_epoch() > best_prec1
-                best_prec1 = max(
-                    logger.metrics["val.top1"]["meter"].get_epoch(), best_prec1
+                prec1, nimg = validate(
+                    val_loader,
+                    model_and_loss,
+                    logger,
+                    epoch,
+                    use_amp=use_amp,
+                    prof=prof,
+                    register_metrics=epoch == start_epoch,
                 )
-            else:
-                is_best = False
-                best_prec1 = 0
+            if logger is not None:
+                logger.end_epoch()
+
+            if save_checkpoints and (
+                not torch.distributed.is_initialized()
+                or torch.distributed.get_rank() == 0
+            ):
+                if not skip_validation:
+                    is_best = (
+                        logger.metrics["val.top1"]["meter"].get_epoch() > best_prec1
+                    )
+                    best_prec1 = max(
+                        logger.metrics["val.top1"]["meter"].get_epoch(), best_prec1
+                    )
+                else:
+                    is_best = False
+                    best_prec1 = 0
+
+                if should_backup_checkpoint(epoch):
+                    backup_filename = "checkpoint-{}.pth.tar".format(epoch + 1)
+                else:
+                    backup_filename = None
+                utils.save_checkpoint(
+                    {
+                        "epoch": epoch + 1,
+                        "arch": model_and_loss.arch,
+                        "state_dict": model_and_loss.model.state_dict(),
+                        "best_prec1": best_prec1,
+                        "optimizer": optimizer.state_dict(),
+                    },
+                    is_best,
+                    checkpoint_dir=checkpoint_dir,
+                    backup_filename=backup_filename,
+                    filename=checkpoint_filename,
+                )
+            if timeout_handler.interrupted:
+                break
 
-            if should_backup_checkpoint(epoch):
-                backup_filename = "checkpoint-{}.pth.tar".format(epoch + 1)
-            else:
-                backup_filename = None
-            utils.save_checkpoint(
-                {
-                    "epoch": epoch + 1,
-                    "arch": model_and_loss.arch,
-                    "state_dict": model_and_loss.model.state_dict(),
-                    "best_prec1": best_prec1,
-                    "optimizer": optimizer.state_dict(),
-                },
-                is_best,
-                checkpoint_dir=checkpoint_dir,
-                backup_filename=backup_filename,
-                filename=checkpoint_filename,
-            )
 
 
 # }}}

+ 43 - 0
PyTorch/Classification/ConvNets/image_classification/utils.py

@@ -31,6 +31,7 @@ import os
 import numpy as np
 import torch
 import shutil
+import signal
 import torch.distributed as dist
 
 
@@ -106,3 +107,45 @@ def reduce_tensor(tensor):
 def first_n(n, generator):
     for i, d in zip(range(n), generator):
         yield d
+
+
+class TimeoutHandler:
+    def __init__(self, sig=signal.SIGTERM):
+        self.sig = sig
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        self.device = f'cuda:{rank}'
+    @property
+    def interrupted(self):
+        if not dist.is_initialized():
+            return self._interrupted
+
+        interrupted = torch.tensor(self._interrupted).int().to(self.device)
+        dist.broadcast(interrupted, 0)
+        interrupted = bool(interrupted.item())
+        return interrupted
+    def __enter__(self):
+        self._interrupted = False
+        self.released = False
+        self.original_handler = signal.getsignal(self.sig)
+        def master_handler(signum, frame):
+            self.release()
+            self._interrupted = True
+            print(f'Received SIGTERM')
+        def ignorind_handler(signum, frame):
+            self.release()
+            print('Received SIGTERM, ignoring')
+
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        if rank == 0:
+            signal.signal(self.sig, master_handler)
+        else:
+            signal.signal(self.sig, ignorind_handler)
+        return self
+    def __exit__(self, type, value, tb):
+        self.release()
+    def release(self):
+        if self.released:
+            return False
+        signal.signal(self.sig, self.original_handler)
+        self.released = True
+        return True

+ 50 - 0
PyTorch/Classification/ConvNets/launch.py

@@ -0,0 +1,50 @@
+import os
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Dict, Any
+import yaml
+
+from main import main, add_parser_arguments
+import torch.backends.cudnn as cudnn
+
+import argparse
+
+
+def get_config_path():
+    return Path(os.path.dirname(os.path.abspath(__file__))) / "configs.yml"
+
+
+if __name__ == "__main__":
+    yaml_cfg_parser = argparse.ArgumentParser(add_help=False)
+    yaml_cfg_parser.add_argument(
+        "--cfg_file",
+        default=get_config_path(),
+        type=str,
+        help="path to yaml config file",
+    )
+    yaml_cfg_parser.add_argument("--model", default=None, type=str, required=True)
+    yaml_cfg_parser.add_argument("--mode", default=None, type=str, required=True)
+    yaml_cfg_parser.add_argument("--precision", default=None, type=str, required=True)
+    yaml_cfg_parser.add_argument("--platform", default=None, type=str, required=True)
+
+    yaml_args, rest = yaml_cfg_parser.parse_known_args()
+
+    with open(yaml_args.cfg_file, "r") as cfg_file:
+        config = yaml.load(cfg_file, Loader=yaml.FullLoader)
+
+    cfg = {
+        **config["precision"][yaml_args.precision],
+        **config["platform"][yaml_args.platform],
+        **config["models"][yaml_args.model][yaml_args.platform][yaml_args.precision],
+        **config["mode"][yaml_args.mode],
+    }
+    print(cfg)
+
+    parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
+    add_parser_arguments(parser)
+    parser.set_defaults(**cfg)
+    args = parser.parse_args(rest)
+    print(args)
+    cudnn.benchmark = True
+
+    main(args)

+ 30 - 37
PyTorch/Classification/ConvNets/main.py

@@ -32,6 +32,7 @@ import os
 import shutil
 import time
 import random
+import signal
 
 import numpy as np
 import torch
@@ -45,15 +46,7 @@ import torch.utils.data
 import torch.utils.data.distributed
 import torchvision.transforms as transforms
 import torchvision.datasets as datasets
-
-try:
-    from apex.parallel import DistributedDataParallel as DDP
-    from apex.fp16_utils import *
-    from apex import amp
-except ImportError:
-    raise ImportError(
-        "Please install apex from https://www.github.com/nvidia/apex to run this example."
-    )
+from torch.nn.parallel import DistributedDataParallel as DDP
 
 import image_classification.resnet as models
 import image_classification.logger as log
@@ -224,12 +217,11 @@ def add_parser_arguments(parser):
         help="load weights from here",
     )
 
-    parser.add_argument("--fp16", action="store_true", help="Run model fp16 mode.")
     parser.add_argument(
         "--static-loss-scale",
         type=float,
         default=1,
-        help="Static loss scale, positive power of 2 values can improve fp16 convergence.",
+        help="Static loss scale, positive power of 2 values can improve amp convergence.",
     )
     parser.add_argument(
         "--dynamic-loss-scale",
@@ -312,10 +304,6 @@ def main(args):
         dist.init_process_group(backend="nccl", init_method="env://")
         args.world_size = torch.distributed.get_world_size()
 
-    if args.amp and args.fp16:
-        print("Please use only one of the --fp16/--amp flags")
-        exit(1)
-
     if args.seed is not None:
         print("Using seed = {}".format(args.seed))
         torch.manual_seed(args.seed + args.local_rank)
@@ -324,22 +312,25 @@ def main(args):
         random.seed(args.seed + args.local_rank)
 
         def _worker_init_fn(id):
+            def handler(signum, frame):
+                print(f"Worker {id} received signal {signum}")
+
+            signal.signal(signal.SIGTERM, handler)
+
             np.random.seed(seed=args.seed + args.local_rank + id)
             random.seed(args.seed + args.local_rank + id)
 
     else:
 
         def _worker_init_fn(id):
-            pass
+            def handler(signum, frame):
+                print(f"Worker {id} received signal {signum}")
 
-    if args.fp16:
-        assert (
-            torch.backends.cudnn.enabled
-        ), "fp16 mode requires cudnn backend to be enabled."
+            signal.signal(signal.SIGTERM, handler)
 
     if args.static_loss_scale != 1.0:
-        if not args.fp16:
-            print("Warning:  if --fp16 is not used, static_loss_scale will be ignored.")
+        if not args.amp:
+            print("Warning: if --amp is not used, static_loss_scale will be ignored.")
 
     if args.optimizer_batch_size < 0:
         batch_size_multiplier = 1
@@ -387,6 +378,11 @@ def main(args):
                     args.resume, checkpoint["epoch"]
                 )
             )
+            if start_epoch >= args.epochs:
+                print(
+                    f"Launched training for {args.epochs}, checkpoint already run {start_epoch}"
+                )
+                exit(1)
         else:
             print("=> no checkpoint found at '{}'".format(args.resume))
             model_state = None
@@ -410,7 +406,6 @@ def main(args):
         loss,
         pretrained_weights=pretrained_weights,
         cuda=True,
-        fp16=args.fp16,
         memory_format=memory_format,
     )
 
@@ -427,6 +422,9 @@ def main(args):
     elif args.data_backend == "syntetic":
         get_val_loader = get_syntetic_loader
         get_train_loader = get_syntetic_loader
+    else:
+        print("Bad databackend picked")
+        exit(1)
 
     train_loader, train_loader_len = get_train_loader(
         args.data,
@@ -435,7 +433,6 @@ def main(args):
         args.mixup > 0.0,
         start_epoch=start_epoch,
         workers=args.workers,
-        fp16=args.fp16,
         memory_format=memory_format,
     )
     if args.mixup != 0.0:
@@ -447,7 +444,6 @@ def main(args):
         args.num_classes,
         False,
         workers=args.workers,
-        fp16=args.fp16,
         memory_format=memory_format,
     )
 
@@ -473,15 +469,12 @@ def main(args):
 
     optimizer = get_optimizer(
         list(model_and_loss.model.named_parameters()),
-        args.fp16,
         args.lr,
         args.momentum,
         args.weight_decay,
         nesterov=args.nesterov,
         bn_weight_decay=args.bn_weight_decay,
         state=optimizer_state,
-        static_loss_scale=args.static_loss_scale,
-        dynamic_loss_scale=args.dynamic_loss_scale,
     )
 
     if args.lr_schedule == "step":
@@ -493,26 +486,26 @@ def main(args):
     elif args.lr_schedule == "linear":
         lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger)
 
-    if args.amp:
-        model_and_loss, optimizer = amp.initialize(
-            model_and_loss,
-            optimizer,
-            opt_level="O1",
-            loss_scale="dynamic" if args.dynamic_loss_scale else args.static_loss_scale,
-        )
+    scaler = torch.cuda.amp.GradScaler(
+        init_scale=args.static_loss_scale,
+        growth_factor=2,
+        backoff_factor=0.5,
+        growth_interval=100 if args.dynamic_loss_scale else 1000000000,
+        enabled=args.amp,
+    )
 
     if args.distributed:
-        model_and_loss.distributed()
+        model_and_loss.distributed(args.gpu)
 
     model_and_loss.load_model_state(model_state)
 
     train_loop(
         model_and_loss,
         optimizer,
+        scaler,
         lr_policy,
         train_loader,
         val_loader,
-        args.fp16,
         logger,
         should_backup_checkpoint(args),
         use_amp=args.amp,

+ 0 - 1
PyTorch/Classification/ConvNets/requirements.txt

@@ -1,2 +1 @@
-pytorch-ignite
 git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger

+ 116 - 137
PyTorch/Classification/ConvNets/resnet50v1.5/README.md

@@ -30,12 +30,12 @@ achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
     * [Inference performance benchmark](#inference-performance-benchmark)
   * [Results](#results)
     * [Training accuracy results](#training-accuracy-results)
-      * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
+      * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
       * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
       * [Training accuracy: NVIDIA DGX-2 (16x V100 32GB)](#training-accuracy-nvidia-dgx-2-16x-v100-32gb)
       * [Example plots](#example-plots)
     * [Training performance results](#training-performance-results)
-      * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+      * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
       * [Training performance: NVIDIA DGX-1 16GB (8x V100 16GB)](#training-performance-nvidia-dgx-1-16gb-8x-v100-16gb)
       * [Training performance: NVIDIA DGX-1 32GB (8x V100 32GB)](#training-performance-nvidia-dgx-1-32gb-8x-v100-32gb)
   * [Inference performance results](#inference-performance-results)
@@ -119,6 +119,8 @@ and this recipe keeps the original assumption that validation is done on 224px i
 
 Using 288px images means that a lot more FLOPs are needed during inference to reach the same accuracy.
 
+
+
 ### Feature support matrix
 
 The following features are supported by this model:
@@ -204,7 +206,7 @@ The following section lists the requirements that you need to meet in order to s
 This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
+* [PyTorch 20.12-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
 * Supported GPUs:
     * [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
     * [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
@@ -256,28 +258,28 @@ For the specifics concerning training and inference, see the [Advanced](#advance
 
 The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
 
-### 3. Build the RN50v1.5 PyTorch NGC container.
+### 3. Build the ResNet50 PyTorch NGC container.
 
 ```
-docker build . -t nvidia_rn50
+docker build . -t nvidia_resnet50
 ```
 
 ### 4. Start an interactive session in the NGC container to run training/inference.
 ```
-nvidia-docker run --rm -it -v <path to imagenet>:/data/imagenet --ipc=host nvidia_rn50
+nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host nvidia_resnet50
 ```
 
 
 ### 5. Start training
 
-To run training for a standard configuration (DGXA100/DGX1/DGX2, AMP/TF32/FP32, 50/90/250 Epochs),
+To run training for a standard configuration (DGXA100/DGX1V/DGX2V, AMP/TF32/FP32, 90/250 Epochs),
 run one of the scripts in the `./resnet50v1.5/training` directory
-called `./resnet50v1.5/training/{AMP, TF32, FP32}/{DGXA100, DGX1, DGX2}_RN50_{AMP, TF32, FP32}_{50,90,250}E.sh`.
+called `./resnet50v1.5/training/{AMP, TF32, FP32}/{ DGXA100, DGX1V, DGX2V }_resnet50_{AMP, TF32, FP32}_{ 90, 250 }E.sh`.
 
-Ensure ImageNet is mounted in the `/data/imagenet` directory.
+Ensure ImageNet is mounted in the `/imagenet` directory.
 
 Example:
-    `bash ./resnet50v1.5/training/AMP/DGX1_RN50_AMP_250E.sh <path were to store checkpoints and logs>`
+    `bash ./resnet50v1.5/training/AMP/DGX1_resnet50_AMP_250E.sh <path were to store checkpoints and logs>`
 
 ### 6. Start inference
 
@@ -295,7 +297,7 @@ To run inference on ImageNet, run:
 
 To run inference on JPEG image using pretrained weights:
 
-`python classify.py --arch resnet50 -c fanin --weights nvidia_resnet50_200821.pth.tar  --precision AMP|FP32 --image <path to JPEG image>`
+`python classify.py --arch resnet50 -c fanin --weights nvidia_resnet50_200821.pth.tar --precision AMP|FP32 --image <path to JPEG image>`
 
 
 ## Advanced
@@ -334,7 +336,7 @@ usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
                [--lr-schedule SCHEDULE] [--warmup E] [--label-smoothing S]
                [--mixup ALPHA] [--momentum M] [--weight-decay W]
                [--bn-weight-decay] [--nesterov] [--print-freq N]
-               [--resume PATH] [--pretrained-weights PATH] [--fp16]
+               [--resume PATH] [--pretrained-weights PATH]
                [--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
                [--prof N] [--amp] [--seed SEED] [--gather-checkpoints]
                [--raport-file RAPORT_FILE] [--evaluate] [--training-only]
@@ -353,8 +355,10 @@ optional arguments:
                         data backend: pytorch | syntetic | dali-gpu | dali-cpu
                         (default: dali-cpu)
   --arch ARCH, -a ARCH  model architecture: resnet18 | resnet34 | resnet50 |
-                        resnet101 | resnet152 | resnext101-32x4d | se-
-                        resnext101-32x4d (default: resnet50)
+                        resnet101 | resnet152 | resnext50-32x4d |
+                        resnext101-32x4d | resnext101-32x8d |
+                        resnext101-32x8d-basic | se-resnext101-32x4d (default:
+                        resnet50)
   --model-config CONF, -c CONF
                         model configs: classic | fanin | grp-fanin | grp-
                         fanout(default: classic)
@@ -383,10 +387,9 @@ optional arguments:
   --resume PATH         path to latest checkpoint (default: none)
   --pretrained-weights PATH
                         load weights from here
-  --fp16                Run model fp16 mode.
   --static-loss-scale STATIC_LOSS_SCALE
                         Static loss scale, positive power of 2 values can
-                        improve fp16 convergence.
+                        improve amp convergence.
   --dynamic-loss-scale  Use dynamic loss scaling. If supplied, this argument
                         supersedes --static-loss-scale.
   --prof N              Run only N iterations
@@ -404,6 +407,7 @@ optional arguments:
   --workspace DIR       path to directory where checkpoints will be stored
   --memory-format {nchw,nhwc}
                         memory layout, nchw or nhwc
+
 ```
 
 
@@ -414,24 +418,7 @@ To use your own dataset, divide it in directories as in the following scheme:
  - Training images - `train/<class id>/<image>`
  - Validation images - `val/<class id>/<image>`
 
-If your dataset's has number of classes different than 1000, you need to add a custom config
-in the `image_classification/resnet.py` file.
-
-```python
-resnet_versions = {
-    ...
-    'resnet50-custom' : {
-       'net' : ResNet,
-       'block' : Bottleneck,
-       'layers' : [3, 4, 6, 3],
-       'widths' : [64, 128, 256, 512],
-       'expansion' : 4,
-       'num_classes' : <custom number of classes>,
-       }
-}
-```
-
-After adding the config, run the training script with `--arch resnet50-custom` flag.
+If your dataset's has number of classes different than 1000, you need to pass `--num-classes N` flag to the training script.
 
 ### Training process
 
@@ -454,7 +441,7 @@ To restart training from checkpoint use `--resume` option.
 To start training from pretrained weights (e.g. downloaded from NGC) use `--pretrained-weights` option.
 
 The difference between those two is that the pretrained weights contain only model weights,
-and checkpoints, apart from model weights, contain optimizer state, LR scheduler state, RNG state.
+and checkpoints, apart from model weights, contain optimizer state, LR scheduler state.
 
 Checkpoints are suitable for dividing the training into parts, for example in order
 to divide the training job into shorter stages, or restart training after infrastructure fail.
@@ -500,14 +487,13 @@ wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/resnet50_
 
 unzip resnet50_pyt_amp_20.06.0.zip
 ```
-
 To run inference on ImageNet, run:
 
 `python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-weights nvidia_resnet50_200821.pth.tar -b <batch size> <path to imagenet>`
 
 To run inference on JPEG image using pretrained weights:
 
-`python classify.py --arch resnet50 -c fanin --weights nvidia_resnet50_200821.pth.tar  --precision AMP|FP32 --image <path to JPEG image>`
+`python classify.py --arch resnet50 --weights nvidia_resnet50_200821.pth.tar --precision AMP|FP32 --image <path to JPEG image>`
 
 
 ## Performance
@@ -521,72 +507,63 @@ The following section shows how to run benchmarks measuring the model performanc
 To benchmark training, run:
 
 * For 1 GPU
-    * FP32
-`python ./main.py --arch resnet50 -b <batch_size> --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP32 (V100 GPUs only)
+        `python ./launch.py --model resnet50 --precision FP32 --mode benchmark_training --platform DGX1V <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
+    * TF32 (A100 GPUs only)
+        `python ./launch.py --model resnet50 --precision TF32 --mode benchmark_training --platform DGXA100 <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
     * AMP
-`python ./main.py --arch resnet50 -b <batch_size> --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --amp --static-loss-scale 256 <path to imagenet>`
+        `python ./launch.py --model resnet50 --precision AMP --mode benchmark_training --platform <DGX1V|DGXA100> <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 * For multiple GPUs
-    * FP32
-`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -b <batch_size> --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP32 (V100 GPUs only)
+        `python ./launch.py --model resnet50 --precision FP32 --mode benchmark_training --platform DGX1V <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
+    * TF32 (A100 GPUs only)
+        `python ./multiproc.py --nproc_per_node 8 ./launch.py --model resnet50 --precision TF32 --mode benchmark_training --platform DGXA100 <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
     * AMP
-`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -b <batch_size> --training-only -p 1 --raport-file benchmark.json --amp --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
+        `python ./multiproc.py --nproc_per_node 8 ./launch.py --model resnet50 --precision AMP --mode benchmark_training --platform <DGX1V|DGXA100> <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
 Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
 
-Batch size should be picked appropriately depending on the hardware configuration.
-
-| *Platform* | *Precision* | *Batch Size* |
-|:----------:|:-----------:|:------------:|
-| DGXA100    | AMP         | 256          |
-| DGXA100    | TF32        | 256          |
-| DGX-1      | AMP         | 256          |
-| DGX-1      | FP32        | 128          |
-
 #### Inference performance benchmark
 
 To benchmark inference, run:
 
-* FP32
+* FP32 (V100 GPUs only)
 
-`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
+`python ./launch.py --model resnet50 --precision FP32 --mode benchmark_inference --platform DGX1V <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
-* AMP
+* TF32 (A100 GPUs only)
 
-`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --amp <path to imagenet>`
+`python ./launch.py --model resnet50 --precision FP32 --mode benchmark_inference --platform DGXA100 <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
-Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+* AMP
 
-Batch size should be picked appropriately depending on the hardware configuration.
+`python ./launch.py --model resnet50 --precision AMP --mode benchmark_inference --platform <DGX1V|DGXA100> <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
-| *Platform* | *Precision* | *Batch Size* |
-|:----------:|:-----------:|:------------:|
-| DGXA100    | AMP         | 256          |
-| DGXA100    | TF32        | 256          |
-| DGX-1      | AMP         | 256          |
-| DGX-1      | FP32        | 128          |
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
 
 ### Results
 
-Our results were obtained by running the applicable training script     in the pytorch-20.06 NGC container.
+Our results were obtained by running the applicable training script     in the pytorch-20.12 NGC container.
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
 
 #### Training accuracy results
 
-##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)
 
-| **epochs** | **Mixed Precision Top1** | **TF32 Top1** |
-|:------:|:--------------------:|:--------------:|
-|     90 |    76.93 +/- 0.23    | 76.85 +/- 0.30 |
+| **Epochs** | **Mixed Precision Top1** | **TF32 Top1**  |
+|:----------:|:------------------------:|:--------------:|
+|     90     |      77.12 +/- 0.11      | 76.95 +/- 0.18 |
+|    250     |      78.43 +/- 0.11      | 78.38 +/- 0.17 |
 
 
 ##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
-| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
-|:-:|:-:|:-:|
-| 50 | 76.25 +/- 0.04 | 76.26 +/- 0.07 |
-|     90 |    77.09 +/- 0.10    | 77.01 +/- 0.16 |
-| 250 | 78.42 +/- 0.04 | 78.30 +/- 0.16 |
+| **Epochs** | **Mixed Precision Top1** | **FP32 Top1**  |
+|:----------:|:------------------------:|:--------------:|
+|     90     |      76.88 +/- 0.16      | 77.01 +/- 0.16 |
+|    250     |      78.25 +/- 0.12      | 78.30 +/- 0.16 |
+
 
 ##### Training accuracy: NVIDIA DGX-2 (16x V100 32GB)
 
@@ -610,26 +587,28 @@ The following images show a 250 epochs configuration on a DGX-1V.
 
 #### Training performance results
 
-##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
+
+| **GPUs** | **Mixed Precision** |  **TF32**  | **Mixed Precision Speedup** | **Mixed Precision Strong Scaling** | **Mixed Precision Training Time (90E)** | **TF32 Strong Scaling** | **TF32 Training Time (90E)** |
+|:--------:|:-------------------:|:----------:|:---------------------------:|:----------------------------------:|:---------------------------------------:|:-----------------------:|:----------------------------:|
+|    1     |     2461 img/s      | 945 img/s  |            2.6 x            |               1.0 x                |                ~14 hours                |          1.0 x          |          ~36 hours           |
+|    8     |     15977 img/s     | 7365 img/s |           2.16 x            |               6.49 x               |                ~3 hours                 |         7.78 x          |           ~5 hours           |
 
-|**GPUs**|**Mixed Precision**|  **TF32**   |**Mixed Precision Speedup**|**Mixed Precision Strong Scaling**|**Mixed Precision Training Time (90E)**|**TF32 Strong Scaling**|**TF32 Training Time (90E)**|
-|:------:|:-----------------:|:-----------:|:-------------------------:|:--------------------------------:|:-------------------------------------:|:---------------------:|:--------------------------:|
-|   1    |   1240.81 img/s   |680.15 img/s |           1.82x           |              1.00x               |               ~27 hours               |         1.00x         |         ~49 hours          |
-|   8    |   9604.92 img/s   |5379.82 img/s|           1.79x           |              7.74x               |               ~4 hours                |         7.91x         |          ~6 hours          |
 
 ##### Training performance: NVIDIA DGX-1 16GB (8x V100 16GB)
 
-|**GPUs**|**Mixed Precision**|  **FP32**   |**Mixed Precision Speedup**|**Mixed Precision Strong Scaling**|**Mixed Precision Training Time (90E)**|**FP32 Strong Scaling**|**FP32 Training Time (90E)**|
-|:------:|:-----------------:|:-----------:|:-------------------------:|:--------------------------------:|:-------------------------------------:|:---------------------:|:--------------------------:|
-|   1    |   856.52 img/s    |373.21 img/s |           2.30x           |              1.00x               |               ~39 hours               |         1.00x         |         ~89 hours          |
-|   8    |   6635.90 img/s   |2899.62 img/s|           2.29x           |              7.75x               |               ~5 hours                |         7.77x         |         ~12 hours          |
+| **GPUs** | **Mixed Precision** |  **FP32**  | **Mixed Precision Speedup** | **Mixed Precision Strong Scaling** | **Mixed Precision Training Time (90E)** | **FP32 Strong Scaling** | **FP32 Training Time (90E)** |
+|:--------:|:-------------------:|:----------:|:---------------------------:|:----------------------------------:|:---------------------------------------:|:-----------------------:|:----------------------------:|
+|    1     |     1180 img/s      | 371 img/s  |           3.17 x            |               1.0 x                |                ~29 hours                |          1.0 x          |          ~91 hours           |
+|    8     |     7608 img/s      | 2851 img/s |           2.66 x            |               6.44 x               |                ~5 hours                 |         7.66 x          |          ~12 hours           |
+
 
 ##### Training performance: NVIDIA DGX-1 32GB (8x V100 32GB)
 
-|**GPUs**|**Mixed Precision**|  **FP32**   |**Mixed Precision Speedup**|**Mixed Precision Strong Scaling**|**Mixed Precision Training Time (90E)**|**FP32 Strong Scaling**|**FP32 Training Time (90E)**|
-|:------:|:-----------------:|:-----------:|:-------------------------:|:--------------------------------:|:-------------------------------------:|:---------------------:|:--------------------------:|
-|   1    |   816.00 img/s    |359.76 img/s |           2.27x           |              1.00x               |               ~41 hours               |         1.00x         |         ~93 hours          |
-|   8    |   6347.26 img/s   |2813.23 img/s|           2.26x           |              7.78x               |               ~5 hours                |         7.82x         |         ~12 hours          |
+| **GPUs** | **Mixed Precision** |  **FP32**  | **Mixed Precision Speedup** | **Mixed Precision Strong Scaling** | **Mixed Precision Training Time (90E)** | **FP32 Strong Scaling** | **FP32 Training Time (90E)** |
+|:--------:|:-------------------:|:----------:|:---------------------------:|:----------------------------------:|:---------------------------------------:|:-----------------------:|:----------------------------:|
+|    1     |     1115 img/s      | 365 img/s  |           3.04 x            |               1.0 x                |                ~31 hours                |          1.0 x          |          ~92 hours           |
+|    8     |     7375 img/s      | 2811 img/s |           2.62 x            |               6.61 x               |                ~5 hours                 |         7.68 x          |          ~12 hours           |
 
 
 #### Inference performance results
@@ -638,66 +617,66 @@ The following images show a 250 epochs configuration on a DGX-1V.
 
 ###### FP32 Inference Latency
 
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 136.82 img/s | 7.12ms | 7.25ms | 8.36ms | 10.92ms |
-| 2 | 266.86 img/s | 7.27ms | 7.41ms | 7.85ms | 9.11ms |
-| 4 | 521.76 img/s | 7.44ms | 7.58ms | 8.14ms | 10.09ms |
-| 8 | 766.22 img/s | 10.18ms | 10.46ms | 10.97ms | 12.75ms |
-| 16 | 976.36 img/s | 15.79ms | 15.88ms | 15.95ms | 16.63ms |
-| 32 | 1092.27 img/s | 28.63ms | 28.71ms | 28.76ms | 29.30ms |
-| 64 | 1161.55 img/s | 53.69ms | 53.86ms | 53.90ms | 54.23ms |
-| 128 | 1209.12 img/s | 104.24ms | 104.68ms | 104.80ms | 105.00ms |
-| 256 | N/A | N/A | N/A | N/A | N/A |
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      99 img/s      |    10.38 ms     |    11.24 ms     |    12.32 ms     |
+|       2        |     190 img/s      |    10.87 ms     |    12.18 ms     |    14.27 ms     |
+|       4        |     403 img/s      |    10.26 ms     |    11.02 ms     |    13.28 ms     |
+|       8        |     754 img/s      |    10.96 ms     |    11.99 ms     |    13.89 ms     |
+|       16       |     960 img/s      |    17.16 ms     |    16.74 ms     |    18.18 ms     |
+|       32       |     1057 img/s     |    31.39 ms     |     30.4 ms     |    30.55 ms     |
+|       64       |     1168 img/s     |     57.1 ms     |    55.01 ms     |    56.19 ms     |
+|      112       |     1166 img/s     |    100.78 ms    |    95.98 ms     |    97.43 ms     |
+|      128       |     1215 img/s     |    111.11 ms    |    105.52 ms    |    106.38 ms    |
+|      256       |     1253 img/s     |    217.03 ms    |    203.78 ms    |    208.68 ms    |
 
-###### Mixed Precision Inference Latency
 
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 114.97 img/s | 8.56ms | 9.32ms | 11.43ms | 12.79ms |
-| 2 | 238.70 img/s | 8.20ms | 8.75ms | 9.49ms | 12.31ms |
-| 4 | 448.69 img/s | 8.67ms | 9.20ms | 9.97ms | 10.60ms |
-| 8 | 875.00 img/s | 8.88ms | 9.31ms | 9.80ms | 10.82ms |
-| 16 | 1746.07 img/s | 8.89ms | 9.05ms | 9.56ms | 12.81ms |
-| 32 | 2004.28 img/s | 14.07ms | 14.14ms | 14.31ms | 14.92ms |
-| 64 | 2254.60 img/s | 25.93ms | 26.05ms | 26.07ms | 26.17ms |
-| 128 | 2360.14 img/s | 50.14ms | 50.28ms | 50.34ms | 50.68ms |
-| 256 | 2342.13 img/s | 96.74ms | 96.91ms | 96.99ms | 97.14ms |
+###### Mixed Precision Inference Latency
 
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      82 img/s      |    12.43 ms     |    13.29 ms     |    14.89 ms     |
+|       2        |     157 img/s      |    13.04 ms     |    13.84 ms     |    16.79 ms     |
+|       4        |     310 img/s      |    13.26 ms     |    14.42 ms     |    15.63 ms     |
+|       8        |     646 img/s      |    12.69 ms     |    13.65 ms     |    15.48 ms     |
+|       16       |     1188 img/s     |    14.01 ms     |    15.56 ms     |    18.34 ms     |
+|       32       |     2093 img/s     |    16.41 ms     |    18.25 ms     |     19.9 ms     |
+|       64       |     2899 img/s     |    24.12 ms     |    22.14 ms     |    22.55 ms     |
+|      128       |     3142 img/s     |    45.28 ms     |    40.77 ms     |    42.89 ms     |
+|      256       |     3276 img/s     |    88.44 ms     |     77.8 ms     |    79.01 ms     |
+|      256       |     3276 img/s     |     88.6 ms     |    77.74 ms     |    79.11 ms     |
 
 
 ##### Inference performance: NVIDIA T4
 
 ###### FP32 Inference Latency
 
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 179.85 img/s | 5.51ms | 5.65ms | 7.34ms | 10.97ms |
-| 2 | 348.12 img/s | 5.67ms | 5.95ms | 6.33ms | 9.81ms |
-| 4 | 556.27 img/s | 7.03ms | 7.34ms | 8.13ms | 9.65ms |
-| 8 | 740.43 img/s | 10.32ms | 10.33ms | 10.60ms | 13.87ms |
-| 16 | 909.17 img/s | 17.19ms | 17.15ms | 18.13ms | 21.06ms |
-| 32 | 999.07 img/s | 31.07ms | 31.12ms | 31.17ms | 32.41ms |
-| 64 | 1090.47 img/s | 57.62ms | 57.84ms | 57.91ms | 58.05ms |
-| 128 | 1142.46 img/s | 110.94ms | 111.15ms | 111.23ms | 112.16ms |
-| 256 | N/A | N/A | N/A | N/A | N/A |
-
-###### Mixed Precision Inference Latency
-
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 163.78 img/s | 6.05ms | 5.92ms | 7.98ms | 11.58ms |
-| 2 | 333.43 img/s | 5.91ms | 6.05ms | 6.63ms | 11.52ms |
-| 4 | 645.45 img/s | 6.04ms | 6.33ms | 7.01ms | 8.90ms |
-| 8 | 1164.15 img/s | 6.73ms | 7.31ms | 8.04ms | 12.41ms |
-| 16 | 1606.42 img/s | 9.53ms | 9.86ms | 10.52ms | 17.01ms |
-| 32 | 1857.29 img/s | 15.67ms | 15.61ms | 16.14ms | 18.66ms |
-| 64 | 2011.62 img/s | 28.64ms | 28.69ms | 28.82ms | 31.06ms |
-| 128 | 2083.90 img/s | 54.87ms | 54.96ms | 54.99ms | 55.27ms |
-| 256 | 2043.72 img/s | 106.51ms | 106.62ms | 106.68ms | 107.03ms |
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |     147 img/s      |     7.28 ms     |     8.48 ms     |     9.79 ms     |
+|       2        |     251 img/s      |     8.48 ms     |    10.23 ms     |    14.01 ms     |
+|       4        |     303 img/s      |    13.57 ms     |    13.61 ms     |    15.42 ms     |
+|       8        |     329 img/s      |     24.7 ms     |    24.74 ms     |     25.0 ms     |
+|       16       |     371 img/s      |    43.73 ms     |    43.74 ms     |    44.03 ms     |
+|       32       |     395 img/s      |    82.36 ms     |    82.13 ms     |    82.58 ms     |
+|       64       |     421 img/s      |    155.37 ms    |    153.07 ms    |    153.55 ms    |
+|      128       |     426 img/s      |    309.06 ms    |    303.0 ms     |    307.42 ms    |
+|      256       |     419 img/s      |    631.43 ms    |    612.42 ms    |    614.82 ms    |
 
 
+###### Mixed Precision Inference Latency
 
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |     112 img/s      |     9.25 ms     |     9.87 ms     |    10.62 ms     |
+|       2        |     223 img/s      |     9.4 ms      |    10.62 ms     |     13.9 ms     |
+|       4        |     468 img/s      |     9.06 ms     |    11.15 ms     |     15.5 ms     |
+|       8        |     844 img/s      |    10.05 ms     |    12.67 ms     |    17.86 ms     |
+|       16       |     1037 img/s     |    16.01 ms     |    15.66 ms     |    15.86 ms     |
+|       32       |     1103 img/s     |    30.27 ms     |    29.45 ms     |    29.74 ms     |
+|       64       |     1154 img/s     |    57.96 ms     |    56.33 ms     |    56.96 ms     |
+|      128       |     1177 img/s     |    114.95 ms    |    110.4 ms     |    111.1 ms     |
+|      256       |     1184 img/s     |    229.61 ms    |    217.84 ms    |    224.75 ms    |
 
 
 ## Release notes
@@ -720,9 +699,9 @@ The following images show a 250 epochs configuration on a DGX-1V.
 5. July 2020
   * Added A100 scripts
   * Updated README
-
+6. February 2021
+  * Moved from APEX AMP to Native AMP
 ### Known issues
 
 There are no known issues with this model.
 
-

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1V_resnet50_AMP_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision AMP --mode convergence --platform DGX1V /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1V_resnet50_AMP_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision AMP --mode convergence --platform DGX1V /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_250E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j8 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_50E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j8 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j8 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2V_resnet50_AMP_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision AMP --mode convergence --platform DGX2V /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2V_resnet50_AMP_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision AMP --mode convergence --platform DGX2V /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_250E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j8 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_50E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j8 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j8 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGXA100_RN50_AMP_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j16 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGXA100_resnet50_AMP_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision AMP --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/AMP/DGXA100_resnet50_AMP_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision AMP --mode convergence --platform DGXA100 /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1V_resnet50_FP32_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision FP32 --mode convergence --platform DGX1V /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1V_resnet50_FP32_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision FP32 --mode convergence --platform DGX1V /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_250E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j8 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_50E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j8 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j8 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2V_resnet50_FP32_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision FP32 --mode convergence --platform DGX2V /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2V_resnet50_FP32_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision FP32 --mode convergence --platform DGX2V /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_250E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j8 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_50E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j8 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j8 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90

+ 0 - 1
PyTorch/Classification/ConvNets/resnet50v1.5/training/TF32/DGXA100_RN50_TF32_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j16 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --epochs 90

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/TF32/DGXA100_resnet50_TF32_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision TF32 --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnet50v1.5/training/TF32/DGXA100_resnet50_TF32_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnet50 --precision TF32 --mode convergence --platform DGXA100 /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 116 - 136
PyTorch/Classification/ConvNets/resnext101-32x4d/README.md

@@ -31,11 +31,11 @@ achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
     * [Inference performance benchmark](#inference-performance-benchmark)
   * [Results](#results)
     * [Training accuracy results](#training-accuracy-results)
-      * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
+      * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
       * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
       * [Example plots](#example-plots)
     * [Training performance results](#training-performance-results)
-      * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+      * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
       * [Training performance: NVIDIA DGX-1 16GB (8x V100 16GB)](#training-performance-nvidia-dgx-1-16gb-8x-v100-16gb)
       * [Training performance: NVIDIA DGX-1 32GB (8x V100 32GB)](#training-performance-nvidia-dgx-1-32gb-8x-v100-32gb)
   * [Inference performance results](#inference-performance-results)
@@ -111,7 +111,7 @@ The following features are supported by this model:
 
 | Feature               | ResNeXt101-32x4d
 |-----------------------|--------------------------
-|[DALI](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html)   |   Yes
+|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html)   |   Yes
 |[APEX AMP](https://nvidia.github.io/apex/amp.html) | Yes |
 
 #### Features
@@ -128,11 +128,11 @@ which speeds up data loading when CPU becomes a bottleneck.
 DALI can use CPU or GPU, and outperforms the PyTorch native dataloader.
 
 Run training with `--data-backends dali-gpu` or `--data-backends dali-cpu` to enable DALI.
-For ResNeXt101-32x4d, for DGXA100, DGX1 and DGX2 we recommend `--data-backends dali-cpu`.
+For DGXA100 and DGX1 we recommend `--data-backends dali-cpu`.
 
 ### Mixed precision training
 
-Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
 1.  Porting the model to use the FP16 data type where appropriate.
 2.  Adding loss scaling to preserve small gradient values.
 
@@ -190,7 +190,7 @@ The following section lists the requirements that you need to meet in order to s
 This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
+* [PyTorch 20.12-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
 * Supported GPUs:
     * [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
     * [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
@@ -242,27 +242,27 @@ For the specifics concerning training and inference, see the [Advanced](#advance
 
 The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
 
-### 3. Build the RNXT101-32x4d PyTorch NGC container.
+### 3. Build the ResNeXt101-32x4d PyTorch NGC container.
 
 ```
-docker build . -t nvidia_rnxt101-32x4d
+docker build . -t nvidia_resnext101-32x4d
 ```
 
 ### 4. Start an interactive session in the NGC container to run training/inference.
 ```
-nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host nvidia_rnxt101-32x4d
+nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host nvidia_resnext101-32x4d
 ```
 
 ### 5. Start training
 
-To run training for a standard configuration (DGXA100/DGX1/DGX2, AMP/TF32/FP32, 90/250 Epochs),
+To run training for a standard configuration (DGXA100/DGX1V, AMP/TF32/FP32, 90/250 Epochs),
 run one of the scripts in the `./resnext101-32x4d/training` directory
-called `./resnext101-32x4d/training/{AMP, TF32, FP32}/{DGXA100, DGX1, DGX2}_RNXT101-32x4d_{AMP, TF32, FP32}_{90,250}E.sh`.
+called `./resnext101-32x4d/training/{AMP, TF32, FP32}/{ DGXA100, DGX1V }_resnext101-32x4d_{AMP, TF32, FP32}_{ 90, 250 }E.sh`.
 
 Ensure ImageNet is mounted in the `/imagenet` directory.
 
 Example:
-    `bash ./resnext101-32x4d/training/AMP/DGX1_RNXT101-32x4d_AMP_250E.sh <path were to store checkpoints and logs>`
+    `bash ./resnext101-32x4d/training/AMP/DGX1_resnext101-32x4d_AMP_250E.sh <path were to store checkpoints and logs>`
 
 ### 6. Start inference
 
@@ -280,7 +280,7 @@ To run inference on ImageNet, run:
 
 To run inference on JPEG image using pretrained weights:
 
-`python classify.py --arch resnext101-32x4d -c fanin --weights nvidia_resnext101-32x4d_200821.pth.tar  --precision AMP|FP32 --image <path to JPEG image>`
+`python classify.py --arch resnext101-32x4d -c fanin --weights nvidia_resnext101-32x4d_200821.pth.tar --precision AMP|FP32 --image <path to JPEG image>`
 
 
 ## Advanced
@@ -319,7 +319,7 @@ usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
                [--lr-schedule SCHEDULE] [--warmup E] [--label-smoothing S]
                [--mixup ALPHA] [--momentum M] [--weight-decay W]
                [--bn-weight-decay] [--nesterov] [--print-freq N]
-               [--resume PATH] [--pretrained-weights PATH] [--fp16]
+               [--resume PATH] [--pretrained-weights PATH]
                [--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
                [--prof N] [--amp] [--seed SEED] [--gather-checkpoints]
                [--raport-file RAPORT_FILE] [--evaluate] [--training-only]
@@ -338,8 +338,10 @@ optional arguments:
                         data backend: pytorch | syntetic | dali-gpu | dali-cpu
                         (default: dali-cpu)
   --arch ARCH, -a ARCH  model architecture: resnet18 | resnet34 | resnet50 |
-                        resnet101 | resnet152 | resnext101-32x4d | se-
-                        resnext101-32x4d (default: resnet50)
+                        resnet101 | resnet152 | resnext50-32x4d |
+                        resnext101-32x4d | resnext101-32x8d |
+                        resnext101-32x8d-basic | se-resnext101-32x4d (default:
+                        resnet50)
   --model-config CONF, -c CONF
                         model configs: classic | fanin | grp-fanin | grp-
                         fanout(default: classic)
@@ -368,10 +370,9 @@ optional arguments:
   --resume PATH         path to latest checkpoint (default: none)
   --pretrained-weights PATH
                         load weights from here
-  --fp16                Run model fp16 mode.
   --static-loss-scale STATIC_LOSS_SCALE
                         Static loss scale, positive power of 2 values can
-                        improve fp16 convergence.
+                        improve amp convergence.
   --dynamic-loss-scale  Use dynamic loss scaling. If supplied, this argument
                         supersedes --static-loss-scale.
   --prof N              Run only N iterations
@@ -399,25 +400,7 @@ To use your own dataset, divide it in directories as in the following scheme:
  - Training images - `train/<class id>/<image>`
  - Validation images - `val/<class id>/<image>`
 
-If your dataset's has number of classes different than 1000, you need to add a custom config
-in the `image_classification/resnet.py` file.
-
-```python
-resnet_versions = {
-    ...
-    'resnext101-32x4d-custom' : {
-        'net' : ResNet,
-        'block' : Bottleneck,
-        'cardinality' : 32,
-        'layers' : [3, 4, 23, 3],
-        'widths' : [128, 256, 512, 1024],
-        'expansion' : 2,
-        'num_classes' : <custom number of classes>,
-    }
-}
-```
-
-After adding the config, run the training script with `--arch resnext101-32x4d-custom` flag.
+If your dataset's has number of classes different than 1000, you need to pass `--num-classes N` flag to the training script.
 
 ### Training process
 
@@ -440,7 +423,7 @@ To restart training from checkpoint use `--resume` option.
 To start training from pretrained weights (e.g. downloaded from NGC) use `--pretrained-weights` option.
 
 The difference between those two is that the pretrained weights contain only model weights,
-and checkpoints, apart from model weights, contain optimizer state, LR scheduler state, RNG state.
+and checkpoints, apart from model weights, contain optimizer state, LR scheduler state.
 
 Checkpoints are suitable for dividing the training into parts, for example in order
 to divide the training job into shorter stages, or restart training after infrastructure fail.
@@ -482,9 +465,9 @@ You can also run ImageNet validation on pretrained weights:
 Pretrained weights can be downloaded from NGC:
 
 ```bash
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/resnext101-32x4d_pyt_amp/versions/20.06.0/zip -O resnext101-32x4d_pyt_amp_20.06.0.zip
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/resnext101_32x4d_pyt_amp/versions/20.06.0/zip -O resnext101_32x4d_pyt_amp_20.06.0.zip
 
-unzip resnext101-32x4d_pyt_amp_20.06.0.zip
+unzip resnext101_32x4d_pyt_amp_20.06.0.zip
 ```
 
 To run inference on ImageNet, run:
@@ -493,7 +476,7 @@ To run inference on ImageNet, run:
 
 To run inference on JPEG image using pretrained weights:
 
-`python classify.py --arch resnext101-32x4d -c fanin --weights nvidia_resnext101-32x4d_200821.pth.tar  --precision AMP|FP32 --image <path to JPEG image>`
+`python classify.py --arch resnext101-32x4d --weights nvidia_resnext101-32x4d_200821.pth.tar --precision AMP|FP32 --image <path to JPEG image>`
 
 
 ## Performance
@@ -507,71 +490,62 @@ The following section shows how to run benchmarks measuring the model performanc
 To benchmark training, run:
 
 * For 1 GPU
-    * FP32
-`python ./main.py --arch resnext101-32x4d -b <batch_size> --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP32 (V100 GPUs only)
+        `python ./launch.py --model resnext101-32x4d --precision FP32 --mode benchmark_training --platform DGX1V <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
+    * TF32 (A100 GPUs only)
+        `python ./launch.py --model resnext101-32x4d --precision TF32 --mode benchmark_training --platform DGXA100 <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
     * AMP
-`python ./main.py --arch resnext101-32x4d -b <batch_size> --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --amp --static-loss-scale 256 --memory-format nhwc <path to imagenet>`
+        `python ./launch.py --model resnext101-32x4d --precision AMP --mode benchmark_training --platform <DGX1V|DGXA100> <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 * For multiple GPUs
-    * FP32
-`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnext101-32x4d -b <batch_size> --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP32 (V100 GPUs only)
+        `python ./launch.py --model resnext101-32x4d --precision FP32 --mode benchmark_training --platform DGX1V <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
+    * TF32 (A100 GPUs only)
+        `python ./multiproc.py --nproc_per_node 8 ./launch.py --model resnext101-32x4d --precision TF32 --mode benchmark_training --platform DGXA100 <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
     * AMP
-`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnext101-32x4d -b <batch_size> --training-only -p 1 --raport-file benchmark.json --amp --static-loss-scale 256 --epochs 1 --prof 100 --memory-format nhwc <path to imagenet>`
+        `python ./multiproc.py --nproc_per_node 8 ./launch.py --model resnext101-32x4d --precision AMP --mode benchmark_training --platform <DGX1V|DGXA100> <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
 Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
 
-Batch size should be picked appropriately depending on the hardware configuration.
-
-| *Platform* | *Precision* | *Batch Size* |
-|:----------:|:-----------:|:------------:|
-| DGXA100    | AMP         | 128          |
-| DGXA100    | TF32        | 128          |
-| DGX-1      | AMP         | 128          |
-| DGX-1      | FP32        | 64           |
-
 #### Inference performance benchmark
 
 To benchmark inference, run:
 
-* FP32
+* FP32 (V100 GPUs only)
 
-`python ./main.py --arch resnext101-32x4d -b <batch_size> -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
+`python ./launch.py --model resnext101-32x4d --precision FP32 --mode benchmark_inference --platform DGX1V <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
-* AMP
+* TF32 (A100 GPUs only)
 
-`python ./main.py --arch resnext101-32x4d -b <batch_size> -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --amp --memory-format nhwc <path to imagenet>`
+`python ./launch.py --model resnext101-32x4d --precision FP32 --mode benchmark_inference --platform DGXA100 <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
-Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+* AMP
 
-Batch size should be picked appropriately depending on the hardware configuration.
+`python ./launch.py --model resnext101-32x4d --precision AMP --mode benchmark_inference --platform <DGX1V|DGXA100> <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
-| *Platform* | *Precision* | *Batch Size* |
-|:----------:|:-----------:|:------------:|
-| DGXA100    | AMP         | 128          |
-| DGXA100    | TF32        | 128          |
-| DGX-1      | AMP         | 128          |
-| DGX-1      | FP32        | 64           |
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
 
 ### Results
 
-Our results were obtained by running the applicable training script     in the pytorch-20.06 NGC container.
+Our results were obtained by running the applicable training script     in the pytorch-20.12 NGC container.
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
 
 #### Training accuracy results
 
-##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)
 
-| **epochs** | **Mixed Precision Top1** | **TF32 Top1** |
-|:------:|:--------------------:|:--------------:|
-|   90   |    79.37 +/- 0.13    | 79.38 +/- 0.13 |
+| **Epochs** | **Mixed Precision Top1** | **TF32 Top1**  |
+|:----------:|:------------------------:|:--------------:|
+|     90     |      79.47 +/- 0.03      | 79.38 +/- 0.07 |
+|    250     |      80.19 +/- 0.08      | 80.27 +/- 0.1  |
 
-##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
-| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
-|:-:|:-:|:-:|
-|   90   |    79.43 +/- 0.04    | 79.40 +/- 0.10 |
-| 250 | 79.92 +/- 0.13 | 80.06 +/- 0.06 |
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
+| **Epochs** | **Mixed Precision Top1** | **FP32 Top1**  |
+|:----------:|:------------------------:|:--------------:|
+|     90     |      79.49 +/- 0.05      | 79.40 +/- 0.10 |
+|    250     |      80.26 +/- 0.11      | 80.06 +/- 0.06 |
 
 
 ##### Example plots
@@ -586,26 +560,29 @@ The following images show a 250 epochs configuration on a DGX-1V.
 
 #### Training performance results
 
-##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
+
+| **GPUs** | **Mixed Precision** |  **TF32**  | **Mixed Precision Speedup** | **Mixed Precision Strong Scaling** | **Mixed Precision Training Time (90E)** | **TF32 Strong Scaling** | **TF32 Training Time (90E)** |
+|:--------:|:-------------------:|:----------:|:---------------------------:|:----------------------------------:|:---------------------------------------:|:-----------------------:|:----------------------------:|
+|    1     |     1169 img/s      | 420 img/s  |           2.77 x            |               1.0 x                |                ~29 hours                |          1.0 x          |          ~80 hours           |
+|    8     |     7399 img/s      | 3193 img/s |           2.31 x            |               6.32 x               |                ~5 hours                 |         7.58 x          |          ~11 hours           |
 
-|**GPUs**|**Mixed Precision**|  **TF32**   |**Mixed Precision Speedup**|**Mixed Precision Strong Scaling**|**Mixed Precision Training Time (90E)**|**TF32 Strong Scaling**|**TF32 Training Time (90E)**|
-|:------:|:-----------------:|:-----------:|:-------------------------:|:--------------------------------:|:-------------------------------------:|:---------------------:|:--------------------------:|
-|   1    |   908.40 img/s    |300.42 img/s |           3.02x           |              1.00x               |               ~37 hours               |         1.00x         |         ~111 hours         |
-|   8    |   6887.59 img/s   |2380.51 img/s|           2.89x           |              7.58x               |               ~5 hours                |         7.92x         |         ~14 hours          |
 
 ##### Training performance: NVIDIA DGX-1 16GB (8x V100 16GB)
 
-|**GPUs**|**Mixed Precision**|  **FP32**   |**Mixed Precision Speedup**|**Mixed Precision Strong Scaling**|**Mixed Precision Training Time (90E)**|**FP32 Strong Scaling**|**FP32 Training Time (90E)**|
-|:------:|:-----------------:|:-----------:|:-------------------------:|:--------------------------------:|:-------------------------------------:|:---------------------:|:--------------------------:|
-|   1    |   534.91 img/s    |150.05 img/s |           3.56x           |              1.00x               |               ~62 hours               |         1.00x         |         ~222 hours         |
-|   8    |   4000.79 img/s   |1151.01 img/s|           3.48x           |              7.48x               |               ~9 hours                |         7.67x         |         ~29 hours          |
+| **GPUs** | **Mixed Precision** |  **FP32**  | **Mixed Precision Speedup** | **Mixed Precision Strong Scaling** | **Mixed Precision Training Time (90E)** | **FP32 Strong Scaling** | **FP32 Training Time (90E)** |
+|:--------:|:-------------------:|:----------:|:---------------------------:|:----------------------------------:|:---------------------------------------:|:-----------------------:|:----------------------------:|
+|    1     |      578 img/s      | 149 img/s  |           3.86 x            |               1.0 x                |                ~59 hours                |          1.0 x          |          ~225 hours          |
+|    8     |     3742 img/s      | 1117 img/s |           3.34 x            |               6.46 x               |                ~9 hours                 |         7.45 x          |          ~31 hours           |
+
 
 ##### Training performance: NVIDIA DGX-1 32GB (8x V100 32GB)
 
-|**GPUs**|**Mixed Precision**|  **FP32**   |**Mixed Precision Speedup**|**Mixed Precision Strong Scaling**|**Mixed Precision Training Time (90E)**|**FP32 Strong Scaling**|**FP32 Training Time (90E)**|
-|:------:|:-----------------:|:-----------:|:-------------------------:|:--------------------------------:|:-------------------------------------:|:---------------------:|:--------------------------:|
-|   1    |   516.07 img/s    |139.80 img/s |           3.69x           |              1.00x               |               ~65 hours               |         1.00x         |         ~238 hours         |
-|   8    |   3861.95 img/s   |1070.94 img/s|           3.61x           |              7.48x               |               ~9 hours                |         7.66x         |         ~31 hours          |
+| **GPUs** | **Mixed Precision** |  **FP32**  | **Mixed Precision Speedup** | **Mixed Precision Strong Scaling** | **Mixed Precision Training Time (90E)** | **FP32 Strong Scaling** | **FP32 Training Time (90E)** |
+|:--------:|:-------------------:|:----------:|:---------------------------:|:----------------------------------:|:---------------------------------------:|:-----------------------:|:----------------------------:|
+|    1     |      556 img/s      | 151 img/s  |           3.68 x            |               1.0 x                |                ~61 hours                |          1.0 x          |          ~223 hours          |
+|    8     |     3595 img/s      | 1102 img/s |           3.26 x            |               6.45 x               |                ~10 hours                |         7.28 x          |          ~31 hours           |
+
 
 #### Inference performance results
 
@@ -613,62 +590,64 @@ The following images show a 250 epochs configuration on a DGX-1V.
 
 ###### FP32 Inference Latency
 
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 47.34 img/s | 21.02ms | 23.41ms | 24.55ms | 26.00ms |
-| 2 | 89.68 img/s | 22.14ms | 22.90ms | 24.86ms | 26.59ms |
-| 4 | 175.92 img/s | 22.57ms | 24.96ms | 25.53ms | 26.03ms |
-| 8 | 325.69 img/s | 24.35ms | 25.17ms | 25.80ms | 28.52ms |
-| 16 | 397.04 img/s | 40.04ms | 40.01ms | 40.08ms | 40.32ms |
-| 32 | 431.77 img/s | 73.71ms | 74.05ms | 74.09ms | 74.26ms |
-| 64 | 485.70 img/s | 131.04ms | 131.38ms | 131.53ms | 131.81ms |
-| 128 | N/A | N/A | N/A | N/A | N/A |
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      55 img/s      |    18.48 ms     |    18.88 ms     |    20.74 ms     |
+|       2        |     116 img/s      |    17.54 ms     |    18.15 ms     |    21.32 ms     |
+|       4        |     214 img/s      |    19.07 ms     |    20.44 ms     |    22.69 ms     |
+|       8        |     291 img/s      |     27.8 ms     |    27.99 ms     |    28.47 ms     |
+|       16       |     354 img/s      |    45.78 ms     |     45.4 ms     |    45.73 ms     |
+|       32       |     423 img/s      |    77.13 ms     |    75.96 ms     |    76.21 ms     |
+|       64       |     486 img/s      |    134.92 ms    |    132.17 ms    |    132.51 ms    |
+|      128       |     523 img/s      |    252.11 ms    |    244.5 ms     |    244.99 ms    |
+|      256       |     530 img/s      |    499.64 ms    |    479.83 ms    |    481.41 ms    |
 
-###### Mixed Precision Inference Latency
 
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 43.11 img/s | 23.05ms | 25.19ms | 25.41ms | 26.63ms |
-| 2 | 83.29 img/s | 23.82ms | 25.11ms | 26.25ms | 27.29ms |
-| 4 | 173.67 img/s | 22.82ms | 24.38ms | 25.26ms | 25.92ms |
-| 8 | 330.18 img/s | 24.05ms | 26.45ms | 27.37ms | 27.74ms |
-| 16 | 634.82 img/s | 25.00ms | 26.93ms | 28.12ms | 28.73ms |
-| 32 | 884.91 img/s | 35.71ms | 35.96ms | 36.01ms | 36.13ms |
-| 64 | 998.40 img/s | 63.43ms | 63.63ms | 63.75ms | 63.96ms |
-| 128 | 1079.10 img/s | 117.74ms | 118.02ms | 118.11ms | 118.35ms |
+###### Mixed Precision Inference Latency
 
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      40 img/s      |    25.17 ms     |     28.4 ms     |    30.66 ms     |
+|       2        |      89 img/s      |    22.64 ms     |    24.29 ms     |    25.99 ms     |
+|       4        |     165 img/s      |    24.54 ms     |    26.23 ms     |    28.61 ms     |
+|       8        |     334 img/s      |    24.31 ms     |    28.46 ms     |    29.91 ms     |
+|       16       |     632 img/s      |     25.8 ms     |    27.76 ms     |    29.53 ms     |
+|       32       |     1219 img/s     |    27.35 ms     |    29.86 ms     |     31.6 ms     |
+|       64       |     1525 img/s     |    43.97 ms     |    42.01 ms     |    42.96 ms     |
+|      128       |     1647 img/s     |    82.22 ms     |    77.65 ms     |    79.56 ms     |
+|      256       |     1689 img/s     |    161.53 ms    |    151.25 ms    |    152.01 ms    |
 
 
 ##### Inference performance: NVIDIA T4
 
 ###### FP32 Inference Latency
 
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 55.64 img/s | 17.88ms | 19.21ms | 20.35ms | 22.29ms |
-| 2 | 109.22 img/s | 18.24ms | 19.00ms | 20.43ms | 22.51ms |
-| 4 | 217.27 img/s | 18.26ms | 18.88ms | 19.51ms | 21.74ms |
-| 8 | 294.55 img/s | 26.74ms | 27.35ms | 27.62ms | 28.93ms |
-| 16 | 351.30 img/s | 45.34ms | 45.72ms | 46.10ms | 47.43ms |
-| 32 | 401.97 img/s | 79.10ms | 79.37ms | 79.44ms | 81.83ms |
-| 64 | 449.30 img/s | 140.30ms | 140.73ms | 141.26ms | 143.57ms |
-| 128 | N/A | N/A | N/A | N/A | N/A |
-
-###### Mixed Precision Inference Latency
-
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 51.14 img/s | 19.48ms | 20.16ms | 21.40ms | 26.21ms |
-| 2 | 102.29 img/s | 19.44ms | 19.77ms | 20.42ms | 24.51ms |
-| 4 | 209.44 img/s | 18.93ms | 19.52ms | 20.23ms | 21.95ms |
-| 8 | 408.69 img/s | 19.47ms | 21.12ms | 23.15ms | 25.77ms |
-| 16 | 641.78 img/s | 24.54ms | 25.19ms | 25.64ms | 27.31ms |
-| 32 | 800.26 img/s | 39.28ms | 39.43ms | 39.54ms | 41.96ms |
-| 64 | 883.66 img/s | 71.76ms | 71.87ms | 71.94ms | 72.78ms |
-| 128 | 948.27 img/s | 134.19ms | 134.40ms | 134.58ms | 134.81ms |
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      79 img/s      |    13.07 ms     |    14.66 ms     |    15.59 ms     |
+|       2        |     119 img/s      |    17.21 ms     |    18.07 ms     |    19.78 ms     |
+|       4        |     141 img/s      |    28.65 ms     |    28.62 ms     |    28.77 ms     |
+|       8        |     139 img/s      |    57.84 ms     |    58.29 ms     |    58.62 ms     |
+|       16       |     153 img/s      |    104.8 ms     |    105.65 ms    |    106.2 ms     |
+|       32       |     178 img/s      |    181.24 ms    |    180.96 ms    |    181.57 ms    |
+|       64       |     179 img/s      |    360.93 ms    |    358.22 ms    |    359.11 ms    |
+|      128       |     177 img/s      |    735.99 ms    |    726.15 ms    |    727.81 ms    |
+|      256       |     167 img/s      |   1561.91 ms    |   1523.52 ms    |   1525.96 ms    |
 
 
+###### Mixed Precision Inference Latency
 
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      65 img/s      |    15.69 ms     |    16.95 ms     |    17.97 ms     |
+|       2        |     126 img/s      |     16.2 ms     |    16.78 ms     |     18.6 ms     |
+|       4        |     245 img/s      |    16.77 ms     |    18.35 ms     |    25.88 ms     |
+|       8        |     488 img/s      |    16.82 ms     |    17.86 ms     |    25.45 ms     |
+|       16       |     541 img/s      |    30.16 ms     |    29.95 ms     |    30.18 ms     |
+|       32       |     566 img/s      |    57.79 ms     |    57.11 ms     |    57.29 ms     |
+|       64       |     580 img/s      |    112.84 ms    |    111.07 ms    |    111.56 ms    |
+|      128       |     586 img/s      |    224.75 ms    |    219.12 ms    |    219.64 ms    |
+|      256       |     589 img/s      |    447.25 ms    |    434.18 ms    |    439.22 ms    |
 
 
 ## Release notes
@@ -680,9 +659,10 @@ The following images show a 250 epochs configuration on a DGX-1V.
 2. July 2020
   * Added A100 scripts
   * Updated README
+3. February 2021
+  * Moved from APEX AMP to Native AMP
 
 ### Known issues
 
 There are no known issues with this model.
 
-

+ 1 - 0
PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1V_resnext101-32x4d_AMP_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnext101-32x4d --precision AMP --mode convergence --platform DGX1V /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1V_resnext101-32x4d_AMP_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnext101-32x4d --precision AMP --mode convergence --platform DGX1V /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 0 - 1
PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNXT101-32x4d_AMP_250E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j8 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs 250 --warmup 8 --wd 6.103515625e-05 --mixup 0.2 --memory-format nhwc

+ 0 - 1
PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNXT101-32x4d_AMP_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j8 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05 --memory-format nhwc

+ 0 - 1
PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGXA100_RNXT101-32x4d_AMP_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j16 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05 --memory-format nhwc

+ 1 - 0
PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGXA100_resnext101-32x4d_AMP_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnext101-32x4d --precision AMP --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnext101-32x4d/training/AMP/DGXA100_resnext101-32x4d_AMP_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnext101-32x4d --precision AMP --mode convergence --platform DGXA100 /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1V_resnext101-32x4d_FP32_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnext101-32x4d --precision FP32 --mode convergence --platform DGX1V /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1V_resnext101-32x4d_FP32_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnext101-32x4d --precision FP32 --mode convergence --platform DGX1V /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 0 - 1
PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNXT101-32x4d_FP32_250E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j8 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 64 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs 250 --warmup 8 --wd 6.103515625e-05 --mixup 0.2

+ 0 - 1
PyTorch/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNXT101-32x4d_FP32_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j8 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 64 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05

+ 0 - 1
PyTorch/Classification/ConvNets/resnext101-32x4d/training/TF32/DGXA100_RNXT101-32x4d_TF32_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j16 -p 100 --data-backend dali-cpu --arch resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05

+ 1 - 0
PyTorch/Classification/ConvNets/resnext101-32x4d/training/TF32/DGXA100_resnext101-32x4d_TF32_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnext101-32x4d --precision TF32 --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/resnext101-32x4d/training/TF32/DGXA100_resnext101-32x4d_TF32_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model resnext101-32x4d --precision TF32 --mode convergence --platform DGXA100 /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 120 - 138
PyTorch/Classification/ConvNets/se-resnext101-32x4d/README.md

@@ -31,11 +31,11 @@ achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
     * [Inference performance benchmark](#inference-performance-benchmark)
   * [Results](#results)
     * [Training accuracy results](#training-accuracy-results)
-      * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
+      * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
       * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
       * [Example plots](#example-plots)
     * [Training performance results](#training-performance-results)
-      * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+      * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
       * [Training performance: NVIDIA DGX-1 16GB (8x V100 16GB)](#training-performance-nvidia-dgx-1-16gb-8x-v100-16gb)
       * [Training performance: NVIDIA DGX-1 32GB (8x V100 32GB)](#training-performance-nvidia-dgx-1-32gb-8x-v100-32gb)
   * [Inference performance results](#inference-performance-results)
@@ -45,7 +45,6 @@ achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
   * [Changelog](#changelog)
   * [Known issues](#known-issues)
 
-
 ## Model overview
 
 The SE-ResNeXt101-32x4d is a [ResNeXt101-32x4d](https://arxiv.org/pdf/1611.05431.pdf)
@@ -106,13 +105,14 @@ This model uses the following data augmentation:
   * Scale to 256x256
   * Center crop to 224x224
 
+
 ### Feature support matrix
 
 The following features are supported by this model:
 
-| Feature               | ResNeXt101-32x4d
+| Feature               | SE-ResNeXt101-32x4d
 |-----------------------|--------------------------
-|[DALI](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html)   |   Yes
+|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html)   |   Yes
 |[APEX AMP](https://nvidia.github.io/apex/amp.html) | Yes |
 
 #### Features
@@ -129,11 +129,11 @@ which speeds up data loading when CPU becomes a bottleneck.
 DALI can use CPU or GPU, and outperforms the PyTorch native dataloader.
 
 Run training with `--data-backends dali-gpu` or `--data-backends dali-cpu` to enable DALI.
-For ResNeXt101-32x4d, for DGX1 and DGX2 we recommend `--data-backends dali-cpu`.
+For DGXA100 and DGX1 we recommend `--data-backends dali-cpu`.
 
 ### Mixed precision training
 
-Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
 1.  Porting the model to use the FP16 data type where appropriate.
 2.  Adding loss scaling to preserve small gradient values.
 
@@ -191,7 +191,7 @@ The following section lists the requirements that you need to meet in order to s
 This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
+* [PyTorch 20.12-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
 * Supported GPUs:
     * [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
     * [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
@@ -216,7 +216,7 @@ cd DeepLearningExamples/PyTorch/Classification/
 
 ### 2. Download and preprocess the dataset.
 
-The ResNeXt101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
+The SE-ResNeXt101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
 
 PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
 
@@ -243,27 +243,28 @@ For the specifics concerning training and inference, see the [Advanced](#advance
 
 The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
 
-### 3. Build the SE-RNXT101-32x4d PyTorch NGC container.
+### 3. Build the SE-ResNeXt101-32x4d PyTorch NGC container.
 
 ```
-docker build . -t nvidia_se-rnxt101-32x4d
+docker build . -t nvidia_se-resnext101-32x4d
 ```
 
 ### 4. Start an interactive session in the NGC container to run training/inference.
 ```
-nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host nvidia_se-rnxt101-32x4d
+nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host nvidia_se-resnext101-32x4d
 ```
 
+
 ### 5. Start training
 
-To run training for a standard configuration (DGXA100/DGX1/DGX2, AMP/TF32/FP32, 90/250 Epochs),
+To run training for a standard configuration (DGXA100/DGX1V, AMP/TF32/FP32, 90/250 Epochs),
 run one of the scripts in the `./se-resnext101-32x4d/training` directory
-called `./se-resnext101-32x4d/training/{AMP, TF32, FP32}/{DGXA100, DGX1, DGX2}_SE-RNXT101-32x4d_{AMP, TF32, FP32}_{90,250}E.sh`.
+called `./se-resnext101-32x4d/training/{AMP, TF32, FP32}/{ DGXA100, DGX1V }_se-resnext101-32x4d_{AMP, TF32, FP32}_{ 90, 250 }E.sh`.
 
 Ensure ImageNet is mounted in the `/imagenet` directory.
 
 Example:
-    `bash ./se-resnext101-32x4d/training/AMP/DGX1_SE-RNXT101-32x4d_AMP_250E.sh <path were to store checkpoints and logs>`
+    `bash ./se-resnext101-32x4d/training/AMP/DGX1_se-resnext101-32x4d_AMP_250E.sh <path were to store checkpoints and logs>`
 
 ### 6. Start inference
 
@@ -281,7 +282,7 @@ To run inference on ImageNet, run:
 
 To run inference on JPEG image using pretrained weights:
 
-`python classify.py --arch se-resnext101-32x4d -c fanin --weights nvidia_se-resnext101-32x4d_200821.pth.tar  --precision AMP|FP32 --image <path to JPEG image>`
+`python classify.py --arch se-resnext101-32x4d -c fanin --weights nvidia_se-resnext101-32x4d_200821.pth.tar --precision AMP|FP32 --image <path to JPEG image>`
 
 
 ## Advanced
@@ -320,7 +321,7 @@ usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
                [--lr-schedule SCHEDULE] [--warmup E] [--label-smoothing S]
                [--mixup ALPHA] [--momentum M] [--weight-decay W]
                [--bn-weight-decay] [--nesterov] [--print-freq N]
-               [--resume PATH] [--pretrained-weights PATH] [--fp16]
+               [--resume PATH] [--pretrained-weights PATH]
                [--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
                [--prof N] [--amp] [--seed SEED] [--gather-checkpoints]
                [--raport-file RAPORT_FILE] [--evaluate] [--training-only]
@@ -339,8 +340,10 @@ optional arguments:
                         data backend: pytorch | syntetic | dali-gpu | dali-cpu
                         (default: dali-cpu)
   --arch ARCH, -a ARCH  model architecture: resnet18 | resnet34 | resnet50 |
-                        resnet101 | resnet152 | resnext101-32x4d | se-
-                        resnext101-32x4d (default: resnet50)
+                        resnet101 | resnet152 | resnext50-32x4d |
+                        resnext101-32x4d | resnext101-32x8d |
+                        resnext101-32x8d-basic | se-resnext101-32x4d (default:
+                        resnet50)
   --model-config CONF, -c CONF
                         model configs: classic | fanin | grp-fanin | grp-
                         fanout(default: classic)
@@ -369,10 +372,9 @@ optional arguments:
   --resume PATH         path to latest checkpoint (default: none)
   --pretrained-weights PATH
                         load weights from here
-  --fp16                Run model fp16 mode.
   --static-loss-scale STATIC_LOSS_SCALE
                         Static loss scale, positive power of 2 values can
-                        improve fp16 convergence.
+                        improve amp convergence.
   --dynamic-loss-scale  Use dynamic loss scaling. If supplied, this argument
                         supersedes --static-loss-scale.
   --prof N              Run only N iterations
@@ -390,6 +392,7 @@ optional arguments:
   --workspace DIR       path to directory where checkpoints will be stored
   --memory-format {nchw,nhwc}
                         memory layout, nchw or nhwc
+
 ```
 
 
@@ -400,25 +403,7 @@ To use your own dataset, divide it in directories as in the following scheme:
  - Training images - `train/<class id>/<image>`
  - Validation images - `val/<class id>/<image>`
 
-If your dataset's has number of classes different than 1000, you need to add a custom config
-in the `image_classification/resnet.py` file.
-
-```python
-resnet_versions = {
-    ...
-    'se-resnext101-32x4d-custom' : {
-        'net' : ResNet,
-        'block' : SEBottleneck,
-        'cardinality' : 32,
-        'layers' : [3, 4, 23, 3],
-        'widths' : [128, 256, 512, 1024],
-        'expansion' : 2,
-        'num_classes' : <custom number of classes>,
-    }
-}
-```
-
-After adding the config, run the training script with `--arch resnext101-32x4d-custom` flag.
+If your dataset's has number of classes different than 1000, you need to pass `--num-classes N` flag to the training script.
 
 ### Training process
 
@@ -441,7 +426,7 @@ To restart training from checkpoint use `--resume` option.
 To start training from pretrained weights (e.g. downloaded from NGC) use `--pretrained-weights` option.
 
 The difference between those two is that the pretrained weights contain only model weights,
-and checkpoints, apart from model weights, contain optimizer state, LR scheduler state, RNG state.
+and checkpoints, apart from model weights, contain optimizer state, LR scheduler state.
 
 Checkpoints are suitable for dividing the training into parts, for example in order
 to divide the training job into shorter stages, or restart training after infrastructure fail.
@@ -487,14 +472,13 @@ wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/seresnext
 
 unzip seresnext101_32x4d_pyt_amp_20.06.0.zip
 ```
-
 To run inference on ImageNet, run:
 
 `python ./main.py --arch se-resnext101-32x4d --evaluate --epochs 1 --pretrained-weights nvidia_se-resnext101-32x4d_200821.pth.tar -b <batch size> <path to imagenet>`
 
 To run inference on JPEG image using pretrained weights:
 
-`python classify.py --arch se-resnext101-32x4d -c fanin --weights nvidia_se-resnext101-32x4d_200821.pth.tar  --precision AMP|FP32 --image <path to JPEG image>`
+`python classify.py --arch se-resnext101-32x4d --weights nvidia_se-resnext101-32x4d_200821.pth.tar --precision AMP|FP32 --image <path to JPEG image>`
 
 
 ## Performance
@@ -508,71 +492,62 @@ The following section shows how to run benchmarks measuring the model performanc
 To benchmark training, run:
 
 * For 1 GPU
-    * FP32
-`python ./main.py --arch se-resnext101-32x4d -b <batch_size> --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP32 (V100 GPUs only)
+        `python ./launch.py --model se-resnext101-32x4d --precision FP32 --mode benchmark_training --platform DGX1V <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
+    * TF32 (A100 GPUs only)
+        `python ./launch.py --model se-resnext101-32x4d --precision TF32 --mode benchmark_training --platform DGXA100 <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
     * AMP
-`python ./main.py --arch se-resnext101-32x4d -b <batch_size> --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --amp --static-loss-scale 256 --memory-format nhwc <path to imagenet>`
+        `python ./launch.py --model se-resnext101-32x4d --precision AMP --mode benchmark_training --platform <DGX1V|DGXA100> <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 * For multiple GPUs
-    * FP32
-`python ./multiproc.py --nproc_per_node 8 ./main.py --arch se-resnext101-32x4d -b <batch_size> --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP32 (V100 GPUs only)
+        `python ./launch.py --model se-resnext101-32x4d --precision FP32 --mode benchmark_training --platform DGX1V <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
+    * TF32 (A100 GPUs only)
+        `python ./multiproc.py --nproc_per_node 8 ./launch.py --model se-resnext101-32x4d --precision TF32 --mode benchmark_training --platform DGXA100 <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
     * AMP
-`python ./multiproc.py --nproc_per_node 8 ./main.py --arch se-resnext101-32x4d -b <batch_size> --training-only -p 1 --raport-file benchmark.json --amp --static-loss-scale 256 --memory-format nhwc --epochs 1 --prof 100 <path to imagenet>`
+        `python ./multiproc.py --nproc_per_node 8 ./launch.py --model se-resnext101-32x4d --precision AMP --mode benchmark_training --platform <DGX1V|DGXA100> <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
 Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
 
-Batch size should be picked appropriately depending on the hardware configuration.
-
-| *Platform* | *Precision* | *Batch Size* |
-|:----------:|:-----------:|:------------:|
-| DGXA100    | AMP         | 128          |
-| DGXA100    | TF32        | 128          |
-| DGX-1      | AMP         | 128          |
-| DGX-1      | FP32        | 64           |
-
 #### Inference performance benchmark
 
 To benchmark inference, run:
 
-* FP32
+* FP32 (V100 GPUs only)
 
-`python ./main.py --arch se-resnext101-32x4d -b <batch_size> -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
+`python ./launch.py --model se-resnext101-32x4d --precision FP32 --mode benchmark_inference --platform DGX1V <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
-* AMP
+* TF32 (A100 GPUs only)
 
-`python ./main.py --arch se-resnext101-32x4d -b <batch_size> -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --amp --memory-format nhwc <path to imagenet>`
+`python ./launch.py --model se-resnext101-32x4d --precision FP32 --mode benchmark_inference --platform DGXA100 <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
-Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+* AMP
 
-Batch size should be picked appropriately depending on the hardware configuration.
+`python ./launch.py --model se-resnext101-32x4d --precision AMP --mode benchmark_inference --platform <DGX1V|DGXA100> <path to imagenet> --raport-file benchmark.json --epochs 1 --prof 100`
 
-| *Platform* | *Precision* | *Batch Size* |
-|:----------:|:-----------:|:------------:|
-| DGXA100    | AMP         | 128          |
-| DGXA100    | TF32        | 128          |
-| DGX-1      | AMP         | 128          |
-| DGX-1      | FP32        | 64           |
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
 
 ### Results
 
-Our results were obtained by running the applicable training script     in the pytorch-20.06 NGC container.
+Our results were obtained by running the applicable training script     in the pytorch-20.12 NGC container.
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
 
 #### Training accuracy results
 
-##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)
 
-| **epochs** | **Mixed Precision Top1** | **TF32 Top1** |
-|:------:|:--------------------:|:--------------:|
-|   90   |    79.95 +/- 0.09    | 79.97 +/- 0.08 |
+| **Epochs** | **Mixed Precision Top1** | **TF32 Top1**  |
+|:----------:|:------------------------:|:--------------:|
+|     90     |      80.03 +/- 0.11      | 79.92 +/- 0.07 |
+|    250     |      80.9 +/- 0.08       | 80.98 +/- 0.07 |
 
-##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
-| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
-|:-:|:-:|:-:|
-|   90   |    80.04 +/- 0.10    | 79.93 +/- 0.10 |
-| 250 | 80.96 +/- 0.04 | 80.97 +/- 0.09 |
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
+| **Epochs** | **Mixed Precision Top1** | **FP32 Top1**  |
+|:----------:|:------------------------:|:--------------:|
+|     90     |      80.04 +/- 0.07      | 79.93 +/- 0.10 |
+|    250     |      80.92 +/- 0.09      | 80.97 +/- 0.09 |
 
 
 ##### Example plots
@@ -587,26 +562,29 @@ The following images show a 250 epochs configuration on a DGX-1V.
 
 #### Training performance results
 
-##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
+
+| **GPUs** | **Mixed Precision** |  **TF32**  | **Mixed Precision Speedup** | **Mixed Precision Strong Scaling** | **Mixed Precision Training Time (90E)** | **TF32 Strong Scaling** | **TF32 Training Time (90E)** |
+|:--------:|:-------------------:|:----------:|:---------------------------:|:----------------------------------:|:---------------------------------------:|:-----------------------:|:----------------------------:|
+|    1     |      804 img/s      | 360 img/s  |           2.22 x            |               1.0 x                |                ~42 hours                |          1.0 x          |          ~94 hours           |
+|    8     |     5248 img/s      | 2665 img/s |           1.96 x            |               6.52 x               |                ~7 hours                 |         7.38 x          |          ~13 hours           |
 
-|**GPUs**|**Mixed Precision**|  **TF32**   |**Mixed Precision Speedup**|**Mixed Precision Strong Scaling**|**Mixed Precision Training Time (90E)**|**TF32 Strong Scaling**|**TF32 Training Time (90E)**|
-|:------:|:-----------------:|:-----------:|:-------------------------:|:--------------------------------:|:-------------------------------------:|:---------------------:|:--------------------------:|
-|   1    |   641.57 img/s    |258.75 img/s |           2.48x           |              1.00x               |               ~52 hours               |         1.00x         |         ~129 hours         |
-|   8    |   4758.40 img/s   |2038.03 img/s|           2.33x           |              7.42x               |               ~7 hours                |         7.88x         |         ~17 hours          |
 
 ##### Training performance: NVIDIA DGX-1 16GB (8x V100 16GB)
 
-|**GPUs**|**Mixed Precision**|  **FP32**  |**Mixed Precision Speedup**|**Mixed Precision Strong Scaling**|**Mixed Precision Training Time (90E)**|**FP32 Strong Scaling**|**FP32 Training Time (90E)**|
-|:------:|:-----------------:|:----------:|:-------------------------:|:--------------------------------:|:-------------------------------------:|:---------------------:|:--------------------------:|
-|   1    |   383.15 img/s    |130.48 img/s|           2.94x           |              1.00x               |               ~87 hours               |         1.00x         |         ~255 hours         |
-|   8    |   2695.10 img/s   |996.04 img/s|           2.71x           |              7.03x               |               ~13 hours               |         7.63x         |         ~34 hours          |
+| **GPUs** | **Mixed Precision** | **FP32**  | **Mixed Precision Speedup** | **Mixed Precision Strong Scaling** | **Mixed Precision Training Time (90E)** | **FP32 Strong Scaling** | **FP32 Training Time (90E)** |
+|:--------:|:-------------------:|:---------:|:---------------------------:|:----------------------------------:|:---------------------------------------:|:-----------------------:|:----------------------------:|
+|    1     |      430 img/s      | 133 img/s |           3.21 x            |               1.0 x                |                ~79 hours                |          1.0 x          |          ~252 hours          |
+|    8     |     2716 img/s      | 994 img/s |           2.73 x            |               6.31 x               |                ~13 hours                |         7.42 x          |          ~34 hours           |
+
 
 ##### Training performance: NVIDIA DGX-1 32GB (8x V100 32GB)
 
-|**GPUs**|**Mixed Precision**|  **FP32**  |**Mixed Precision Speedup**|**Mixed Precision Strong Scaling**|**Mixed Precision Training Time (90E)**|**FP32 Strong Scaling**|**FP32 Training Time (90E)**|
-|:------:|:-----------------:|:----------:|:-------------------------:|:--------------------------------:|:-------------------------------------:|:---------------------:|:--------------------------:|
-|   1    |   364.65 img/s    |123.46 img/s|           2.95x           |              1.00x               |               ~92 hours               |         1.00x         |         ~270 hours         |
-|   8    |   2540.49 img/s   |959.94 img/s|           2.65x           |              6.97x               |               ~13 hours               |         7.78x         |         ~35 hours          |
+| **GPUs** | **Mixed Precision** |  **FP32**  | **Mixed Precision Speedup** | **Mixed Precision Strong Scaling** | **Mixed Precision Training Time (90E)** | **FP32 Strong Scaling** | **FP32 Training Time (90E)** |
+|:--------:|:-------------------:|:----------:|:---------------------------:|:----------------------------------:|:---------------------------------------:|:-----------------------:|:----------------------------:|
+|    1     |      413 img/s      | 134 img/s  |           3.08 x            |               1.0 x                |                ~82 hours                |          1.0 x          |          ~251 hours          |
+|    8     |     2572 img/s      | 1011 img/s |           2.54 x            |               6.22 x               |                ~14 hours                |         7.54 x          |          ~34 hours           |
+
 
 #### Inference performance results
 
@@ -614,62 +592,65 @@ The following images show a 250 epochs configuration on a DGX-1V.
 
 ###### FP32 Inference Latency
 
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 33.58 img/s | 29.72ms | 30.92ms | 31.77ms | 34.65ms |
-| 2 | 66.47 img/s | 29.94ms | 31.30ms | 32.74ms | 34.79ms |
-| 4 | 135.31 img/s | 29.36ms | 29.78ms | 32.61ms | 33.90ms |
-| 8 | 261.52 img/s | 30.42ms | 32.73ms | 33.99ms | 35.61ms |
-| 16 | 356.05 img/s | 44.61ms | 44.93ms | 45.17ms | 46.90ms |
-| 32 | 391.83 img/s | 80.91ms | 81.28ms | 81.64ms | 82.69ms |
-| 64 | 443.91 img/s | 142.70ms | 142.99ms | 143.46ms | 145.01ms |
-| 128 | N/A | N/A | N/A | N/A | N/A |
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      37 img/s      |    26.81 ms     |    27.89 ms     |    31.44 ms     |
+|       2        |      75 img/s      |    27.01 ms     |    28.89 ms     |    31.17 ms     |
+|       4        |     144 img/s      |    28.09 ms     |    30.14 ms     |    32.47 ms     |
+|       8        |     259 img/s      |    31.23 ms     |    33.65 ms     |     38.4 ms     |
+|       16       |     332 img/s      |     48.7 ms     |    48.35 ms     |     48.8 ms     |
+|       32       |     394 img/s      |    83.02 ms     |    81.55 ms     |     81.9 ms     |
+|       64       |     471 img/s      |    138.88 ms    |    136.24 ms    |    136.54 ms    |
+|      128       |     505 img/s      |    261.4 ms     |    253.07 ms    |    254.29 ms    |
+|      256       |     513 img/s      |    516.66 ms    |    496.06 ms    |    497.05 ms    |
 
-###### Mixed Precision Inference Latency
 
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 35.08 img/s | 28.40ms | 29.75ms | 31.77ms | 35.85ms |
-| 2 | 68.85 img/s | 28.92ms | 30.24ms | 31.46ms | 37.07ms |
-| 4 | 131.78 img/s | 30.17ms | 31.39ms | 32.66ms | 37.17ms |
-| 8 | 260.21 img/s | 30.52ms | 31.20ms | 32.92ms | 34.46ms |
-| 16 | 506.62 img/s | 31.36ms | 32.48ms | 34.13ms | 36.49ms |
-| 32 | 778.92 img/s | 40.69ms | 40.90ms | 41.07ms | 43.67ms |
-| 64 | 880.49 img/s | 72.10ms | 72.29ms | 72.34ms | 76.46ms |
-| 128 | 977.86 img/s | 130.19ms | 130.34ms | 130.41ms | 131.12ms |
+###### Mixed Precision Inference Latency
 
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      29 img/s      |    34.24 ms     |    36.67 ms     |     39.4 ms     |
+|       2        |      53 img/s      |    37.81 ms     |    43.03 ms     |     45.1 ms     |
+|       4        |     103 img/s      |     39.1 ms     |    43.05 ms     |    46.16 ms     |
+|       8        |     226 img/s      |    35.66 ms     |    38.39 ms     |    41.13 ms     |
+|       16       |     458 img/s      |     35.4 ms     |    37.38 ms     |    39.97 ms     |
+|       32       |     882 img/s      |    37.37 ms     |    40.12 ms     |    42.64 ms     |
+|       64       |     1356 img/s     |    49.31 ms     |    47.21 ms     |    49.87 ms     |
+|      112       |     1448 img/s     |    81.27 ms     |    77.35 ms     |    78.28 ms     |
+|      128       |     1486 img/s     |    90.59 ms     |    86.15 ms     |    87.04 ms     |
+|      256       |     1534 img/s     |    176.72 ms    |    166.2 ms     |    167.53 ms    |
 
 
 ##### Inference performance: NVIDIA T4
 
 ###### FP32 Inference Latency
 
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 40.47 img/s | 24.72ms | 26.94ms | 29.33ms | 33.03ms |
-| 2 | 84.16 img/s | 23.66ms | 24.53ms | 25.96ms | 29.42ms |
-| 4 | 165.10 img/s | 24.08ms | 24.59ms | 25.75ms | 27.57ms |
-| 8 | 266.04 img/s | 29.90ms | 30.51ms | 30.84ms | 33.07ms |
-| 16 | 325.89 img/s | 48.57ms | 48.91ms | 49.02ms | 51.01ms |
-| 32 | 365.99 img/s | 86.94ms | 87.15ms | 87.41ms | 90.74ms |
-| 64 | 410.43 img/s | 155.30ms | 156.07ms | 156.36ms | 164.74ms |
-| 128 | N/A | N/A | N/A | N/A | N/A |
-
-###### Mixed Precision Inference Latency
-
-| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| 1 | 38.80 img/s | 25.74ms | 26.10ms | 29.28ms | 31.72ms |
-| 2 | 78.79 img/s | 25.29ms | 25.83ms | 27.18ms | 33.07ms |
-| 4 | 160.22 img/s | 24.81ms | 25.58ms | 26.25ms | 27.93ms |
-| 8 | 298.01 img/s | 26.69ms | 27.59ms | 29.13ms | 32.69ms |
-| 16 | 567.48 img/s | 28.05ms | 28.36ms | 31.28ms | 34.44ms |
-| 32 | 709.56 img/s | 44.58ms | 44.69ms | 44.98ms | 47.99ms |
-| 64 | 799.72 img/s | 79.32ms | 79.40ms | 79.49ms | 84.34ms |
-| 128 | 856.19 img/s | 147.92ms | 149.02ms | 149.13ms | 151.90ms |
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      52 img/s      |    19.39 ms     |    20.39 ms     |    21.18 ms     |
+|       2        |     102 img/s      |    19.98 ms     |     21.4 ms     |    23.75 ms     |
+|       4        |     134 img/s      |    30.12 ms     |    30.14 ms     |    30.54 ms     |
+|       8        |     136 img/s      |    59.07 ms     |    60.63 ms     |    61.49 ms     |
+|       16       |     154 img/s      |    104.38 ms    |    105.21 ms    |    105.81 ms    |
+|       32       |     169 img/s      |    190.12 ms    |    189.64 ms    |    190.24 ms    |
+|       64       |     171 img/s      |    376.19 ms    |    374.16 ms    |    375.6 ms     |
+|      128       |     168 img/s      |    771.4 ms     |    761.64 ms    |    764.7 ms     |
+|      256       |     159 img/s      |   1639.15 ms    |   1603.45 ms    |   1605.47 ms    |
 
 
+###### Mixed Precision Inference Latency
 
+| **Batch Size** | **Throughput Avg** | **Latency Avg** | **Latency 95%** | **Latency 99%** |
+|:--------------:|:------------------:|:---------------:|:---------------:|:---------------:|
+|       1        |      42 img/s      |    24.17 ms     |    27.26 ms     |    29.98 ms     |
+|       2        |      87 img/s      |    23.24 ms     |    24.66 ms     |    26.77 ms     |
+|       4        |     170 img/s      |    23.87 ms     |    24.89 ms     |    29.59 ms     |
+|       8        |     334 img/s      |    24.49 ms     |    27.92 ms     |    35.66 ms     |
+|       16       |     472 img/s      |    34.45 ms     |    34.29 ms     |    35.72 ms     |
+|       32       |     502 img/s      |    64.93 ms     |    64.47 ms     |    65.16 ms     |
+|       64       |     517 img/s      |    126.24 ms    |    125.03 ms    |    125.86 ms    |
+|      128       |     522 img/s      |    250.99 ms    |    245.87 ms    |    247.1 ms     |
+|      256       |     523 img/s      |    502.41 ms    |    487.58 ms    |    489.69 ms    |
 
 
 ## Release notes
@@ -681,9 +662,10 @@ The following images show a 250 epochs configuration on a DGX-1V.
 2. July 2020
   * Added A100 scripts
   * Updated README
+3. February 2021
+  * Moved from APEX AMP to Native AMP
 
 ### Known issues
 
 There are no known issues with this model.
 
-

+ 1 - 0
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1V_se-resnext101-32x4d_AMP_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model se-resnext101-32x4d --precision AMP --mode convergence --platform DGX1V /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1V_se-resnext101-32x4d_AMP_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model se-resnext101-32x4d --precision AMP --mode convergence --platform DGX1V /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 0 - 1
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNXT101-32x4d_AMP_250E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j8 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs 250 --warmup 8 --wd 6.103515625e-05 --mixup 0.2 --memory-format nhwc

+ 0 - 1
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNXT101-32x4d_AMP_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j8 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05 --memory-format nhwc

+ 0 - 1
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGXA100_SE-RNXT101-32x4d_AMP_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j16 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --amp --static-loss-scale 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05 --memory-format nhwc

+ 1 - 0
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGXA100_se-resnext101-32x4d_AMP_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model se-resnext101-32x4d --precision AMP --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGXA100_se-resnext101-32x4d_AMP_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model se-resnext101-32x4d --precision AMP --mode convergence --platform DGXA100 /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1V_se-resnext101-32x4d_FP32_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model se-resnext101-32x4d --precision FP32 --mode convergence --platform DGX1V /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1V_se-resnext101-32x4d_FP32_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model se-resnext101-32x4d --precision FP32 --mode convergence --platform DGX1V /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 0 - 1
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNXT101-32x4d_FP32_250E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j8 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 64 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs 250 --warmup 8 --wd 6.103515625e-05 --mixup 0.2

+ 0 - 1
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNXT101-32x4d_FP32_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j8 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 64 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05

+ 0 - 1
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/TF32/DGXA100_SE-RNXT101-32x4d_TF32_90E.sh

@@ -1 +0,0 @@
-python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --raport-file raport.json -j16 -p 100 --data-backend pytorch --arch se-resnext101-32x4d -c fanin --label-smoothing 0.1 --workspace $1 -b 128 --optimizer-batch-size 1024 --lr 1.024 --mom 0.875 --lr-schedule cosine --epochs  90 --warmup 8 --wd 6.103515625e-05

+ 1 - 0
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/TF32/DGXA100_se-resnext101-32x4d_TF32_250E.sh

@@ -0,0 +1 @@
+python ./launch.py --model se-resnext101-32x4d --precision TF32 --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/se-resnext101-32x4d/training/TF32/DGXA100_se-resnext101-32x4d_TF32_90E.sh

@@ -0,0 +1 @@
+python ./launch.py --model se-resnext101-32x4d --precision TF32 --mode convergence --platform DGXA100 /imagenet --epochs 90 --mixup 0.0 --workspace ${1:-./} --raport-file raport.json

+ 1 - 0
PyTorch/Classification/ConvNets/triton/deployer.py

@@ -61,6 +61,7 @@ def initialize_model(args):
         model.load_state_dict(
             {k.replace("module.", ""): v for k, v in state_dict.items()}
         )
+        model.load_state_dict(state_dict)
     return model.half() if args.fp16 else model