2 лет назад · fc9c09b08d
--- a/PaddlePaddle/Classification/RN50v1.5/Dockerfile
+++ b/PaddlePaddle/Classification/RN50v1.5/Dockerfile
@@ -1,4 +1,4 @@
 
															-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.02-py3
														
 
															+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.06-py3
														
 
															 FROM ${FROM_IMAGE_NAME}
														
 
															 ADD requirements.txt /workspace/
														
--- a/PaddlePaddle/Classification/RN50v1.5/README.md
+++ b/PaddlePaddle/Classification/RN50v1.5/README.md
@@ -303,7 +303,7 @@ Example:
 
															 bash scripts/training/train_resnet50_TF32_90E_DGXA100.sh
														
 
															 # For AMP and 8 GPUs training in 90 epochs
														
 
															-bash scripts/training/train_resnet50_TF32_90E_DGXA100.sh
														
 
															+bash scripts/training/train_resnet50_AMP_90E_DGXA100.sh
														
 
															 ```
														
 
															 Or you can manually launch training by `paddle.distributed.launch`. `paddle.distributed.launch` is a built-in module in PaddlePaddle that spawns up multiple distributed training processes on each of the training nodes.
														
@@ -497,6 +497,7 @@ Advanced Training:
 
															   --use-dynamic-loss-scaling
														
 
															                         Enable dynamic loss scaling in AMP training, only be applied when --amp is set. (default: False)
														
 
															   --use-pure-fp16       Enable pure FP16 training, only be applied when --amp is set. (default: False)
														
 
															+  --fuse-resunit        Enable CUDNNv8 ResUnit fusion, only be applied when --amp is set. (default: False)
														
 
															   --asp                 Enable automatic sparse training (ASP). (default: False)
														
 
															   --prune-model         Prune model to 2:4 sparse pattern, only be applied when --asp is set. (default: False)
														
 
															   --mask-algo {mask_1d,mask_2d_greedy,mask_2d_best}
														
@@ -827,8 +828,8 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
															 | **GPUs** |  **Throughput - TF32**  | **Throughput - mixed precision** | **Throughput speedup (TF32 to mixed precision)** | **TF32 Scaling** | **Mixed Precision Scaling** | **Mixed Precision Training Time (90E)** | **TF32 Training Time (90E)** |
														
 
															 |:--------:|:------------:|:-------------:|:------------:|:------:|:--------:|:--------:|:--------:|
														
 
															-|    1     |    993 img/s |  2711 img/s   |    2.73 x    | 1.0 x  |  1.0 x   | ~13 hours| ~40 hours|
														
 
															-|    8     |  7955 img/s  |   20267 img/s |    2.54 x    | 8.01 x | 7.47 x   | ~2 hours | ~4 hours |
														
 
															+|    1     |  1024 img/s  |  2897 img/s   |    2.83 x    | 1.0 x  |  1.0 x   | ~13 hours| ~40 hours|
														
 
															+|    8     |  8013 img/s  |   23874 img/s |    2.98 x    | 7.83 x | 8.24 x   | ~2 hours | ~4 hours |
														
 
															 ##### Training performance of Automatic SParsity: NVIDIA DGX A100 (8x A100 80GB)
														
 
															 | **GPUs** |  **Throughput - mixed precision** | **Throughput - mixed precision+ASP** | **Overhead** |
														
--- a/PaddlePaddle/Classification/RN50v1.5/program.py
+++ b/PaddlePaddle/Classification/RN50v1.5/program.py
@@ -143,6 +143,8 @@ def create_strategy(args, is_train=True):
 
															         build_strategy.fuse_elewise_add_act_ops = True
														
 
															         build_strategy.fuse_bn_add_act_ops = True
														
 
															         build_strategy.enable_addto = True
														
 
															+        if args.fuse_resunit and is_train:
														
 
															+            build_strategy.fuse_resunit = True
														
 
															     return build_strategy, exec_strategy
														
--- a/PaddlePaddle/Classification/RN50v1.5/scripts/training/train_resnet50_AMP_90E_DGXA100.sh
+++ b/PaddlePaddle/Classification/RN50v1.5/scripts/training/train_resnet50_AMP_90E_DGXA100.sh
@@ -17,4 +17,5 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 train.py \
 
															     --amp \
														
 
															     --scale-loss 128.0 \
														
 
															     --use-dynamic-loss-scaling \
														
 
															-    --data-layout NHWC
														
 
															+    --data-layout NHWC \
														
 
															+    --fuse-resunit
														
--- a/PaddlePaddle/Classification/RN50v1.5/utils/config.py
+++ b/PaddlePaddle/Classification/RN50v1.5/utils/config.py
@@ -276,7 +276,10 @@ def add_advance_args(parser):
 
															         '--use-pure-fp16',
														
 
															         action='store_true',
														
 
															         help='Enable pure FP16 training, only be applied when --amp is set.')
														
 
															-
														
 
															+    group.add_argument(
														
 
															+        '--fuse-resunit',
														
 
															+        action='store_true',
														
 
															+        help='Enable CUDNNv8 ResUnit fusion, only be applied when --amp is set.')
														
 
															     # ASP
														
 
															     group.add_argument(
														
 
															         '--asp',