2 năm trước cách đây · fc9c09b08d
--- a/PaddlePaddle/Classification/RN50v1.5/Dockerfile
+++ b/PaddlePaddle/Classification/RN50v1.5/Dockerfile
@@ -1,4 +1,4 @@
 
				-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.02-py3
			
 
				+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.06-py3
			
 
				 FROM ${FROM_IMAGE_NAME}
			
 
				 
			
 
				 ADD requirements.txt /workspace/
			
--- a/PaddlePaddle/Classification/RN50v1.5/README.md
+++ b/PaddlePaddle/Classification/RN50v1.5/README.md
@@ -303,7 +303,7 @@ Example:
 
				 bash scripts/training/train_resnet50_TF32_90E_DGXA100.sh
			
 
				 
			
 
				 # For AMP and 8 GPUs training in 90 epochs
			
 
				-bash scripts/training/train_resnet50_TF32_90E_DGXA100.sh
			
 
				+bash scripts/training/train_resnet50_AMP_90E_DGXA100.sh
			
 
				 ```
			
 
				 
			
 
				 Or you can manually launch training by `paddle.distributed.launch`. `paddle.distributed.launch` is a built-in module in PaddlePaddle that spawns up multiple distributed training processes on each of the training nodes.
			
@@ -497,6 +497,7 @@ Advanced Training:
 
				   --use-dynamic-loss-scaling
			
 
				                         Enable dynamic loss scaling in AMP training, only be applied when --amp is set. (default: False)
			
 
				   --use-pure-fp16       Enable pure FP16 training, only be applied when --amp is set. (default: False)
			
 
				+  --fuse-resunit        Enable CUDNNv8 ResUnit fusion, only be applied when --amp is set. (default: False)
			
 
				   --asp                 Enable automatic sparse training (ASP). (default: False)
			
 
				   --prune-model         Prune model to 2:4 sparse pattern, only be applied when --asp is set. (default: False)
			
 
				   --mask-algo {mask_1d,mask_2d_greedy,mask_2d_best}
			
@@ -827,8 +828,8 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
				 
			
 
				 | **GPUs** |  **Throughput - TF32**  | **Throughput - mixed precision** | **Throughput speedup (TF32 to mixed precision)** | **TF32 Scaling** | **Mixed Precision Scaling** | **Mixed Precision Training Time (90E)** | **TF32 Training Time (90E)** |
			
 
				 |:--------:|:------------:|:-------------:|:------------:|:------:|:--------:|:--------:|:--------:|
			
 
				-|    1     |    993 img/s |  2711 img/s   |    2.73 x    | 1.0 x  |  1.0 x   | ~13 hours| ~40 hours|
			
 
				-|    8     |  7955 img/s  |   20267 img/s |    2.54 x    | 8.01 x | 7.47 x   | ~2 hours | ~4 hours |
			
 
				+|    1     |  1024 img/s  |  2897 img/s   |    2.83 x    | 1.0 x  |  1.0 x   | ~13 hours| ~40 hours|
			
 
				+|    8     |  8013 img/s  |   23874 img/s |    2.98 x    | 7.83 x | 8.24 x   | ~2 hours | ~4 hours |
			
 
				 
			
 
				 ##### Training performance of Automatic SParsity: NVIDIA DGX A100 (8x A100 80GB)
			
 
				 | **GPUs** |  **Throughput - mixed precision** | **Throughput - mixed precision+ASP** | **Overhead** |
			
--- a/PaddlePaddle/Classification/RN50v1.5/program.py
+++ b/PaddlePaddle/Classification/RN50v1.5/program.py
@@ -143,6 +143,8 @@ def create_strategy(args, is_train=True):
 
				         build_strategy.fuse_elewise_add_act_ops = True
			
 
				         build_strategy.fuse_bn_add_act_ops = True
			
 
				         build_strategy.enable_addto = True
			
 
				+        if args.fuse_resunit and is_train:
			
 
				+            build_strategy.fuse_resunit = True
			
 
				 
			
 
				     return build_strategy, exec_strategy
			
 
				 
			
--- a/PaddlePaddle/Classification/RN50v1.5/scripts/training/train_resnet50_AMP_90E_DGXA100.sh
+++ b/PaddlePaddle/Classification/RN50v1.5/scripts/training/train_resnet50_AMP_90E_DGXA100.sh
@@ -17,4 +17,5 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 train.py \
 
				     --amp \
			
 
				     --scale-loss 128.0 \
			
 
				     --use-dynamic-loss-scaling \
			
 
				-    --data-layout NHWC
			
 
				+    --data-layout NHWC \
			
 
				+    --fuse-resunit
			
--- a/PaddlePaddle/Classification/RN50v1.5/utils/config.py
+++ b/PaddlePaddle/Classification/RN50v1.5/utils/config.py
@@ -276,7 +276,10 @@ def add_advance_args(parser):
 
				         '--use-pure-fp16',
			
 
				         action='store_true',
			
 
				         help='Enable pure FP16 training, only be applied when --amp is set.')
			
 
				-
			
 
				+    group.add_argument(
			
 
				+        '--fuse-resunit',
			
 
				+        action='store_true',
			
 
				+        help='Enable CUDNNv8 ResUnit fusion, only be applied when --amp is set.')
			
 
				     # ASP
			
 
				     group.add_argument(
			
 
				         '--asp',