2 лет назад · f81fca98b3
--- a/PaddlePaddle/Classification/RN50v1.5/Dockerfile
+++ b/PaddlePaddle/Classification/RN50v1.5/Dockerfile
@@ -1,4 +1,4 @@
 
				-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:22.05-py3
			
 
				+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.02-py3
			
 
				 FROM ${FROM_IMAGE_NAME}
			
 
				 
			
 
				 ADD requirements.txt /workspace/
			
--- a/PaddlePaddle/Classification/RN50v1.5/README.md
+++ b/PaddlePaddle/Classification/RN50v1.5/README.md
@@ -504,6 +504,8 @@ Advanced Training:
 
				                         be applied when --asp and --prune-model is set. (default: mask_1d)
			
 
				 
			
 
				 Paddle-TRT:
			
 
				+  --device DEVICE_ID
			
 
				+                        The GPU device id for Paddle-TRT inference. (default: 0)
			
 
				   --trt-inference-dir TRT_INFERENCE_DIR
			
 
				                         A path to store/load inference models. export_model.py would export models to this folder, then inference.py
			
 
				                         would load from here. (default: ./inference)
			
@@ -521,7 +523,7 @@ Paddle-TRT:
 
				                         A file in which to store JSON model exporting report. (default: ./export.json)
			
 
				   --trt-log-path TRT_LOG_PATH
			
 
				                         A file in which to store JSON inference report. (default: ./inference.json)
			
 
				-  --trt-use-synthat TRT_USE_SYNTHAT
			
 
				+  --trt-use-synthetic TRT_USE_SYNTHAT
			
 
				                         Apply synthetic data for benchmark. (default: False)
			
 
				 ```
			
 
				 
			
@@ -672,17 +674,27 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 train.py \
 
				 ```
			
 
				 
			
 
				 #### Inference with TensorRT
			
 
				-To run inference with TensorRT for the best performance, you can apply the scripts in `scripts/inference`.
			
 
				+For inference with TensorRT, we provide two scopes to do benchmark with or without data preprocessing.
			
 
				+
			
 
				+The default scripts in `scripts/inference` use synthetic input to run inference without data preprocessing.
			
 
				 
			
 
				 For example,
			
 
				 1. Run `bash scripts/inference/export_resnet50_AMP.sh <your_checkpoint>` to export an inference model.
			
 
				-  - The default path of checkpoint is `./output/ResNet50/89`.
			
 
				+  - The default path of the checkpoint is `./output/ResNet50/89`.
			
 
				 2. Run `bash scripts/inference/infer_resnet50_AMP.sh` to infer with TensorRT.
			
 
				 
			
 
				 Or you could manually run `export_model.py` and `inference.py` with specific arguments, refer to [Command-line options](#command-line-options).
			
 
				 
			
 
				 Note that arguments passed to `export_model.py` and `inference.py` should be the same with arguments used in training.
			
 
				 
			
 
				+To run inference with data preprocessing, set the option `--trt-use-synthetic` to false and `--image-root` to the path of your own dataset. For example,
			
 
				+
			
 
				+```bash
			
 
				+python inference.py --trt-inference-dir <path_to_model> \
			
 
				+  --image-root <your_own_data_set> \
			
 
				+  --trt-use-synthetic False
			
 
				+```
			
 
				+
			
 
				 ## Performance
			
 
				 
			
 
				 The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
			
@@ -748,7 +760,7 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 train.py \
 
				 
			
 
				 ##### Benchmark with TensorRT
			
 
				 
			
 
				-To benchmark the inference performance with TensorRT on a specific batch size, run:
			
 
				+To benchmark the inference performance with TensorRT on a specific batch size, run inference.py with `--trt-use-synthetic True`. The benchmark uses synthetic input without data preprocessing.
			
 
				 
			
 
				 * FP32 / TF32
			
 
				 ```bash
			
@@ -757,7 +769,8 @@ python inference.py \
 
				     --trt-precision FP32 \
			
 
				     --batch-size <batch_size> \
			
 
				     --benchmark-steps 1024 \
			
 
				-    --benchmark-warmup-steps 16
			
 
				+    --benchmark-warmup-steps 16 \
			
 
				+    --trt-use-synthetic True
			
 
				 ```
			
 
				 
			
 
				 * FP16
			
@@ -767,13 +780,12 @@ python inference.py \
 
				     --trt-precision FP16 \
			
 
				     --batch-size <batch_size>
			
 
				     --benchmark-steps 1024 \
			
 
				-    --benchmark-warmup-steps 16
			
 
				+    --benchmark-warmup-steps 16 \
			
 
				+    --trt-use-synthetic True
			
 
				 ```
			
 
				 
			
 
				 Note that arguments passed to `inference.py` should be the same with arguments used in training.
			
 
				 
			
 
				-The benchmark uses the validation dataset by default, which should be put in `--image-root/val`.
			
 
				-For the performance benchmark of the raw model, a synthetic dataset can be used. To use synthetic dataset, add `--trt-use-synthat True` as a command line option.
			
 
				 
			
 
				 ### Results
			
 
				 
			
@@ -866,96 +878,103 @@ Our results were obtained by running the applicable training script with `--run-
 
				 #### Paddle-TRT performance: NVIDIA DGX A100 (1x A100 80GB)
			
 
				 Our results for Paddle-TRT were obtained by running the `inference.py` script on NVIDIA DGX A100 with (1x A100 80G) GPU.
			
 
				 
			
 
				+Note that the benchmark does not include data preprocessing. Refer to [Benchmark with TensorRT](#benchmark-with-tensorrt).
			
 
				+
			
 
				 **TF32 Inference Latency**
			
 
				 
			
 
				 |**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
			
 
				 |--------------|------------------|---------------|---------------|---------------|---------------|
			
 
				-| 1 | 716.49 img/s | 1.40 ms | 1.96 ms | 2.20 ms | 3.01 ms |
			
 
				-| 2 | 1219.98 img/s | 1.64 ms | 2.26 ms | 2.90 ms | 5.04 ms |
			
 
				-| 4 | 1880.12 img/s | 2.13 ms | 3.39 ms | 4.44 ms | 7.32 ms |
			
 
				-| 8 | 2404.10 img/s | 3.33 ms | 4.51 ms | 5.90 ms | 10.39 ms |
			
 
				-| 16 | 3101.28 img/s | 5.16 ms | 7.06 ms | 9.13 ms | 15.18 ms |
			
 
				-| 32 | 3294.11 img/s | 9.71 ms | 21.42 ms | 26.94 ms | 35.79 ms |
			
 
				-| 64 | 4327.38 img/s | 14.79 ms | 25.59 ms | 30.45 ms | 45.34 ms |
			
 
				-| 128 | 4956.59 img/s | 25.82 ms | 33.74 ms | 40.36 ms | 56.06 ms |
			
 
				-| 256 | 5244.29 img/s | 48.81 ms | 62.11 ms | 67.56 ms | 88.38 ms |
			
 
				+| 1 | 915.48 img/s | 1.09 ms | 1.09 ms | 1.18 ms | 1.19 ms |
			
 
				+| 2 | 1662.70 img/s | 1.20 ms | 1.21 ms | 1.29 ms | 1.30 ms |
			
 
				+| 4 | 2856.25 img/s | 1.40 ms | 1.40 ms | 1.49 ms | 1.55 ms |
			
 
				+| 8 | 3988.80 img/s | 2.01 ms | 2.01 ms | 2.10 ms | 2.18 ms |
			
 
				+| 16 | 5409.55 img/s | 2.96 ms | 2.96 ms | 3.05 ms | 3.07 ms |
			
 
				+| 32 | 6406.13 img/s | 4.99 ms | 5.00 ms | 5.08 ms | 5.12 ms |
			
 
				+| 64 | 7169.75 img/s | 8.93 ms | 8.94 ms | 9.01 ms | 9.04 ms |
			
 
				+| 128 | 7616.79 img/s | 16.80 ms | 16.89 ms | 16.90 ms | 16.99 ms |
			
 
				+| 256 | 7843.26 img/s | 32.64 ms | 32.85 ms | 32.88 ms | 32.93 ms |
			
 
				 
			
 
				 **FP16 Inference Latency**
			
 
				 
			
 
				 |**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
			
 
				 |--------------|------------------|---------------|---------------|---------------|---------------|
			
 
				-| 1 | 860.90 img/s | 1.16 ms | 1.81 ms | 2.06 ms | 2.98 ms |
			
 
				-| 2 | 1464.06 img/s | 1.37 ms | 2.13 ms | 2.73 ms | 4.76 ms |
			
 
				-| 4 | 2246.24 img/s | 1.78 ms | 3.17 ms | 4.20 ms | 7.39 ms |
			
 
				-| 8 | 2457.44 img/s | 3.25 ms | 4.35 ms | 5.50 ms | 9.98 ms |
			
 
				-| 16 | 3928.83 img/s | 4.07 ms | 6.26 ms | 8.50 ms | 15.10 ms |
			
 
				-| 32 | 3853.13 img/s | 8.30 ms | 19.87 ms | 25.51 ms | 34.99 ms |
			
 
				-| 64 | 5581.89 img/s | 11.46 ms | 22.32 ms | 30.75 ms | 43.35 ms |
			
 
				-| 128 | 6846.77 img/s | 18.69 ms | 25.43 ms | 35.03 ms | 50.04 ms |
			
 
				-| 256 | 7481.19 img/s | 34.22 ms | 40.92 ms | 51.10 ms | 65.68 ms |
			
 
				+| 1 | 1265.67 img/s | 0.79 ms | 0.79 ms | 0.88 ms | 0.89 ms |
			
 
				+| 2 | 2339.59 img/s | 0.85 ms | 0.86 ms | 0.94 ms | 0.96 ms |
			
 
				+| 4 | 4271.30 img/s | 0.94 ms | 0.94 ms | 1.03 ms | 1.04 ms |
			
 
				+| 8 | 7053.76 img/s | 1.13 ms | 1.14 ms | 1.22 ms | 1.31 ms |
			
 
				+| 16 | 10225.85 img/s | 1.56 ms | 1.57 ms | 1.65 ms | 1.67 ms |
			
 
				+| 32 | 12802.53 img/s | 2.50 ms | 2.50 ms | 2.59 ms | 2.61 ms |
			
 
				+| 64 | 14723.56 img/s | 4.35 ms | 4.35 ms | 4.43 ms | 4.45 ms |
			
 
				+| 128 | 16157.12 img/s | 7.92 ms | 7.96 ms | 8.00 ms | 8.06 ms |
			
 
				+| 256 | 17054.80 img/s | 15.01 ms | 15.06 ms | 15.07 ms | 15.16 ms |
			
 
				+
			
 
				 
			
 
				 #### Paddle-TRT performance: NVIDIA A30 (1x A30 24GB)
			
 
				 Our results for Paddle-TRT were obtained by running the `inference.py` script on NVIDIA A30 with (1x A30 24G) GPU.
			
 
				 
			
 
				+Note that the benchmark does not include data preprocessing. Refer to [Benchmark with TensorRT](#benchmark-with-tensorrt).
			
 
				+
			
 
				 **TF32 Inference Latency**
			
 
				 
			
 
				 |**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
			
 
				 |--------------|------------------|---------------|---------------|---------------|---------------|
			
 
				-| 1 | 672.79 img/s | 1.49 ms | 2.01 ms | 2.29 ms | 3.04 ms |
			
 
				-| 2 | 1041.47 img/s | 1.92 ms | 2.49 ms | 2.87 ms | 4.13 ms |
			
 
				-| 4 | 1505.64 img/s | 2.66 ms | 3.43 ms | 4.06 ms | 6.85 ms |
			
 
				-| 8 | 2001.13 img/s | 4.00 ms | 4.72 ms | 5.54 ms | 9.51 ms |
			
 
				-| 16 | 2462.80 img/s | 6.50 ms | 7.71 ms | 9.32 ms | 15.54 ms |
			
 
				-| 32 | 2474.34 img/s | 12.93 ms | 21.61 ms | 25.76 ms | 34.69 ms |
			
 
				-| 64 | 2949.38 img/s | 21.70 ms | 29.58 ms | 34.63 ms | 47.11 ms |
			
 
				-| 128 | 3278.67 img/s | 39.04 ms | 43.34 ms | 52.72 ms | 66.78 ms |
			
 
				-| 256 | 3293.10 img/s | 77.74 ms | 90.51 ms | 99.71 ms | 110.80 ms |
			
 
				+| 1 | 781.87 img/s | 1.28 ms | 1.29 ms | 1.38 ms | 1.45 ms |
			
 
				+| 2 | 1290.14 img/s | 1.55 ms | 1.55 ms | 1.65 ms | 1.67 ms |
			
 
				+| 4 | 1876.48 img/s | 2.13 ms | 2.13 ms | 2.23 ms | 2.25 ms |
			
 
				+| 8 | 2451.23 img/s | 3.26 ms | 3.27 ms | 3.37 ms | 3.42 ms |
			
 
				+| 16 | 2974.77 img/s | 5.38 ms | 5.42 ms | 5.47 ms | 5.53 ms |
			
 
				+| 32 | 3359.63 img/s | 9.52 ms | 9.62 ms | 9.66 ms | 9.72 ms |
			
 
				+| 64 | 3585.82 img/s | 17.85 ms | 18.03 ms | 18.09 ms | 18.20 ms |
			
 
				+| 128 | 3718.44 img/s | 34.42 ms | 34.71 ms | 34.75 ms | 34.91 ms |
			
 
				+| 256 | 3806.11 img/s | 67.26 ms | 67.61 ms | 67.71 ms | 67.86 ms |
			
 
				 
			
 
				 **FP16 Inference Latency**
			
 
				 
			
 
				 |**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
			
 
				 |--------------|------------------|---------------|---------------|---------------|---------------|
			
 
				-| 1 | 804.56 img/s | 1.24 ms | 1.81 ms | 2.15 ms | 3.07 ms |
			
 
				-| 2 | 1435.74 img/s | 1.39 ms | 2.05 ms | 2.48 ms | 3.86 ms |
			
 
				-| 4 | 2169.87 img/s | 1.84 ms | 2.72 ms | 3.39 ms | 5.94 ms |
			
 
				-| 8 | 2395.13 img/s | 3.34 ms | 4.46 ms | 5.11 ms | 9.49 ms |
			
 
				-| 16 | 3779.82 img/s | 4.23 ms | 5.83 ms | 7.66 ms | 14.44 ms |
			
 
				-| 32 | 3620.18 img/s | 8.84 ms | 17.90 ms | 22.31 ms | 30.91 ms |
			
 
				-| 64 | 4592.08 img/s | 13.94 ms | 24.00 ms | 29.38 ms | 41.41 ms |
			
 
				-| 128 | 5064.06 img/s | 25.28 ms | 31.73 ms | 37.79 ms | 53.01 ms |
			
 
				-| 256 | 4774.61 img/s | 53.62 ms | 59.04 ms | 67.29 ms | 80.51 ms |
			
 
				+| 1 | 1133.80 img/s | 0.88 ms | 0.89 ms | 0.98 ms | 0.99 ms |
			
 
				+| 2 | 2068.18 img/s | 0.97 ms | 0.97 ms | 1.06 ms | 1.08 ms |
			
 
				+| 4 | 3181.06 img/s | 1.26 ms | 1.27 ms | 1.35 ms | 1.38 ms |
			
 
				+| 8 | 5078.30 img/s | 1.57 ms | 1.58 ms | 1.68 ms | 1.74 ms |
			
 
				+| 16 | 6240.02 img/s | 2.56 ms | 2.58 ms | 2.67 ms | 2.86 ms |
			
 
				+| 32 | 7000.86 img/s | 4.57 ms | 4.66 ms | 4.69 ms | 4.76 ms |
			
 
				+| 64 | 7523.45 img/s | 8.51 ms | 8.62 ms | 8.73 ms | 8.86 ms |
			
 
				+| 128 | 7914.47 img/s | 16.17 ms | 16.31 ms | 16.34 ms | 16.46 ms |
			
 
				+| 256 | 8225.56 img/s | 31.12 ms | 31.29 ms | 31.38 ms | 31.50 ms |
			
 
				 
			
 
				 
			
 
				 #### Paddle-TRT performance: NVIDIA A10 (1x A10 24GB)
			
 
				 Our results for Paddle-TRT were obtained by running the `inference.py` script on NVIDIA A10 with (1x A10 24G) GPU.
			
 
				 
			
 
				+Note that the benchmark does not include data preprocessing. Refer to [Benchmark with TensorRT](#benchmark-with-tensorrt).
			
 
				+
			
 
				 **TF32 Inference Latency**
			
 
				 
			
 
				 |**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
			
 
				 |--------------|------------------|---------------|---------------|---------------|---------------|
			
 
				-| 1 | 372.04 img/s | 2.69 ms | 3.64 ms | 4.20 ms | 5.28 ms |
			
 
				-| 2 | 615.93 img/s | 3.25 ms | 4.08 ms | 4.59 ms | 6.42 ms |
			
 
				-| 4 | 1070.02 img/s | 3.74 ms | 3.90 ms | 4.35 ms | 7.48 ms |
			
 
				-| 8 | 1396.88 img/s | 5.73 ms | 6.87 ms | 7.52 ms | 10.63 ms |
			
 
				-| 16 | 1522.20 img/s | 10.51 ms | 12.73 ms | 13.84 ms | 17.84 ms |
			
 
				-| 32 | 1674.39 img/s | 19.11 ms | 23.23 ms | 24.63 ms | 29.55 ms |
			
 
				-| 64 | 1782.14 img/s | 35.91 ms | 41.84 ms | 44.53 ms | 48.94 ms |
			
 
				-| 128 | 1722.33 img/s | 74.32 ms | 85.37 ms | 89.27 ms | 94.85 ms |
			
 
				-| 256 | 1576.89 img/s | 162.34 ms | 181.01 ms | 185.92 ms | 194.42 ms |
			
 
				+| 1 | 563.63 img/s | 1.77 ms | 1.79 ms | 1.87 ms | 1.89 ms |
			
 
				+| 2 | 777.13 img/s | 2.57 ms | 2.63 ms | 2.68 ms | 2.89 ms |
			
 
				+| 4 | 1171.93 img/s | 3.41 ms | 3.43 ms | 3.51 ms | 3.55 ms |
			
 
				+| 8 | 1627.81 img/s | 4.91 ms | 4.97 ms | 5.02 ms | 5.09 ms |
			
 
				+| 16 | 1986.40 img/s | 8.05 ms | 8.11 ms | 8.19 ms | 8.37 ms |
			
 
				+| 32 | 2246.04 img/s | 14.25 ms | 14.33 ms | 14.40 ms | 14.57 ms |
			
 
				+| 64 | 2398.07 img/s | 26.69 ms | 26.87 ms | 26.91 ms | 27.06 ms |
			
 
				+| 128 | 2489.96 img/s | 51.41 ms | 51.74 ms | 51.80 ms | 51.94 ms |
			
 
				+| 256 | 2523.22 img/s | 101.46 ms | 102.13 ms | 102.35 ms | 102.77 ms |
			
 
				 
			
 
				 **FP16 Inference Latency**
			
 
				 
			
 
				 |**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
			
 
				 |--------------|------------------|---------------|---------------|---------------|---------------|
			
 
				-| 1 | 365.38 img/s | 2.74 ms | 3.94 ms | 4.35 ms | 5.64 ms |
			
 
				-| 2 | 612.52 img/s | 3.26 ms | 4.34 ms | 4.80 ms | 6.97 ms |
			
 
				-| 4 | 1018.15 img/s | 3.93 ms | 4.95 ms | 5.55 ms | 9.16 ms |
			
 
				-| 8 | 1924.26 img/s | 4.16 ms | 5.44 ms | 6.20 ms | 11.89 ms |
			
 
				-| 16 | 2477.49 img/s | 6.46 ms | 8.07 ms | 9.21 ms | 15.05 ms |
			
 
				-| 32 | 2896.01 img/s | 11.05 ms | 13.56 ms | 15.32 ms | 21.76 ms |
			
 
				-| 64 | 3165.27 img/s | 20.22 ms | 24.20 ms | 25.94 ms | 33.18 ms |
			
 
				-| 128 | 3176.46 img/s | 40.29 ms | 46.36 ms | 49.15 ms | 54.95 ms |
			
 
				-| 256 | 3110.01 img/s | 82.31 ms | 93.21 ms | 96.06 ms | 99.97 ms |
			
 
				+| 1 | 1296.81 img/s | 0.77 ms | 0.77 ms | 0.87 ms | 0.88 ms |
			
 
				+| 2 | 2224.06 img/s | 0.90 ms | 0.90 ms | 1.00 ms | 1.01 ms |
			
 
				+| 4 | 2845.61 img/s | 1.41 ms | 1.43 ms | 1.51 ms | 1.53 ms |
			
 
				+| 8 | 3793.35 img/s | 2.11 ms | 2.19 ms | 2.22 ms | 2.30 ms |
			
 
				+| 16 | 4315.53 img/s | 3.71 ms | 3.80 ms | 3.86 ms | 3.98 ms |
			
 
				+| 32 | 4815.26 img/s | 6.64 ms | 6.74 ms | 6.79 ms | 7.15 ms |
			
 
				+| 64 | 5103.27 img/s | 12.54 ms | 12.66 ms | 12.70 ms | 13.01 ms |
			
 
				+| 128 | 5393.20 img/s | 23.73 ms | 23.98 ms | 24.05 ms | 24.20 ms |
			
 
				+| 256 | 5505.24 img/s | 46.50 ms | 46.82 ms | 46.92 ms | 47.17 ms |
			
 
				 
			
 
				 ## Release notes
			
 
				 
			
--- a/PaddlePaddle/Classification/RN50v1.5/dali.py
+++ b/PaddlePaddle/Classification/RN50v1.5/dali.py
@@ -12,10 +12,15 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				+import ctypes
			
 
				 import os
			
 
				 from dataclasses import dataclass
			
 
				+from cuda import cudart
			
 
				 import paddle
			
 
				+import numpy as np
			
 
				+from nvidia.dali.backend import TensorListCPU
			
 
				 import nvidia.dali.ops as ops
			
 
				+import nvidia.dali.fn as fn
			
 
				 import nvidia.dali.types as types
			
 
				 from nvidia.dali.pipeline import Pipeline
			
 
				 from nvidia.dali.plugin.paddle import DALIGenericIterator
			
@@ -236,3 +241,54 @@ def build_dataloader(args, mode):
 
				     """
			
 
				     assert mode in Mode, "Dataset mode should be in supported Modes (train or eval)"
			
 
				     return dali_dataloader(args, mode, paddle.device.get_device())
			
 
				+
			
 
				+
			
 
				+def dali_synthetic_dataloader(args, device):
			
 
				+    """
			
 
				+    Define a dali dataloader with synthetic data.
			
 
				+
			
 
				+    Args:
			
 
				+        args(Namespace): Arguments obtained from ArgumentParser.
			
 
				+        device(int): Id of GPU to load data.
			
 
				+    Outputs:
			
 
				+        DALIGenericIterator(nvidia.dali.plugin.paddle.DALIGenericIterator)
			
 
				+            Iteratable outputs of DALI pipeline,
			
 
				+            including "data" in type of Paddle's Tensor.
			
 
				+    """
			
 
				+    assert "gpu" in device, "gpu training is required for DALI"
			
 
				+
			
 
				+    device_id = int(device.split(':')[1])
			
 
				+
			
 
				+    batch_size = args.batch_size
			
 
				+    image_shape = args.image_shape
			
 
				+    output_dtype = types.FLOAT16 if args.dali_output_fp16 else types.FLOAT
			
 
				+    num_threads = args.dali_num_threads
			
 
				+
			
 
				+    class ExternalInputIterator(object):
			
 
				+        def __init__(self, batch_size, image_shape):
			
 
				+            n_bytes = int(batch_size * np.prod(image_shape) * 4)
			
 
				+            err, mem = cudart.cudaMallocHost(n_bytes)
			
 
				+            assert err == cudart.cudaError_t.cudaSuccess
			
 
				+            mem_ptr = ctypes.cast(mem, ctypes.POINTER(ctypes.c_float))
			
 
				+            self.synthetic_data = np.ctypeslib.as_array(mem_ptr, shape=(batch_size, *image_shape))
			
 
				+            self.n = args.benchmark_steps
			
 
				+
			
 
				+        def __iter__(self):
			
 
				+            self.i = 0
			
 
				+            return self
			
 
				+
			
 
				+        def __next__(self):
			
 
				+            if self.i >= self.n:
			
 
				+                self.__iter__()
			
 
				+                raise StopIteration()
			
 
				+            self.i += 1
			
 
				+            return TensorListCPU(self.synthetic_data, is_pinned=True)
			
 
				+
			
 
				+    eli = ExternalInputIterator(batch_size, image_shape)
			
 
				+    pipe = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id)
			
 
				+    with pipe:
			
 
				+        images = fn.external_source(source=eli, no_copy=True, dtype=output_dtype)
			
 
				+        images = images.gpu()
			
 
				+        pipe.set_outputs(images)
			
 
				+    pipe.build()
			
 
				+    return DALIGenericIterator([pipe], ['data'])
			
--- a/PaddlePaddle/Classification/RN50v1.5/inference.py
+++ b/PaddlePaddle/Classification/RN50v1.5/inference.py
@@ -22,7 +22,7 @@ import dllogger
 
				 from paddle.fluid import LoDTensor
			
 
				 from paddle.inference import Config, PrecisionType, create_predictor
			
 
				 
			
 
				-from dali import dali_dataloader
			
 
				+from dali import dali_dataloader, dali_synthetic_dataloader
			
 
				 from utils.config import parse_args, print_args
			
 
				 from utils.mode import Mode
			
 
				 from utils.logger import setup_dllogger
			
@@ -40,7 +40,7 @@ def init_predictor(args):
 
				         f'There should be only 1 pdmodel in {infer_dir}, but there are {len(pdmodel_path)}'
			
 
				     predictor_config = Config(pdmodel_path[0], pdiparams_path[0])
			
 
				     predictor_config.enable_memory_optim()
			
 
				-    predictor_config.enable_use_gpu(0, 0)
			
 
				+    predictor_config.enable_use_gpu(0, args.device)
			
 
				     precision = args.trt_precision
			
 
				     max_batch_size = args.batch_size
			
 
				     assert precision in ['FP32', 'FP16', 'INT8'], \
			
@@ -106,14 +106,14 @@ def benchmark_dataset(args):
 
				     """
			
 
				     predictor = init_predictor(args)
			
 
				 
			
 
				-    dali_iter = dali_dataloader(args, Mode.EVAL, 'gpu:0')
			
 
				+    dali_iter = dali_dataloader(args, Mode.EVAL, 'gpu:' + str(args.device))
			
 
				 
			
 
				     # Warmup some samples for the stable performance number
			
 
				     batch_size = args.batch_size
			
 
				     image_shape = args.image_shape
			
 
				-    image = np.zeros((batch_size, *image_shape)).astype(np.single)
			
 
				+    images = np.zeros((batch_size, *image_shape)).astype(np.float32)
			
 
				     for _ in range(args.benchmark_warmup_steps):
			
 
				-        predict(predictor, [image])[0]
			
 
				+        predict(predictor, [images])[0]
			
 
				 
			
 
				     total_images = 0
			
 
				     correct_predict = 0
			
@@ -127,8 +127,8 @@ def benchmark_dataset(args):
 
				             label = np.asarray(data['label'])
			
 
				             total_images += label.shape[0]
			
 
				             label = label.flatten()
			
 
				-            image = data['data']
			
 
				-            predict_label = predict(predictor, [image])[0]
			
 
				+            images = data['data']
			
 
				+            predict_label = predict(predictor, [images])[0]
			
 
				             correct_predict += (label == predict_label).sum()
			
 
				         batch_end_time_step = time.perf_counter()
			
 
				         batch_latency = batch_end_time_step - last_time_step
			
@@ -152,29 +152,33 @@ def benchmark_dataset(args):
 
				     return statistics
			
 
				 
			
 
				 
			
 
				-def benchmark_synthat(args):
			
 
				+def benchmark_synthetic(args):
			
 
				     """
			
 
				-    Benchmark on the synthatic data and bypass all pre-processing.
			
 
				+    Benchmark on the synthetic data and bypass all pre-processing.
			
 
				     The host to device copy is still included.
			
 
				     This used to find the upper throughput bound when tunning the full input pipeline.
			
 
				     """
			
 
				 
			
 
				     predictor = init_predictor(args)
			
 
				+    dali_iter = dali_synthetic_dataloader(args, 'gpu:' + str(args.device))
			
 
				+
			
 
				     batch_size = args.batch_size
			
 
				     image_shape = args.image_shape
			
 
				-    image = np.random.random((batch_size, *image_shape)).astype(np.single)
			
 
				+    images = np.random.random((batch_size, *image_shape)).astype(np.float32)
			
 
				 
			
 
				     latency = []
			
 
				 
			
 
				     # warmup
			
 
				     for _ in range(args.benchmark_warmup_steps):
			
 
				-        predict(predictor, [image])[0]
			
 
				+        predict(predictor, [images])[0]
			
 
				 
			
 
				     # benchmark
			
 
				     start = time.perf_counter()
			
 
				     last_time_step = time.perf_counter()
			
 
				-    for _ in range(args.benchmark_steps):
			
 
				-        predict(predictor, [image])[0]
			
 
				+    for dali_data in dali_iter:
			
 
				+        for data in dali_data:
			
 
				+            images = data['data']
			
 
				+            predict(predictor, [images])[0]
			
 
				         batch_end_time_step = time.perf_counter()
			
 
				         batch_latency = batch_end_time_step - last_time_step
			
 
				         latency.append(batch_latency)
			
@@ -195,14 +199,13 @@ def benchmark_synthat(args):
 
				     }
			
 
				     return statistics
			
 
				 
			
 
				-
			
 
				 def main(args):
			
 
				     setup_dllogger(args.trt_log_path)
			
 
				     if args.show_config:
			
 
				         print_args(args)
			
 
				 
			
 
				-    if args.trt_use_synthat:
			
 
				-        statistics = benchmark_synthat(args)
			
 
				+    if args.trt_use_synthetic:
			
 
				+        statistics = benchmark_synthetic(args)
			
 
				     else:
			
 
				         statistics = benchmark_dataset(args)
			
 
				 
			
--- a/PaddlePaddle/Classification/RN50v1.5/requirements.txt
+++ b/PaddlePaddle/Classification/RN50v1.5/requirements.txt
@@ -1 +1,2 @@
 
				 git+https://github.com/NVIDIA/[email protected]#egg=dllogger
			
 
				+cuda-python==12.0.0
			
--- a/PaddlePaddle/Classification/RN50v1.5/scripts/inference/infer_resnet50_AMP.sh
+++ b/PaddlePaddle/Classification/RN50v1.5/scripts/inference/infer_resnet50_AMP.sh
@@ -18,4 +18,5 @@ python inference.py \
 
				     --trt-precision FP16 \
			
 
				     --batch-size 256 \
			
 
				     --benchmark-steps 1024 \
			
 
				-    --benchmark-warmup-steps 16
			
 
				+    --benchmark-warmup-steps 16 \
			
 
				+    --trt-use-synthetic True
			
--- a/PaddlePaddle/Classification/RN50v1.5/scripts/inference/infer_resnet50_TF32.sh
+++ b/PaddlePaddle/Classification/RN50v1.5/scripts/inference/infer_resnet50_TF32.sh
@@ -18,4 +18,5 @@ python inference.py \
 
				     --dali-num-threads 8 \
			
 
				     --batch-size 256 \
			
 
				     --benchmark-steps 1024 \
			
 
				-    --benchmark-warmup-steps 16
			
 
				+    --benchmark-warmup-steps 16 \
			
 
				+    --trt-use-synthetic True
			
--- a/PaddlePaddle/Classification/RN50v1.5/utils/config.py
+++ b/PaddlePaddle/Classification/RN50v1.5/utils/config.py
@@ -446,6 +446,12 @@ def add_training_args(parser):
 
				 
			
 
				 def add_trt_args(parser):
			
 
				     group = parser.add_argument_group('Paddle-TRT')
			
 
				+    group.add_argument(
			
 
				+        '--device',
			
 
				+        type=int,
			
 
				+        default='0',
			
 
				+        help='The GPU device id for Paddle-TRT inference.'
			
 
				+    )
			
 
				     group.add_argument(
			
 
				         '--trt-inference-dir',
			
 
				         type=str,
			
@@ -491,7 +497,7 @@ def add_trt_args(parser):
 
				         default='./inference.json',
			
 
				         help='A file in which to store JSON inference report.')
			
 
				     group.add_argument(
			
 
				-        '--trt-use-synthat',
			
 
				+        '--trt-use-synthetic',
			
 
				         type=distutils.util.strtobool,
			
 
				         default=False,
			
 
				         help='Apply synthetic data for benchmark.')