Răsfoiți Sursa

[RN50/Paddle] Fix 2308 compatibility issue

Shijie Wang 2 ani în urmă
părinte
comite
4fca54fb89

+ 1 - 1
PaddlePaddle/Classification/RN50v1.5/Dockerfile

@@ -1,4 +1,4 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.06-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.09-py3
 FROM ${FROM_IMAGE_NAME}
 
 ADD requirements.txt /workspace/

+ 2 - 2
PaddlePaddle/Classification/RN50v1.5/README.md

@@ -233,7 +233,7 @@ The following section lists the requirements you need to meet to start training
 This repository contains a Dockerfile that extends the CUDA NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PaddlePaddle 22.05-py3 NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/paddlepaddle) or newer
+* [PaddlePaddle 23.09-py3 NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/paddlepaddle) or newer
 * Supported GPUs:
     * [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
 
@@ -289,7 +289,7 @@ docker build . -t nvidia_resnet50
 
 ### 4. Start an interactive session in the NGC container to run training/inference.
 ```bash
-nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host nvidia_resnet50
+nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host --e FLAGS_apply_pass_to_program=1  nvidia_resnet50
 ```
 
 ### 5. Start training

+ 83 - 66
PaddlePaddle/Classification/RN50v1.5/program.py

@@ -12,26 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import time
 import logging
-
+import time
 from profile import Profiler
+
+import dllogger
+import models
 import numpy as np
-from optimizer import build_optimizer
 from lr_scheduler import build_lr_scheduler
+from optimizer import build_optimizer
 from utils.misc import AverageMeter
 from utils.mode import Mode, RunScope
 from utils.utility import get_num_trainers
-import models
-
-import dllogger
 
 import paddle
 import paddle.nn.functional as F
 from paddle.distributed import fleet
 from paddle.distributed.fleet import DistributedStrategy
-from paddle.static import sparsity
 from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper
+from paddle.incubate import asp as sparsity
 
 
 def create_feeds(image_shape):
@@ -45,11 +44,13 @@ def create_feeds(image_shape):
                      key (string): Name of variable to feed.
                      Value (tuple): paddle.static.data.
     """
-    feeds = dict()
+    feeds = {}
     feeds['data'] = paddle.static.data(
-        name="data", shape=[None] + image_shape, dtype="float32")
+        name="data", shape=[None] + image_shape, dtype="float32"
+    )
     feeds['label'] = paddle.static.data(
-        name="label", shape=[None, 1], dtype="int64")
+        name="label", shape=[None, 1], dtype="int64"
+    )
 
     return feeds
 
@@ -70,7 +71,7 @@ def create_fetchs(out, feeds, class_num, label_smoothing=0, mode=Mode.TRAIN):
                       key (string): Name of variable to fetch.
                       Value (tuple): (variable, AverageMeter).
     """
-    fetchs = dict()
+    fetchs = {}
     target = paddle.reshape(feeds['label'], [-1, 1])
 
     if mode == Mode.TRAIN:
@@ -78,8 +79,7 @@ def create_fetchs(out, feeds, class_num, label_smoothing=0, mode=Mode.TRAIN):
             loss = F.cross_entropy(out, target)
         else:
             label_one_hot = F.one_hot(target, class_num)
-            soft_target = F.label_smooth(
-                label_one_hot, epsilon=label_smoothing)
+            soft_target = F.label_smooth(label_one_hot, epsilon=label_smoothing)
             soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
             log_softmax = -F.log_softmax(out, axis=-1)
             loss = paddle.sum(log_softmax * soft_target, axis=-1)
@@ -94,19 +94,23 @@ def create_fetchs(out, feeds, class_num, label_smoothing=0, mode=Mode.TRAIN):
 
     acc_top1 = paddle.metric.accuracy(input=out, label=target, k=1)
     acc_top5 = paddle.metric.accuracy(input=out, label=target, k=5)
-    metric_dict = dict()
+    metric_dict = {}
     metric_dict["top1"] = acc_top1
     metric_dict["top5"] = acc_top5
 
     for key in metric_dict:
         if mode != Mode.TRAIN and paddle.distributed.get_world_size() > 1:
             paddle.distributed.all_reduce(
-                metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
-            metric_dict[key] = metric_dict[
-                key] / paddle.distributed.get_world_size()
+                metric_dict[key], op=paddle.distributed.ReduceOp.SUM
+            )
+            metric_dict[key] = (
+                metric_dict[key] / paddle.distributed.get_world_size()
+            )
 
-        fetchs[key] = (metric_dict[key], AverageMeter(
-            key, '7.4f', need_avg=True))
+        fetchs[key] = (
+            metric_dict[key],
+            AverageMeter(key, '7.4f', need_avg=True),
+        )
 
     return fetchs
 
@@ -127,13 +131,16 @@ def create_strategy(args, is_train=True):
     exec_strategy = paddle.static.ExecutionStrategy()
 
     exec_strategy.num_threads = 1
-    exec_strategy.num_iteration_per_drop_scope = (10000 if args.amp and
-                                                  args.use_pure_fp16 else 10)
-
-    paddle.set_flags({
-        'FLAGS_cudnn_exhaustive_search': True,
-        'FLAGS_conv_workspace_size_limit': 4096
-    })
+    exec_strategy.num_iteration_per_drop_scope = (
+        10000 if args.amp and args.use_pure_fp16 else 10
+    )
+
+    paddle.set_flags(
+        {
+            'FLAGS_cudnn_exhaustive_search': True,
+            'FLAGS_conv_workspace_size_limit': 4096,
+        }
+    )
 
     if not is_train:
         build_strategy.fix_op_run_order = True
@@ -177,7 +184,7 @@ def dist_optimizer(args, optimizer):
         dist_strategy.amp_configs = {
             "init_loss_scaling": args.scale_loss,
             "use_dynamic_loss_scaling": args.use_dynamic_loss_scaling,
-            "use_pure_fp16": args.use_pure_fp16
+            "use_pure_fp16": args.use_pure_fp16,
         }
 
     dist_strategy.asp = args.asp
@@ -223,14 +230,16 @@ def build(args, main_prog, startup_prog, step_each_epoch, is_train=True):
                 input_image_channel=input_image_channel,
                 data_format=data_format,
                 use_pure_fp16=use_pure_fp16,
-                bn_weight_decay=bn_weight_decay)
+                bn_weight_decay=bn_weight_decay,
+            )
             out = model(feeds["data"])
 
             fetchs = create_fetchs(
-                out, feeds, class_num, args.label_smoothing, mode=mode)
+                out, feeds, class_num, args.label_smoothing, mode=mode
+            )
 
             if args.asp:
-                sparsity.set_excluded_layers(main_prog, [model.fc.weight.name])
+                sparsity.set_excluded_layers(main_program=main_prog, param_names=[model.fc.weight.name])
 
             lr_scheduler = None
             optimizer = None
@@ -244,10 +253,13 @@ def build(args, main_prog, startup_prog, step_each_epoch, is_train=True):
     # This is a workaround to "Communicator of ring id 0 has not been initialized.".
     # Since Paddle's design, the initialization would be done inside train program,
     # eval_only need to manually call initialization.
-    if args.run_scope == RunScope.EVAL_ONLY and \
-       paddle.distributed.get_world_size() > 1:
+    if (
+        args.run_scope == RunScope.EVAL_ONLY
+        and paddle.distributed.get_world_size() > 1
+    ):
         collective_helper = CollectiveHelper(
-            role_maker=fleet.PaddleCloudRoleMaker(is_collective=True))
+            role_maker=fleet.PaddleCloudRoleMaker(is_collective=True)
+        )
         collective_helper.update_startup_program(startup_prog)
 
     return fetchs, lr_scheduler, feeds, optimizer
@@ -270,22 +282,22 @@ def compile_prog(args, program, loss_name=None, is_train=True):
     build_strategy, exec_strategy = create_strategy(args, is_train)
 
     compiled_program = paddle.static.CompiledProgram(
-        program).with_data_parallel(
-            loss_name=loss_name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
+        program, build_strategy=build_strategy
+    )
 
     return compiled_program
 
 
-def run(args,
-        dataloader,
-        exe,
-        program,
-        fetchs,
-        epoch,
-        mode=Mode.TRAIN,
-        lr_scheduler=None):
+def run(
+    args,
+    dataloader,
+    exe,
+    program,
+    fetchs,
+    epoch,
+    mode=Mode.TRAIN,
+    lr_scheduler=None,
+):
     """
     Execute program.
 
@@ -312,11 +324,11 @@ def run(args,
         if fetchs[k][1] is not None:
             metric_dict[k] = fetchs[k][1]
 
-    metric_dict["batch_time"] = AverageMeter(
-        'batch_time', '.5f', postfix=" s,")
+    metric_dict["batch_time"] = AverageMeter('batch_time', '.5f', postfix=" s,")
     metric_dict["data_time"] = AverageMeter('data_time', '.5f', postfix=" s,")
     metric_dict["compute_time"] = AverageMeter(
-        'compute_time', '.5f', postfix=" s,")
+        'compute_time', '.5f', postfix=" s,"
+    )
 
     for m in metric_dict.values():
         m.reset()
@@ -328,8 +340,7 @@ def run(args,
     batch_size = None
     latency = []
 
-    total_benchmark_steps = \
-        args.benchmark_steps + args.benchmark_warmup_steps
+    total_benchmark_steps = args.benchmark_steps + args.benchmark_warmup_steps
 
     dataloader.reset()
     while True:
@@ -361,11 +372,12 @@ def run(args,
         batch_size = batch[0]["data"].shape()[0]
         feed_dict = batch[0]
 
-        with profiler.profile_tag(idx, "Training"
-                                  if mode == Mode.TRAIN else "Evaluation"):
-            results = exe.run(program=program,
-                              feed=feed_dict,
-                              fetch_list=fetch_list)
+        with profiler.profile_tag(
+            idx, "Training" if mode == Mode.TRAIN else "Evaluation"
+        ):
+            results = exe.run(
+                program=program, feed=feed_dict, fetch_list=fetch_list
+            )
 
         for name, m in zip(fetchs.keys(), results):
             if name in metric_dict:
@@ -382,15 +394,16 @@ def run(args,
         tic = time.perf_counter()
 
         if idx % args.print_interval == 0:
-            log_msg = dict()
+            log_msg = {}
             log_msg['loss'] = metric_dict['loss'].val.item()
             log_msg['top1'] = metric_dict['top1'].val.item()
             log_msg['top5'] = metric_dict['top5'].val.item()
             log_msg['data_time'] = metric_dict['data_time'].val
             log_msg['compute_time'] = metric_dict['compute_time'].val
             log_msg['batch_time'] = metric_dict['batch_time'].val
-            log_msg['ips'] = \
+            log_msg['ips'] = (
                 batch_size * num_trainers / metric_dict['batch_time'].val
+            )
             if mode == Mode.TRAIN:
                 log_msg['lr'] = metric_dict['lr'].val
             log_info((epoch, idx), log_msg, mode)
@@ -406,10 +419,10 @@ def run(args,
                 logging.info("Begin benchmark at step %d", idx + 1)
 
             if idx == total_benchmark_steps:
-                benchmark_data = dict()
-                benchmark_data[
-                    'ips'] = batch_size * num_trainers / metric_dict[
-                        'batch_time'].avg
+                benchmark_data = {}
+                benchmark_data['ips'] = (
+                    batch_size * num_trainers / metric_dict['batch_time'].avg
+                )
                 if mode == mode.EVAL:
                     latency = np.array(latency) * 1000
                     quantile = np.quantile(latency, [0.9, 0.95, 0.99])
@@ -422,15 +435,19 @@ def run(args,
                 logging.info("End benchmark at epoch step %d", idx)
                 return benchmark_data
 
-    epoch_data = dict()
+    epoch_data = {}
     epoch_data['loss'] = metric_dict['loss'].avg.item()
     epoch_data['epoch_time'] = metric_dict['batch_time'].total
-    epoch_data['ips'] = batch_size * num_trainers * \
-            metric_dict["batch_time"].count / metric_dict["batch_time"].sum
+    epoch_data['ips'] = (
+        batch_size
+        * num_trainers
+        * metric_dict["batch_time"].count
+        / metric_dict["batch_time"].sum
+    )
     if mode == Mode.EVAL:
         epoch_data['top1'] = metric_dict['top1'].avg.item()
         epoch_data['top5'] = metric_dict['top5'].avg.item()
-    log_info((epoch, ), epoch_data, mode)
+    log_info((epoch,), epoch_data, mode)
 
     return epoch_data
 
@@ -445,7 +462,7 @@ def log_info(step, metrics, mode):
         mode(utils.Mode): Train or eval mode.
     """
     prefix = 'train' if mode == Mode.TRAIN else 'val'
-    dllogger_iter_data = dict()
+    dllogger_iter_data = {}
     for key in metrics:
         dllogger_iter_data[f"{prefix}.{key}"] = metrics[key]
     dllogger.log(step=step, data=dllogger_iter_data)

+ 58 - 29
PaddlePaddle/Classification/RN50v1.5/train.py

@@ -12,20 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import logging
-import paddle
-from paddle.distributed import fleet
-from paddle.static import sparsity
-from paddle.fluid.contrib.mixed_precision.fp16_utils import rewrite_program
-from paddle.fluid.contrib.mixed_precision.fp16_lists import AutoMixedPrecisionLists
+import os
+
 from dali import build_dataloader
+from utils.affinity import set_cpu_affinity
 from utils.config import parse_args, print_args
 from utils.logger import setup_dllogger
-from utils.save_load import init_program, save_model
-from utils.affinity import set_cpu_affinity
 from utils.mode import Mode, RunScope
+from utils.save_load import init_program, save_model
+
+import paddle
 import program
+from paddle.distributed import fleet
+from paddle.static.amp.fp16_lists import AutoMixedPrecisionLists
+from paddle.static.amp.fp16_utils import cast_model_to_fp16
+from paddle.incubate import asp as sparsity
 
 
 class MetricSummary:
@@ -35,18 +37,24 @@ class MetricSummary:
 
     def update(self, new_metrics):
         if not self.is_updated:
-            self.metric_dict = dict()
+            self.metric_dict = {}
 
         for key in new_metrics:
             if key in self.metric_dict:
                 # top1, top5 and ips are "larger is better"
                 if key in ['top1', 'top5', 'ips']:
-                    self.metric_dict[key] = new_metrics[key] if new_metrics[
-                        key] > self.metric_dict[key] else self.metric_dict[key]
+                    self.metric_dict[key] = (
+                        new_metrics[key]
+                        if new_metrics[key] > self.metric_dict[key]
+                        else self.metric_dict[key]
+                    )
                 # Others are "Smaller is better"
                 else:
-                    self.metric_dict[key] = new_metrics[key] if new_metrics[
-                        key] < self.metric_dict[key] else self.metric_dict[key]
+                    self.metric_dict[key] = (
+                        new_metrics[key]
+                        if new_metrics[key] < self.metric_dict[key]
+                        else self.metric_dict[key]
+                    )
             else:
                 self.metric_dict[key] = new_metrics[key]
 
@@ -89,7 +97,8 @@ def main(args):
             train_prog,
             startup_prog,
             step_each_epoch=train_step_each_epoch,
-            is_train=True)
+            is_train=True,
+        )
 
     eval_dataloader = None
     eval_prog = None
@@ -103,7 +112,8 @@ def main(args):
             eval_prog,
             startup_prog,
             step_each_epoch=eval_step_each_epoch,
-            is_train=False)
+            is_train=False,
+        )
         # clone to prune some content which is irrelevant in eval_prog
         eval_prog = eval_prog.clone(for_test=True)
 
@@ -113,17 +123,24 @@ def main(args):
     init_program(
         args,
         exe=exe,
-        program=train_prog if train_prog is not None else eval_prog)
+        program=train_prog if train_prog is not None else eval_prog,
+    )
 
     if args.amp:
         if args.run_scope == RunScope.EVAL_ONLY:
-            rewrite_program(eval_prog, amp_lists=AutoMixedPrecisionLists())
+            cast_model_to_fp16(
+                eval_prog,
+                AutoMixedPrecisionLists(),
+                use_fp16_guard=False,
+                level='O1',
+            )
         else:
             optimizer.amp_init(
                 device,
                 scope=paddle.static.global_scope(),
                 test_program=eval_prog,
-                use_fp16_test=True)
+                use_fp16_test=True,
+            )
 
     if args.asp and args.prune_model:
         logging.info("Pruning model to 2:4 sparse pattern...")
@@ -138,28 +155,40 @@ def main(args):
     for epoch_id in range(args.start_epoch, args.epochs):
         # Training
         if train_prog is not None:
-            metric_summary = program.run(args, train_dataloader, exe,
-                                         train_prog, train_fetchs, epoch_id,
-                                         Mode.TRAIN, lr_scheduler)
+            metric_summary = program.run(
+                args,
+                train_dataloader,
+                exe,
+                train_prog,
+                train_fetchs,
+                epoch_id,
+                Mode.TRAIN,
+                lr_scheduler,
+            )
             train_summary.update(metric_summary)
 
             # Save a checkpoint
             if epoch_id % args.save_interval == 0:
-                model_path = os.path.join(args.output_dir,
-                                          args.model_arch_name)
+                model_path = os.path.join(args.output_dir, args.model_arch_name)
                 save_model(train_prog, model_path, epoch_id, args.model_prefix)
 
         # Evaluation
-        if (eval_prog is not None) and \
-            (epoch_id % args.eval_interval == 0):
-            metric_summary = program.run(args, eval_dataloader, exe, eval_prog,
-                                         eval_fetchs, epoch_id, Mode.EVAL)
+        if (eval_prog is not None) and (epoch_id % args.eval_interval == 0):
+            metric_summary = program.run(
+                args,
+                eval_dataloader,
+                exe,
+                eval_prog,
+                eval_fetchs,
+                epoch_id,
+                Mode.EVAL,
+            )
             eval_summary.update(metric_summary)
 
     if train_summary.is_updated:
-        program.log_info(tuple(), train_summary.metric_dict, Mode.TRAIN)
+        program.log_info((), train_summary.metric_dict, Mode.TRAIN)
     if eval_summary.is_updated:
-        program.log_info(tuple(), eval_summary.metric_dict, Mode.EVAL)
+        program.log_info((), eval_summary.metric_dict, Mode.EVAL)
 
 
 if __name__ == '__main__':