Przeglądaj źródła

[EffDet/PyT] Invoking CUDA synchronize() before Timing

Ao Tang 3 lat temu
rodzic
commit
f613b7c0a8

+ 4 - 0
PyTorch/Detection/Efficientdet/train.py

@@ -521,12 +521,14 @@ def train_epoch(
 
     model.train()
 
+    torch.cuda.synchronize()
     end = time.time()
     last_idx = steps_per_epoch - 1
     num_updates = epoch * steps_per_epoch
     for batch_idx in range(steps_per_epoch):
         input, target = next(loader_iter)
         last_batch = batch_idx == last_idx
+        torch.cuda.synchronize()
         data_time_m.update(time.time() - end)
 
         with torch.cuda.amp.autocast(enabled=use_amp):
@@ -575,6 +577,7 @@ def train_epoch(
         if lr_scheduler is not None:
             lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
 
+        torch.cuda.synchronize()
         end = time.time()
         if args.benchmark:
             if batch_idx >= args.benchmark_steps:
@@ -597,6 +600,7 @@ def validate(model, loader, args, evaluator=None, epoch=0, log_suffix=''):
 
     model.eval()
 
+    torch.cuda.synchronize()
     end = time.time()
     last_idx = len(loader) - 1
     with torch.no_grad():

+ 4 - 0
PyTorch/Detection/Efficientdet/validate.py

@@ -208,12 +208,14 @@ def validate(args):
     bench.eval()
     batch_time = AverageMeter()
     throughput = AverageMeter()
+    torch.cuda.synchronize()
     end = time.time()
     total_time_start = time.time()
     with torch.no_grad():
         for i, (input, target) in enumerate(loader):
             with torch.cuda.amp.autocast(enabled=args.amp):
                 output = bench(input, target['img_scale'], target['img_size'])
+            torch.cuda.synchronize()
             batch_time.update(time.time() - end)
             throughput.update(input.size(0) / batch_time.val)
             evaluator.add_predictions(output, target)
@@ -235,6 +237,7 @@ def validate(args):
                 )
             end = time.time()
 
+    torch.cuda.synchronize()
     dllogger_metric['total_inference_time'] = time.time() - total_time_start
     dllogger_metric['inference_throughput'] = throughput.avg
     dllogger_metric['inference_time'] = 1000 / throughput.avg
@@ -245,6 +248,7 @@ def validate(args):
             mean_ap = evaluator.evaluate()
         else:
             evaluator.save_predictions(args.results)
+        torch.cuda.synchronize()
         dllogger_metric['map'] = mean_ap
         dllogger_metric['total_eval_time'] = time.time() - total_time_start
     else: