Browse Source

[Convents/MX] Logging and suspend-resume fixes

Lukasz Pierscieniewski 3 năm trước cách đây
mục cha
commit
2fff359539

+ 23 - 7
MxNet/Classification/RN50v1.5/fit.py

@@ -83,6 +83,7 @@ class PartitionSignalHandler():
         return bool(self.t[0] > 0)
 
     def _signal_handler(self, signum, frame):
+        print("Signal reveived")
         self.t[0] = 1
 
 
@@ -152,6 +153,8 @@ def add_fit_args(parser):
                        help='file where to save the dllogger log from the experiment')
     train.add_argument('--workspace', type=str, default='./',
                        help='path to directory where results will be stored')
+    train.add_argument('--logdir', type=str, default=None,
+                       help="path to directory where logs will be stored")
     train.add_argument('--no-metrics', action='store_true',
                        help='do not calculate evaluation metrics (for benchmarking)')
     train.add_argument('--benchmark-iters', type=int, default=None,
@@ -199,13 +202,18 @@ def load_model(args, model):
     file = list(glob.glob(
         f"{args.workspace}/{args.model_prefix}_*.params"))
     if len(file) == 0:
-        return 0
+        return -1
+
+    file = [x for x in sorted(file) if "best.params" not in x]
+
+    if len(file) == 0:
+        return -1
 
-    file = [x for x in sorted(file) if "best.params" not in x][-1]
+    file = file[-1]
 
     epoch = re.match(f".*{args.model_prefix}_([0-9]*)\.params", file)
     if epoch is None:
-        return 0
+        return -1
 
     epoch = int(epoch.group(1))
     model.load_parameters(file)
@@ -427,6 +435,8 @@ def model_fit(args, net, train_data, eval_metric, optimizer,
 
                 durations.append(time.time() - tic)
                 tic = time.time()
+        else:
+            break
 
         durations = durations[min(len(durations) // 10, 100):]
         dllogger_epoch_data = {
@@ -453,8 +463,8 @@ def model_fit(args, net, train_data, eval_metric, optimizer,
             accuracy = score.get('accuracy', -1)
             save_checkpoint(net, epoch, accuracy, best_accuracy,
                             model_prefix, args.workspace,
-                            args.save_frequency, kvstore,
-                            force_save=should_break)
+                            args.save_frequency if args.mode == "train_val" else -1,
+                            kvstore, force_save=should_break)
             best_accuracy = max(best_accuracy, accuracy)
         global_metrics.update_dict(dllogger_epoch_data)
         dllogger.log(step=(epoch,), data=dllogger_epoch_data)
@@ -473,6 +483,11 @@ def fit(args, model, data_loader):
     # select gpu for horovod process
     if 'horovod' in args.kv_store:
         args.gpus = [args.gpus[hvd.local_rank()]]
+        ctx = mx.gpu(hvd.local_rank())
+
+        tensor1 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
+        tensor2 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
+        tensor1, tensor2 = hvd.grouped_allreduce([tensor1,tensor2])
 
     if args.amp:
         amp.init()
@@ -516,11 +531,12 @@ def fit(args, model, data_loader):
                 tic = time.time()
         return
 
-    start_epoch = load_model(args, model)
+    start_epoch = load_model(args, model) + 1
     if start_epoch == 0:
         # all initializers should be specified in the model definition.
         # if not, this will raise an error
         model.initialize(mx.init.Initializer())
+    logging.info(f"starting epoch {start_epoch}")
 
     # devices for training
     devs = list(map(mx.gpu, args.gpus))
@@ -598,7 +614,7 @@ def fit(args, model, data_loader):
             optimizer=args.optimizer,
             optimizer_params=optimizer_params,
             lr_scheduler=lr_scheduler,
-            model_prefix=os.path.join(args.workspace, args.model_prefix),
+            model_prefix=args.model_prefix,
         )
     elif args.mode == 'val':
         for epoch in range(args.num_epochs):  # loop for benchmarking

+ 2 - 1
MxNet/Classification/RN50v1.5/log_utils.py

@@ -24,10 +24,11 @@ def format_step(step):
 def setup_logging(args):
     logging.basicConfig(level=logging.DEBUG, format='{asctime}:{levelname}: {message}', style='{')
     if hvd.rank() == 0:
+        logging_dir = args.logdir if args.logdir is not None else args.workspace
         dllogger.init(backends=[
             dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT, step_format=format_step),
             dllogger.JSONStreamBackend(
-                dllogger.Verbosity.VERBOSE, os.path.join(args.workspace, args.dllogger_log)),
+                dllogger.Verbosity.VERBOSE, os.path.join(logging_dir, args.dllogger_log)),
         ])
     else:
         dllogger.init([])