소스 검색

[MaskRCNN/TF2] Update

Jan Golda 4 년 전
부모
커밋
5afd63dde0

+ 0 - 1
TensorFlow2/Segmentation/MaskRCNN/README.md

@@ -323,7 +323,6 @@ NVIDIA implementation of MastRCNN for TensorFlow 2.x
 Runtime:
   MODE                                    One of supported execution modes:
                                                 train - run in training mode
-                                                train_and_eval - run training followed by evaluation
                                                 eval - run evaluation on eval data split
                                                 infer - run inference on eval data split
   --data_dir DIR                          Input directory containing the dataset (default: /data)

+ 1 - 1
TensorFlow2/Segmentation/MaskRCNN/main.py

@@ -57,7 +57,7 @@ def main():
     # setup dataset
     dataset = Dataset(params)
 
-    if params.mode in ['train', 'train_and_eval']:
+    if params.mode == 'train':
         run_training(dataset, params)
     if params.mode == 'eval':
         run_evaluation(dataset, params)

+ 1 - 2
TensorFlow2/Segmentation/MaskRCNN/mrcnn_tf2/arguments.py

@@ -45,12 +45,11 @@ RUNTIME_GROUP.add_argument(
     help=(
         'One of supported execution modes:'
         '\n\ttrain - run in training mode'
-        '\n\ttrain_and_eval - run training followed by evaluation'
         '\n\teval - run evaluation on eval data split'
         '\n\tinfer - run inference on eval data split'
     ),
     choices=[
-        'train', 'train_and_eval', 'eval', 'infer'
+        'train', 'eval', 'infer'
     ]
 )
 

+ 0 - 19
TensorFlow2/Segmentation/MaskRCNN/mrcnn_tf2/runtime/run.py

@@ -59,25 +59,6 @@ def run_training(dataset, params):
         verbose=0
     )
 
-    if 'eval' not in params.mode:
-        return
-
-    predictions = mask_rcnn_model.predict(
-        x=dataset.eval_fn(params.eval_batch_size * params.replicas),
-        callbacks=list(create_callbacks(params))
-    )
-
-    eval_results = evaluate(
-        predictions=predictions,
-        eval_file=params.eval_file,
-        include_mask=params.include_mask
-    )
-
-    dllogger.log(
-        step=tuple(),
-        data={k: float(v) for k, v in eval_results.items()}
-    )
-
 
 def run_evaluation(dataset, params):
     setup(params)

+ 1 - 1
TensorFlow2/Segmentation/MaskRCNN/scripts/benchmark_inference.py

@@ -74,4 +74,4 @@ if __name__ == '__main__':
     print(line, cmd, line, sep='\n', flush=True)
 
     # run model
-    subprocess.call(cmd, shell=True)
+    exit(subprocess.call(cmd, shell=True))

+ 16 - 1
TensorFlow2/Segmentation/MaskRCNN/scripts/benchmark_training.py

@@ -17,6 +17,9 @@ import argparse
 import os
 import shutil
 import subprocess
+from pathlib import Path
+
+LOCK_FILE = Path('/tmp/mrcnn_tf2.lock')
 
 
 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
@@ -45,6 +48,8 @@ if __name__ == '__main__':
                         help='Input directory containing the dataset')
     parser.add_argument('--weights_dir', type=str, metavar='DIR', default='/weights',
                         help='Directory containing pre-trained resnet weights')
+    parser.add_argument('--slurm_lock', action='store_true',
+                        help='Prevent this script from being launched multiple times when used in multi-gpu slurm setup')
 
     flags, remainder = parser.parse_known_args()
 
@@ -76,5 +81,15 @@ if __name__ == '__main__':
     line = '-' * shutil.get_terminal_size()[0]
     print(line, cmd, line, sep='\n', flush=True)
 
+    # acquire lock if --slurm_lock is provided
+    try:
+        flags.slurm_lock and LOCK_FILE.touch(exist_ok=False)
+    except FileExistsError:
+        print(f'Failed to acquire lock ({LOCK_FILE}) - skipping')
+        exit(0)
+
     # run model
-    subprocess.call(cmd, shell=True)
+    code = subprocess.call(cmd, shell=True)
+
+    flags.slurm_lock and LOCK_FILE.unlink()
+    exit(code)

+ 1 - 1
TensorFlow2/Segmentation/MaskRCNN/scripts/evaluate.py

@@ -71,4 +71,4 @@ if __name__ == '__main__':
     print(line, cmd, line, sep='\n', flush=True)
 
     # run model
-    subprocess.call(cmd, shell=True)
+    exit(subprocess.call(cmd, shell=True))

+ 1 - 1
TensorFlow2/Segmentation/MaskRCNN/scripts/inference.py

@@ -70,4 +70,4 @@ if __name__ == '__main__':
     print(line, cmd, line, sep='\n', flush=True)
 
     # run model
-    subprocess.call(cmd, shell=True)
+    exit(subprocess.call(cmd, shell=True))

+ 39 - 11
TensorFlow2/Segmentation/MaskRCNN/scripts/train.py

@@ -17,6 +17,9 @@ import argparse
 import os
 import shutil
 import subprocess
+from pathlib import Path
+
+LOCK_FILE = Path('/tmp/mrcnn_tf2.lock')
 
 
 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
@@ -45,6 +48,8 @@ if __name__ == '__main__':
                         help='Input directory containing the dataset')
     parser.add_argument('--weights_dir', type=str, metavar='DIR', default='/weights',
                         help='Directory containing pre-trained resnet weights')
+    parser.add_argument('--slurm_lock', action='store_true',
+                        help='Prevent this script from being launched multiple times when used in multi-gpu slurm setup')
     parser.add_argument('--no_eval', action='store_true', help='Disables evaluation after training.')
 
     flags, remainder = parser.parse_known_args()
@@ -52,28 +57,51 @@ if __name__ == '__main__':
     main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../main.py'))
     checkpoint_path = os.path.join(flags.weights_dir, "rn50_tf_amp_ckpt_v20.06.0/nvidia_rn50_tf_amp")
 
-    # build command
-    cmd = (
+    # build commands
+    cmd_train = (
         f'python {main_path}'
-        f' train_and_eval'
+        f' train'
         f' --data_dir "{flags.data_dir}"'
-        f' --eval_file "{os.path.join(flags.data_dir, "annotations/instances_val2017.json")}"'
         f' --backbone_checkpoint "{checkpoint_path}"'
         f' --train_batch_size {flags.batch_size}'
     )
+    cmd_eval = (
+        f'python {main_path}'
+        f' eval'
+        f' --data_dir "{flags.data_dir}"'
+        f' --eval_file "{os.path.join(flags.data_dir, "annotations/instances_val2017.json")}"'
+    )
 
     if not flags.no_xla:
-        cmd += ' --xla'
+        cmd_train += ' --xla'
+        cmd_eval += ' --xla'
     if flags.amp:
-        cmd += ' --amp'
+        cmd_train += ' --amp'
+        cmd_eval += ' --amp'
     if remainder:
-        cmd += ' ' + ' '.join(remainder)
+        cmd_train += ' ' + ' '.join(remainder)
+        cmd_eval += ' ' + ' '.join(remainder)
     if flags.gpus is not None:
-        cmd = f'CUDA_VISIBLE_DEVICES={",".join(map(str, range(flags.gpus)))} ' + cmd
+        cmd_train = f'CUDA_VISIBLE_DEVICES={",".join(map(str, range(flags.gpus)))} ' + cmd
 
     # print command
     line = '-' * shutil.get_terminal_size()[0]
-    print(line, cmd, line, sep='\n', flush=True)
+    print(line, cmd_train, line, sep='\n', flush=True)
+
+    # acquire lock if --slurm_lock is provided
+    try:
+        flags.slurm_lock and LOCK_FILE.touch(exist_ok=False)
+    except FileExistsError:
+        print(f'Failed to acquire lock ({LOCK_FILE}) - skipping')
+        exit(0)
+
+    # run training
+    code = subprocess.call(cmd_train, shell=True)
+
+    # evaluation
+    if not code and not flags.no_eval:
+        print(line, cmd_eval, line, sep='\n', flush=True)
+        code = subprocess.call(cmd_eval, shell=True)
 
-    # run model
-    subprocess.call(cmd, shell=True)
+    flags.slurm_lock and LOCK_FILE.unlink()
+    exit(code)