|
|
@@ -17,6 +17,9 @@ import argparse
|
|
|
import os
|
|
|
import shutil
|
|
|
import subprocess
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+LOCK_FILE = Path('/tmp/mrcnn_tf2.lock')
|
|
|
|
|
|
|
|
|
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
|
|
|
@@ -45,6 +48,8 @@ if __name__ == '__main__':
|
|
|
help='Input directory containing the dataset')
|
|
|
parser.add_argument('--weights_dir', type=str, metavar='DIR', default='/weights',
|
|
|
help='Directory containing pre-trained resnet weights')
|
|
|
+ parser.add_argument('--slurm_lock', action='store_true',
|
|
|
+ help='Prevent this script from being launched multiple times when used in multi-gpu slurm setup')
|
|
|
parser.add_argument('--no_eval', action='store_true', help='Disables evaluation after training.')
|
|
|
|
|
|
flags, remainder = parser.parse_known_args()
|
|
|
@@ -52,28 +57,51 @@ if __name__ == '__main__':
|
|
|
main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../main.py'))
|
|
|
checkpoint_path = os.path.join(flags.weights_dir, "rn50_tf_amp_ckpt_v20.06.0/nvidia_rn50_tf_amp")
|
|
|
|
|
|
- # build command
|
|
|
- cmd = (
|
|
|
+ # build commands
|
|
|
+ cmd_train = (
|
|
|
f'python {main_path}'
|
|
|
- f' train_and_eval'
|
|
|
+ f' train'
|
|
|
f' --data_dir "{flags.data_dir}"'
|
|
|
- f' --eval_file "{os.path.join(flags.data_dir, "annotations/instances_val2017.json")}"'
|
|
|
f' --backbone_checkpoint "{checkpoint_path}"'
|
|
|
f' --train_batch_size {flags.batch_size}'
|
|
|
)
|
|
|
+ cmd_eval = (
|
|
|
+ f'python {main_path}'
|
|
|
+ f' eval'
|
|
|
+ f' --data_dir "{flags.data_dir}"'
|
|
|
+ f' --eval_file "{os.path.join(flags.data_dir, "annotations/instances_val2017.json")}"'
|
|
|
+ )
|
|
|
|
|
|
if not flags.no_xla:
|
|
|
- cmd += ' --xla'
|
|
|
+ cmd_train += ' --xla'
|
|
|
+ cmd_eval += ' --xla'
|
|
|
if flags.amp:
|
|
|
- cmd += ' --amp'
|
|
|
+ cmd_train += ' --amp'
|
|
|
+ cmd_eval += ' --amp'
|
|
|
if remainder:
|
|
|
- cmd += ' ' + ' '.join(remainder)
|
|
|
+ cmd_train += ' ' + ' '.join(remainder)
|
|
|
+ cmd_eval += ' ' + ' '.join(remainder)
|
|
|
if flags.gpus is not None:
|
|
|
- cmd = f'CUDA_VISIBLE_DEVICES={",".join(map(str, range(flags.gpus)))} ' + cmd
|
|
|
+ cmd_train = f'CUDA_VISIBLE_DEVICES={",".join(map(str, range(flags.gpus)))} ' + cmd
|
|
|
|
|
|
# print command
|
|
|
line = '-' * shutil.get_terminal_size()[0]
|
|
|
- print(line, cmd, line, sep='\n', flush=True)
|
|
|
+ print(line, cmd_train, line, sep='\n', flush=True)
|
|
|
+
|
|
|
+ # acquire lock if --slurm_lock is provided
|
|
|
+ try:
|
|
|
+ flags.slurm_lock and LOCK_FILE.touch(exist_ok=False)
|
|
|
+ except FileExistsError:
|
|
|
+ print(f'Failed to acquire lock ({LOCK_FILE}) - skipping')
|
|
|
+ exit(0)
|
|
|
+
|
|
|
+ # run training
|
|
|
+ code = subprocess.call(cmd_train, shell=True)
|
|
|
+
|
|
|
+ # evaluation
|
|
|
+ if not code and not flags.no_eval:
|
|
|
+ print(line, cmd_eval, line, sep='\n', flush=True)
|
|
|
+ code = subprocess.call(cmd_eval, shell=True)
|
|
|
|
|
|
- # run model
|
|
|
- subprocess.call(cmd, shell=True)
|
|
|
+ flags.slurm_lock and LOCK_FILE.unlink()
|
|
|
+ exit(code)
|