|
|
@@ -51,6 +51,10 @@ from apex.parallel import DistributedDataParallel as DDP
|
|
|
from apex import amp
|
|
|
|
|
|
|
|
|
+def synchronized_timestamp():
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ return time.time()
|
|
|
+
|
|
|
def parse_args():
|
|
|
parser = ArgumentParser(description="Train a Neural Collaborative"
|
|
|
" Filtering model")
|
|
|
@@ -218,7 +222,7 @@ def main():
|
|
|
torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
|
|
|
torch.cuda.synchronize()
|
|
|
|
|
|
- main_start_time = time.time()
|
|
|
+ main_start_time = synchronized_timestamp()
|
|
|
|
|
|
feature_spec_path = os.path.join(args.data, args.feature_spec_file)
|
|
|
feature_spec = FeatureSpec.from_yaml(feature_spec_path)
|
|
|
@@ -268,10 +272,10 @@ def main():
|
|
|
model.load_state_dict(state_dict)
|
|
|
|
|
|
if args.mode == 'test':
|
|
|
- start = time.time()
|
|
|
+ start = synchronized_timestamp()
|
|
|
hr, ndcg, val_loss = val_epoch(model, test_loader, args.topk,
|
|
|
distributed=args.distributed, world_size=args.world_size)
|
|
|
- val_time = time.time() - start
|
|
|
+ val_time = synchronized_timestamp() - start
|
|
|
eval_size = test_loader.raw_dataset_length
|
|
|
eval_throughput = eval_size / val_time
|
|
|
|
|
|
@@ -285,12 +289,12 @@ def main():
|
|
|
# to an uninitialized variable.
|
|
|
max_hr = 0
|
|
|
best_epoch = 0
|
|
|
- best_model_timestamp = time.time()
|
|
|
+ best_model_timestamp = synchronized_timestamp()
|
|
|
train_throughputs, eval_throughputs = [], []
|
|
|
|
|
|
for epoch in range(args.epochs):
|
|
|
|
|
|
- begin = time.time()
|
|
|
+ begin = synchronized_timestamp()
|
|
|
batch_dict_list = train_loader.get_epoch_data()
|
|
|
num_batches = len(batch_dict_list)
|
|
|
for i in range(num_batches // args.grads_accumulated):
|
|
|
@@ -322,8 +326,8 @@ def main():
|
|
|
p.grad = None
|
|
|
|
|
|
del batch_dict_list
|
|
|
- train_time = time.time() - begin
|
|
|
- begin = time.time()
|
|
|
+ train_time = synchronized_timestamp() - begin
|
|
|
+ begin = synchronized_timestamp()
|
|
|
|
|
|
epoch_samples = train_loader.length_after_augmentation
|
|
|
train_throughput = epoch_samples / train_time
|
|
|
@@ -332,7 +336,7 @@ def main():
|
|
|
hr, ndcg, val_loss = val_epoch(model, test_loader, args.topk,
|
|
|
distributed=args.distributed, world_size=args.world_size)
|
|
|
|
|
|
- val_time = time.time() - begin
|
|
|
+ val_time = synchronized_timestamp() - begin
|
|
|
eval_size = test_loader.raw_dataset_length
|
|
|
eval_throughput = eval_size / val_time
|
|
|
eval_throughputs.append(eval_throughput)
|
|
|
@@ -358,7 +362,7 @@ def main():
|
|
|
save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth')
|
|
|
print("Saving the model to: ", save_checkpoint_path)
|
|
|
torch.save(model.state_dict(), save_checkpoint_path)
|
|
|
- best_model_timestamp = time.time()
|
|
|
+ best_model_timestamp = synchronized_timestamp()
|
|
|
|
|
|
if args.threshold is not None:
|
|
|
if hr >= args.threshold:
|
|
|
@@ -372,7 +376,7 @@ def main():
|
|
|
'mean_eval_throughput': np.mean(eval_throughputs),
|
|
|
'best_accuracy': max_hr,
|
|
|
'best_epoch': best_epoch,
|
|
|
- 'time_to_target': time.time() - main_start_time,
|
|
|
+ 'time_to_target': synchronized_timestamp() - main_start_time,
|
|
|
'time_to_best_model': best_model_timestamp - main_start_time,
|
|
|
'validation_loss': float(val_loss.item()),
|
|
|
'train_loss': float(loss.item())},
|