|
|
@@ -236,18 +236,38 @@ def validate(model, criterion, valset, epoch, batch_iter, batch_size,
|
|
|
collate_fn=collate_fn)
|
|
|
|
|
|
val_loss = 0.0
|
|
|
+ num_iters = 0
|
|
|
+ val_items_per_sec = 0.0
|
|
|
for i, batch in enumerate(val_loader):
|
|
|
- x, y, len_x = batch_to_gpu(batch)
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ iter_start_time = time.perf_counter()
|
|
|
+
|
|
|
+ x, y, num_items = batch_to_gpu(batch)
|
|
|
y_pred = model(x)
|
|
|
loss = criterion(y_pred, y)
|
|
|
if distributed_run:
|
|
|
reduced_val_loss = reduce_tensor(loss.data, world_size).item()
|
|
|
+ reduced_num_items = reduce_tensor(num_items.data, 1).item()
|
|
|
else:
|
|
|
reduced_val_loss = loss.item()
|
|
|
+ reduced_num_items = num_items.item()
|
|
|
val_loss += reduced_val_loss
|
|
|
- val_loss = val_loss / (i + 1)
|
|
|
|
|
|
- DLLogger.log(step=(epoch, batch_iter, epoch), data={'val_iter_loss': val_loss})
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ iter_stop_time = time.perf_counter()
|
|
|
+ iter_time = iter_stop_time - iter_start_time
|
|
|
+
|
|
|
+ items_per_sec = reduced_num_items/iter_time
|
|
|
+ DLLogger.log(step=(epoch, batch_iter, i), data={'val_items_per_sec': items_per_sec})
|
|
|
+ val_items_per_sec += items_per_sec
|
|
|
+ num_iters += 1
|
|
|
+
|
|
|
+ val_loss = val_loss/(i + 1)
|
|
|
+
|
|
|
+ DLLogger.log(step=(epoch,), data={'val_loss': val_loss})
|
|
|
+ DLLogger.log(step=(epoch,), data={'val_items_per_sec':
|
|
|
+ (val_items_per_sec/num_iters if num_iters > 0 else 0.0)})
|
|
|
+
|
|
|
return val_loss
|
|
|
|
|
|
def adjust_learning_rate(iteration, epoch, optimizer, learning_rate,
|
|
|
@@ -307,8 +327,8 @@ def main():
|
|
|
if distributed_run:
|
|
|
init_distributed(args, world_size, local_rank, args.group_name)
|
|
|
|
|
|
- run_start_time = time.time()
|
|
|
- DLLogger.log(step=tuple(), data={'run_start': run_start_time})
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ run_start_time = time.perf_counter()
|
|
|
|
|
|
model_config = models.get_model_config(model_name, args)
|
|
|
model = models.get_model(model_name, model_config,
|
|
|
@@ -350,8 +370,14 @@ def main():
|
|
|
model_name, n_frames_per_step)
|
|
|
trainset = data_functions.get_data_loader(
|
|
|
model_name, args.dataset_path, args.training_files, args)
|
|
|
- train_sampler = DistributedSampler(trainset) if distributed_run else None
|
|
|
- train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
|
|
|
+ if distributed_run:
|
|
|
+ train_sampler = DistributedSampler(trainset)
|
|
|
+ shuffle = False
|
|
|
+ else:
|
|
|
+ train_sampler = None
|
|
|
+ shuffle = True
|
|
|
+
|
|
|
+ train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
|
|
|
sampler=train_sampler,
|
|
|
batch_size=args.batch_size, pin_memory=False,
|
|
|
drop_last=True, collate_fn=collate_fn)
|
|
|
@@ -362,21 +388,21 @@ def main():
|
|
|
batch_to_gpu = data_functions.get_batch_to_gpu(model_name)
|
|
|
|
|
|
iteration = 0
|
|
|
- train_epoch_avg_items_per_sec = 0.0
|
|
|
+ train_epoch_items_per_sec = 0.0
|
|
|
val_loss = 0.0
|
|
|
num_iters = 0
|
|
|
|
|
|
model.train()
|
|
|
|
|
|
for epoch in range(start_epoch, args.epochs):
|
|
|
- epoch_start_time = time.time()
|
|
|
- DLLogger.log(step=(epoch,) , data={'train_epoch_start': epoch_start_time})
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ epoch_start_time = time.perf_counter()
|
|
|
# used to calculate avg items/sec over epoch
|
|
|
reduced_num_items_epoch = 0
|
|
|
|
|
|
# used to calculate avg loss over epoch
|
|
|
train_epoch_avg_loss = 0.0
|
|
|
- train_epoch_avg_items_per_sec = 0.0
|
|
|
+ train_epoch_items_per_sec = 0.0
|
|
|
|
|
|
num_iters = 0
|
|
|
|
|
|
@@ -387,12 +413,11 @@ def main():
|
|
|
train_loader.sampler.set_epoch(epoch)
|
|
|
|
|
|
for i, batch in enumerate(train_loader):
|
|
|
- iter_start_time = time.time()
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ iter_start_time = time.perf_counter()
|
|
|
DLLogger.log(step=(epoch, i),
|
|
|
data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))})
|
|
|
- DLLogger.log(step=(epoch, i), data={'train_iter_start': iter_start_time})
|
|
|
|
|
|
- start = time.perf_counter()
|
|
|
adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate,
|
|
|
args.anneal_steps, args.anneal_factor, local_rank)
|
|
|
|
|
|
@@ -411,7 +436,7 @@ def main():
|
|
|
if np.isnan(reduced_loss):
|
|
|
raise Exception("loss is NaN")
|
|
|
|
|
|
- DLLogger.log(step=(epoch,i), data={'train_iter_loss': reduced_loss})
|
|
|
+ DLLogger.log(step=(epoch,i), data={'train_loss': reduced_loss})
|
|
|
|
|
|
train_epoch_avg_loss += reduced_loss
|
|
|
num_iters += 1
|
|
|
@@ -431,25 +456,24 @@ def main():
|
|
|
|
|
|
optimizer.step()
|
|
|
|
|
|
- iter_stop_time = time.time()
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ iter_stop_time = time.perf_counter()
|
|
|
iter_time = iter_stop_time - iter_start_time
|
|
|
items_per_sec = reduced_num_items/iter_time
|
|
|
- train_epoch_avg_items_per_sec += items_per_sec
|
|
|
+ train_epoch_items_per_sec += items_per_sec
|
|
|
|
|
|
- DLLogger.log(step=(epoch, i), data={'train_iter_items/sec': items_per_sec})
|
|
|
- DLLogger.log(step=(epoch, i), data={'train_iter_stop': iter_stop_time})
|
|
|
+ DLLogger.log(step=(epoch, i), data={'train_items_per_sec': items_per_sec})
|
|
|
DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time})
|
|
|
iteration += 1
|
|
|
|
|
|
-
|
|
|
- epoch_stop_time = time.time()
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ epoch_stop_time = time.perf_counter()
|
|
|
epoch_time = epoch_stop_time - epoch_start_time
|
|
|
|
|
|
- DLLogger.log(step=(epoch,), data={'train_epoch_items/sec': reduced_num_items_epoch/epoch_time})
|
|
|
- DLLogger.log(step=(epoch,), data={'train_epoch_avg_items/sec':
|
|
|
- (train_epoch_avg_items_per_sec/num_iters if num_iters > 0 else 0.0)})
|
|
|
- DLLogger.log(step=(epoch,), data={'train_epoch_avg_loss': (train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0)})
|
|
|
- DLLogger.log(step=(epoch,), data={'epoch_time': epoch_time})
|
|
|
+ DLLogger.log(step=(epoch,), data={'train_items_per_sec':
|
|
|
+ (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
|
|
|
+ DLLogger.log(step=(epoch,), data={'train_loss': (train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0)})
|
|
|
+ DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time})
|
|
|
|
|
|
val_loss = validate(model, criterion, valset, epoch, i,
|
|
|
args.batch_size, world_size, collate_fn,
|
|
|
@@ -463,14 +487,13 @@ def main():
|
|
|
if local_rank == 0:
|
|
|
DLLogger.flush()
|
|
|
|
|
|
-
|
|
|
- run_stop_time = time.time()
|
|
|
- DLLogger.log(step=tuple(), data={'run_stop': run_start_time})
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ run_stop_time = time.perf_counter()
|
|
|
run_time = run_stop_time - run_start_time
|
|
|
DLLogger.log(step=tuple(), data={'run_time': run_time})
|
|
|
- DLLogger.log(step=tuple(), data={'train_items_per_sec':
|
|
|
- (train_epoch_avg_items_per_sec/num_iters if num_iters > 0 else 0.0)})
|
|
|
DLLogger.log(step=tuple(), data={'val_loss': val_loss})
|
|
|
+ DLLogger.log(step=tuple(), data={'train_items_per_sec':
|
|
|
+ (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
|
|
|
|
|
|
if local_rank == 0:
|
|
|
DLLogger.flush()
|