| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 |
- # BSD 3-Clause License
- # Copyright (c) 2018-2020, NVIDIA Corporation
- # All rights reserved.
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- # * Redistributions of source code must retain the above copyright notice, this
- # list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright notice,
- # this list of conditions and the following disclaimer in the documentation
- # and/or other materials provided with the distribution.
- # * Neither the name of the copyright holder nor the names of its
- # contributors may be used to endorse or promote products derived from
- # this software without specific prior written permission.
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- """https://github.com/NVIDIA/tacotron2"""
- import os
- from numpy import finfo
- import torch
- from tacotron2.distributed import apply_gradient_allreduce
- import torch.distributed as dist
- from torch.utils.data.distributed import DistributedSampler
- from torch.utils.data import DataLoader
- from tacotron2.model import Tacotron2
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
- def reduce_tensor(tensor, n_gpus):
- rt = tensor.clone()
- dist.all_reduce(rt, op=dist.reduce_op.SUM)
- rt /= n_gpus
- return rt
- def init_distributed(hparams, n_gpus, rank, group_name):
- assert torch.cuda.is_available(), "Distributed mode requires CUDA."
- print("Initializing Distributed")
- # Set cuda device so everything is done on the right GPU.
- torch.cuda.set_device(rank % torch.cuda.device_count())
- # Initialize distributed communication
- dist.init_process_group(
- backend=hparams.dist_backend, init_method=hparams.dist_url,
- world_size=n_gpus, rank=rank, group_name=group_name)
- print("Done initializing distributed")
- def load_model(hparams):
- model = Tacotron2(hparams).to(device)
- if hparams.fp16_run:
- model.decoder.attention_layer.score_mask_value = finfo('float16').min
- if hparams.distributed_run:
- model = apply_gradient_allreduce(model)
- return model
- def warm_start_model(checkpoint_path, model, ignore_layers):
- assert os.path.isfile(checkpoint_path)
- print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
- checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
- model_dict = checkpoint_dict['state_dict']
- if len(ignore_layers) > 0:
- model_dict = {k: v for k, v in model_dict.items()
- if k not in ignore_layers}
- dummy_dict = model.state_dict()
- dummy_dict.update(model_dict)
- model_dict = dummy_dict
- model.load_state_dict(model_dict)
- return model
- def load_checkpoint(checkpoint_path, model, optimizer):
- assert os.path.isfile(checkpoint_path)
- print("Loading checkpoint '{}'".format(checkpoint_path))
- checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
- model.load_state_dict(checkpoint_dict['state_dict'])
- optimizer.load_state_dict(checkpoint_dict['optimizer'])
- learning_rate = checkpoint_dict['learning_rate']
- iteration = checkpoint_dict['iteration']
- print("Loaded checkpoint '{}' from iteration {}" .format(
- checkpoint_path, iteration))
- return model, optimizer, learning_rate, iteration
- def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
- print("Saving model and optimizer state at iteration {} to {}".format(
- iteration, filepath))
- torch.save({'iteration': iteration,
- 'state_dict': model.state_dict(),
- 'optimizer': optimizer.state_dict(),
- 'learning_rate': learning_rate}, filepath)
- def validate(model, criterion, valset, iteration, batch_size, n_gpus,
- collate_fn, logger, distributed_run, rank):
- """Handles all the validation scoring and printing"""
- model.eval()
- with torch.no_grad():
- val_sampler = DistributedSampler(valset) if distributed_run else None
- val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
- shuffle=False, batch_size=batch_size,
- pin_memory=False, collate_fn=collate_fn)
- val_loss = 0.0
- for i, batch in enumerate(val_loader):
- x, y = model.parse_batch(batch)
- y_pred = model(x)
- loss = criterion(y_pred, y)
- if distributed_run:
- reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
- else:
- reduced_val_loss = loss.item()
- val_loss += reduced_val_loss
- val_loss = val_loss / (i + 1)
- model.train()
- if rank == 0:
- print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss))
- logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
|