|
@@ -0,0 +1,203 @@
|
|
|
|
|
+# *****************************************************************************
|
|
|
|
|
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
|
|
|
+#
|
|
|
|
|
+# Redistribution and use in source and binary forms, with or without
|
|
|
|
|
+# modification, are permitted provided that the following conditions are met:
|
|
|
|
|
+# * Redistributions of source code must retain the above copyright
|
|
|
|
|
+# notice, this list of conditions and the following disclaimer.
|
|
|
|
|
+# * Redistributions in binary form must reproduce the above copyright
|
|
|
|
|
+# notice, this list of conditions and the following disclaimer in the
|
|
|
|
|
+# documentation and/or other materials provided with the distribution.
|
|
|
|
|
+# * Neither the name of the NVIDIA CORPORATION nor the
|
|
|
|
|
+# names of its contributors may be used to endorse or promote products
|
|
|
|
|
+# derived from this software without specific prior written permission.
|
|
|
|
|
+#
|
|
|
|
|
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
|
|
|
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
|
|
|
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
|
|
|
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
|
|
|
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
|
|
|
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
|
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
|
|
|
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
|
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
|
|
|
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
+#
|
|
|
|
|
+# *****************************************************************************
|
|
|
|
|
+
|
|
|
|
|
+import urllib.request
|
|
|
|
|
+import torch
|
|
|
|
|
+import os
|
|
|
|
|
+import sys
|
|
|
|
|
+
|
|
|
|
|
+#from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
|
|
|
|
|
+def checkpoint_from_distributed(state_dict):
|
|
|
|
|
+ """
|
|
|
|
|
+ Checks whether checkpoint was generated by DistributedDataParallel. DDP
|
|
|
|
|
+ wraps model in additional "module.", it needs to be unwrapped for single
|
|
|
|
|
+ GPU inference.
|
|
|
|
|
+ :param state_dict: model's state dict
|
|
|
|
|
+ """
|
|
|
|
|
+ ret = False
|
|
|
|
|
+ for key, _ in state_dict.items():
|
|
|
|
|
+ if key.find('module.') != -1:
|
|
|
|
|
+ ret = True
|
|
|
|
|
+ break
|
|
|
|
|
+ return ret
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
|
|
|
|
|
+def unwrap_distributed(state_dict):
|
|
|
|
|
+ """
|
|
|
|
|
+ Unwraps model from DistributedDataParallel.
|
|
|
|
|
+ DDP wraps model in additional "module.", it needs to be removed for single
|
|
|
|
|
+ GPU inference.
|
|
|
|
|
+ :param state_dict: model's state dict
|
|
|
|
|
+ """
|
|
|
|
|
+ new_state_dict = {}
|
|
|
|
|
+ for key, value in state_dict.items():
|
|
|
|
|
+ new_key = key.replace('module.1.', '')
|
|
|
|
|
+ new_key = new_key.replace('module.', '')
|
|
|
|
|
+ new_state_dict[new_key] = value
|
|
|
|
|
+ return new_state_dict
|
|
|
|
|
+
|
|
|
|
|
+def _download_checkpoint(checkpoint, force_reload):
|
|
|
|
|
+ model_dir = os.path.join(torch.hub._get_torch_home(), 'checkpoints')
|
|
|
|
|
+ if not os.path.exists(model_dir):
|
|
|
|
|
+ os.makedirs(model_dir)
|
|
|
|
|
+ ckpt_file = os.path.join(model_dir, os.path.basename(checkpoint))
|
|
|
|
|
+ if not os.path.exists(ckpt_file) or force_reload:
|
|
|
|
|
+ sys.stderr.write('Downloading checkpoint from {}\n'.format(checkpoint))
|
|
|
|
|
+ urllib.request.urlretrieve(checkpoint, ckpt_file)
|
|
|
|
|
+ return ckpt_file
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def nvidia_fastpitch(pretrained=True, **kwargs):
|
|
|
|
|
+ """TODO
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ from fastpitch import model as fastpitch
|
|
|
|
|
+
|
|
|
|
|
+ force_reload = "force_reload" in kwargs and kwargs["force_reload"]
|
|
|
|
|
+ fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
|
|
|
|
|
+
|
|
|
|
|
+ if pretrained:
|
|
|
|
|
+ checkpoint = 'https://api.ngc.nvidia.com/v2/models/nvidia/dle/fastpitch__pyt_ckpt/versions/21.12.1_amp/files/nvidia_fastpitch_210824+cfg.pt'
|
|
|
|
|
+ ckpt_file = _download_checkpoint(checkpoint, force_reload)
|
|
|
|
|
+ ckpt = torch.load(ckpt_file)
|
|
|
|
|
+ state_dict = ckpt['state_dict']
|
|
|
|
|
+ if checkpoint_from_distributed(state_dict):
|
|
|
|
|
+ state_dict = unwrap_distributed(state_dict)
|
|
|
|
|
+ config = ckpt['config']
|
|
|
|
|
+ train_setup = ckpt.get('train_setup', {})
|
|
|
|
|
+ else:
|
|
|
|
|
+ config = {'n_mel_channels': 80, 'n_symbols': 148, 'padding_idx': 0, 'symbols_embedding_dim': 384,
|
|
|
|
|
+ 'in_fft_n_layers': 6, 'in_fft_n_heads': 1, 'in_fft_d_head': 64, 'in_fft_conv1d_kernel_size': 3,
|
|
|
|
|
+ 'in_fft_conv1d_filter_size': 1536, 'in_fft_output_size': 384, 'p_in_fft_dropout': 0.1,
|
|
|
|
|
+ 'p_in_fft_dropatt': 0.1, 'p_in_fft_dropemb': 0.0, 'out_fft_n_layers': 6, 'out_fft_n_heads': 1,
|
|
|
|
|
+ 'out_fft_d_head': 64, 'out_fft_conv1d_kernel_size': 3, 'out_fft_conv1d_filter_size': 1536,
|
|
|
|
|
+ 'out_fft_output_size': 384, 'p_out_fft_dropout': 0.1, 'p_out_fft_dropatt': 0.1, 'p_out_fft_dropemb': 0.0,
|
|
|
|
|
+ 'dur_predictor_kernel_size': 3, 'dur_predictor_filter_size': 256, 'p_dur_predictor_dropout': 0.1,
|
|
|
|
|
+ 'dur_predictor_n_layers': 2, 'pitch_predictor_kernel_size': 3, 'pitch_predictor_filter_size': 256,
|
|
|
|
|
+ 'p_pitch_predictor_dropout': 0.1, 'pitch_predictor_n_layers': 2, 'pitch_embedding_kernel_size': 3,
|
|
|
|
|
+ 'n_speakers': 1, 'speaker_emb_weight': 1.0, 'energy_predictor_kernel_size': 3,
|
|
|
|
|
+ 'energy_predictor_filter_size': 256, 'p_energy_predictor_dropout': 0.1, 'energy_predictor_n_layers': 2,
|
|
|
|
|
+ 'energy_conditioning': True, 'energy_embedding_kernel_size': 3}
|
|
|
|
|
+ for k,v in kwargs.items():
|
|
|
|
|
+ if k in config.keys():
|
|
|
|
|
+ config[k] = v
|
|
|
|
|
+ train_setup = {}
|
|
|
|
|
+
|
|
|
|
|
+ model = fastpitch.FastPitch(**config)
|
|
|
|
|
+
|
|
|
|
|
+ if pretrained:
|
|
|
|
|
+ model.load_state_dict(state_dict)
|
|
|
|
|
+
|
|
|
|
|
+ if fp16:
|
|
|
|
|
+ model.half()
|
|
|
|
|
+
|
|
|
|
|
+ model.forward = model.infer
|
|
|
|
|
+
|
|
|
|
|
+ return model, train_setup
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def nvidia_textprocessing_utils(cmudict_path, heteronyms_path, **kwargs):
|
|
|
|
|
+
|
|
|
|
|
+ from common.text.text_processing import TextProcessing
|
|
|
|
|
+ import numpy as np
|
|
|
|
|
+ from torch.nn.utils.rnn import pad_sequence
|
|
|
|
|
+ from common.text import cmudict
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ class TextPreProcessing:
|
|
|
|
|
+ @staticmethod
|
|
|
|
|
+ def prepare_input_sequence(texts, batch_size=1, device='cpu'):
|
|
|
|
|
+ cmudict.initialize(cmudict_path, heteronyms_path)
|
|
|
|
|
+ tp = TextProcessing(symbol_set='english_basic', cleaner_names=['english_cleaners_v2'], p_arpabet=1.0)
|
|
|
|
|
+ fields={}
|
|
|
|
|
+
|
|
|
|
|
+ fields['text'] = [torch.LongTensor(tp.encode_text(text))
|
|
|
|
|
+ for text in texts]
|
|
|
|
|
+ order = np.argsort([-t.size(0) for t in fields['text']])
|
|
|
|
|
+
|
|
|
|
|
+ fields['text'] = [fields['text'][i] for i in order]
|
|
|
|
|
+ fields['text_lens'] = torch.LongTensor([t.size(0) for t in fields['text']])
|
|
|
|
|
+
|
|
|
|
|
+ for t in fields['text']:
|
|
|
|
|
+ print(tp.sequence_to_text(t.numpy()))
|
|
|
|
|
+
|
|
|
|
|
+ # cut into batches & pad
|
|
|
|
|
+ batches = []
|
|
|
|
|
+ for b in range(0, len(order), batch_size):
|
|
|
|
|
+ batch = {f: values[b:b+batch_size] for f, values in fields.items()}
|
|
|
|
|
+ for f in batch:
|
|
|
|
|
+ if f == 'text':
|
|
|
|
|
+ batch[f] = pad_sequence(batch[f], batch_first=True)
|
|
|
|
|
+
|
|
|
|
|
+ if type(batch[f]) is torch.Tensor:
|
|
|
|
|
+ batch[f] = batch[f].to(device)
|
|
|
|
|
+ batches.append(batch)
|
|
|
|
|
+
|
|
|
|
|
+ return batches
|
|
|
|
|
+
|
|
|
|
|
+ return TextPreProcessing()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# # from tacotron2.text import text_to_sequence
|
|
|
|
|
+
|
|
|
|
|
+# @staticmethod
|
|
|
|
|
+# def pad_sequences(batch):
|
|
|
|
|
+# # Right zero-pad all one-hot text sequences to max input length
|
|
|
|
|
+# input_lengths, ids_sorted_decreasing = torch.sort(
|
|
|
|
|
+# torch.LongTensor([len(x) for x in batch]),
|
|
|
|
|
+# dim=0, descending=True)
|
|
|
|
|
+# max_input_len = input_lengths[0]
|
|
|
|
|
+
|
|
|
|
|
+# text_padded = torch.LongTensor(len(batch), max_input_len)
|
|
|
|
|
+# text_padded.zero_()
|
|
|
|
|
+# for i in range(len(ids_sorted_decreasing)):
|
|
|
|
|
+# text = batch[ids_sorted_decreasing[i]]
|
|
|
|
|
+# text_padded[i, :text.size(0)] = text
|
|
|
|
|
+
|
|
|
|
|
+# return text_padded, input_lengths
|
|
|
|
|
+
|
|
|
|
|
+# @staticmethod
|
|
|
|
|
+# def prepare_input_sequence(texts, cpu_run=False):
|
|
|
|
|
+
|
|
|
|
|
+# d = []
|
|
|
|
|
+# # for i,text in enumerate(texts):
|
|
|
|
|
+# # d.append(torch.IntTensor(
|
|
|
|
|
+# # Processing.text_to_sequence(text, ['english_cleaners'])[:]))
|
|
|
|
|
+
|
|
|
|
|
+# text_padded, input_lengths = Processing.pad_sequences(d)
|
|
|
|
|
+# if not cpu_run:
|
|
|
|
|
+# text_padded = text_padded.cuda().long()
|
|
|
|
|
+# input_lengths = input_lengths.cuda().long()
|
|
|
|
|
+# else:
|
|
|
|
|
+# text_padded = text_padded.long()
|
|
|
|
|
+# input_lengths = input_lengths.long()
|
|
|
|
|
+
|
|
|
|
|
+# return text_padded, input_lengths
|
|
|
|
|
+
|
|
|
|
|
+# return Processing()
|