SunnyMirror
/
DeepLearningExamples
의 미러 https://github.com/NVIDIA/DeepLearningExamples.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
							# *****************************************************************************
#  Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:
#      * Redistributions of source code must retain the above copyright
#        notice, this list of conditions and the following disclaimer.
#      * Redistributions in binary form must reproduce the above copyright
#        notice, this list of conditions and the following disclaimer in the
#        documentation and/or other materials provided with the distribution.
#      * Neither the name of the NVIDIA CORPORATION nor the
#        names of its contributors may be used to endorse or promote products
#        derived from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************

import urllib.request
import torch
import os
import sys

#from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
def checkpoint_from_distributed(state_dict):
    """
    Checks whether checkpoint was generated by DistributedDataParallel. DDP
    wraps model in additional "module.", it needs to be unwrapped for single
    GPU inference.
    :param state_dict: model's state dict
    """
    ret = False
    for key, _ in state_dict.items():
        if key.find('module.') != -1:
            ret = True
            break
    return ret


# from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
def unwrap_distributed(state_dict):
    """
    Unwraps model from DistributedDataParallel.
    DDP wraps model in additional "module.", it needs to be removed for single
    GPU inference.
    :param state_dict: model's state dict
    """
    new_state_dict = {}
    for key, value in state_dict.items():
        new_key = key.replace('module.1.', '')
        new_key = new_key.replace('module.', '')
        new_state_dict[new_key] = value
    return new_state_dict

def _download_checkpoint(checkpoint, force_reload):
    model_dir = os.path.join(torch.hub._get_torch_home(), 'checkpoints')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    ckpt_file = os.path.join(model_dir, os.path.basename(checkpoint))
    if not os.path.exists(ckpt_file) or force_reload:
        sys.stderr.write('Downloading checkpoint from {}\n'.format(checkpoint))
        urllib.request.urlretrieve(checkpoint, ckpt_file)
    return ckpt_file


def nvidia_fastpitch(pretrained=True, **kwargs):
    """TODO
    """

    from fastpitch import model as fastpitch
    
    force_reload = "force_reload" in kwargs and kwargs["force_reload"]
    fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"

    if pretrained:
        checkpoint = 'https://api.ngc.nvidia.com/v2/models/nvidia/dle/fastpitch__pyt_ckpt/versions/21.12.1_amp/files/nvidia_fastpitch_210824+cfg.pt'
        ckpt_file = _download_checkpoint(checkpoint, force_reload)
        ckpt = torch.load(ckpt_file)
        state_dict = ckpt['state_dict']
        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)
        config = ckpt['config']
        train_setup = ckpt.get('train_setup', {})     
    else:
        config = {'n_mel_channels': 80, 'n_symbols': 148, 'padding_idx': 0, 'symbols_embedding_dim': 384, 
                  'in_fft_n_layers': 6, 'in_fft_n_heads': 1, 'in_fft_d_head': 64, 'in_fft_conv1d_kernel_size': 3, 
                  'in_fft_conv1d_filter_size': 1536, 'in_fft_output_size': 384, 'p_in_fft_dropout': 0.1, 
                  'p_in_fft_dropatt': 0.1, 'p_in_fft_dropemb': 0.0, 'out_fft_n_layers': 6, 'out_fft_n_heads': 1, 
                  'out_fft_d_head': 64, 'out_fft_conv1d_kernel_size': 3, 'out_fft_conv1d_filter_size': 1536, 
                  'out_fft_output_size': 384, 'p_out_fft_dropout': 0.1, 'p_out_fft_dropatt': 0.1, 'p_out_fft_dropemb': 0.0, 
                  'dur_predictor_kernel_size': 3, 'dur_predictor_filter_size': 256, 'p_dur_predictor_dropout': 0.1, 
                  'dur_predictor_n_layers': 2, 'pitch_predictor_kernel_size': 3, 'pitch_predictor_filter_size': 256, 
                  'p_pitch_predictor_dropout': 0.1, 'pitch_predictor_n_layers': 2, 'pitch_embedding_kernel_size': 3, 
                  'n_speakers': 1, 'speaker_emb_weight': 1.0, 'energy_predictor_kernel_size': 3, 
                  'energy_predictor_filter_size': 256, 'p_energy_predictor_dropout': 0.1, 'energy_predictor_n_layers': 2, 
                  'energy_conditioning': True, 'energy_embedding_kernel_size': 3}
        for k,v in kwargs.items():
            if k in config.keys():
                config[k] = v
        train_setup = {}

    model = fastpitch.FastPitch(**config)

    if pretrained:
        model.load_state_dict(state_dict)

    if fp16:
        model.half()
        
    model.forward = model.infer

    return model, train_setup


def nvidia_textprocessing_utils(cmudict_path, heteronyms_path, **kwargs):

    from common.text.text_processing import TextProcessing
    import numpy as np
    from torch.nn.utils.rnn import pad_sequence
    from common.text import cmudict


    class TextPreProcessing:
        @staticmethod
        def prepare_input_sequence(texts, batch_size=1, device='cpu'):
            cmudict.initialize(cmudict_path, heteronyms_path)
            tp = TextProcessing(symbol_set='english_basic', cleaner_names=['english_cleaners_v2'], p_arpabet=1.0)
            fields={}

            fields['text'] = [torch.LongTensor(tp.encode_text(text))
                            for text in texts]
            order = np.argsort([-t.size(0) for t in fields['text']])

            fields['text'] = [fields['text'][i] for i in order]
            fields['text_lens'] = torch.LongTensor([t.size(0) for t in fields['text']])

            for t in fields['text']:
                print(tp.sequence_to_text(t.numpy()))

            # cut into batches & pad
            batches = []
            for b in range(0, len(order), batch_size):
                batch = {f: values[b:b+batch_size] for f, values in fields.items()}
                for f in batch:
                    if f == 'text':
                        batch[f] = pad_sequence(batch[f], batch_first=True)

                    if type(batch[f]) is torch.Tensor:
                        batch[f] = batch[f].to(device)
                batches.append(batch)

            return batches

    return TextPreProcessing()

    
#         # from tacotron2.text import text_to_sequence
        
#         @staticmethod
#         def pad_sequences(batch):
#             # Right zero-pad all one-hot text sequences to max input length
#             input_lengths, ids_sorted_decreasing = torch.sort(
#                 torch.LongTensor([len(x) for x in batch]),
#                 dim=0, descending=True)
#             max_input_len = input_lengths[0]

#             text_padded = torch.LongTensor(len(batch), max_input_len)
#             text_padded.zero_()
#             for i in range(len(ids_sorted_decreasing)):
#                 text = batch[ids_sorted_decreasing[i]]
#                 text_padded[i, :text.size(0)] = text

#             return text_padded, input_lengths
        
#         @staticmethod
#         def prepare_input_sequence(texts, cpu_run=False):

#             d = []
#             # for i,text in enumerate(texts):
#             #     d.append(torch.IntTensor(
#             #         Processing.text_to_sequence(text, ['english_cleaners'])[:]))

#             text_padded, input_lengths = Processing.pad_sequences(d)
#             if not cpu_run:
#                 text_padded = text_padded.cuda().long()
#                 input_lengths = input_lengths.cuda().long()
#             else:
#                 text_padded = text_padded.long()
#                 input_lengths = input_lengths.long()

#             return text_padded, input_lengths
    
#     return Processing()