3 lat temu · afea561ecf
--- a/PyTorch/SpeechSynthesis/HiFiGAN/common/utils.py
+++ b/PyTorch/SpeechSynthesis/HiFiGAN/common/utils.py
@@ -51,8 +51,6 @@ import soundfile  # flac
 
				 
			
 
				 import matplotlib
			
 
				 
			
 
				-matplotlib.use("Agg")
			
 
				-import matplotlib.pylab as plt
			
 
				 import numpy as np
			
 
				 import torch
			
 
				 import torch.distributed as dist
			
@@ -173,6 +171,8 @@ def print_once(*msg):
 
				 
			
 
				 
			
 
				 def plot_spectrogram(spectrogram):
			
 
				+    matplotlib.use("Agg")
			
 
				+    import matplotlib.pylab as plt
			
 
				     fig, ax = plt.subplots(figsize=(10, 2))
			
 
				     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
			
 
				                    interpolation='none')
			
--- a/PyTorch/SpeechSynthesis/HiFiGAN/fastpitch/__init__.py
+++ b/PyTorch/SpeechSynthesis/HiFiGAN/fastpitch/__init__.py
@@ -0,0 +1 @@
 
				+from .entrypoints import nvidia_fastpitch, nvidia_textprocessing_utils
			
--- a/PyTorch/SpeechSynthesis/HiFiGAN/fastpitch/entrypoints.py
+++ b/PyTorch/SpeechSynthesis/HiFiGAN/fastpitch/entrypoints.py
@@ -0,0 +1,203 @@
 
				+# *****************************************************************************
			
 
				+#  Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+#  Redistribution and use in source and binary forms, with or without
			
 
				+#  modification, are permitted provided that the following conditions are met:
			
 
				+#      * Redistributions of source code must retain the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer.
			
 
				+#      * Redistributions in binary form must reproduce the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer in the
			
 
				+#        documentation and/or other materials provided with the distribution.
			
 
				+#      * Neither the name of the NVIDIA CORPORATION nor the
			
 
				+#        names of its contributors may be used to endorse or promote products
			
 
				+#        derived from this software without specific prior written permission.
			
 
				+#
			
 
				+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
			
 
				+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
			
 
				+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
			
 
				+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
			
 
				+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+#
			
 
				+# *****************************************************************************
			
 
				+
			
 
				+import urllib.request
			
 
				+import torch
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+#from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
			
 
				+def checkpoint_from_distributed(state_dict):
			
 
				+    """
			
 
				+    Checks whether checkpoint was generated by DistributedDataParallel. DDP
			
 
				+    wraps model in additional "module.", it needs to be unwrapped for single
			
 
				+    GPU inference.
			
 
				+    :param state_dict: model's state dict
			
 
				+    """
			
 
				+    ret = False
			
 
				+    for key, _ in state_dict.items():
			
 
				+        if key.find('module.') != -1:
			
 
				+            ret = True
			
 
				+            break
			
 
				+    return ret
			
 
				+
			
 
				+
			
 
				+# from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
			
 
				+def unwrap_distributed(state_dict):
			
 
				+    """
			
 
				+    Unwraps model from DistributedDataParallel.
			
 
				+    DDP wraps model in additional "module.", it needs to be removed for single
			
 
				+    GPU inference.
			
 
				+    :param state_dict: model's state dict
			
 
				+    """
			
 
				+    new_state_dict = {}
			
 
				+    for key, value in state_dict.items():
			
 
				+        new_key = key.replace('module.1.', '')
			
 
				+        new_key = new_key.replace('module.', '')
			
 
				+        new_state_dict[new_key] = value
			
 
				+    return new_state_dict
			
 
				+
			
 
				+def _download_checkpoint(checkpoint, force_reload):
			
 
				+    model_dir = os.path.join(torch.hub._get_torch_home(), 'checkpoints')
			
 
				+    if not os.path.exists(model_dir):
			
 
				+        os.makedirs(model_dir)
			
 
				+    ckpt_file = os.path.join(model_dir, os.path.basename(checkpoint))
			
 
				+    if not os.path.exists(ckpt_file) or force_reload:
			
 
				+        sys.stderr.write('Downloading checkpoint from {}\n'.format(checkpoint))
			
 
				+        urllib.request.urlretrieve(checkpoint, ckpt_file)
			
 
				+    return ckpt_file
			
 
				+
			
 
				+
			
 
				+def nvidia_fastpitch(pretrained=True, **kwargs):
			
 
				+    """TODO
			
 
				+    """
			
 
				+
			
 
				+    from fastpitch import model as fastpitch
			
 
				+    
			
 
				+    force_reload = "force_reload" in kwargs and kwargs["force_reload"]
			
 
				+    fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
			
 
				+
			
 
				+    if pretrained:
			
 
				+        checkpoint = 'https://api.ngc.nvidia.com/v2/models/nvidia/dle/fastpitch__pyt_ckpt/versions/21.12.1_amp/files/nvidia_fastpitch_210824+cfg.pt'
			
 
				+        ckpt_file = _download_checkpoint(checkpoint, force_reload)
			
 
				+        ckpt = torch.load(ckpt_file)
			
 
				+        state_dict = ckpt['state_dict']
			
 
				+        if checkpoint_from_distributed(state_dict):
			
 
				+            state_dict = unwrap_distributed(state_dict)
			
 
				+        config = ckpt['config']
			
 
				+        train_setup = ckpt.get('train_setup', {})     
			
 
				+    else:
			
 
				+        config = {'n_mel_channels': 80, 'n_symbols': 148, 'padding_idx': 0, 'symbols_embedding_dim': 384, 
			
 
				+                  'in_fft_n_layers': 6, 'in_fft_n_heads': 1, 'in_fft_d_head': 64, 'in_fft_conv1d_kernel_size': 3, 
			
 
				+                  'in_fft_conv1d_filter_size': 1536, 'in_fft_output_size': 384, 'p_in_fft_dropout': 0.1, 
			
 
				+                  'p_in_fft_dropatt': 0.1, 'p_in_fft_dropemb': 0.0, 'out_fft_n_layers': 6, 'out_fft_n_heads': 1, 
			
 
				+                  'out_fft_d_head': 64, 'out_fft_conv1d_kernel_size': 3, 'out_fft_conv1d_filter_size': 1536, 
			
 
				+                  'out_fft_output_size': 384, 'p_out_fft_dropout': 0.1, 'p_out_fft_dropatt': 0.1, 'p_out_fft_dropemb': 0.0, 
			
 
				+                  'dur_predictor_kernel_size': 3, 'dur_predictor_filter_size': 256, 'p_dur_predictor_dropout': 0.1, 
			
 
				+                  'dur_predictor_n_layers': 2, 'pitch_predictor_kernel_size': 3, 'pitch_predictor_filter_size': 256, 
			
 
				+                  'p_pitch_predictor_dropout': 0.1, 'pitch_predictor_n_layers': 2, 'pitch_embedding_kernel_size': 3, 
			
 
				+                  'n_speakers': 1, 'speaker_emb_weight': 1.0, 'energy_predictor_kernel_size': 3, 
			
 
				+                  'energy_predictor_filter_size': 256, 'p_energy_predictor_dropout': 0.1, 'energy_predictor_n_layers': 2, 
			
 
				+                  'energy_conditioning': True, 'energy_embedding_kernel_size': 3}
			
 
				+        for k,v in kwargs.items():
			
 
				+            if k in config.keys():
			
 
				+                config[k] = v
			
 
				+        train_setup = {}
			
 
				+
			
 
				+    model = fastpitch.FastPitch(**config)
			
 
				+
			
 
				+    if pretrained:
			
 
				+        model.load_state_dict(state_dict)
			
 
				+
			
 
				+    if fp16:
			
 
				+        model.half()
			
 
				+        
			
 
				+    model.forward = model.infer
			
 
				+
			
 
				+    return model, train_setup
			
 
				+
			
 
				+
			
 
				+def nvidia_textprocessing_utils(cmudict_path, heteronyms_path, **kwargs):
			
 
				+
			
 
				+    from common.text.text_processing import TextProcessing
			
 
				+    import numpy as np
			
 
				+    from torch.nn.utils.rnn import pad_sequence
			
 
				+    from common.text import cmudict
			
 
				+
			
 
				+
			
 
				+    class TextPreProcessing:
			
 
				+        @staticmethod
			
 
				+        def prepare_input_sequence(texts, batch_size=1, device='cpu'):
			
 
				+            cmudict.initialize(cmudict_path, heteronyms_path)
			
 
				+            tp = TextProcessing(symbol_set='english_basic', cleaner_names=['english_cleaners_v2'], p_arpabet=1.0)
			
 
				+            fields={}
			
 
				+
			
 
				+            fields['text'] = [torch.LongTensor(tp.encode_text(text))
			
 
				+                            for text in texts]
			
 
				+            order = np.argsort([-t.size(0) for t in fields['text']])
			
 
				+
			
 
				+            fields['text'] = [fields['text'][i] for i in order]
			
 
				+            fields['text_lens'] = torch.LongTensor([t.size(0) for t in fields['text']])
			
 
				+
			
 
				+            for t in fields['text']:
			
 
				+                print(tp.sequence_to_text(t.numpy()))
			
 
				+
			
 
				+            # cut into batches & pad
			
 
				+            batches = []
			
 
				+            for b in range(0, len(order), batch_size):
			
 
				+                batch = {f: values[b:b+batch_size] for f, values in fields.items()}
			
 
				+                for f in batch:
			
 
				+                    if f == 'text':
			
 
				+                        batch[f] = pad_sequence(batch[f], batch_first=True)
			
 
				+
			
 
				+                    if type(batch[f]) is torch.Tensor:
			
 
				+                        batch[f] = batch[f].to(device)
			
 
				+                batches.append(batch)
			
 
				+
			
 
				+            return batches
			
 
				+
			
 
				+    return TextPreProcessing()
			
 
				+
			
 
				+    
			
 
				+        
			
 
				+#         # from tacotron2.text import text_to_sequence
			
 
				+        
			
 
				+#         @staticmethod
			
 
				+#         def pad_sequences(batch):
			
 
				+#             # Right zero-pad all one-hot text sequences to max input length
			
 
				+#             input_lengths, ids_sorted_decreasing = torch.sort(
			
 
				+#                 torch.LongTensor([len(x) for x in batch]),
			
 
				+#                 dim=0, descending=True)
			
 
				+#             max_input_len = input_lengths[0]
			
 
				+
			
 
				+#             text_padded = torch.LongTensor(len(batch), max_input_len)
			
 
				+#             text_padded.zero_()
			
 
				+#             for i in range(len(ids_sorted_decreasing)):
			
 
				+#                 text = batch[ids_sorted_decreasing[i]]
			
 
				+#                 text_padded[i, :text.size(0)] = text
			
 
				+
			
 
				+#             return text_padded, input_lengths
			
 
				+        
			
 
				+#         @staticmethod
			
 
				+#         def prepare_input_sequence(texts, cpu_run=False):
			
 
				+
			
 
				+#             d = []
			
 
				+#             # for i,text in enumerate(texts):
			
 
				+#             #     d.append(torch.IntTensor(
			
 
				+#             #         Processing.text_to_sequence(text, ['english_cleaners'])[:]))
			
 
				+
			
 
				+#             text_padded, input_lengths = Processing.pad_sequences(d)
			
 
				+#             if not cpu_run:
			
 
				+#                 text_padded = text_padded.cuda().long()
			
 
				+#                 input_lengths = input_lengths.cuda().long()
			
 
				+#             else:
			
 
				+#                 text_padded = text_padded.long()
			
 
				+#                 input_lengths = input_lengths.long()
			
 
				+
			
 
				+#             return text_padded, input_lengths
			
 
				+    
			
 
				+#     return Processing()
			
--- a/PyTorch/SpeechSynthesis/HiFiGAN/hifigan/__init__.py
+++ b/PyTorch/SpeechSynthesis/HiFiGAN/hifigan/__init__.py
@@ -0,0 +1 @@
 
				+from .entrypoints import nvidia_hifigan
			
--- a/PyTorch/SpeechSynthesis/HiFiGAN/hifigan/entrypoints.py
+++ b/PyTorch/SpeechSynthesis/HiFiGAN/hifigan/entrypoints.py
@@ -0,0 +1,112 @@
 
				+# *****************************************************************************
			
 
				+#  Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+#  Redistribution and use in source and binary forms, with or without
			
 
				+#  modification, are permitted provided that the following conditions are met:
			
 
				+#      * Redistributions of source code must retain the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer.
			
 
				+#      * Redistributions in binary form must reproduce the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer in the
			
 
				+#        documentation and/or other materials provided with the distribution.
			
 
				+#      * Neither the name of the NVIDIA CORPORATION nor the
			
 
				+#        names of its contributors may be used to endorse or promote products
			
 
				+#        derived from this software without specific prior written permission.
			
 
				+#
			
 
				+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
			
 
				+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
			
 
				+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
			
 
				+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
			
 
				+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+#
			
 
				+# *****************************************************************************
			
 
				+
			
 
				+import urllib.request
			
 
				+import torch
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+#from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
			
 
				+def checkpoint_from_distributed(state_dict):
			
 
				+    """
			
 
				+    Checks whether checkpoint was generated by DistributedDataParallel. DDP
			
 
				+    wraps model in additional "module.", it needs to be unwrapped for single
			
 
				+    GPU inference.
			
 
				+    :param state_dict: model's state dict
			
 
				+    """
			
 
				+    ret = False
			
 
				+    for key, _ in state_dict.items():
			
 
				+        if key.find('module.') != -1:
			
 
				+            ret = True
			
 
				+            break
			
 
				+    return ret
			
 
				+
			
 
				+
			
 
				+# from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
			
 
				+def unwrap_distributed(state_dict):
			
 
				+    """
			
 
				+    Unwraps model from DistributedDataParallel.
			
 
				+    DDP wraps model in additional "module.", it needs to be removed for single
			
 
				+    GPU inference.
			
 
				+    :param state_dict: model's state dict
			
 
				+    """
			
 
				+    new_state_dict = {}
			
 
				+    for key, value in state_dict.items():
			
 
				+        new_key = key.replace('module.1.', '')
			
 
				+        new_key = new_key.replace('module.', '')
			
 
				+        new_state_dict[new_key] = value
			
 
				+    return new_state_dict
			
 
				+
			
 
				+def _download_checkpoint(checkpoint, force_reload):
			
 
				+    model_dir = os.path.join(torch.hub._get_torch_home(), 'checkpoints')
			
 
				+    if not os.path.exists(model_dir):
			
 
				+        os.makedirs(model_dir)
			
 
				+    ckpt_file = os.path.join(model_dir, os.path.basename(checkpoint))
			
 
				+    if not os.path.exists(ckpt_file) or force_reload:
			
 
				+        sys.stderr.write('Downloading checkpoint from {}\n'.format(checkpoint))
			
 
				+        urllib.request.urlretrieve(checkpoint, ckpt_file)
			
 
				+    return ckpt_file
			
 
				+
			
 
				+
			
 
				+def nvidia_hifigan(pretrained=True, **kwargs):
			
 
				+    """TODO
			
 
				+    """
			
 
				+    from hifigan import models as vocoder
			
 
				+
			
 
				+    force_reload = "force_reload" in kwargs and kwargs["force_reload"]
			
 
				+    fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
			
 
				+    
			
 
				+    if pretrained:
			
 
				+        checkpoint = 'https://api.ngc.nvidia.com/v2/models/nvidia/dle/hifigan__pyt_ckpt_mode-finetune_ds-ljs22khz/versions/21.08.0_amp/files/hifigan_gen_checkpoint_10000_ft.pt'
			
 
				+        ckpt_file = _download_checkpoint(checkpoint, force_reload)
			
 
				+        ckpt = torch.load(ckpt_file)
			
 
				+        state_dict = ckpt['generator']
			
 
				+        if checkpoint_from_distributed(state_dict):
			
 
				+            state_dict = unwrap_distributed(state_dict)
			
 
				+        config = ckpt['config']   
			
 
				+        train_setup = ckpt.get('train_setup', {})     
			
 
				+    else:
			
 
				+        config = {'upsample_rates': [8, 8, 2, 2], 'upsample_kernel_sizes': [16, 16, 4, 4], 
			
 
				+                  'upsample_initial_channel': 512, 'resblock': '1', 'resblock_kernel_sizes': [3, 7, 11], 
			
 
				+                  'resblock_dilation_sizes': [[1, 3, 5], [1, 3, 5], [1, 3, 5]]}
			
 
				+        for k,v in kwargs.items():
			
 
				+            if k in config.keys():
			
 
				+                config[k] = v
			
 
				+        train_setup = {}
			
 
				+
			
 
				+    hifigan = vocoder.Generator(config)
			
 
				+    denoiser = None
			
 
				+    if pretrained:
			
 
				+        hifigan.load_state_dict(state_dict)
			
 
				+        hifigan.remove_weight_norm()
			
 
				+        denoiser = vocoder.Denoiser(hifigan, win_length=1024)
			
 
				+
			
 
				+    if fp16:
			
 
				+        hifigan.half()
			
 
				+        denoiser.half()
			
 
				+
			
 
				+    return hifigan, train_setup, denoiser
			
--- a/hubconf.py
+++ b/hubconf.py
@@ -25,5 +25,10 @@ from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import nvidia_tts_utils
 
				 from PyTorch.SpeechSynthesis.Tacotron2.waveglow import nvidia_waveglow
			
 
				 sys.path.append(os.path.join(sys.path[0], 'PyTorch/SpeechSynthesis/Tacotron2'))
			
 
				 
			
 
				+from PyTorch.SpeechSynthesis.HiFiGAN.fastpitch import nvidia_fastpitch
			
 
				+from PyTorch.SpeechSynthesis.HiFiGAN.fastpitch import nvidia_textprocessing_utils
			
 
				+from PyTorch.SpeechSynthesis.HiFiGAN.hifigan import nvidia_hifigan
			
 
				+sys.path.append(os.path.join(sys.path[0], 'PyTorch/SpeechSynthesis/HiFiGAN'))
			
 
				+
			
 
				 from PyTorch.Forecasting.TFT.tft_torchhub import nvidia_tft, nvidia_tft_data_utils
			
 
				 sys.path.append(os.path.join(sys.path[0], 'PyTorch/Forecasting/TFT'))
		`@@ -0,0 +1 @@`
		`+from .entrypoints import nvidia_fastpitch, nvidia_textprocessing_utils`