%!s(int64=4) %!d(string=hai) anos · 0a52a93b5f
--- a/PyTorch/SpeechRecognition/QuartzNet/common/audio.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/audio.py
@@ -67,10 +67,11 @@ class AudioSegment(object):
 
				 
			
 
				         samples = self._convert_samples_to_float32(samples)
			
 
				         if target_sr is not None and target_sr != sample_rate:
			
 
				-            samples = librosa.core.resample(samples, sample_rate, target_sr)
			
 
				+            samples = librosa.resample(samples, orig_sr=sample_rate,
			
 
				+                                       target_sr=target_sr)
			
 
				             sample_rate = target_sr
			
 
				         if trim:
			
 
				-            samples, _ = librosa.effects.trim(samples, trim_db)
			
 
				+            samples, _ = librosa.effects.trim(samples, top_db=trim_db)
			
 
				         self._samples = samples
			
 
				         self._sample_rate = sample_rate
			
 
				         if self._samples.ndim >= 2:
			
--- a/PyTorch/SpeechRecognition/QuartzNet/common/features.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/common/features.py
@@ -237,7 +237,7 @@ class FilterbankFeatures(BaseFeatures):
 
				         window_tensor = window_fn(self.win_length,
			
 
				                                   periodic=False) if window_fn else None
			
 
				         filterbanks = torch.tensor(
			
 
				-            librosa.filters.mel(sample_rate, self.n_fft, n_mels=n_filt,
			
 
				+            librosa.filters.mel(sr=sample_rate, n_fft=self.n_fft, n_mels=n_filt,
			
 
				                                 fmin=lowfreq, fmax=highfreq),
			
 
				             dtype=torch.float).unsqueeze(0)
			
 
				         # torchscript
			
--- a/PyTorch/SpeechRecognition/QuartzNet/requirements.txt
+++ b/PyTorch/SpeechRecognition/QuartzNet/requirements.txt
@@ -1,10 +1,10 @@
 
				 inflect==5.3.0
			
 
				 ipdb
			
 
				-librosa==0.8.0
			
 
				+librosa==0.9.0
			
 
				 pandas==1.1.4
			
 
				 pycuda==2020.1
			
 
				 pyyaml>=5.4
			
 
				 soundfile
			
 
				 sox==1.4.1
			
 
				 tqdm==4.53.0
			
 
				-git+https://github.com/NVIDIA/[email protected]#egg=dllogger
			
 
				+git+https://github.com/NVIDIA/[email protected]#egg=dllogger
			
--- a/PyTorch/SpeechRecognition/QuartzNet/utils/preprocessing_utils.py
+++ b/PyTorch/SpeechRecognition/QuartzNet/utils/preprocessing_utils.py
@@ -15,7 +15,6 @@
 
				 #!/usr/bin/env python
			
 
				 import os
			
 
				 import multiprocessing
			
 
				-import librosa
			
 
				 import functools
			
 
				 
			
 
				 import sox
			
--- a/PyTorch/SpeechSynthesis/FastPitch/common/audio_processing.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/common/audio_processing.py
@@ -74,7 +74,7 @@ def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
 
				     # Compute the squared window at the desired length
			
 
				     win_sq = get_window(window, win_length, fftbins=True)
			
 
				     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
			
 
				-    win_sq = librosa_util.pad_center(win_sq, n_fft)
			
 
				+    win_sq = librosa_util.pad_center(win_sq, size=n_fft)
			
 
				 
			
 
				     # Fill the envelope
			
 
				     for i in range(n_frames):
			
--- a/PyTorch/SpeechSynthesis/FastPitch/common/layers.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/common/layers.py
@@ -94,7 +94,13 @@ class TacotronSTFT(torch.nn.Module):
 
				         self.sampling_rate = sampling_rate
			
 
				         self.stft_fn = STFT(filter_length, hop_length, win_length)
			
 
				         mel_basis = librosa_mel_fn(
			
 
				-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
			
 
				+            sr=sampling_rate,
			
 
				+            n_fft=filter_length,
			
 
				+            n_mels=n_mel_channels,
			
 
				+            fmin=mel_fmin,
			
 
				+            fmax=mel_fmax
			
 
				+        )
			
 
				+
			
 
				         mel_basis = torch.from_numpy(mel_basis).float()
			
 
				         self.register_buffer('mel_basis', mel_basis)
			
 
				 
			
--- a/PyTorch/SpeechSynthesis/FastPitch/common/stft.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/common/stft.py
@@ -64,7 +64,7 @@ class STFT(torch.nn.Module):
 
				             assert(filter_length >= win_length)
			
 
				             # get window and zero center pad it to filter_length
			
 
				             fft_window = get_window(window, win_length, fftbins=True)
			
 
				-            fft_window = pad_center(fft_window, filter_length)
			
 
				+            fft_window = pad_center(fft_window, size=filter_length)
			
 
				             fft_window = torch.from_numpy(fft_window).float()
			
 
				 
			
 
				             # window the bases
			
--- a/PyTorch/SpeechSynthesis/FastPitch/requirements.txt
+++ b/PyTorch/SpeechSynthesis/FastPitch/requirements.txt
@@ -1,7 +1,7 @@
 
				 matplotlib
			
 
				 numpy
			
 
				 inflect
			
 
				-librosa==0.8.0
			
 
				+librosa==0.9.0
			
 
				 scipy
			
 
				 tensorboardX==2.0
			
 
				-git+https://github.com/NVIDIA/[email protected]#egg=dllogger
			
 
				+git+https://github.com/NVIDIA/[email protected]#egg=dllogger
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/tacotron2_common/audio_processing.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/tacotron2_common/audio_processing.py
@@ -74,7 +74,7 @@ def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
 
				     # Compute the squared window at the desired length
			
 
				     win_sq = get_window(window, win_length, fftbins=True)
			
 
				     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
			
 
				-    win_sq = librosa_util.pad_center(win_sq, n_fft)
			
 
				+    win_sq = librosa_util.pad_center(win_sq, size=n_fft)
			
 
				 
			
 
				     # Fill the envelope
			
 
				     for i in range(n_frames):
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/tacotron2_common/layers.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/tacotron2_common/layers.py
@@ -74,7 +74,13 @@ class TacotronSTFT(torch.nn.Module):
 
				         self.sampling_rate = sampling_rate
			
 
				         self.stft_fn = STFT(filter_length, hop_length, win_length)
			
 
				         mel_basis = librosa_mel_fn(
			
 
				-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
			
 
				+            sr=sampling_rate,
			
 
				+            n_fft=filter_length,
			
 
				+            n_mels=n_mel_channels,
			
 
				+            fmin=mel_fmin,
			
 
				+            fmax=mel_fmax
			
 
				+        )
			
 
				+
			
 
				         mel_basis = torch.from_numpy(mel_basis).float()
			
 
				         self.register_buffer('mel_basis', mel_basis)
			
 
				 
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/tacotron2_common/stft.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/tacotron2_common/stft.py
@@ -64,7 +64,7 @@ class STFT(torch.nn.Module):
 
				             assert(filter_length >= win_length)
			
 
				             # get window and zero center pad it to filter_length
			
 
				             fft_window = get_window(window, win_length, fftbins=True)
			
 
				-            fft_window = pad_center(fft_window, filter_length)
			
 
				+            fft_window = pad_center(fft_window, size=filter_length)
			
 
				             fft_window = torch.from_numpy(fft_window).float()
			
 
				 
			
 
				             # window the bases