Explorar o código

[Speech] Librosa update

Grzegorz Karch %!s(int64=4) %!d(string=hai) anos
pai
achega
0a52a93b5f

+ 3 - 2
PyTorch/SpeechRecognition/QuartzNet/common/audio.py

@@ -67,10 +67,11 @@ class AudioSegment(object):
 
         samples = self._convert_samples_to_float32(samples)
         if target_sr is not None and target_sr != sample_rate:
-            samples = librosa.core.resample(samples, sample_rate, target_sr)
+            samples = librosa.resample(samples, orig_sr=sample_rate,
+                                       target_sr=target_sr)
             sample_rate = target_sr
         if trim:
-            samples, _ = librosa.effects.trim(samples, trim_db)
+            samples, _ = librosa.effects.trim(samples, top_db=trim_db)
         self._samples = samples
         self._sample_rate = sample_rate
         if self._samples.ndim >= 2:

+ 1 - 1
PyTorch/SpeechRecognition/QuartzNet/common/features.py

@@ -237,7 +237,7 @@ class FilterbankFeatures(BaseFeatures):
         window_tensor = window_fn(self.win_length,
                                   periodic=False) if window_fn else None
         filterbanks = torch.tensor(
-            librosa.filters.mel(sample_rate, self.n_fft, n_mels=n_filt,
+            librosa.filters.mel(sr=sample_rate, n_fft=self.n_fft, n_mels=n_filt,
                                 fmin=lowfreq, fmax=highfreq),
             dtype=torch.float).unsqueeze(0)
         # torchscript

+ 2 - 2
PyTorch/SpeechRecognition/QuartzNet/requirements.txt

@@ -1,10 +1,10 @@
 inflect==5.3.0
 ipdb
-librosa==0.8.0
+librosa==0.9.0
 pandas==1.1.4
 pycuda==2020.1
 pyyaml>=5.4
 soundfile
 sox==1.4.1
 tqdm==4.53.0
-git+https://github.com/NVIDIA/[email protected]#egg=dllogger
+git+https://github.com/NVIDIA/[email protected]#egg=dllogger

+ 0 - 1
PyTorch/SpeechRecognition/QuartzNet/utils/preprocessing_utils.py

@@ -15,7 +15,6 @@
 #!/usr/bin/env python
 import os
 import multiprocessing
-import librosa
 import functools
 
 import sox

+ 1 - 1
PyTorch/SpeechSynthesis/FastPitch/common/audio_processing.py

@@ -74,7 +74,7 @@ def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
     # Compute the squared window at the desired length
     win_sq = get_window(window, win_length, fftbins=True)
     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
-    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    win_sq = librosa_util.pad_center(win_sq, size=n_fft)
 
     # Fill the envelope
     for i in range(n_frames):

+ 7 - 1
PyTorch/SpeechSynthesis/FastPitch/common/layers.py

@@ -94,7 +94,13 @@ class TacotronSTFT(torch.nn.Module):
         self.sampling_rate = sampling_rate
         self.stft_fn = STFT(filter_length, hop_length, win_length)
         mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+            sr=sampling_rate,
+            n_fft=filter_length,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax
+        )
+
         mel_basis = torch.from_numpy(mel_basis).float()
         self.register_buffer('mel_basis', mel_basis)
 

+ 1 - 1
PyTorch/SpeechSynthesis/FastPitch/common/stft.py

@@ -64,7 +64,7 @@ class STFT(torch.nn.Module):
             assert(filter_length >= win_length)
             # get window and zero center pad it to filter_length
             fft_window = get_window(window, win_length, fftbins=True)
-            fft_window = pad_center(fft_window, filter_length)
+            fft_window = pad_center(fft_window, size=filter_length)
             fft_window = torch.from_numpy(fft_window).float()
 
             # window the bases

+ 2 - 2
PyTorch/SpeechSynthesis/FastPitch/requirements.txt

@@ -1,7 +1,7 @@
 matplotlib
 numpy
 inflect
-librosa==0.8.0
+librosa==0.9.0
 scipy
 tensorboardX==2.0
-git+https://github.com/NVIDIA/[email protected]#egg=dllogger
+git+https://github.com/NVIDIA/[email protected]#egg=dllogger

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/tacotron2_common/audio_processing.py

@@ -74,7 +74,7 @@ def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
     # Compute the squared window at the desired length
     win_sq = get_window(window, win_length, fftbins=True)
     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
-    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    win_sq = librosa_util.pad_center(win_sq, size=n_fft)
 
     # Fill the envelope
     for i in range(n_frames):

+ 7 - 1
PyTorch/SpeechSynthesis/Tacotron2/tacotron2_common/layers.py

@@ -74,7 +74,13 @@ class TacotronSTFT(torch.nn.Module):
         self.sampling_rate = sampling_rate
         self.stft_fn = STFT(filter_length, hop_length, win_length)
         mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+            sr=sampling_rate,
+            n_fft=filter_length,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax
+        )
+
         mel_basis = torch.from_numpy(mel_basis).float()
         self.register_buffer('mel_basis', mel_basis)
 

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/tacotron2_common/stft.py

@@ -64,7 +64,7 @@ class STFT(torch.nn.Module):
             assert(filter_length >= win_length)
             # get window and zero center pad it to filter_length
             fft_window = get_window(window, win_length, fftbins=True)
-            fft_window = pad_center(fft_window, filter_length)
+            fft_window = pad_center(fft_window, size=filter_length)
             fft_window = torch.from_numpy(fft_window).float()
 
             # window the bases