audio_processing.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. # BSD 3-Clause License
  2. # Copyright (c) 2018-2020, NVIDIA Corporation
  3. # All rights reserved.
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are met:
  6. # * Redistributions of source code must retain the above copyright notice, this
  7. # list of conditions and the following disclaimer.
  8. # * Redistributions in binary form must reproduce the above copyright notice,
  9. # this list of conditions and the following disclaimer in the documentation
  10. # and/or other materials provided with the distribution.
  11. # * Neither the name of the copyright holder nor the names of its
  12. # contributors may be used to endorse or promote products derived from
  13. # this software without specific prior written permission.
  14. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  15. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  17. # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  18. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  20. # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  21. # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  22. # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. """https://github.com/NVIDIA/tacotron2"""
  25. import torch
  26. import numpy as np
  27. from scipy.signal import get_window
  28. import librosa.util as librosa_util
  29. def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
  30. n_fft=800, dtype=np.float32, norm=None):
  31. """
  32. # from librosa 0.6
  33. Compute the sum-square envelope of a window function at a given hop length.
  34. This is used to estimate modulation effects induced by windowing
  35. observations in short-time fourier transforms.
  36. Parameters
  37. ----------
  38. window : string, tuple, number, callable, or list-like
  39. Window specification, as in `get_window`
  40. n_frames : int > 0
  41. The number of analysis frames
  42. hop_length : int > 0
  43. The number of samples to advance between frames
  44. win_length : [optional]
  45. The length of the window function. By default, this matches `n_fft`.
  46. n_fft : int > 0
  47. The length of each analysis frame.
  48. dtype : np.dtype
  49. The data type of the output
  50. Returns
  51. -------
  52. wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
  53. The sum-squared envelope of the window function
  54. """
  55. if win_length is None:
  56. win_length = n_fft
  57. n = n_fft + hop_length * (n_frames - 1)
  58. x = np.zeros(n, dtype=dtype)
  59. # Compute the squared window at the desired length
  60. win_sq = get_window(window, win_length, fftbins=True)
  61. win_sq = librosa_util.normalize(win_sq, norm=norm)**2
  62. win_sq = librosa_util.pad_center(win_sq, n_fft)
  63. # Fill the envelope
  64. for i in range(n_frames):
  65. sample = i * hop_length
  66. x[sample:min(n, sample + n_fft)
  67. ] += win_sq[:max(0, min(n_fft, n - sample))]
  68. return x
  69. def griffin_lim(magnitudes, stft_fn, n_iters=30):
  70. """
  71. PARAMS
  72. ------
  73. magnitudes: spectrogram magnitudes
  74. stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
  75. """
  76. angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
  77. angles = angles.astype(np.float32)
  78. angles = torch.autograd.Variable(torch.from_numpy(angles))
  79. signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
  80. for i in range(n_iters):
  81. _, angles = stft_fn.transform(signal)
  82. signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
  83. return signal
  84. def dynamic_range_compression(x, C=1, clip_val=1e-5):
  85. """
  86. PARAMS
  87. ------
  88. C: compression factor
  89. """
  90. return torch.log(torch.clamp(x, min=clip_val) * C)
  91. def dynamic_range_decompression(x, C=1):
  92. """
  93. PARAMS
  94. ------
  95. C: compression factor used to compress
  96. """
  97. return torch.exp(x) / C