test_infer.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. # *****************************************************************************
  2. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are met:
  6. # * Redistributions of source code must retain the above copyright
  7. # notice, this list of conditions and the following disclaimer.
  8. # * Redistributions in binary form must reproduce the above copyright
  9. # notice, this list of conditions and the following disclaimer in the
  10. # documentation and/or other materials provided with the distribution.
  11. # * Neither the name of the NVIDIA CORPORATION nor the
  12. # names of its contributors may be used to endorse or promote products
  13. # derived from this software without specific prior written permission.
  14. #
  15. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  16. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  17. # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  19. # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20. # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  21. # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  22. # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  24. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. #
  26. # *****************************************************************************
  27. from tacotron2.text import text_to_sequence
  28. import models
  29. import torch
  30. import argparse
  31. import numpy as np
  32. from scipy.io.wavfile import write
  33. import sys
  34. from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence, load_and_setup_model
  35. import time
  36. import dllogger as DLLogger
  37. from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
  38. from apex import amp
  39. from waveglow.denoiser import Denoiser
  40. def parse_args(parser):
  41. """
  42. Parse commandline arguments.
  43. """
  44. parser.add_argument('--tacotron2', type=str,
  45. help='full path to the Tacotron2 model checkpoint file')
  46. parser.add_argument('--waveglow', type=str,
  47. help='full path to the WaveGlow model checkpoint file')
  48. parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
  49. parser.add_argument('-d', '--denoising-strength', default=0.01, type=float)
  50. parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
  51. help='Sampling rate')
  52. run_mode = parser.add_mutually_exclusive_group()
  53. run_mode.add_argument('--fp16', action='store_true',
  54. help='Run inference with FP16')
  55. run_mode.add_argument('--cpu', action='store_true',
  56. help='Run inference on CPU')
  57. parser.add_argument('--log-file', type=str, default='nvlog.json',
  58. help='Filename for logging')
  59. parser.add_argument('--stft-hop-length', type=int, default=256,
  60. help='STFT hop length for estimating audio length from mel size')
  61. parser.add_argument('--num-iters', type=int, default=10,
  62. help='Number of iterations')
  63. parser.add_argument('-il', '--input-length', type=int, default=64,
  64. help='Input length')
  65. parser.add_argument('-bs', '--batch-size', type=int, default=1,
  66. help='Batch size')
  67. return parser
  68. def print_stats(measurements_all):
  69. throughput = measurements_all['throughput']
  70. preprocessing = measurements_all['pre_processing']
  71. type_conversion = measurements_all['type_conversion']
  72. storage = measurements_all['storage']
  73. data_transfer = measurements_all['data_transfer']
  74. postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
  75. latency = measurements_all['latency']
  76. waveglow_latency = measurements_all['waveglow_latency']
  77. tacotron2_latency = measurements_all['tacotron2_latency']
  78. denoiser_latency = measurements_all['denoiser_latency']
  79. num_mels_per_audio = measurements_all['num_mels_per_audio']
  80. latency.sort()
  81. cf_50 = max(latency[:int(len(latency)*0.50)])
  82. cf_90 = max(latency[:int(len(latency)*0.90)])
  83. cf_95 = max(latency[:int(len(latency)*0.95)])
  84. cf_99 = max(latency[:int(len(latency)*0.99)])
  85. cf_100 = max(latency[:int(len(latency)*1.0)])
  86. print("Throughput average (samples/sec) = {:.0f}".format(np.mean(throughput)))
  87. print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
  88. print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
  89. print("Number of mels per audio average = {:.0f}".format(np.mean(num_mels_per_audio)))
  90. print("Tacotron2 latency average (seconds) = {:.2f}".format(np.mean(tacotron2_latency)))
  91. print("WaveGlow latency average (seconds) = {:.2f}".format(np.mean(waveglow_latency)))
  92. print("Denoiser latency average (seconds) = {:.4f}".format(np.mean(denoiser_latency)))
  93. print("Latency average (seconds) = {:.2f}".format(np.mean(latency)))
  94. print("Latency std (seconds) = {:.2f}".format(np.std(latency)))
  95. print("Latency cl 50 (seconds) = {:.2f}".format(cf_50))
  96. print("Latency cl 90 (seconds) = {:.2f}".format(cf_90))
  97. print("Latency cl 95 (seconds) = {:.2f}".format(cf_95))
  98. print("Latency cl 99 (seconds) = {:.2f}".format(cf_99))
  99. print("Latency cl 100 (seconds) = {:.2f}".format(cf_100))
  100. def main():
  101. """
  102. Launches text to speech (inference).
  103. Inference is executed on a single GPU or CPU.
  104. """
  105. parser = argparse.ArgumentParser(
  106. description='PyTorch Tacotron 2 Inference')
  107. parser = parse_args(parser)
  108. args, unknown_args = parser.parse_known_args()
  109. DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
  110. StdOutBackend(Verbosity.VERBOSE)])
  111. for k,v in vars(args).items():
  112. DLLogger.log(step="PARAMETER", data={k:v})
  113. DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
  114. measurements_all = {"pre_processing": [],
  115. "tacotron2_latency": [],
  116. "waveglow_latency": [],
  117. "denoiser_latency": [],
  118. "latency": [],
  119. "type_conversion": [],
  120. "data_transfer": [],
  121. "storage": [],
  122. "tacotron2_items_per_sec": [],
  123. "waveglow_items_per_sec": [],
  124. "num_mels_per_audio": [],
  125. "throughput": []}
  126. print("args:", args, unknown_args)
  127. tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
  128. args.fp16, args.cpu, forward_is_infer=True)
  129. waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
  130. args.fp16, args.cpu, forward_is_infer=True)
  131. denoiser = Denoiser(waveglow)
  132. if not args.cpu:
  133. denoiser.cuda()
  134. texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
  135. texts = [texts[0][:args.input_length]]
  136. texts = texts*args.batch_size
  137. warmup_iters = 3
  138. for iter in range(args.num_iters):
  139. measurements = {}
  140. with MeasureTime(measurements, "pre_processing", args.cpu):
  141. sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)
  142. with torch.no_grad():
  143. with MeasureTime(measurements, "latency", args.cpu):
  144. with MeasureTime(measurements, "tacotron2_latency", args.cpu):
  145. mel, mel_lengths, _ = tacotron2.infer(sequences_padded, input_lengths)
  146. with MeasureTime(measurements, "waveglow_latency", args.cpu):
  147. audios = waveglow.infer(mel, sigma=args.sigma_infer)
  148. num_mels = mel.size(0)*mel.size(2)
  149. num_samples = audios.size(0)*audios.size(1)
  150. with MeasureTime(measurements, "type_conversion", args.cpu):
  151. audios = audios.float()
  152. with torch.no_grad(), MeasureTime(measurements, "denoiser_latency", args.cpu):
  153. audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
  154. with MeasureTime(measurements, "data_transfer", args.cpu):
  155. audios = audios.cpu()
  156. with MeasureTime(measurements, "storage", args.cpu):
  157. audios = audios.numpy()
  158. for i, audio in enumerate(audios):
  159. audio_path = "audio_"+str(i)+".wav"
  160. write(audio_path, args.sampling_rate,
  161. audio[:mel_lengths[i]*args.stft_hop_length])
  162. measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
  163. measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
  164. measurements['num_mels_per_audio'] = mel.size(2)
  165. measurements['throughput'] = num_samples/measurements['latency']
  166. if iter >= warmup_iters:
  167. for k,v in measurements.items():
  168. measurements_all[k].append(v)
  169. DLLogger.log(step=(iter-warmup_iters), data={k: v})
  170. DLLogger.flush()
  171. print_stats(measurements_all)
  172. if __name__ == '__main__':
  173. main()