| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- # *****************************************************************************
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- # * Neither the name of the NVIDIA CORPORATION nor the
- # names of its contributors may be used to endorse or promote products
- # derived from this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # *****************************************************************************
- from tacotron2.text import text_to_sequence
- import models
- import torch
- import argparse
- import numpy as np
- from scipy.io.wavfile import write
- import sys
- from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence, load_and_setup_model
- import time
- import dllogger as DLLogger
- from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
- from apex import amp
- from waveglow.denoiser import Denoiser
- def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('--tacotron2', type=str,
- help='full path to the Tacotron2 model checkpoint file')
- parser.add_argument('--waveglow', type=str,
- help='full path to the WaveGlow model checkpoint file')
- parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
- parser.add_argument('-d', '--denoising-strength', default=0.01, type=float)
- parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
- help='Sampling rate')
- run_mode = parser.add_mutually_exclusive_group()
- run_mode.add_argument('--fp16', action='store_true',
- help='Run inference with FP16')
- run_mode.add_argument('--cpu', action='store_true',
- help='Run inference on CPU')
- parser.add_argument('--log-file', type=str, default='nvlog.json',
- help='Filename for logging')
- parser.add_argument('--stft-hop-length', type=int, default=256,
- help='STFT hop length for estimating audio length from mel size')
- parser.add_argument('--num-iters', type=int, default=10,
- help='Number of iterations')
- parser.add_argument('-il', '--input-length', type=int, default=64,
- help='Input length')
- parser.add_argument('-bs', '--batch-size', type=int, default=1,
- help='Batch size')
- return parser
- def print_stats(measurements_all):
- throughput = measurements_all['throughput']
- preprocessing = measurements_all['pre_processing']
- type_conversion = measurements_all['type_conversion']
- storage = measurements_all['storage']
- data_transfer = measurements_all['data_transfer']
- postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
- latency = measurements_all['latency']
- waveglow_latency = measurements_all['waveglow_latency']
- tacotron2_latency = measurements_all['tacotron2_latency']
- denoiser_latency = measurements_all['denoiser_latency']
- num_mels_per_audio = measurements_all['num_mels_per_audio']
- latency.sort()
- cf_50 = max(latency[:int(len(latency)*0.50)])
- cf_90 = max(latency[:int(len(latency)*0.90)])
- cf_95 = max(latency[:int(len(latency)*0.95)])
- cf_99 = max(latency[:int(len(latency)*0.99)])
- cf_100 = max(latency[:int(len(latency)*1.0)])
- print("Throughput average (samples/sec) = {:.0f}".format(np.mean(throughput)))
- print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
- print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
- print("Number of mels per audio average = {:.0f}".format(np.mean(num_mels_per_audio)))
- print("Tacotron2 latency average (seconds) = {:.2f}".format(np.mean(tacotron2_latency)))
- print("WaveGlow latency average (seconds) = {:.2f}".format(np.mean(waveglow_latency)))
- print("Denoiser latency average (seconds) = {:.4f}".format(np.mean(denoiser_latency)))
- print("Latency average (seconds) = {:.2f}".format(np.mean(latency)))
- print("Latency std (seconds) = {:.2f}".format(np.std(latency)))
- print("Latency cl 50 (seconds) = {:.2f}".format(cf_50))
- print("Latency cl 90 (seconds) = {:.2f}".format(cf_90))
- print("Latency cl 95 (seconds) = {:.2f}".format(cf_95))
- print("Latency cl 99 (seconds) = {:.2f}".format(cf_99))
- print("Latency cl 100 (seconds) = {:.2f}".format(cf_100))
- def main():
- """
- Launches text to speech (inference).
- Inference is executed on a single GPU or CPU.
- """
- parser = argparse.ArgumentParser(
- description='PyTorch Tacotron 2 Inference')
- parser = parse_args(parser)
- args, unknown_args = parser.parse_known_args()
- DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
- StdOutBackend(Verbosity.VERBOSE)])
- for k,v in vars(args).items():
- DLLogger.log(step="PARAMETER", data={k:v})
- DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
- measurements_all = {"pre_processing": [],
- "tacotron2_latency": [],
- "waveglow_latency": [],
- "denoiser_latency": [],
- "latency": [],
- "type_conversion": [],
- "data_transfer": [],
- "storage": [],
- "tacotron2_items_per_sec": [],
- "waveglow_items_per_sec": [],
- "num_mels_per_audio": [],
- "throughput": []}
- print("args:", args, unknown_args)
- tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
- args.fp16, args.cpu, forward_is_infer=True)
- waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
- args.fp16, args.cpu, forward_is_infer=True)
- denoiser = Denoiser(waveglow)
- if not args.cpu:
- denoiser.cuda()
- texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
- texts = [texts[0][:args.input_length]]
- texts = texts*args.batch_size
- warmup_iters = 3
- for iter in range(args.num_iters):
- measurements = {}
- with MeasureTime(measurements, "pre_processing", args.cpu):
- sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)
- with torch.no_grad():
- with MeasureTime(measurements, "latency", args.cpu):
- with MeasureTime(measurements, "tacotron2_latency", args.cpu):
- mel, mel_lengths, _ = tacotron2.infer(sequences_padded, input_lengths)
- with MeasureTime(measurements, "waveglow_latency", args.cpu):
- audios = waveglow.infer(mel, sigma=args.sigma_infer)
- num_mels = mel.size(0)*mel.size(2)
- num_samples = audios.size(0)*audios.size(1)
- with MeasureTime(measurements, "type_conversion", args.cpu):
- audios = audios.float()
- with torch.no_grad(), MeasureTime(measurements, "denoiser_latency", args.cpu):
- audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
- with MeasureTime(measurements, "data_transfer", args.cpu):
- audios = audios.cpu()
- with MeasureTime(measurements, "storage", args.cpu):
- audios = audios.numpy()
- for i, audio in enumerate(audios):
- audio_path = "audio_"+str(i)+".wav"
- write(audio_path, args.sampling_rate,
- audio[:mel_lengths[i]*args.stft_hop_length])
- measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
- measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
- measurements['num_mels_per_audio'] = mel.size(2)
- measurements['throughput'] = num_samples/measurements['latency']
- if iter >= warmup_iters:
- for k,v in measurements.items():
- measurements_all[k].append(v)
- DLLogger.log(step=(iter-warmup_iters), data={k: v})
- DLLogger.flush()
- print_stats(measurements_all)
- if __name__ == '__main__':
- main()
|