Răsfoiți Sursa

[Tacotron2/PyT} Updating for Ampere

Przemek Strzelczyk 5 ani în urmă
părinte
comite
f0c8bc571a
57 a modificat fișierele cu 550 adăugiri și 445 ștergeri
  1. 3 1
      PyTorch/SpeechSynthesis/Tacotron2/Dockerfile
  2. 254 118
      PyTorch/SpeechSynthesis/Tacotron2/README.md
  3. 6 5
      PyTorch/SpeechSynthesis/Tacotron2/common/stft.py
  4. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/common/utils.py
  5. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_onnx.py
  6. 12 85
      PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py
  7. BIN
      PyTorch/SpeechSynthesis/Tacotron2/img/Taco2WG_train_loss.png
  8. BIN
      PyTorch/SpeechSynthesis/Tacotron2/img/tacotron2_a100_amp_loss.png
  9. BIN
      PyTorch/SpeechSynthesis/Tacotron2/img/tacotron2_a100_tf32_loss.png
  10. BIN
      PyTorch/SpeechSynthesis/Tacotron2/img/waveglow_a100_amp_loss.png
  11. BIN
      PyTorch/SpeechSynthesis/Tacotron2/img/waveglow_a100_tf32_loss.png
  12. 37 38
      PyTorch/SpeechSynthesis/Tacotron2/inference.py
  13. 8 8
      PyTorch/SpeechSynthesis/Tacotron2/notebooks/Tacotron2.ipynb
  14. 2 2
      PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/README.md
  15. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/README.md
  16. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_AMP_1NGPU_train.sh
  17. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_AMP_4NGPU_train.sh
  18. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_AMP_8NGPU_train.sh
  19. 0 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_FP32_1NGPU_train.sh
  20. 0 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_FP32_4NGPU_train.sh
  21. 0 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_FP32_8NGPU_train.sh
  22. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_AMP_1NGPU_train.sh
  23. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_AMP_4NGPU_train.sh
  24. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_AMP_8NGPU_train.sh
  25. 0 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_FP32_1NGPU_train.sh
  26. 0 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_FP32_4NGPU_train.sh
  27. 0 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_FP32_8NGPU_train.sh
  28. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_AMP_1NGPU_train.sh
  29. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_AMP_4NGPU_train.sh
  30. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_AMP_8NGPU_train.sh
  31. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_TF32_1NGPU_train.sh
  32. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_TF32_4NGPU_train.sh
  33. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_TF32_8NGPU_train.sh
  34. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_AMP_1NGPU_train.sh
  35. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_AMP_4NGPU_train.sh
  36. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_AMP_8NGPU_train.sh
  37. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_TF32_1NGPU_train.sh
  38. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_TF32_4NGPU_train.sh
  39. 2 0
      PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_TF32_8NGPU_train.sh
  40. 0 2
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_1GPU.sh
  41. 0 2
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_4GPU.sh
  42. 0 2
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_8GPU.sh
  43. 0 2
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_1GPU.sh
  44. 0 2
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_4GPU.sh
  45. 0 2
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_8GPU.sh
  46. 11 4
      PyTorch/SpeechSynthesis/Tacotron2/run_latency_tests.sh
  47. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/scripts/train_tacotron2.sh
  48. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/scripts/train_waveglow.sh
  49. 1 0
      PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py
  50. 49 76
      PyTorch/SpeechSynthesis/Tacotron2/test_infer.py
  51. 19 22
      PyTorch/SpeechSynthesis/Tacotron2/test_infer.sh
  52. 73 29
      PyTorch/SpeechSynthesis/Tacotron2/train.py
  53. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/trt/README.md
  54. 2 2
      PyTorch/SpeechSynthesis/Tacotron2/trt/export_onnx2trt.py
  55. 1 4
      PyTorch/SpeechSynthesis/Tacotron2/trt/inference_trt.py
  56. 7 16
      PyTorch/SpeechSynthesis/Tacotron2/waveglow/denoiser.py
  57. 23 17
      PyTorch/SpeechSynthesis/Tacotron2/waveglow/model.py

+ 3 - 1
PyTorch/SpeechSynthesis/Tacotron2/Dockerfile

@@ -1,6 +1,8 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
 FROM ${FROM_IMAGE_NAME}
 
 ADD . /workspace/tacotron2
 WORKDIR /workspace/tacotron2
 RUN pip install --no-cache-dir -r requirements.txt
+
+ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4

+ 254 - 118
PyTorch/SpeechSynthesis/Tacotron2/README.md

@@ -4,17 +4,18 @@ This repository provides a script and recipe to train Tacotron 2 and WaveGlow
 v1.6 models to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
 
 ## Table of Contents
-* [Model overview](#model-overview)
+- [Model overview](#model-overview)
    * [Model architecture](#model-architecture)
    * [Default configuration](#default-configuration)
    * [Feature support matrix](#feature-support-matrix)
       * [Features](#features)
    * [Mixed precision training](#mixed-precision-training)
       * [Enabling mixed precision](#enabling-mixed-precision)
-* [Setup](#setup)
+      * [Enabling TF32](#enabling-tf32)
+- [Setup](#setup)
    * [Requirements](#requirements)
-* [Quick Start Guide](#quick-start-guide)
-* [Advanced](#advanced)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
    * [Scripts and sample code](#scripts-and-sample-code)
    * [Parameters](#parameters)
       * [Shared parameters](#shared-parameters)
@@ -27,20 +28,24 @@ v1.6 models to achieve state of the art accuracy, and is tested and maintained b
       * [Multi-dataset](#multi-dataset)
    * [Training process](#training-process)
    * [Inference process](#inference-process)
-* [Performance](#performance)
+- [Performance](#performance)
    * [Benchmarking](#benchmarking)
       * [Training performance benchmark](#training-performance-benchmark)
       * [Inference performance benchmark](#inference-performance-benchmark)
    * [Results](#results)
       * [Training accuracy results](#training-accuracy-results)
-         * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
+         * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
+         * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+         * [Training curves](#training-curves)
       * [Training performance results](#training-performance-results)
-         * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
+         * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+         * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gB)
          * [Expected training time](#expected-training-time)
       * [Inference performance results](#inference-performance-results)
-         * [Inference performance: NVIDIA V100 16G](#inference-performance-nvidia-v100-16g)
+         * [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
+         * [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
          * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
-* [Release notes](#release-notes)
+- [Release notes](#release-notes)
    * [Changelog](#changelog)
    * [Known issues](#known-issues)
 
@@ -71,12 +76,12 @@ available [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
 The Tacotron 2 and WaveGlow model enables you to efficiently synthesize high
 quality speech from text.
 
-Both models are trained with mixed precision using Tensor Cores on NVIDIA
-Volta and Turing GPUs. Therefore, researchers can get results 1.5x faster for Tacotron 2
-and 2.2x faster for WaveGlow than training without Tensor Cores, while
-experiencing the benefits of mixed precision training. The models are tested
-against each NGC monthly container release to ensure consistent accuracy and
-performance over time.
+Both models are trained with mixed precision using Tensor Cores on Volta,
+Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can
+get results 2.0x faster for Tacotron 2 and 3.1x faster for WaveGlow than
+training without Tensor Cores, while experiencing the benefits of mixed
+precision training. The models are tested against each NGC monthly
+container release to ensure consistent accuracy and performance over time.
 
 ### Model architecture
 
@@ -143,7 +148,7 @@ performance by overlapping communication with computation during `backward()`
 and bucketing smaller gradient transfers to reduce the total number of transfers
 required.
 
-## Mixed precision training
+### Mixed precision training
 
 *Mixed precision* is the combined use of different numerical precisions in a
 computational method. [Mixed precision](https://arxiv.org/abs/1710.03740)
@@ -151,7 +156,8 @@ training offers significant computational speedup by performing operations in
 half-precision format, while storing minimal information in single-precision
 to retain as much information as possible in critical parts of the network.
 Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores)
-in the Volta and Turing architecture, significant training speedups are
+in Volta, and following with both the Turing and Ampere architectures,
+significant training speedups are
 experienced by switching to mixed precision -- up to 3x overall speedup on
 the most arithmetically intense model architectures.  Using mixed precision
 training requires two steps:
@@ -170,7 +176,7 @@ documentation.
 blog.
 * APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
 
-### Enabling mixed precision
+#### Enabling mixed precision
 
 Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
 (AMP)  library from [APEX](https://github.com/NVIDIA/apex) that casts variables
@@ -183,7 +189,7 @@ to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.f
 
 By default, the `train_tacotron2.sh` and `train_waveglow.sh` scripts will
 launch mixed precision training with Tensor Cores. You can change this
-behaviour by removing the `--amp-run` flag from the `train.py` script.
+behaviour by removing the `--amp` flag from the `train.py` script.
 
 To enable mixed precision, the following steps were performed in the Tacotron 2 and
 WaveGlow models:
@@ -219,6 +225,18 @@ called `losses`):
             scaled_losses.backward()
         ```
 
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](#https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs.
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](#https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
 ## Setup
 
 The following section lists the requirements in order to start training the
@@ -230,18 +248,21 @@ This repository contains Dockerfile which extends the PyTorch NGC container
 and encapsulates some dependencies. Aside from these dependencies, ensure you
 have the following components:
 
-* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 20.03-py3+ NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+- [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
 or newer
-* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+- Supported GPUs:
+   - [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+   - [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/)
+   - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
 
 For more information about how to get started with NGC containers, see the
 following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
 Documentation:
 
-* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
-* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
-* [Running PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
+- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+- [Running PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
 
 For those unable to use the PyTorch NGC container, to set up the required
 environment or create your own container, see the versioned
@@ -285,7 +306,7 @@ After you build the container image, you can start an interactive CLI session wi
    ```
 
    The `interactive.sh` script requires that the location on the dataset is specified.
-   For example, `LJSpeech-1.1`. To preprocess the datasets for Tacotron 2 training, use 
+   For example, `LJSpeech-1.1`. To preprocess the datasets for Tacotron 2 training, use
    the `./scripts/prepare_mels.sh` script:
    ```bash
    bash scripts/prepare_mels.sh
@@ -323,27 +344,27 @@ inference using the respective checkpoints that are passed as `--tacotron2`
 and `--waveglow` arguments. Tacotron2 and WaveGlow checkpoints can also be downloaded from NGC:
 
    https://ngc.nvidia.com/catalog/models/nvidia:tacotron2pyt_fp16/files?version=3
-   
+
    https://ngc.nvidia.com/catalog/models/nvidia:waveglow256pyt_fp16/files?version=2
 
    To run inference issue:
 
    ```bash
-   python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> --wn-channels 256 -o output/ -i phrases/phrase.txt --amp-run
+   python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> --wn-channels 256 -o output/ -i phrases/phrase.txt --fp16
    ```
 
    The speech is generated from lines of text in the file that is passed with
    `-i` argument. The number of lines determines inference batch size. To run
-   inference in mixed precision, use the `--amp-run` flag. The output audio will
+   inference in mixed precision, use the `--fp16` flag. The output audio will
    be stored in the path specified by the `-o` argument.
 
-   You can also run inference on CPU with TorchScript by adding flag --cpu-run:
+   You can also run inference on CPU with TorchScript by adding flag --cpu:
    ```bash
    export CUDA_VISIBLE_DEVICES=
    ```
    ```bash
-   python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> --wn-channels 256 --cpu-run -o output/ -i phrases/phrase.txt
-   ```    
+   python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> --wn-channels 256 --cpu -o output/ -i phrases/phrase.txt
+   ```
 
 ## Advanced
 
@@ -383,8 +404,8 @@ WaveGlow models.
 * `--epochs` - number of epochs (Tacotron 2: 1501, WaveGlow: 1001)
 * `--learning-rate` - learning rate (Tacotron 2: 1e-3, WaveGlow: 1e-4)
 * `--batch-size` - batch size (Tacotron 2 FP16/FP32: 104/48, WaveGlow FP16/FP32: 10/4)
-* `--amp-run` - use mixed precision training
-* `--cpu-run` - use CPU with TorchScript for inference
+* `--amp` - use mixed precision training
+* `--cpu` - use CPU with TorchScript for inference
 
 #### Shared audio/STFT parameters
 
@@ -482,24 +503,24 @@ models and input text as a text file, with one phrase per line.
 
 To run inference, issue:
 ```bash
-python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> --wn-channels 256 -o output/ --include-warmup -i phrases/phrase.txt --amp-run
+python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> --wn-channels 256 -o output/ --include-warmup -i phrases/phrase.txt --fp16
 ```
 Here, `Tacotron2_checkpoint` and `WaveGlow_checkpoint` are pre-trained
-checkpoints for the respective models, and `phrases/phrase.txt` contains input 
-phrases. The number of text lines determines the inference batch size. Audio 
+checkpoints for the respective models, and `phrases/phrase.txt` contains input
+phrases. The number of text lines determines the inference batch size. Audio
 will be saved in the output folder. The audio files [audio_fp16](./audio/audio_fp16.wav)
-and [audio_fp32](./audio/audio_fp32.wav) were generated using checkpoints from 
+and [audio_fp32](./audio/audio_fp32.wav) were generated using checkpoints from
 mixed precision and FP32 training, respectively.
 
 You can find all the available options by calling `python inference.py --help`.
 
-You can also run inference on CPU with TorchScript by adding flag --cpu-run:
+You can also run inference on CPU with TorchScript by adding flag --cpu:
 ```bash
 export CUDA_VISIBLE_DEVICES=
 ```
 ```bash
-python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> --wn-channels 256 --cpu-run -o output/ -i phrases/phrase.txt
-```    
+python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> --wn-channels 256 --cpu -o output/ -i phrases/phrase.txt
+```
 
 ## Performance
 
@@ -517,9 +538,9 @@ To benchmark the training performance on a specific batch size, run:
 * For 1 GPU
 	* FP16
         ```bash
-        python train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_subset_2500_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --dataset-path <dataset-path> --amp-run
+        python train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_subset_2500_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --dataset-path <dataset-path> --amp
         ```
-	* FP32
+	* TF32 (or FP32 if TF32 not enabled)
         ```bash
         python train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_subset_2500_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --dataset-path <dataset-path>
         ```
@@ -527,9 +548,9 @@ To benchmark the training performance on a specific batch size, run:
 * For multiple GPUs
 	* FP16
         ```bash
-        python -m multiproc train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_subset_2500_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --dataset-path <dataset-path> --amp-run
+        python -m multiproc train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_subset_2500_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --dataset-path <dataset-path> --amp
         ```
-	* FP32
+	* TF32 (or FP32 if TF32 not enabled)
         ```bash
         python -m multiproc train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_subset_2500_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --dataset-path <dataset-path>
         ```
@@ -539,9 +560,9 @@ To benchmark the training performance on a specific batch size, run:
 * For 1 GPU
 	* FP16
         ```bash
-        python train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp-run
+        python train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp
         ```
-	* FP32
+	* TF32 (or FP32 if TF32 not enabled)
         ```bash
         python train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path>
         ```
@@ -549,9 +570,9 @@ To benchmark the training performance on a specific batch size, run:
 * For multiple GPUs
 	* FP16
         ```bash
-        python -m multiproc train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp-run
+        python -m multiproc train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp
         ```
-	* FP32
+	* TF32 (or FP32 if TF32 not enabled)
         ```bash
         python -m multiproc train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path>
         ```
@@ -566,9 +587,9 @@ To benchmark the inference performance on a batch size=1, run:
 
 * For FP16
     ```bash
-    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrases/phrase_1_64.txt --amp-run --log-file=output/nvlog_fp16.json
+    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrases/phrase_1_64.txt --fp16 --log-file=output/nvlog_fp16.json
     ```
-* For FP32
+* For TF32 (or FP32 if TF32 not enabled)
     ```bash
     python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrases/phrase_1_64.txt --log-file=output/nvlog_fp32.json
     ```
@@ -586,10 +607,43 @@ and accuracy in training and inference.
 
 #### Training accuracy results
 
-##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+Our results were obtained by running the `./platform/DGXA100_{tacotron2,waveglow}_{AMP,TF32}_{1,4,8}NGPU_train.sh`
+training script in the PyTorch-20.06-py3 NGC container on
+NVIDIA DGX A100 (8x A100 40GB) GPUs.
+
+All of the results were produced using the `train.py` script as described in the
+[Training process](#training-process) section of this document. For each model,
+the loss is taken from a sample run.
+
+| Loss (Model/Epoch) |       1 |     250 |     500 |     750 |    1000 |
+| :----------------: | ------: | ------: | ------: | ------: | ------: |
+| Tacotron 2 FP16 | 3.82| 0.56| 0.42| 0.38| 0.35|
+| Tacotron 2 TF32 | 3.50| 0.54| 0.41| 0.37| 0.35|
+| WaveGlow FP16   | -3.31| -5.72| -5.87 | -5.94| -5.99
+| WaveGlow TF32   | -4.46| -5.93| -5.98| | |
 
-Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{AMP,FP32}_DGX1_16GB_8GPU.sh` training script in the PyTorch-19.06-py3
-NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
+![](./img/tacotron2_a100_amp_loss.png "Tacotron 2 FP16 loss")
+
+Figure 4. Tacotron 2 FP16 loss - batch size 128 (sample run)
+
+![](./img/tacotron2_a100_tf32_loss.png "Tacotron 2 TF32 loss")
+
+Figure 5. Tacotron 2 TF32 loss - batch size 128 (sample run)
+
+![](./img/waveglow_a100_amp_loss.png "WaveGlow FP16 loss")
+
+Figure 6. WaveGlow FP16 loss - batch size 10 (sample run)
+
+![](./img/waveglow_a100_tf32_loss.png "WaveGlow TF32 loss")
+
+Figure 7. WaveGlow TF32 loss - batch size 4 (sample run)
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
+
+Our results were obtained by running the `./platform/DGX1_{tacotron2,waveglow}_{AMP,TF32}_{1,4,8}NGPU_train.sh`
+training script in the PyTorch-20.06-py3 NGC container on
+NVIDIA DGX-1 with 8x V100 16G GPUs.
 
 All of the results were produced using the `train.py` script as described in the
 [Training process](#training-process) section of this document.
@@ -601,25 +655,78 @@ All of the results were produced using the `train.py` script as described in the
 | WaveGlow FP16  | -2.2054 | -5.7602 |  -5.901 | -5.9706 | -6.0258 |
 | WaveGlow FP32  | -3.0327 |  -5.858 | -6.0056 | -6.0613 | -6.1087 |
 
-Tacotron 2 FP16 loss - batch size 104 (mean and std over 16 runs)
 ![](./img/tacotron2_amp_loss.png "Tacotron 2 FP16 loss")
 
-Tacotron 2 FP32 loss - batch size 48 (mean and std over 16 runs)
+Figure 4. Tacotron 2 FP16 loss - batch size 104 (mean and std over 16 runs)
+
 ![](./img/tacotron2_fp32_loss.png "Tacotron 2 FP16 loss")
 
-WaveGlow FP16 loss - batch size 10 (mean and std over 16 runs)
+Figure 5. Tacotron 2 FP32 loss - batch size 48 (mean and std over 16 runs)
+
 ![](./img/waveglow_fp16_loss.png "WaveGlow FP16 loss")
 
-WaveGlow FP32 loss - batch size 4 (mean and std over 16 runs)
+Figure 6. WaveGlow FP16 loss - batch size 10 (mean and std over 16 runs)
+
 ![](./img/waveglow_fp32_loss.png "WaveGlow FP32 loss")
 
+Figure 7. WaveGlow FP32 loss - batch size 4 (mean and std over 16 runs)
+
+#### Training curves
+
+![](./img/Taco2WG_train_loss.png "Tacotron 2 and WaveGlow training loss")
+
+Figure 3. Tacotron 2 and WaveGlow training loss.
 
 #### Training performance results
 
-##### Training performance: NVIDIA DGX-1 (8x V100 16G)
+##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+
+Our results were obtained by running the `./platform/DGXA100_{tacotron2,waveglow}_{AMP,TF32}_{1,4,8}NGPU_train.sh`
+training script in the [framework-container-name] NGC container on
+NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in output mel-spectrograms per second for
+Tacotron 2 and output samples per second for WaveGlow)
+were averaged over an entire training epoch.
+
+This table shows the results for Tacotron 2:
+
+|Number of GPUs|Batch size per GPU|Number of mels used with mixed precision|Number of mels used with TF32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with TF32|
+|---:|---:|---:|---:|---:|---:|---:|
+|1| 128|  26,484|  31,499| 0.84| 1.00| 1.00|
+|4| 128| 107,482| 124,591| 0.86| 4.06| 3.96|
+|8| 128| 209,186| 250,556| 0.83| 7.90| 7.95|
+
+The following table shows the results for WaveGlow:
+
+|Number of GPUs|Batch size per GPU|Number of samples used with mixed precision|Number of samples used with TF32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with TF32|
+|---:|---:|---:|---:|---:|---:|---:|
+|1| 10@FP16, 4@TF32 | 149,479|  67,581| 2.21| 1.00| 1.00|
+|4| 10@FP16, 4@TF32 | 532,363| 233,846| 2.28| 3.56| 3.46|
+|8| 10@FP16, 4@TF32 | 905,043| 383,043| 2.36| 6.05| 5.67|
+
+
+##### Expected training time
+
+The following table shows the expected training time for convergence for Tacotron 2 (1501 epochs):
 
-Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{AMP,FP32}_DGX1_16GB_8GPU.sh`
-training script in the PyTorch-19.12-py3 NGC container on NVIDIA DGX-1 with
+|Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with TF32 (Hrs)|Speed-up with mixed precision|
+|---:|---:|---:|---:|---:|
+|1| 128| 112| 94| 0.84|
+|4| 128|  29| 25| 0.87|
+|8| 128|  16| 14| 0.84|
+
+  
+The following table shows the expected training time for convergence for WaveGlow (1001 epochs):
+
+|Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with TF32 (Hrs)|Speed-up with mixed precision|
+|---:|---:|---:|---:|---:|
+|1| 10@FP16, 4@TF32 | 188| 416| 2.21|
+|4| 10@FP16, 4@TF32 |  54| 122| 2.27|
+|8| 10@FP16, 4@TF32 |  33|  75| 2.29|
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
+
+Our results were obtained by running the `./platform/DGX1_{tacotron2,waveglow}_{AMP,TF32}_{1,4,8}NGPU_train.sh`
+training script in the PyTorch-20.06-py3 NGC container on NVIDIA DGX-1 with
 8x V100 16G GPUs. Performance numbers (in output mel-spectrograms per second for
 Tacotron 2 and output samples per second for WaveGlow) were averaged over
 an entire training epoch.
@@ -628,17 +735,17 @@ This table shows the results for Tacotron 2:
 
 |Number of GPUs|Batch size per GPU|Number of mels used with mixed precision|Number of mels used with FP32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with FP32|
 |---:|---:|---:|---:|---:|---:|---:|
-|1|104@FP16, 48@FP32 | 15,313 | 9,674 | 1.58 | 1.00 | 1.00 |
-|4|104@FP16, 48@FP32 | 53,661 | 32,778 | 1.64 | 3.50 | 3.39 |
-|8|104@FP16, 48@FP32 | 100,422 | 59,549 | 1.69 | 6.56 | 6.16 |
+|1|104@FP16, 48@FP32|  15,891|  9,174| 1.73| 1.00| 1.00|
+|4|104@FP16, 48@FP32|  53,417| 32,035| 1.67| 3.36| 3.49|
+|8|104@FP16, 48@FP32| 115,032| 58,703| 1.96| 7.24| 6.40|
 
 The following table shows the results for WaveGlow:
 
 |Number of GPUs|Batch size per GPU|Number of samples used with mixed precision|Number of samples used with FP32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with FP32|
 |---:|---:|---:|---:|---:|---:|---:|
-|1| 10@FP16, 4@FP32 | 81,503 | 36,671 | 2.22 | 1.00 | 1.00 |
-|4| 10@FP16, 4@FP32 | 275,803 | 124,504 | 2.22 | 3.38 | 3.40 |
-|8| 10@FP16, 4@FP32 | 583,887 | 264,903 | 2.20 | 7.16 | 7.22 |
+|1| 10@FP16, 4@FP32 | 105,873|  33,761| 3.14| 1.00| 1.00|
+|4| 10@FP16, 4@FP32 | 364,471| 118,254| 3.08| 3.44| 3.50|
+|8| 10@FP16, 4@FP32 | 690,909| 222,794| 3.10| 6.53| 6.60|
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
 
@@ -648,99 +755,128 @@ The following table shows the expected training time for convergence for Tacotro
 
 |Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with FP32 (Hrs)|Speed-up with mixed precision|
 |---:|---:|---:|---:|---:|
-|1| 104@FP16, 48@FP32 | 193 | 312 | 1.62 |
-|4| 104@FP16, 48@FP32 | 53 | 85 | 1.58 |
-|8| 104@FP16, 48@FP32 | 31 | 45 | 1.47 |
+|1| 104@FP16, 48@FP32| 181| 333| 1.84|
+|4| 104@FP16, 48@FP32|  53|  88| 1.66|
+|8| 104@FP16, 48@FP32|  31|  48| 1.56|
 
 The following table shows the expected training time for convergence for WaveGlow (1001 epochs):
 
 |Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with FP32 (Hrs)|Speed-up with mixed precision|
 |---:|---:|---:|---:|---:|
-|1| 10@FP16, 4@FP32 | 347 | 768 | 2.21 |
-|4| 10@FP16, 4@FP32 | 106 | 231 | 2.18 |
-|8| 10@FP16, 4@FP32 | 49 | 105 | 2.16 |
+|1| 10@FP16, 4@FP32 | 249| 793| 3.18|
+|4| 10@FP16, 4@FP32 |  78| 233| 3.00|
+|8| 10@FP16, 4@FP32 |  48| 127| 2.98|
 
 #### Inference performance results
 
 The following tables show inference statistics for the Tacotron2 and WaveGlow
-text-to-speech system, gathered from 1000 inference runs, on 1 V100 and 1 T4,
+text-to-speech system, gathered from 1000 inference runs, on 1x A100, 1x V100 and 1x T4,
 respectively. Latency is measured from the start of Tacotron 2 inference to
 the end of WaveGlow inference. The tables include average latency, latency standard
 deviation, and latency confidence intervals. Throughput is measured
 as the number of generated audio samples per second. RTF is the real-time factor
 which tells how many seconds of speech are generated in 1 second of compute.
 
-##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
 
-|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
-|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
-|1| 128| FP16| 1.27| 0.06| 1.34| 1.38| 1.41| 121,190| 1.37| 603| 7.00| 5.51|
-|4| 128| FP16| 2.32| 0.09| 2.42| 2.45| 2.59| 277,711| 2.03| 628| 7.23| 3.12|
-|1| 128| FP32| 1.70| 0.05| 1.77| 1.79| 1.84|  88,650| 1.00| 590| 6.85| 4.03|
-|4| 128| FP32| 4.56| 0.12| 4.72| 4.77| 4.87| 136,518| 1.00| 608| 7.06| 1.55|
+Our results were obtained by running the `inference-script-name.sh` inferencing
+benchmarking script in the PyTorch-20.06-py3 NGC container on NVIDIA DGX A100 (1x A100 40GB) GPU.
 
-##### Inference performance: NVIDIA T4
+|Batch size|Input length|Precision|WN channels|Avg latency (s)|Latency std (s)|Latency confidence interval 50% (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+|1| 128| FP16| 256| 0.80| 0.02| 0.80| 0.83| 0.84| 0.86| 192,086| 1.08| 602| 6.99| 8.74|
+|4| 128| FP16| 256| 1.05| 0.03| 1.05| 1.09| 1.10| 1.13| 602,856| 1.20| 619| 7.19| 6.85|
+|1| 128| FP32| 256| 0.87| 0.02| 0.87| 0.90| 0.91| 0.93| 177,210| 1.00| 601| 6.98| 8.02|
+|4| 128| FP32| 256| 1.27| 0.03| 1.26| 1.31| 1.32| 1.35| 500,458| 1.00| 620| 7.20| 5.67|
+|1| 128| FP16| 512| 0.87| 0.02| 0.87| 0.90| 0.92| 0.94| 176,135| 1.12| 601| 6.98| 8.02|
+|4| 128| FP16| 512| 1.37| 0.03| 1.36| 1.42| 1.43| 1.45| 462,691| 1.32| 619| 7.19| 5.25|
+|1| 128| FP32| 512| 0.98| 0.03| 0.98| 1.02| 1.03| 1.07| 156,586| 1.00| 602| 6.99| 7.13|
+|4| 128| FP32| 512| 1.81| 0.05| 1.79| 1.86| 1.90| 1.93| 351,465| 1.00| 620| 7.20| 3.98|
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
 
-|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
-|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
-|1| 128| FP16|  3.13| 0.13|  3.28|  3.36|  3.46| 49,276| 1.26| 602| 6.99| 2.24|
-|4| 128| FP16| 11.98| 0.42| 12.44| 12.70| 13.29| 53,676| 1.23| 628| 7.29| 0.61| 
-|1| 128| FP32|  3.88| 0.12|  4.04|  4.09|  4.19| 38,964| 1.00| 591| 6.86| 1.77|
-|4| 128| FP32| 14.34| 0.42| 14.89| 15.08| 15.55| 43,489| 1.00| 609| 7.07| 0.49|
+|Batch size|Input length|Precision|WN channels|Avg latency (s)|Latency std (s)|Latency confidence interval 50% (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+|1| 128| FP16| 256| 1.14| 0.07| 1.12| 1.20| 1.33| 1.40| 136,069| 1.58| 602| 6.99| 6.13|
+|4| 128| FP16| 256| 1.52| 0.05| 1.52| 1.58| 1.61| 1.65| 416,688| 1.72| 619| 7.19| 4.73|
+|1| 128| FP32| 256| 1.79| 0.06| 1.78| 1.86| 1.89| 1.99|  86,175| 1.00| 602| 6.99| 3.91|
+|4| 128| FP32| 256| 2.61| 0.07| 2.61| 2.71| 2.74| 2.78| 242,656| 1.00| 619| 7.19| 2.75|
+|1| 128| FP16| 512| 1.25| 0.08| 1.23| 1.32| 1.44| 1.50| 124,057| 1.90| 602| 6.99| 5.59|
+|4| 128| FP16| 512| 2.11| 0.06| 2.10| 2.19| 2.22| 2.29| 300,505| 2.37| 620| 7.20| 3.41|
+|1| 128| FP32| 512| 2.36| 0.08| 2.35| 2.46| 2.54| 2.61|  65,239| 1.00| 601| 6.98| 2.96|
+|4| 128| FP32| 512| 5.00| 0.14| 4.96| 5.18| 5.26| 5.42| 126,810| 1.00| 618| 7.18| 1.44|
 
 
+##### Inference performance: NVIDIA T4
+
+|Batch size|Input length|Precision|WN channels|Avg latency (s)|Latency std (s)|Latency confidence interval 50% (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+|1| 128| FP16| 256|  1.23| 0.05|  1.22|  1.29|  1.33|  1.42| 125,397| 2.46| 602| 6.99| 5.68|
+|4| 128| FP16| 256|  2.85| 0.08|  2.84|  2.96|  2.99|  3.07| 222,672| 1.90| 620| 7.20| 2.53|
+|1| 128| FP32| 256|  3.03| 0.10|  3.02|  3.14|  3.19|  3.32|  50,900| 1.00| 602| 6.99| 2.31|
+|4| 128| FP32| 256|  5.41| 0.15|  5.38|  5.61|  5.66|  5.85| 117,325| 1.00| 620| 7.20| 1.33|
+|1| 128| FP16| 512|  1.75| 0.08|  1.73|  1.87|  1.91|  1.98|  88,319| 2.79| 602| 6.99| 4.00|
+|4| 128| FP16| 512|  4.59| 0.13|  4.57|  4.77|  4.83|  4.94| 138,226| 2.84| 620| 7.20| 1.57|
+|1| 128| FP32| 512|  4.87| 0.14|  4.86|  5.03|  5.13|  5.27|  31,630| 1.00| 602| 6.99| 1.44|
+|4| 128| FP32| 512| 13.02| 0.37| 12.96| 13.53| 13.67| 14.13|  48,749| 1.00| 620| 7.20| 0.55|
+
 Our results were obtained by running the `./run_latency_tests.sh` script in
-the PyTorch-19.09-py3 NGC container. Please note that to reproduce the results,
+the PyTorch-20.06-py3 NGC container. Please note that to reproduce the results,
 you need to provide pretrained checkpoints for Tacotron 2 and WaveGlow. Please
 edit the script to provide your checkpoint filenames.
 
 
-To compare with inference performance on CPU with TorchScript, benchmark inference on CPU using `./run_latency_tests_cpu.sh` script and get the performance numbers for batch size 1 and 4. Intel's optimization for PyTorch on CPU are added, you need to set "export OMP_NUM_THREADS=num physical cores" based on your CPU's core number, for your reference: https://software.intel.com/content/www/us/en/develop/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference.html
+To compare with inference performance on CPU with TorchScript, benchmark inference on CPU using `./run_latency_tests_cpu.sh` script and get the performance numbers for batch size 1 and 4. Intel's optimization for PyTorch on CPU are added, you need to set `export OMP_NUM_THREADS=<num physical cores>` based on your CPU's core number, for your reference: https://software.intel.com/content/www/us/en/develop/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference.html
 
 
 ## Release notes
 
 ### Changelog
 
-March 2019
-* Initial release
+June 2020
+* Updated performance tables to include A100 results
 
-June 2019
-* AMP support
-* Data preprocessing for Tacotron 2 training
-* Fixed dropouts on LSTMCells
+March 2020
+* Added Tacotron 2 and WaveGlow inference using TensorRT Inference Server with custom TensorRT backend in `trtis_cpp`
+* Added Conversational AI demo script in `notebooks/conversationalai`
+* Fixed loading CUDA RNG state in `load_checkpoint()` function in `train.py`
+* Fixed FP16 export to TensorRT in `trt/README.md`
 
-July 2019
-* Changed measurement units for Tacotron 2 training and inference performance
-benchmarks from input tokes per second to output mel-spectrograms per second
-* Introduced batched inference
-* Included warmup in the inference script
+January 2020
+* Updated batch sizes and performance results for Tacotron 2.
 
-August 2019
-* Fixed inference results
-* Fixed initialization of Batch Normalization
+December 2019
+* Added export and inference scripts for TensorRT. See [Tacotron2 TensorRT README](trt/README.md).
 
-September 2019
-* Introduced inference statistics
+November 2019
+* Implemented training resume from checkpoint
+* Added notebook for running Tacotron 2 and WaveGlow in TRTIS.
 
 October 2019
 * Tacotron 2 inference with torch.jit.script
 
-November 2019
-* Implemented training resume from checkpoint
-* Added notebook for running Tacotron 2 and WaveGlow in TRTIS.
+September 2019
+* Introduced inference statistics
 
-December 2019
-* Added export and inference scripts for TensorRT. See [Tacotron2 TensorRT README](trt/README.md).
+August 2019
+* Fixed inference results
+* Fixed initialization of Batch Normalization
+
+July 2019
+* Changed measurement units for Tacotron 2 training and inference performance
+benchmarks from input tokes per second to output mel-spectrograms per second
+* Introduced batched inference
+* Included warmup in the inference script
+
+June 2019
+* AMP support
+* Data preprocessing for Tacotron 2 training
+* Fixed dropouts on LSTMCells
+
+March 2019
+* Initial release
 
-January 2020
-* Updated batch sizes and performance results for Tacotron 2.
 
-March 2020
-* Added Tacotron 2 and WaveGlow inference using TensorRT Inference Server with custom TensorRT backend in `trtis_cpp`
-* Added Conversational AI demo script in `notebooks/conversationalai`
-* Fixed loading CUDA RNG state in `load_checkpoint()` function in `train.py`
-* Fixed FP16 export to TensorRT in `trt/README.md`
 
 ### Known issues
 

+ 6 - 5
PyTorch/SpeechSynthesis/Tacotron2/common/stft.py

@@ -108,11 +108,12 @@ class STFT(torch.nn.Module):
         recombine_magnitude_phase = torch.cat(
             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
 
-        inverse_transform = F.conv_transpose1d(
-            recombine_magnitude_phase,
-            Variable(self.inverse_basis, requires_grad=False),
-            stride=self.hop_length,
-            padding=0)
+        inverse_transform = F.conv_transpose2d(
+            recombine_magnitude_phase.unsqueeze(-1),
+            Variable(self.inverse_basis.unsqueeze(-1), requires_grad=False),
+            stride=(self.hop_length,1),
+            padding=(0,0))
+        inverse_transform = inverse_transform.squeeze(-1)
 
         if self.window is not None:
             window_sum = window_sumsquare(

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/common/utils.py

@@ -63,4 +63,4 @@ def to_gpu(x):
 
     if torch.cuda.is_available():
         x = x.cuda(non_blocking=True)
-    return torch.autograd.Variable(x)
+    return x

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_onnx.py

@@ -297,7 +297,7 @@ def main():
     args, _ = parser.parse_known_args()
 
     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
-                                     amp_run=args.fp16, cpu_run=False)
+                                     fp16_run=args.fp16, cpu_run=False)
 
     opset_version = 10
 

+ 12 - 85
PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py

@@ -49,72 +49,6 @@ def parse_args(parser):
     return parser
 
 
-def convert_convinv_1d_to_2d(convinv):
-    """
-    Takes an invertible 1x1 1-d convolution and returns a 2-d convolution that does
-    the inverse
-    """
-    conv2d = torch.nn.Conv2d(convinv.W_inverse.size(1),
-                             convinv.W_inverse.size(0),
-                             1, bias=False)
-    conv2d.weight.data[:,:,:,0] = convinv.W_inverse.data
-    return conv2d
-
-
-def convert_conv_1d_to_2d(conv1d):
-    conv2d = torch.nn.Conv2d(conv1d.weight.size(1),
-                             conv1d.weight.size(0),
-                             (conv1d.weight.size(2), 1),
-                             stride=(conv1d.stride[0], 1),
-                             dilation=(conv1d.dilation[0], 1),
-                             padding=(conv1d.padding[0], 0))
-    conv2d.weight.data[:,:,:,0] = conv1d.weight.data
-    conv2d.bias.data = conv1d.bias.data
-    return conv2d
-
-
-def convert_WN_1d_to_2d_(WN):
-    """
-    Modifies the WaveNet like affine coupling layer in-place to use 2-d convolutions
-    """
-    WN.start = convert_conv_1d_to_2d(WN.start)
-    WN.end = convert_conv_1d_to_2d(WN.end)
-
-    for i in range(len(WN.in_layers)):
-        WN.in_layers[i] = convert_conv_1d_to_2d(WN.in_layers[i])
-
-    for i in range(len(WN.res_skip_layers)):
-        WN.res_skip_layers[i] = convert_conv_1d_to_2d(WN.res_skip_layers[i])
-
-    for i in range(len(WN.res_skip_layers)):
-        WN.cond_layers[i] = convert_conv_1d_to_2d(WN.cond_layers[i])
-
-def convert_1d_to_2d_(glow):
-    """
-    Caffe2 and TensorRT don't seem to support 1-d convolutions or properly
-    convert ONNX exports with 1d convolutions to 2d convolutions yet, so we
-    do the conversion to 2-d convolutions before ONNX export
-    """
-    # Convert upsample to 2d
-    upsample = torch.nn.ConvTranspose2d(glow.upsample.weight.size(0),
-                                        glow.upsample.weight.size(1),
-                                        (glow.upsample.weight.size(2), 1),
-                                        stride=(glow.upsample.stride[0], 1))
-    upsample.weight.data[:,:,:,0] = glow.upsample.weight.data
-    upsample.bias.data = glow.upsample.bias.data
-    glow.upsample = upsample.cuda()
-
-    # Convert WN to 2d
-    for WN in glow.WN:
-        convert_WN_1d_to_2d_(WN)
-
-    # Convert invertible conv to 2d
-    for i in range(len(glow.convinv)):
-        glow.convinv[i] = convert_convinv_1d_to_2d(glow.convinv[i])
-
-    glow.cuda()
-
-
 def infer_onnx(self, spect, z, sigma=0.9):
 
     spect = self.upsample(spect)
@@ -126,37 +60,33 @@ def infer_onnx(self, spect, z, sigma=0.9):
     mel_dim = 80
     batch_size = spect.size(0)
 
-    spect = torch.squeeze(spect, 3)
     spect = spect.view((batch_size, mel_dim, length_spect_group, self.n_group))
     spect = spect.permute(0, 2, 1, 3)
     spect = spect.contiguous()
     spect = spect.view((batch_size, length_spect_group, self.n_group*mel_dim))
     spect = spect.permute(0, 2, 1)
-    spect = torch.unsqueeze(spect, 3)
     spect = spect.contiguous()
 
-    audio = z[:, :self.n_remaining_channels, :, :]
-    z = z[:, self.n_remaining_channels:self.n_group, :, :]
+    audio = z[:, :self.n_remaining_channels, :]
+    z = z[:, self.n_remaining_channels:self.n_group, :]
     audio = sigma*audio
 
     for k in reversed(range(self.n_flows)):
         n_half = int(audio.size(1) / 2)
-        audio_0 = audio[:, :n_half, :, :]
-        audio_1 = audio[:, n_half:(n_half+n_half), :, :]
+        audio_0 = audio[:, :n_half, :]
+        audio_1 = audio[:, n_half:(n_half+n_half), :]
 
         output = self.WN[k]((audio_0, spect))
-        s = output[:, n_half:(n_half+n_half), :, :]
-        b = output[:, :n_half, :, :]
+        s = output[:, n_half:(n_half+n_half), :]
+        b = output[:, :n_half, :]
         audio_1 = (audio_1 - b) / torch.exp(s)
         audio = torch.cat([audio_0, audio_1], 1)
-
-        audio = self.convinv[k](audio)
+        audio = self.convinv[k].infer(audio)
 
         if k % self.n_early_every == 0 and k > 0:
-            audio = torch.cat((z[:, :self.n_early_size, :, :], audio), 1)
-            z = z[:, self.n_early_size:self.n_group, :, :]
+            audio = torch.cat((z[:, :self.n_early_size, :], audio), 1)
+            z = z[:, self.n_early_size:self.n_group, :]
 
-    audio = torch.squeeze(audio, 3)
     audio = audio.permute(0,2,1).contiguous().view(batch_size, (length_spect_group * self.n_group))
 
     return audio
@@ -165,7 +95,7 @@ def infer_onnx(self, spect, z, sigma=0.9):
 def export_onnx(parser, args):
 
     waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
-                                    amp_run=args.fp16, cpu_run=False,
+                                    fp16_run=args.fp16, cpu_run=False,
                                     forward_is_infer=False)
 
     # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
@@ -173,7 +103,7 @@ def export_onnx(parser, args):
     stride = 256 # value from waveglow upsample
     n_group = 8
     z_size2 = (mel.size(2)*stride)//n_group
-    z = torch.randn(1, n_group, z_size2, 1).cuda()
+    z = torch.randn(1, n_group, z_size2).cuda()
 
     if args.fp16:
         mel = mel.half()
@@ -183,16 +113,13 @@ def export_onnx(parser, args):
         waveglow.infer(mel, sigma=args.sigma_infer)
 
         # export to ONNX
-        convert_1d_to_2d_(waveglow)
         if args.fp16:
             waveglow = waveglow.half()
 
         fType = types.MethodType
         waveglow.forward = fType(infer_onnx, waveglow)
 
-        mel = mel.unsqueeze(3)
-
-        opset_version = 10
+        opset_version = 12
 
         torch.onnx.export(waveglow, (mel, z), args.output+"/"+"waveglow.onnx",
                           opset_version=opset_version,

BIN
PyTorch/SpeechSynthesis/Tacotron2/img/Taco2WG_train_loss.png


BIN
PyTorch/SpeechSynthesis/Tacotron2/img/tacotron2_a100_amp_loss.png


BIN
PyTorch/SpeechSynthesis/Tacotron2/img/tacotron2_a100_tf32_loss.png


BIN
PyTorch/SpeechSynthesis/Tacotron2/img/waveglow_a100_amp_loss.png


BIN
PyTorch/SpeechSynthesis/Tacotron2/img/waveglow_a100_tf32_loss.png


+ 37 - 38
PyTorch/SpeechSynthesis/Tacotron2/inference.py

@@ -31,6 +31,7 @@ import torch
 import argparse
 import numpy as np
 from scipy.io.wavfile import write
+import matplotlib
 import matplotlib.pyplot as plt
 
 import sys
@@ -39,8 +40,6 @@ import time
 import dllogger as DLLogger
 from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
 
-from apex import amp
-
 from waveglow.denoiser import Denoiser
 
 def parse_args(parser):
@@ -60,16 +59,19 @@ def parse_args(parser):
     parser.add_argument('-d', '--denoising-strength', default=0.01, type=float)
     parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
                         help='Sampling rate')
-    parser.add_argument('--amp-run', action='store_true',
-                        help='inference with AMP')
+
+    run_mode = parser.add_mutually_exclusive_group()
+    run_mode.add_argument('--fp16', action='store_true',
+                        help='Run inference with mixed precision')
+    run_mode.add_argument('--cpu', action='store_true',
+                        help='Run inference on CPU')
+
     parser.add_argument('--log-file', type=str, default='nvlog.json',
                         help='Filename for logging')
     parser.add_argument('--include-warmup', action='store_true',
                         help='Include warmup')
     parser.add_argument('--stft-hop-length', type=int, default=256,
                         help='STFT hop length for estimating audio length from mel size')
-    parser.add_argument('--cpu-run', action='store_true', 
-                        help='Run inference on CPU')
 
     return parser
 
@@ -103,18 +105,18 @@ def unwrap_distributed(state_dict):
     return new_state_dict
 
 
-def load_and_setup_model(model_name, parser, checkpoint, amp_run, cpu_run, forward_is_infer=False):
+def load_and_setup_model(model_name, parser, checkpoint, fp16_run, cpu_run, forward_is_infer=False):
     model_parser = models.parse_model_args(model_name, parser, add_help=False)
     model_args, _ = model_parser.parse_known_args()
     model_config = models.get_model_config(model_name, model_args)
-    model = models.get_model(model_name, model_config, cpu_run, forward_is_infer=forward_is_infer)
-    
+    model = models.get_model(model_name, model_config, to_cuda=(not cpu_run),
+                             forward_is_infer=forward_is_infer)
+
     if checkpoint is not None:
         if cpu_run:
             state_dict = torch.load(checkpoint, map_location=torch.device('cpu'))['state_dict']
         else:
             state_dict = torch.load(checkpoint)['state_dict']
-            
         if checkpoint_from_distributed(state_dict):
             state_dict = unwrap_distributed(state_dict)
 
@@ -125,7 +127,7 @@ def load_and_setup_model(model_name, parser, checkpoint, amp_run, cpu_run, forwa
 
     model.eval()
 
-    if amp_run:
+    if fp16_run:
         model.half()
 
     return model
@@ -156,29 +158,29 @@ def prepare_input_sequence(texts, cpu_run=False):
             text_to_sequence(text, ['english_cleaners'])[:]))
 
     text_padded, input_lengths = pad_sequences(d)
-    if torch.cuda.is_available() and not cpu_run:
-        text_padded = torch.autograd.Variable(text_padded).cuda().long()
-        input_lengths = torch.autograd.Variable(input_lengths).cuda().long()
+    if not cpu_run:
+        text_padded = text_padded.cuda().long()
+        input_lengths = input_lengths.cuda().long()
     else:
-        text_padded = torch.autograd.Variable(text_padded).long()
-        input_lengths = torch.autograd.Variable(input_lengths).long()
+        text_padded = text_padded.long()
+        input_lengths = input_lengths.long()
 
     return text_padded, input_lengths
 
 
 class MeasureTime():
-    def __init__(self, measurements, key, cpu_run):
+    def __init__(self, measurements, key, cpu_run=False):
         self.measurements = measurements
         self.key = key
         self.cpu_run = cpu_run
 
     def __enter__(self):
-        if self.cpu_run == False:
+        if not self.cpu_run:
             torch.cuda.synchronize()
         self.t0 = time.perf_counter()
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
-        if self.cpu_run == False:
+        if not self.cpu_run:
             torch.cuda.synchronize()
         self.measurements[self.key] = time.perf_counter() - self.t0
 
@@ -201,14 +203,12 @@ def main():
     DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
 
     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
-                                     args.amp_run, args.cpu_run, forward_is_infer=True)
+                                     args.fp16, args.cpu, forward_is_infer=True)
     waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
-                                    args.amp_run, args.cpu_run, forward_is_infer=True)
-    
-    if args.cpu_run:
-        denoiser = Denoiser(waveglow, args.cpu_run)
-    else:
-        denoiser = Denoiser(waveglow, args.cpu_run).cuda()
+                                    args.fp16, args.cpu, forward_is_infer=True)
+    denoiser = Denoiser(waveglow)
+    if not args.cpu:
+        denoiser.cuda()
 
     jitted_tacotron2 = torch.jit.script(tacotron2)
 
@@ -221,14 +221,11 @@ def main():
         sys.exit(1)
 
     if args.include_warmup:
-        if args.cpu_run:
-            sequence = torch.randint(low=0, high=148, size=(1,50),
-                                 dtype=torch.long)
-            input_lengths = torch.IntTensor([sequence.size(1)]).long()
-        else:
-            sequence = torch.randint(low=0, high=148, size=(1,50),
-                                 dtype=torch.long).cuda()
-            input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long()
+        sequence = torch.randint(low=0, high=148, size=(1,50)).long()
+        input_lengths = torch.IntTensor([sequence.size(1)]).long()
+        if not args.cpu:
+            sequence = sequence.cuda()
+            input_lengths = input_lengths.cuda()
         for i in range(3):
             with torch.no_grad():
                 mel, mel_lengths, _ = jitted_tacotron2(sequence, input_lengths)
@@ -236,14 +233,15 @@ def main():
 
     measurements = {}
 
-    sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run)
+    sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)
 
-    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu_run):
+    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu):
         mel, mel_lengths, alignments = jitted_tacotron2(sequences_padded, input_lengths)
 
-    with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu_run):
+    with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu):
         audios = waveglow(mel, sigma=args.sigma_infer)
         audios = audios.float()
+    with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu):
         audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
 
     print("Stopping after",mel.size(2),"decoder steps")
@@ -255,7 +253,8 @@ def main():
     DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']})
     DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
     DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
-    DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time'])})
+    DLLogger.log(step=0, data={"denoiser_latency": measurements['denoiser_time']})
+    DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time']+measurements['denoiser_time'])})
 
     for i, audio in enumerate(audios):
 

+ 8 - 8
PyTorch/SpeechSynthesis/Tacotron2/notebooks/Tacotron2.ipynb

@@ -193,7 +193,7 @@
     "\n",
     "The training loss is averaged over an entire training epoch, whereas the validation loss is averaged over the validation dataset. Performance is reported in total input tokens per second for the Tacotron 2 model, and in total output samples per second for the WaveGlow model. Both measures are recorded as train_iter_items/sec (after each iteration) and train_epoch_items/sec (averaged over epoch) in the output log. The result is averaged over an entire training epoch and summed over all GPUs that were included in the training.\n",
     "\n",
-    "By default, the train_tacotron2.sh and train_waveglow.sh scripts will launch mixed precision training with tensor cores. You can change this behaviour by removing the --amp-run flag from the train.py script.\n",
+    "By default, the train_tacotron2.sh and train_waveglow.sh scripts will launch mixed precision training with tensor cores. You can change this behaviour by removing the --amp flag from the train.py script.\n",
     "\n",
     "To run Tacotron 2 training:"
    ]
@@ -205,7 +205,7 @@
    "outputs": [],
    "source": [
     "#For single GPU \n",
-    "!nvidia-docker exec -it myTacotron2 python train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1500 -bs 80 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1 --amp-run "
+    "!nvidia-docker exec -it myTacotron2 python train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1500 -bs 80 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1 --amp "
    ]
   },
   {
@@ -215,7 +215,7 @@
    "outputs": [],
    "source": [
     "#For multiple GPUs\n",
-    "!nvidia-docker exec -it myTacotron2 python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1500 -bs 80 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled. I a --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1 --amp-run "
+    "!nvidia-docker exec -it myTacotron2 python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1500 -bs 80 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled. I a --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1 --amp "
    ]
   },
   {
@@ -232,7 +232,7 @@
    "outputs": [],
    "source": [
     "#For single GPU\n",
-    "!nvidia-docker exec -it myTacotron2 python train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1000 -bs 10 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 65504.0 --epochs-per-checkpoint 50 --cudnn-enabled --cudnn-benchmark --log-file output/nvlog.json --amp-run"
+    "!nvidia-docker exec -it myTacotron2 python train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1000 -bs 10 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 65504.0 --epochs-per-checkpoint 50 --cudnn-enabled --cudnn-benchmark --log-file output/nvlog.json --amp"
    ]
   },
   {
@@ -242,7 +242,7 @@
    "outputs": [],
    "source": [
     "#For multiple GPUs\n",
-    "!nvidia-docker exec -it myTacotron2 python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1000 -bs 10 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 65504.0 --epochs-per-checkpoint 50 --cudnn-enabled --cudnn-benchmark --log-file output/nvlog.json --amp-run"
+    "!nvidia-docker exec -it myTacotron2 python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1000 -bs 10 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 65504.0 --epochs-per-checkpoint 50 --cudnn-enabled --cudnn-benchmark --log-file output/nvlog.json --amp"
    ]
   },
   {
@@ -377,7 +377,7 @@
     "\n",
     "The output audio will be stored in the path specified by -o argument.\n",
     "\n",
-    "To run inference in mixed precision, use --amp-run flag:   "
+    "To run inference in mixed precision, use --fp16 flag:   "
    ]
   },
   {
@@ -386,7 +386,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!nvidia-docker exec -it myTacotron2 python inference.py --tacotron2 JoC_Tacotron2_FP16_PyT_20190306 --max-decoder-steps 2000 --waveglow JoC_WaveGlow_FP32_PyT_20190306 -o output/ --include-warmup -i text.txt --amp-run"
+    "!nvidia-docker exec -it myTacotron2 python inference.py --tacotron2 JoC_Tacotron2_FP16_PyT_20190306 --max-decoder-steps 2000 --waveglow JoC_WaveGlow_FP32_PyT_20190306 -o output/ --include-warmup -i text.txt --fp16"
    ]
   },
   {
@@ -411,7 +411,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To run inference using FP32, simply remove --amp-run flag: "
+    "To run inference using FP32, simply remove --fp16 flag: "
    ]
   },
   {

+ 2 - 2
PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/README.md

@@ -178,7 +178,7 @@ Export Tacotron 2 to TorchScript:
 ```bash
 cd /workspace/tacotron2/
 mkdir -p output
-python exports/export_tacotron2_ts.py --tacotron2 checkpoints/nvidia_tacotron2pyt_fp16_20190427 -o output/model.pt --amp-run
+python exports/export_tacotron2_ts.py --tacotron2 checkpoints/nvidia_tacotron2pyt_fp16_20190427 -o output/model.pt --amp
 ```
 
 To export WaveGlow to TensorRT 7, install ONNX-TRT
@@ -194,7 +194,7 @@ cd /workspace/tacotron2
 Export WaveGlow to ONNX intermediate representation:
 
 ```bash
-python exports/export_waveglow_onnx.py --waveglow checkpoints/nvidia_waveglow256pyt_fp16 --wn-channels 256 --amp-run -o output/
+python exports/export_waveglow_onnx.py --waveglow checkpoints/nvidia_waveglow256pyt_fp16 --wn-channels 256 --fp16 -o output/
 ```
 
 Use the exported ONNX IR to generate TensorRT engine:

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/README.md

@@ -34,7 +34,7 @@ file by executing points 1-5. You have to train WaveGlow in a different way than
 the following command instead of the one given in QuickStart at point 5:
 
 ```bash
-python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 2001 --wn-channels 256 -bs 12 --segment-length 16000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
+python -m multiproc train.py -m WaveGlow -o output/ --amp -lr 1e-4 --epochs 2001 --wn-channels 256 -bs 12 --segment-length 16000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
 ```
 
 This will train the WaveGlow model with a smaller number of residual connections

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_AMP_1NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python train.py -m Tacotron2 -o output/ --amp -lr 1e-3 --epochs 1501 -bs 104 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_AMP_4NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m Tacotron2 -o output/ --amp -lr 1e-3 --epochs 1501 -bs 104 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_AMP_8NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m Tacotron2 -o output/ --amp -lr 1e-3 --epochs 1501 -bs 104 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 0 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_1GPU.sh → PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_FP32_1NGPU_train.sh


+ 0 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_4GPU.sh → PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_FP32_4NGPU_train.sh


+ 0 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_8GPU.sh → PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_tacotron2_FP32_8NGPU_train.sh


+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_AMP_1NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python train.py -m WaveGlow -o output/ --amp -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_AMP_4NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m WaveGlow -o output/ --amp -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_AMP_8NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m WaveGlow -o output/ --amp -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 0 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_1GPU.sh → PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_FP32_1NGPU_train.sh


+ 0 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_4GPU.sh → PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_FP32_4NGPU_train.sh


+ 0 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_8GPU.sh → PyTorch/SpeechSynthesis/Tacotron2/platform/DGX1_waveglow_FP32_8NGPU_train.sh


+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_AMP_1NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python train.py -m Tacotron2 -o output/ --amp -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_AMP_4NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m Tacotron2 -o output/ --amp -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_AMP_8NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m Tacotron2 -o output/ --amp -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_TF32_1NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_TF32_4NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_tacotron2_TF32_8NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_AMP_1NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python train.py -m WaveGlow -o output/ --amp -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_AMP_4NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m WaveGlow -o output/ --amp -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_AMP_8NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m WaveGlow -o output/ --amp -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_TF32_1NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_TF32_4NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 2 - 0
PyTorch/SpeechSynthesis/Tacotron2/platform/DGXA100_waveglow_TF32_8NGPU_train.sh

@@ -0,0 +1,2 @@
+mkdir -p output
+python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 0 - 2
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_1GPU.sh

@@ -1,2 +0,0 @@
-mkdir -p output
-python train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 104 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 0 - 2
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_4GPU.sh

@@ -1,2 +0,0 @@
-mkdir -p output
-python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 104 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 0 - 2
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_8GPU.sh

@@ -1,2 +0,0 @@
-mkdir -p output
-python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 104 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 0 - 2
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_1GPU.sh

@@ -1,2 +0,0 @@
-mkdir -p output
-python train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 0 - 2
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_4GPU.sh

@@ -1,2 +0,0 @@
-mkdir -p output
-python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 0 - 2
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_8GPU.sh

@@ -1,2 +0,0 @@
-mkdir -p output
-python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 11 - 4
PyTorch/SpeechSynthesis/Tacotron2/run_latency_tests.sh

@@ -1,4 +1,11 @@
-bash test_infer.sh -bs 1 -il 128 -p amp --num-iters 1003 --tacotron2 tacotron2_1032590_6000_amp --waveglow waveglow_1076430_14000_amp --wn-channels 256
-bash test_infer.sh -bs 4 -il 128 -p amp --num-iters 1003 --tacotron2 tacotron2_1032590_6000_amp --waveglow waveglow_1076430_14000_amp --wn-channels 256
-bash test_infer.sh -bs 1 -il 128 -p fp32 --num-iters 1003 --tacotron2 tacotron2_1032590_6000_amp --waveglow waveglow_1076430_14000_amp --wn-channels 256
-bash test_infer.sh -bs 4 -il 128 -p fp32 --num-iters 1003 --tacotron2 tacotron2_1032590_6000_amp --waveglow waveglow_1076430_14000_amp --wn-channels 256
+unset CUDA_VISIBLE_DEVICES
+bash test_infer.sh -bs 1 -il 128 --fp16 --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
+bash test_infer.sh -bs 4 -il 128 --fp16 --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
+bash test_infer.sh -bs 1 -il 128 --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
+bash test_infer.sh -bs 4 -il 128 --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
+export CUDA_VISIBLE_DEVICES=
+export OMP_NUM_THREADS=6
+export KMP_BLOCKTIME=0
+export KMP_AFFINITY=granularity=fine,compact,1,0
+bash test_infer.sh -bs 1 -il 128 --cpu --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
+bash test_infer.sh -bs 4 -il 128 --cpu --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/scripts/train_tacotron2.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m Tacotron2 -o ./output/ -lr 1e-3 --epochs 1501 -bs 104 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1 --amp-run
+python -m multiproc train.py -m Tacotron2 -o ./output/ -lr 1e-3 --epochs 1501 -bs 48 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/scripts/train_waveglow.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m WaveGlow -o ./output/ -lr 1e-4 --epochs 1001 -bs 10 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json --amp-run
+python -m multiproc train.py -m WaveGlow -o ./output/ -lr 1e-4 --epochs 1501 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json

+ 1 - 0
PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py

@@ -535,6 +535,7 @@ class Decoder(nn.Module):
          attention_weights_cum,
          attention_context,
          processed_memory) = self.initialize_decoder_states(memory)
+
         mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device=memory.device)
         not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device=memory.device)
 

+ 49 - 76
PyTorch/SpeechSynthesis/Tacotron2/test_infer.py

@@ -34,7 +34,7 @@ from scipy.io.wavfile import write
 
 import sys
 
-from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence
+from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence, load_and_setup_model
 
 import time
 import dllogger as DLLogger
@@ -56,8 +56,13 @@ def parse_args(parser):
     parser.add_argument('-d', '--denoising-strength', default=0.01, type=float)
     parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
                         help='Sampling rate')
-    parser.add_argument('--amp-run', action='store_true',
-                        help='inference with AMP')
+
+    run_mode = parser.add_mutually_exclusive_group()
+    run_mode.add_argument('--fp16', action='store_true',
+                        help='Run inference with FP16')
+    run_mode.add_argument('--cpu', action='store_true',
+                        help='Run inference on CPU')
+
     parser.add_argument('--log-file', type=str, default='nvlog.json',
                         help='Filename for logging')
     parser.add_argument('--stft-hop-length', type=int, default=256,
@@ -68,50 +73,11 @@ def parse_args(parser):
                         help='Input length')
     parser.add_argument('-bs', '--batch-size', type=int, default=1,
                         help='Batch size')
-    parser.add_argument('--cpu-run', action='store_true', 
-                        help='Run inference on CPU')
     return parser
 
 
-def load_and_setup_model(model_name, parser, checkpoint, amp_run, cpu_run, forward_is_infer=False):
-    model_parser = models.parse_model_args(model_name, parser, add_help=False)
-    model_args, _ = model_parser.parse_known_args()
-
-    model_config = models.get_model_config(model_name, model_args)
-    model = models.get_model(model_name, model_config, cpu_run, forward_is_infer=forward_is_infer)
-
-    if checkpoint is not None:
-        if cpu_run:
-            state_dict = torch.load(checkpoint, map_location=torch.device('cpu'))['state_dict']
-        else:
-            state_dict = torch.load(checkpoint)['state_dict']
-
-        if checkpoint_from_distributed(state_dict):
-            state_dict = unwrap_distributed(state_dict)
-
-        model.load_state_dict(state_dict)
-
-    if model_name == "WaveGlow":
-        model = model.remove_weightnorm(model)
-
-    model.eval()
-
-    if amp_run:
-        model, _ = amp.initialize(model, [], opt_level="O3")
-
-    return model
-
-
 def print_stats(measurements_all):
 
-    print(np.mean(measurements_all['latency'][1:]),
-          np.mean(measurements_all['throughput'][1:]),
-          np.mean(measurements_all['pre_processing'][1:]),
-          np.mean(measurements_all['type_conversion'][1:])+
-          np.mean(measurements_all['storage'][1:])+
-          np.mean(measurements_all['data_transfer'][1:]),
-          np.mean(measurements_all['num_mels_per_audio'][1:]))
-
     throughput = measurements_all['throughput']
     preprocessing = measurements_all['pre_processing']
     type_conversion = measurements_all['type_conversion']
@@ -119,6 +85,9 @@ def print_stats(measurements_all):
     data_transfer = measurements_all['data_transfer']
     postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
     latency = measurements_all['latency']
+    waveglow_latency = measurements_all['waveglow_latency']
+    tacotron2_latency = measurements_all['tacotron2_latency']
+    denoiser_latency = measurements_all['denoiser_latency']
     num_mels_per_audio = measurements_all['num_mels_per_audio']
 
     latency.sort()
@@ -129,17 +98,20 @@ def print_stats(measurements_all):
     cf_99 = max(latency[:int(len(latency)*0.99)])
     cf_100 = max(latency[:int(len(latency)*1.0)])
 
-    print("Throughput average (samples/sec) = {:.4f}".format(np.mean(throughput)))
-    print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
-    print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
-    print("Number of mels per audio average = {}".format(np.mean(num_mels_per_audio)))
-    print("Latency average (seconds) = {:.4f}".format(np.mean(latency)))
-    print("Latency std (seconds) = {:.4f}".format(np.std(latency)))
-    print("Latency cl 50 (seconds) = {:.4f}".format(cf_50))
-    print("Latency cl 90 (seconds) = {:.4f}".format(cf_90))
-    print("Latency cl 95 (seconds) = {:.4f}".format(cf_95))
-    print("Latency cl 99 (seconds) = {:.4f}".format(cf_99))
-    print("Latency cl 100 (seconds) = {:.4f}".format(cf_100))
+    print("Throughput average (samples/sec)    = {:.0f}".format(np.mean(throughput)))
+    print("Preprocessing average (seconds)     = {:.4f}".format(np.mean(preprocessing)))
+    print("Postprocessing average (seconds)    = {:.4f}".format(np.mean(postprocessing)))
+    print("Number of mels per audio average    = {:.0f}".format(np.mean(num_mels_per_audio)))
+    print("Tacotron2 latency average (seconds) = {:.2f}".format(np.mean(tacotron2_latency)))
+    print("WaveGlow latency average (seconds)  = {:.2f}".format(np.mean(waveglow_latency)))
+    print("Denoiser latency average (seconds)  = {:.4f}".format(np.mean(denoiser_latency)))
+    print("Latency average (seconds)           = {:.2f}".format(np.mean(latency)))
+    print("Latency std (seconds)               = {:.2f}".format(np.std(latency)))
+    print("Latency cl 50 (seconds)             = {:.2f}".format(cf_50))
+    print("Latency cl 90 (seconds)             = {:.2f}".format(cf_90))
+    print("Latency cl 95 (seconds)             = {:.2f}".format(cf_95))
+    print("Latency cl 99 (seconds)             = {:.2f}".format(cf_99))
+    print("Latency cl 100 (seconds)            = {:.2f}".format(cf_100))
 
 
 def main():
@@ -161,6 +133,7 @@ def main():
     measurements_all = {"pre_processing": [],
                         "tacotron2_latency": [],
                         "waveglow_latency": [],
+                        "denoiser_latency": [],
                         "latency": [],
                         "type_conversion": [],
                         "data_transfer": [],
@@ -172,15 +145,13 @@ def main():
 
     print("args:", args, unknown_args)
 
-    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True)
-    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run)
-
-    if args.cpu_run:
-        denoiser = Denoiser(waveglow, args.cpu_run)
-    else:
-        denoiser = Denoiser(waveglow, args.cpu_run).cuda()
-
-    jitted_tacotron2 = torch.jit.script(tacotron2)
+    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
+                                     args.fp16, args.cpu, forward_is_infer=True)
+    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
+                                    args.fp16, args.cpu, forward_is_infer=True)
+    denoiser = Denoiser(waveglow)
+    if not args.cpu:
+        denoiser.cuda()
 
     texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
     texts = [texts[0][:args.input_length]]
@@ -192,29 +163,31 @@ def main():
 
         measurements = {}
 
-        with MeasureTime(measurements, "pre_processing", args.cpu_run):
-            sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run)
+        with MeasureTime(measurements, "pre_processing", args.cpu):
+            sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)
 
         with torch.no_grad():
-            with MeasureTime(measurements, "latency", args.cpu_run):
-                with MeasureTime(measurements, "tacotron2_latency", args.cpu_run):
-                    mel, mel_lengths, _ = jitted_tacotron2(sequences_padded, input_lengths)
+            with MeasureTime(measurements, "latency", args.cpu):
+                with MeasureTime(measurements, "tacotron2_latency", args.cpu):
+                    mel, mel_lengths, _ = tacotron2.infer(sequences_padded, input_lengths)
 
-                with MeasureTime(measurements, "waveglow_latency", args.cpu_run):
+                with MeasureTime(measurements, "waveglow_latency", args.cpu):
                     audios = waveglow.infer(mel, sigma=args.sigma_infer)
-                    audios = audios.float()
-                    audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
 
-        num_mels = mel.size(0)*mel.size(2)
-        num_samples = audios.size(0)*audios.size(1)
+                num_mels = mel.size(0)*mel.size(2)
+                num_samples = audios.size(0)*audios.size(1)
+
 
-        with MeasureTime(measurements, "type_conversion", args.cpu_run):
-            audios = audios.float()
+                with MeasureTime(measurements, "type_conversion", args.cpu):
+                    audios = audios.float()
+
+                with torch.no_grad(), MeasureTime(measurements, "denoiser_latency", args.cpu):
+                    audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
 
-        with MeasureTime(measurements, "data_transfer", args.cpu_run):
+        with MeasureTime(measurements, "data_transfer", args.cpu):
             audios = audios.cpu()
 
-        with MeasureTime(measurements, "storage", args.cpu_run):
+        with MeasureTime(measurements, "storage", args.cpu):
             audios = audios.numpy()
             for i, audio in enumerate(audios):
                 audio_path = "audio_"+str(i)+".wav"

+ 19 - 22
PyTorch/SpeechSynthesis/Tacotron2/test_infer.sh

@@ -2,14 +2,14 @@
 
 BATCH_SIZE=1
 INPUT_LENGTH=128
-PRECISION="fp32"
 NUM_ITERS=1003 # extra 3 iterations for warmup
 TACOTRON2_CKPT="tacotron2_1032590_6000_amp"
 WAVEGLOW_CKPT="waveglow_1076430_14000_amp"
-AMP_RUN=""
+RUN_MODE="" # = fp32
+LOG_RUN_MODE="gpu_fp32"
 TEST_PROGRAM="test_infer.py"
-WN_CHANNELS=256
-CPU_RUN=""
+WN_CHANNELS=512
+LOG_SUFFIX_ADD="" #additional info, e.g., GPU type
 
 while [ -n "$1" ]
 do
@@ -22,10 +22,6 @@ do
 	    INPUT_LENGTH="$2"
 	    shift
 	    ;;
-	-p|--prec)
-	    PRECISION="$2"
-	    shift
-	    ;;
 	--num-iters)
 	    NUM_ITERS="$2"
 	    shift
@@ -58,8 +54,16 @@ do
 	    WN_CHANNELS="$2"
 	    shift
 	    ;;
-	--cpu-run)
-	    CPU_RUN="--cpu-run"
+	--cpu)
+	    RUN_MODE="--cpu"
+	    LOG_RUN_MODE="cpu_fp32"
+	    ;;
+	--fp16)
+	    RUN_MODE="--fp16"
+	    LOG_RUN_MODE="gpu_fp16"
+	    ;;
+	--log-suffix)
+	    LOG_SUFFIX_ADD="$2"
 	    shift
 	    ;;
 	*)
@@ -68,15 +72,7 @@ do
     shift
 done
 
-if [ "$PRECISION" = "amp" ]
-then
-    AMP_RUN="--amp-run"
-elif  [ "$PRECISION" = "fp16" ]
-then
-    AMP_RUN="--fp16"
-fi
-
-LOG_SUFFIX=bs${BATCH_SIZE}_il${INPUT_LENGTH}_${PRECISION}
+LOG_SUFFIX=bs${BATCH_SIZE}_il${INPUT_LENGTH}_${LOG_RUN_MODE}_wn${WN_CHANNELS}_${LOG_SUFFIX_ADD}
 NVLOG_FILE=nvlog_${LOG_SUFFIX}.json
 TMP_LOGFILE=tmp_log_${LOG_SUFFIX}.log
 LOGFILE=log_${LOG_SUFFIX}.log
@@ -94,11 +90,11 @@ python $TEST_PROGRAM \
        $TACOTRON2_PARAMS \
        --waveglow $WAVEGLOW_CKPT \
        --batch-size $BATCH_SIZE \
-       --input-length $INPUT_LENGTH $AMP_RUN \
+       --input-length $INPUT_LENGTH \
        --log-file $NVLOG_FILE \
        --num-iters $NUM_ITERS \
        --wn-channels $WN_CHANNELS \
-       $CPU_RUN \
+       $RUN_MODE \
        |& tee $TMP_LOGFILE
 set +x
 
@@ -107,8 +103,9 @@ PERF=$(cat $TMP_LOGFILE | grep -F 'Throughput average (samples/sec)' | awk -F'=
 NUM_MELS=$(cat $TMP_LOGFILE | grep -F 'Number of mels per audio average' | awk -F'= ' '{print $2}')
 LATENCY=$(cat $TMP_LOGFILE | grep -F 'Latency average (seconds)' | awk -F'= ' '{print $2}')
 LATENCYSTD=$(cat $TMP_LOGFILE | grep -F 'Latency std (seconds)' | awk -F'= ' '{print $2}')
+LATENCY50=$(cat $TMP_LOGFILE | grep -F 'Latency cl 50 (seconds)' | awk -F'= ' '{print $2}')
 LATENCY90=$(cat $TMP_LOGFILE | grep -F 'Latency cl 90 (seconds)' | awk -F'= ' '{print $2}')
 LATENCY95=$(cat $TMP_LOGFILE | grep -F 'Latency cl 95 (seconds)' | awk -F'= ' '{print $2}')
 LATENCY99=$(cat $TMP_LOGFILE | grep -F 'Latency cl 99 (seconds)' | awk -F'= ' '{print $2}')
 
-echo "$BATCH_SIZE,$INPUT_LENGTH,$PRECISION,$NUM_ITERS,$LATENCY,$LATENCYSTD,$LATENCY90,$LATENCY95,$LATENCY99,$PERF,$NUM_MELS" >> $LOGFILE
+echo "$BATCH_SIZE,$INPUT_LENGTH,$LOG_RUN_MODE,$NUM_ITERS,$LATENCY,$LATENCYSTD,$LATENCY50,$LATENCY90,$LATENCY95,$LATENCY99,$PERF,$NUM_MELS" | tee $LOGFILE

+ 73 - 29
PyTorch/SpeechSynthesis/Tacotron2/train.py

@@ -81,9 +81,11 @@ def parse_args(parser):
                           help='Number of epochs per checkpoint')
     training.add_argument('--checkpoint-path', type=str, default='',
                           help='Checkpoint path to resume training')
+    training.add_argument('--resume-from-last', action='store_true',
+                          help='Resumes training from the last checkpoint; uses the directory provided with \'--output\' option to search for the checkpoint \"checkpoint_<model_name>_last.pt\"')
     training.add_argument('--dynamic-loss-scaling', type=bool, default=True,
                           help='Enable dynamic loss scaling')
-    training.add_argument('--amp-run', action='store_true',
+    training.add_argument('--amp', action='store_true',
                           help='Enable AMP')
     training.add_argument('--cudnn-enabled', action='store_true',
                           help='Enable cudnn')
@@ -179,29 +181,70 @@ def init_distributed(args, world_size, rank, group_name):
     print("Done initializing distributed")
 
 
-def save_checkpoint(model, optimizer, epoch, config, amp_run, filepath):
-    print("Saving model and optimizer state at epoch {} to {}".format(
-        epoch, filepath))
-    checkpoint = {'epoch': epoch,
-                  'cuda_rng_state_all': torch.cuda.get_rng_state_all(),
-                  'random_rng_state': torch.random.get_rng_state(),
-                  'config': config,
-                  'state_dict': model.state_dict(),
-                  'optimizer': optimizer.state_dict()}
-    if amp_run:
-        checkpoint['amp'] = amp.state_dict()
+def save_checkpoint(model, optimizer, epoch, config, amp_run, output_dir, model_name,
+                    local_rank, world_size):
+
+    random_rng_state = torch.random.get_rng_state().cuda()
+    cuda_rng_state = torch.cuda.get_rng_state(local_rank).cuda()
+
+    random_rng_states_all = [torch.empty_like(random_rng_state) for _ in range(world_size)]
+    cuda_rng_states_all = [torch.empty_like(cuda_rng_state) for _ in range(world_size)]
 
-    torch.save(checkpoint, filepath)
+    if world_size > 1:
+        dist.all_gather(random_rng_states_all, random_rng_state)
+        dist.all_gather(cuda_rng_states_all, cuda_rng_state)
+    else:
+        random_rng_states_all = [random_rng_state]
+        cuda_rng_states_all = [cuda_rng_state]
 
+    random_rng_states_all = torch.stack(random_rng_states_all).cpu()
+    cuda_rng_states_all = torch.stack(cuda_rng_states_all).cpu()
 
-def load_checkpoint(model, optimizer, epoch, config, amp_run, filepath, rank):
+    if local_rank == 0:
+        checkpoint = {'epoch': epoch,
+                      'cuda_rng_state_all': cuda_rng_states_all,
+                      'random_rng_states_all': random_rng_states_all,
+                      'config': config,
+                      'state_dict': model.state_dict(),
+                      'optimizer': optimizer.state_dict()}
+        if amp_run:
+            checkpoint['amp'] = amp.state_dict()
+
+        checkpoint_filename = "checkpoint_{}_{}.pt".format(model_name, epoch)
+        checkpoint_path = os.path.join(
+            output_dir, checkpoint_filename)
+        print("Saving model and optimizer state at epoch {} to {}".format(
+            epoch, checkpoint_path))
+        torch.save(checkpoint, checkpoint_path)
+
+        symlink_src = checkpoint_filename
+        symlink_dst = os.path.join(
+            output_dir, "checkpoint_{}_last.pt".format(model_name))
+        if os.path.exists(symlink_dst) and os.path.islink(symlink_dst):
+            print("|||| Updating symlink", symlink_dst, "to point to", symlink_src)
+            os.remove(symlink_dst)
+
+        os.symlink(symlink_src, symlink_dst)
+
+
+def get_last_checkpoint_filename(output_dir, model_name):
+    symlink = os.path.join(output_dir, "checkpoint_{}_last.pt".format(model_name))
+    if os.path.exists(symlink):
+        print("|||| Loading checkpoint from symlink", symlink)
+        return os.path.join(output_dir, os.readlink(symlink))
+    else:
+        print("|||| No last checkpoint available - starting from epoch 0 ")
+        return ""
+
+
+def load_checkpoint(model, optimizer, epoch, config, amp_run, filepath, local_rank):
 
     checkpoint = torch.load(filepath, map_location='cpu')
 
     epoch[0] = checkpoint['epoch']+1
-    device_id = rank % torch.cuda.device_count()
+    device_id = local_rank % torch.cuda.device_count()
     torch.cuda.set_rng_state(checkpoint['cuda_rng_state_all'][device_id])
-    torch.random.set_rng_state(checkpoint['random_rng_state'])
+    torch.random.set_rng_state(checkpoint['random_rng_states_all'][device_id])
     config = checkpoint['config']
     model.load_state_dict(checkpoint['state_dict'])
     optimizer.load_state_dict(checkpoint['optimizer'])
@@ -209,6 +252,7 @@ def load_checkpoint(model, optimizer, epoch, config, amp_run, filepath, rank):
     if amp_run:
         amp.load_state_dict(checkpoint['amp'])
 
+
 # adapted from: https://discuss.pytorch.org/t/opinion-eval-should-be-a-context-manager/18998/3
 # Following snippet is licensed under MIT license
 
@@ -247,7 +291,7 @@ def validate(model, criterion, valset, epoch, batch_iter, batch_size,
             if distributed_run:
                 reduced_val_loss = reduce_tensor(loss.data, world_size).item()
                 reduced_num_items = reduce_tensor(num_items.data, 1).item()
-            else:
+            else:               #
                 reduced_val_loss = loss.item()
                 reduced_num_items = num_items.item()
             val_loss += reduced_val_loss
@@ -334,13 +378,13 @@ def main():
                              cpu_run=False,
                              uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight)
 
-    if not args.amp_run and distributed_run:
+    if not args.amp and distributed_run:
         model = DDP(model)
 
     optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate,
                                  weight_decay=args.weight_decay)
 
-    if args.amp_run:
+    if args.amp:
         model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
         if distributed_run:
             model = DDP(model)
@@ -352,9 +396,12 @@ def main():
 
     start_epoch = [0]
 
+    if args.resume_from_last:
+        args.checkpoint_path = get_last_checkpoint_filename(args.output, model_name)
+
     if args.checkpoint_path is not "":
         load_checkpoint(model, optimizer, start_epoch, model_config,
-                        args.amp_run, args.checkpoint_path, local_rank)
+                        args.amp, args.checkpoint_path, local_rank)
 
     start_epoch = start_epoch[0]
 
@@ -399,11 +446,10 @@ def main():
         # used to calculate avg items/sec over epoch
         reduced_num_items_epoch = 0
 
-        # used to calculate avg loss over epoch
-        train_epoch_avg_loss = 0.0
         train_epoch_items_per_sec = 0.0
 
         num_iters = 0
+        reduced_loss = 0
 
         # if overflow at the last iteration then do not save checkpoint
         overflow = False
@@ -437,13 +483,12 @@ def main():
 
             DLLogger.log(step=(epoch,i), data={'train_loss': reduced_loss})
 
-            train_epoch_avg_loss += reduced_loss
             num_iters += 1
 
             # accumulate number of items processed in this epoch
             reduced_num_items_epoch += reduced_num_items
 
-            if args.amp_run:
+            if args.amp:
                 with amp.scale_loss(loss, optimizer) as scaled_loss:
                     scaled_loss.backward()
                 grad_norm = torch.nn.utils.clip_grad_norm_(
@@ -471,18 +516,17 @@ def main():
 
         DLLogger.log(step=(epoch,), data={'train_items_per_sec':
                                           (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
-        DLLogger.log(step=(epoch,), data={'train_loss': (train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0)})
+        DLLogger.log(step=(epoch,), data={'train_loss': reduced_loss})
         DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time})
 
         val_loss = validate(model, criterion, valset, epoch, iteration,
                             args.batch_size, world_size, collate_fn,
                             distributed_run, local_rank, batch_to_gpu)
 
-        if (epoch % args.epochs_per_checkpoint == 0) and local_rank == 0 and args.bench_class == "":
-            checkpoint_path = os.path.join(
-                args.output, "checkpoint_{}_{}".format(model_name, epoch))
+        if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "":
             save_checkpoint(model, optimizer, epoch, model_config,
-                            args.amp_run, checkpoint_path)
+                            args.amp, args.output, args.model_name,
+                            local_rank, world_size)
         if local_rank == 0:
             DLLogger.flush()
 

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/trt/README.md

@@ -51,7 +51,7 @@ NVIDIA TensorRT is a platform for high-performance deep learning inference. It i
    Export Tacotron 2 to three ONNX parts: Encoder, Decoder, and Postnet:
 
 	```bash
-   mkdir -p output
+	mkdir -p output
 	python exports/export_tacotron2_onnx.py --tacotron2 ./checkpoints/nvidia_tacotron2pyt_fp16_20190427 -o output/ --fp16
 	```
 

+ 2 - 2
PyTorch/SpeechSynthesis/Tacotron2/trt/export_onnx2trt.py

@@ -113,8 +113,8 @@ def main():
             sys.exit()
 
     # WaveGlow
-    shapes=[{"name": "mel", "min": (1,80,32,1),  "opt": (1,80,768,1),  "max": (1,80,1664,1)},
-            {"name": "z",   "min": (1,8,1024,1), "opt": (1,8,24576,1), "max": (1,8,53248,1)}]
+    shapes=[{"name": "mel", "min": (1,80,32),  "opt": (1,80,768),  "max": (1,80,1664)},
+            {"name": "z",   "min": (1,8,1024), "opt": (1,8,24576), "max": (1,8,53248)}]
     if args.waveglow != "":
         print("Building WaveGlow ...")
         waveglow_engine = build_engine(args.waveglow, shapes=shapes, fp16=args.fp16)

+ 1 - 4
PyTorch/SpeechSynthesis/Tacotron2/trt/inference_trt.py

@@ -28,8 +28,6 @@
 import tensorrt as trt
 import numpy as np
 from scipy.io.wavfile import write
-import pycuda.autoinit
-import pycuda.driver as cuda
 import time
 import torch
 import argparse
@@ -270,14 +268,13 @@ def infer_tacotron2_trt(encoder, decoder_iter, postnet,
 
 def infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, fp16):
 
-    mel = mel.unsqueeze(3)
     mel_size = mel.size(2)
     batch_size = mel.size(0)
     stride = 256
     n_group = 8
     z_size = mel_size*stride
     z_size = z_size//n_group
-    z = torch.randn(batch_size, n_group, z_size, 1).cuda()
+    z = torch.randn(batch_size, n_group, z_size).cuda()
     audios = torch.zeros(batch_size, mel_size*stride).cuda()
 
     if fp16:

+ 7 - 16
PyTorch/SpeechSynthesis/Tacotron2/waveglow/denoiser.py

@@ -37,24 +37,15 @@ class Denoiser(torch.nn.Module):
     def __init__(self, waveglow, cpu_run=False, filter_length=1024, n_overlap=4,
                  win_length=1024, mode='zeros'):
         super(Denoiser, self).__init__()
-        if cpu_run:
-            self.stft = STFT(filter_length=filter_length,
+        device = waveglow.upsample.weight.device
+        dtype = waveglow.upsample.weight.dtype
+        self.stft = STFT(filter_length=filter_length,
                          hop_length=int(filter_length/n_overlap),
-                         win_length=win_length)
-        else:
-            self.stft = STFT(filter_length=filter_length,
-                         hop_length=int(filter_length/n_overlap),
-                         win_length=win_length).cuda()
+                         win_length=win_length).to(device)
         if mode == 'zeros':
-            mel_input = torch.zeros(
-                (1, 80, 88),
-                dtype=waveglow.upsample.weight.dtype,
-                device=waveglow.upsample.weight.device)
+            mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device)
         elif mode == 'normal':
-            mel_input = torch.randn(
-                (1, 80, 88),
-                dtype=waveglow.upsample.weight.dtype,
-                device=waveglow.upsample.weight.device)
+            mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device)
         else:
             raise Exception("Mode {} if not supported".format(mode))
 
@@ -65,7 +56,7 @@ class Denoiser(torch.nn.Module):
         self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
 
     def forward(self, audio, strength=0.1):
-        audio_spec, audio_angles = self.stft.transform(audio.float())
+        audio_spec, audio_angles = self.stft.transform(audio)
         audio_spec_denoised = audio_spec - self.bias_spec * strength
         audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
         audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)

+ 23 - 17
PyTorch/SpeechSynthesis/Tacotron2/waveglow/model.py

@@ -60,27 +60,33 @@ class Invertible1x1Conv(torch.nn.Module):
         W = W.view(c, c, 1)
         self.conv.weight.data = W
 
-    def forward(self, z, reverse=False):
+    def forward(self, z):
         # shape
         batch_size, group_size, n_of_groups = z.size()
 
         W = self.conv.weight.squeeze()
 
-        if reverse:
-            if not hasattr(self, 'W_inverse'):
-                # Reverse computation
-                W_inverse = W.float().inverse()
-                W_inverse = Variable(W_inverse[..., None])
-                if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor':
-                    W_inverse = W_inverse.half()
-                self.W_inverse = W_inverse
-            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
-            return z
-        else:
-            # Forward computation
-            log_det_W = batch_size * n_of_groups * torch.logdet(W.unsqueeze(0).float()).squeeze()
-            z = self.conv(z)
-            return z, log_det_W
+        # Forward computation
+        log_det_W = batch_size * n_of_groups * torch.logdet(W.unsqueeze(0).float()).squeeze()
+        z = self.conv(z)
+        return z, log_det_W
+
+
+    def infer(self, z):
+        # shape
+        batch_size, group_size, n_of_groups = z.size()
+
+        W = self.conv.weight.squeeze()
+
+        if not hasattr(self, 'W_inverse'):
+            # Reverse computation
+            W_inverse = W.float().inverse()
+            W_inverse = Variable(W_inverse[..., None])
+            if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor':
+                W_inverse = W_inverse.half()
+            self.W_inverse = W_inverse
+        z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+        return z
 
 
 class WN(torch.nn.Module):
@@ -260,7 +266,7 @@ class WaveGlow(torch.nn.Module):
             audio_1 = (audio_1 - b) / torch.exp(s)
             audio = torch.cat([audio_0, audio_1], 1)
 
-            audio = self.convinv[k](audio, reverse=True)
+            audio = self.convinv[k].infer(audio)
 
             if k % self.n_early_every == 0 and k > 0:
                 z = torch.randn(spect.size(0), self.n_early_size, spect.size(