5 lat temu · 53e7e4f130
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/CMakeLists.txt
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/CMakeLists.txt
@@ -53,7 +53,7 @@ set(CMAKE_CXX_STANDARD 11)
 
				 set(CMAKE_CUDA_STANDARD 11)
			
 
				 
			
 
				 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CPP_DEVEL_FLAGS} -fPIC")
			
 
				-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_DEVEL_FLAGS} -rdc=true -Xcompiler=-fPIC")
			
 
				+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_DEVEL_FLAGS} -Xcompiler=-fPIC")
			
 
				 
			
 
				 enable_testing()
			
 
				 
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/README.md
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/README.md
@@ -12,10 +12,10 @@ Server with a custom TensorRT
 
				   - [Requirements](#requirements)
			
 
				 * [Quick Start Guide](#quick-start-guide)
			
 
				   - [Export the models](#export-the-models)
			
 
				-  - [Setup the TRTIS server](#setup-the-trtis-server)
			
 
				-  - [Setup the TRTIS client](#setup-the-trtis-client)
			
 
				-  - [Starting the TRTIS server](#starting-the-trtis-server)
			
 
				-  - [Running the TRTIS client](#running-the-trtis-client)
			
 
				+  - [Setup the Triton server](#setup-the-trtis-server)
			
 
				+  - [Setup the Triton client](#setup-the-trtis-client)
			
 
				+  - [Starting the Triton server](#starting-the-trtis-server)
			
 
				+  - [Running the Triton client](#running-the-trtis-client)
			
 
				 * [Advanced](#advanced)
			
 
				   - [Code structure](#code-structure)
			
 
				   - [Precision](#precision)
			
@@ -93,14 +93,14 @@ mkdir models
 
				 ./export_weights.sh checkpoints/nvidia_tacotron2pyt_fp16_20190427 checkpoints/nvidia_waveglow256pyt_fp16 models/
			
 
				 ```
			
 
				 
			
 
				-### Setup the TRTIS server
			
 
				+### Setup the Triton server
			
 
				 ```bash
			
 
				 ./build_trtis.sh models/tacotron2.json models/waveglow.onnx models/denoiser.json
			
 
				 ```
			
 
				 This will take some time as TensorRT tries out different tactics for best
			
 
				 performance while building the engines.
			
 
				 
			
 
				-### Setup the TRTIS client
			
 
				+### Setup the Triton client
			
 
				 
			
 
				 Next you need to build the client docker container. To do this, enter the
			
 
				 `trtis_client` directory and run the script `build_trtis_client.sh`.
			
@@ -111,7 +111,7 @@ cd trtis_client
 
				 cd ..
			
 
				 ```
			
 
				 
			
 
				-### Run the TRTIS server
			
 
				+### Run the Triton server
			
 
				 
			
 
				 To run the server locally, use the script `run_trtis_server.sh`:
			
 
				 ```bash
			
@@ -119,10 +119,10 @@ To run the server locally, use the script `run_trtis_server.sh`:
 
				 ```
			
 
				 
			
 
				 You can use the environment variable `NVIDIA_VISIBLE_DEVICES` to set which GPUs
			
 
				-the TRTIS server sees.
			
 
				+the Triton server sees.
			
 
				 
			
 
				 
			
 
				-### Run the TRTIS client
			
 
				+### Run the Triton client
			
 
				 
			
 
				 Leave the server running. In another terminal, type:
			
 
				 ```bash
			
@@ -142,13 +142,11 @@ to detect the end of the phrase.
 
				 ### Code structure
			
 
				 
			
 
				 The `src/` contains the following sub-directories:
			
 
				-* `trtis`: The directory containing code for the custom TRTIS backend.
			
 
				+* `trtis`: The directory containing code for the custom Triton backend.
			
 
				 * `trt/tacotron2`: The directory containing the Tacotron2 implementation in TensorRT.
			
 
				 * `trt/waveglow`: The directory containing the WaveGlow implementation in TensorRT.
			
 
				 * `trt/denoiser`: The directory containing the Denoiser (STFT) implementation in TensorRT.
			
 
				 * `trt/plugins`: The directory containing plugins used by the TensorRT engines.
			
 
				-* `trt/helpers`: The directory containing scripts for exporting models from
			
 
				-PyTorch.
			
 
				 
			
 
				 The `trtis_client/` directory contains the code for running the client.
			
 
				 
			
@@ -172,21 +170,6 @@ For all tests in these tables, we used WaveGlow with 256 residual channels.
 
				 
			
 
				 ### Performance on NVIDIA T4
			
 
				 
			
 
				-#### TensorRT \w Plugins in TRTIS
			
 
				-
			
 
				-Latency in this table is measured from the client sending the request, to it
			
 
				-receiving back the generated audio. 
			
 
				-
			
 
				-|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)| Latency interval 90% (s)|Latency interval 95% (s)|Latency interval 99% (s)|Avg mels generated |Avg audio length (s)|Avg RTF|
			
 
				-|---:|----:|-----:|------:|------:|------:|------:|------:|----:|------:|-------:|
			
 
				-| 1  | 128 | FP16 | 0.49 | 0.00 | 0.49 | 0.49 | 0.50 | 564 | 6.59 | 13.48 |
			
 
				-| 4  | 128 | FP16 | 1.37 | 0.01 | 1.38 | 1.38 | 1.38 | 563 | 6.54 |  4.77 |
			
 
				-| 1  | 128 | FP32 | 1.30 | 0.01 | 1.30 | 1.30 | 1.31 | 567 | 6.58 |  5.08 |
			
 
				-| 4  | 128 | FP32 | 3.63 | 0.01 | 3.64 | 3.64 | 3.64 | 568 | 6.59 |  1.82 |
			
 
				-
			
 
				-To reproduce this table, see [Running the benchmark](#running-the-benchmark)
			
 
				-below.
			
 
				-
			
 
				 
			
 
				 #### TensorRT \w Plugins vs. PyTorch
			
 
				 
			
@@ -194,12 +177,12 @@ Latency in this table is measured from just before the input sequence starts
 
				 being copied from host memory to the GPU,
			
 
				 to just after the generated audio finishes being copied back to the host
			
 
				 memory.
			
 
				-That is, what is taking place in the custom backend inside of TRTIS.
			
 
				+That is, what is taking place in the custom backend inside of Triton.
			
 
				 
			
 
				 |Framework|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)| Latency interval 90% (s)|Latency interval 95% (s)|Latency interval 99% (s)| Throughput (samples/sec) | Speed-up vs. PyT FP32 | Speed-up vs. PyT FP16 | Avg mels generated |Avg audio length (s)|Avg RTF|
			
 
				 |------:|----:|-----:|-----------:|--------:|------:|------:|------:|------:|------:|------:|----:|------:|-------:|---:|
			
 
				-| TRT \w plugins | 1  | 128 | FP16 | 0.45 | 0.00 | 0.45 | 0.45 | 0.46 | 320,950 | __3.72x__ | __3.39x__ | 564 | 6.55 | 14.59 |
			
 
				-| TRT \w plugins | 1  | 128 | FP32 | 1.26 | 0.01 | 1.27 | 1.27 | 1.27 | 115,150 | __1.33x__ | __1.21x__ | 567 | 6.58 |  5.22 |
			
 
				+| TRT \w plugins | 1  | 128 | FP16 | 0.40 | 0.00 | 0.40 | 0.40 | 0.40 | 369,862 | __4.27x__ | __3.90x__ | 579 | 6.72 | 16.77 |
			
 
				+| TRT \w plugins | 1  | 128 | FP32 | 1.20 | 0.01 | 1.21 | 1.21 | 1.21 | 123,922 | __1.43x__ | __1.31x__ | 581 | 6.74 |  5.62 |
			
 
				 | PyTorch        | 1  | 128 | FP16 | 1.63 | 0.07 | 1.71 | 1.73 | 1.81 | 94,758 | __1.10x__ | __1.00x__ | 601 | 6.98 |  4.30 |
			
 
				 | PyTorch        | 1  | 128 | FP32 | 1.77 | 0.08 | 1.88 | 1.92 | 2.00 | 86,705 | __1.00x__ | __0.91x__ | 600 | 6.96 |  3.92 |
			
 
				 
			
@@ -207,16 +190,36 @@ That is a __3.72x__ speedup when using TensorRT FP16 with plugins when compared
 
				 PyTorch FP32, and still a __3.39x__ speedup when compared to PyTorch FP16.
			
 
				 
			
 
				 The TensorRT entries in this table can be reproduced by using the output of
			
 
				-the TRTIS server, when performing the steps for [Running the
			
 
				+the Triton server, when performing the steps for [Running the
			
 
				 benchmark](#running-the-benchmark) below.
			
 
				 The PyTorch entries can be reproduced by following the instructions
			
 
				 [here](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2).
			
 
				 
			
 
				 
			
 
				+
			
 
				+#### TensorRT \w Plugins in Triton
			
 
				+
			
 
				+Latency in this table is measured from the client sending the request, to it
			
 
				+receiving back the generated audio. This includes network time,
			
 
				+request/response formatting time, as well as the backend time shown in the
			
 
				+above section.
			
 
				+
			
 
				+|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)| Latency interval 90% (s)|Latency interval 95% (s)|Latency interval 99% (s)|Avg mels generated |Avg audio length (s)|Avg RTF|
			
 
				+|---:|----:|-----:|------:|------:|------:|------:|------:|----:|------:|-------:|
			
 
				+| 1  | 128 | FP16 | 0.42 | 0.00 | 0.42 | 0.42 | 0.42 | 579 | 6.72 | 15.95 |
			
 
				+| 8  | 128 | FP16 | 2.55 | 0.01 | 2.56 | 2.56 | 2.57 | 571 | 6.62 |  2.60 |
			
 
				+| 1  | 128 | FP32 | 1.22 | 0.01 | 1.22 | 1.23 | 1.23 | 581 | 6.75 |  5.54 |
			
 
				+| 8  | 128 | FP32 | 8.64 | 0.01 | 8.68 | 8.69 | 8.71 | 569 | 6.61 |  0.72 |
			
 
				+
			
 
				+To reproduce this table, see [Running the benchmark](#running-the-benchmark)
			
 
				+below.
			
 
				+
			
 
				+
			
 
				+
			
 
				 ### Running the benchmark
			
 
				 
			
 
				-Once you have performed the steps in [Setup the TRTIS server](#setup-the-trtis-server) and
			
 
				-[Setup the TRTIS client](#setup-the-trtis-client), you can run the benchmark by starting the TRTIS server via:
			
 
				+Once you have performed the steps in [Setup the Triton server](#setup-the-trtis-server) and
			
 
				+[Setup the Triton client](#setup-the-trtis-client), you can run the benchmark by starting the Triton server via:
			
 
				 ```bash
			
 
				 ./run_trtis_server.sh
			
 
				 ```
			
@@ -233,15 +236,14 @@ Replace <batch size> with the desired batch size between 1 and 32. The engines a
 
				 After some time this should produce output like:
			
 
				 ```
			
 
				 Performed 1000 runs.
			
 
				-batch size = 1
			
 
				-input size = 128
			
 
				-avg latency (s) = 0.485718
			
 
				-latency std (s) = 0.00448834
			
 
				-latency interval 50% (s) = 0.485836
			
 
				-latency interval 90% (s) = 0.489517
			
 
				-latency interval 95% (s) = 0.490613
			
 
				-latency interval 99% (s) = 0.494721
			
 
				-average mels generated = 564
			
 
				-average audio generated (s) = 6.54803
			
 
				-average real-time factor = 13.4811
			
 
				+batch size =  1
			
 
				+avg latency (s) = 0.421375
			
 
				+latency std (s) = 0.00170839
			
 
				+latency interval 50% (s) = 0.421553
			
 
				+latency interval 90% (s) = 0.422805
			
 
				+latency interval 95% (s) = 0.423273
			
 
				+latency interval 99% (s) = 0.424153
			
 
				+average mels generated = 582
			
 
				+average audio generated (s) = 6.72218
			
 
				+average real-time factor = 15.953
			
 
				 ```
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/scripts/denoiser_to_json.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/scripts/denoiser_to_json.py
@@ -29,7 +29,9 @@
 
				 
			
 
				 
			
 
				 import json
			
 
				+import torch
			
 
				 import sys
			
 
				+import os
			
 
				 from scipy.signal import get_window
			
 
				 import librosa.util as librosa_util
			
 
				 
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/scripts/waveglow_to_onnx.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/scripts/waveglow_to_onnx.py
@@ -25,9 +25,11 @@
 
				 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				 # 
			
 
				 
			
 
				-
			
 
				-
			
 
				+import json
			
 
				 import sys
			
 
				+import onnx
			
 
				+import numpy as np
			
 
				+from scipy.io.wavfile import write
			
 
				 import argparse
			
 
				 import torch
			
 
				 
			
@@ -77,8 +79,8 @@ def convert_convinv_1d_to_2d(convinv):
 
				 
			
 
				 def convert_1d_to_2d_(glow):
			
 
				     """
			
 
				-    Caffe2 and TensorRT don't seem to support 1-d convolutions or properly 
			
 
				-    convert ONNX exports with 1d convolutions to 2d convolutions yet, so we 
			
 
				+    Caffe2 and TensorRT don't seem to support 1-d convolutions or properly
			
 
				+    convert ONNX exports with 1d convolutions to 2d convolutions yet, so we
			
 
				     do the conversion to 2-d convolutions before ONNX export
			
 
				     """
			
 
				     # Convert upsample to 2d
			
@@ -146,21 +148,20 @@ def infer_o(self, spect, z):
 
				     pre-calculated so ONNX doesn't export "Dynamic" outputs which are not yet
			
 
				     suported by TensorRT
			
 
				     """
			
 
				+    batch_size = spect.size(0)
			
 
				+    spect = spect.permute(0, 3, 2, 1).contiguous()
			
 
				 
			
 
				     spect = self.upsample(spect)
			
 
				     spect = torch.squeeze(spect, 3)
			
 
				-    spect = spect.view(self.view_size_1)
			
 
				+    spect = spect.view(batch_size, self.upsample_weight_size,  self.length_spect_group, self.n_group)
			
 
				     spect = spect.permute(0, 2, 1, 3)
			
 
				     spect = spect.contiguous()
			
 
				-    spect = spect.view(self.view_size_2)
			
 
				+    spect = spect.view(batch_size, self.length_spect_group, self.upsample_weight_size*self.n_group)
			
 
				     spect = spect.permute(0, 2, 1)
			
 
				-    spect = spect.reshape([
			
 
				-        self.batch_size,
			
 
				-        self.upsample_weight_size*self.n_group,
			
 
				-        self.length_spect_group,
			
 
				-        1])
			
 
				+    spect = torch.unsqueeze(spect, 3)
			
 
				+    spect = spect.contiguous()
			
 
				 
			
 
				-    audio = z[:, 0:self.n_remaining_channels, :, :]
			
 
				+    audio = z[:, :self.n_remaining_channels, :, :]
			
 
				     z = z[:, self.n_remaining_channels:self.n_group, :, :]
			
 
				 
			
 
				     for k in reversed(range(self.n_flows)):
			
@@ -172,18 +173,22 @@ def infer_o(self, spect, z):
 
				         s = output[:, n_half:2*n_half, :, :]
			
 
				         b = output[:, 0:n_half, :, :]
			
 
				         audio_1 = (audio_1 - b)/torch.exp(s)
			
 
				+        audio_0 = audio_0.expand(audio_1.size(0), audio_0.size(1),
			
 
				+                                 audio_0.size(2), audio_0.size(3))
			
 
				         audio = torch.cat([audio_0, audio_1], 1)
			
 
				 
			
 
				         audio = self.convinv[k](audio)
			
 
				 
			
 
				         if k % self.n_early_every == 0 and k > 0:
			
 
				-            audio = torch.cat((z[:, 0:self.n_early_size, :, :], audio), 1)
			
 
				+            zb = z[:, 0:self.n_early_size, :, :].expand(audio.size(0),
			
 
				+                    self.n_early_size, z.size(2), z.size(3))
			
 
				+            audio = torch.cat((zb, audio), 1)
			
 
				             z = z[:, self.n_early_size:self.n_group -
			
 
				                   self.n_remaining_channels, :, :]
			
 
				 
			
 
				     audio = torch.squeeze(audio, 3)
			
 
				     audio = audio.permute(0, 2, 1).contiguous().view(
			
 
				-        1, (self.length_spect_group * self.n_group))
			
 
				+        audio.size(0), (self.length_spect_group * self.n_group))
			
 
				     return audio
			
 
				 
			
 
				 
			
@@ -195,7 +200,6 @@ def main(waveglow_path, output_path, batch_size, length_mels):
 
				     torch.manual_seed(0)
			
 
				 
			
 
				     model = load_waveglow(waveglow_path, waveglow_config)
			
 
				-    model.batch_size = batch_size
			
 
				 
			
 
				     length_spect = length_mels
			
 
				     length_samples = 768 + 256*length_spect
			
@@ -208,7 +212,7 @@ def main(waveglow_path, output_path, batch_size, length_mels):
 
				 
			
 
				     # Run inference because it forces inverses to be calculated
			
 
				     with torch.no_grad():
			
 
				-        _ = model.infer(spect)
			
 
				+        test_out1 = model.infer(spect)
			
 
				     assert(length_samples % model.n_group == 0)
			
 
				 
			
 
				     model.length_spect_group = int(length_samples / model.n_group)
			
@@ -224,34 +228,30 @@ def main(waveglow_path, output_path, batch_size, length_mels):
 
				     n_halves.reverse()
			
 
				     model.n_halves = n_halves
			
 
				 
			
 
				-    model.view_size_1 = torch.Size(
			
 
				-        [model.batch_size, model.upsample_weight_size,  model.length_spect_group, model.n_group])
			
 
				-    model.view_size_2 = torch.Size(
			
 
				-        [model.batch_size, model.length_spect_group, model.upsample_weight_size*model.n_group])
			
 
				+    spect = torch.cuda.FloatTensor(
			
 
				+        batch_size, 1, length_spect, model.upsample.weight.size(0)).normal_()
			
 
				+    z = torch.cuda.FloatTensor(
			
 
				+        1, model.n_group, model.length_spect_group, 1).normal_()
			
 
				+    spect = torch.autograd.Variable(spect.cuda(), requires_grad=False)
			
 
				+    z = torch.autograd.Variable(z, requires_grad=False)
			
 
				 
			
 
				     # Replace old forward with inference
			
 
				     glow.WaveGlow.forward = infer_o
			
 
				-    glow.WN.forward = WN_forward
			
 
				+    #glow.WN.forward = WN_forward
			
 
				 
			
 
				     # Convert whole model to 2d convolutions
			
 
				     convert_1d_to_2d_(model)
			
 
				     model.cuda()
			
 
				 
			
 
				-    spect = torch.cuda.FloatTensor(
			
 
				-        batch_size, model.upsample.weight.size(0), length_spect, 1).normal_()
			
 
				-    z = torch.cuda.FloatTensor(
			
 
				-        1, model.n_group, model.length_spect_group, 1).normal_()
			
 
				-    spect = torch.autograd.Variable(spect.cuda(), requires_grad=False)
			
 
				-    z = torch.autograd.Variable(z, requires_grad=False)
			
 
				-
			
 
				     # Get output for comparison with Caffe2
			
 
				     with torch.no_grad():
			
 
				-        _ = model(spect, z)
			
 
				+        test_out2 = model(spect, z)
			
 
				 
			
 
				     # Export model
			
 
				-    torch.onnx.export(model, (spect, z), 
			
 
				+    torch.onnx.export(model, (spect, z),
			
 
				             output_path,
			
 
				-            dynamic_axes={'spect': [0], 'z': [0]},
			
 
				+            dynamic_axes={'spect': {0: 'batch_size'},
			
 
				+                          'audio': {0: 'batch_size'}},
			
 
				             input_names=['spect', 'z'],
			
 
				             output_names=['audio'],
			
 
				             opset_version=10,
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/denoiser/denoiserStreamingInstance.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/denoiser/denoiserStreamingInstance.cpp
@@ -69,7 +69,7 @@ void DenoiserStreamingInstance::inferNext(
 
				 
			
 
				     if (!mContext->enqueue(batchSize, mBinding.getBindings(), stream, nullptr))
			
 
				     {
			
 
				-        throw std::runtime_error("Failed to run encoding.");
			
 
				+        throw std::runtime_error("Failed to run denoiser.");
			
 
				     }
			
 
				 
			
 
				     stopTiming();
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/plugins/taco2AttentionPlugin/taco2AttentionLayerPlugin.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/plugins/taco2AttentionPlugin/taco2AttentionLayerPlugin.cpp
@@ -37,8 +37,6 @@
 
				 #include <stdexcept>
			
 
				 #include <string>
			
 
				 
			
 
				-using namespace nvinfer1;
			
 
				-
			
 
				 namespace nvinfer1
			
 
				 {
			
 
				 namespace plugin
			
@@ -371,7 +369,13 @@ size_t Taco2AttentionLayerPlugin::getWorkspaceSize(
 
				     // space for queryOutput (num attention dimensions),
			
 
				     // convOutput (input length*num filters), elemSum (input length), and
			
 
				     // energyScratch (inputLength).
			
 
				-    return sizeof(value_type) * batchSize * (mNumAttentionDimension + (inputLength * mNumFilters) + 2 * inputLength);
			
 
				+    const size_t numWorkspaceElements
			
 
				+        = mNumAttentionDimension +                 // query output
			
 
				+          (inputLength * mNumFilters) +            // conv output
			
 
				+          (mNumAttentionDimension * inputLength) + // elem sum
			
 
				+          inputLength;                             // enery scratch
			
 
				+
			
 
				+    return numWorkspaceElements * sizeof(value_type) * batchSize;
			
 
				 }
			
 
				 
			
 
				 int Taco2AttentionLayerPlugin::enqueue(const PluginTensorDesc* const inputDesc,
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/plugins/taco2AttentionPlugin/taco2AttentionLayerPluginCreator.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/plugins/taco2AttentionPlugin/taco2AttentionLayerPluginCreator.cpp
@@ -44,7 +44,6 @@ namespace plugin
 
				 namespace
			
 
				 {
			
 
				 
			
 
				-constexpr const char* const INPUT_LENGTH_STR = "InputLength";
			
 
				 constexpr const char* const ENCODING_DIMENSION_STR = "EncodingDimension";
			
 
				 constexpr const char* const QUERY_DIMENSION_STR = "QueryDimension";
			
 
				 constexpr const char* const NUM_FILTERS_STR = "NumFilters";
			
@@ -64,7 +63,7 @@ constexpr const char* const ENERGY_WEIGHTS_STR = "EnergyWeight";
 
				 PluginFieldCollection* Taco2AttentionLayerPluginCreator::getFields()
			
 
				 {
			
 
				     static PluginFieldCollection* pluginPtr = nullptr;
			
 
				-    static const std::vector<PluginField> fields{{INPUT_LENGTH_STR, nullptr, PluginFieldType::kINT32, 0},
			
 
				+    static const std::vector<PluginField> fields{
			
 
				         {ENCODING_DIMENSION_STR, nullptr, PluginFieldType::kINT32, 0},
			
 
				         {QUERY_DIMENSION_STR, nullptr, PluginFieldType::kINT32, 0},
			
 
				         {NUM_FILTERS_STR, nullptr, PluginFieldType::kINT32, 0},
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/speechSynthesizer.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/speechSynthesizer.cpp
@@ -25,6 +25,7 @@
 
				  */
			
 
				 
			
 
				 #include "speechSynthesizer.h"
			
 
				+#include "utils.h"
			
 
				 
			
 
				 #include <algorithm>
			
 
				 #include <stdexcept>
			
@@ -39,7 +40,7 @@ namespace tts
 
				 namespace
			
 
				 {
			
 
				 
			
 
				-constexpr int MAX_NUM_FRAMES_PER_CHAR = 10;
			
 
				+constexpr int MAX_NUM_MELS_PER_CHAR = 10;
			
 
				 }
			
 
				 
			
 
				 /******************************************************************************
			
@@ -49,9 +50,9 @@ constexpr int MAX_NUM_FRAMES_PER_CHAR = 10;
 
				 namespace
			
 
				 {
			
 
				 
			
 
				-int maxFramesFromMels(const int numMels)
			
 
				+int maxMelsFromChars(const int numChars)
			
 
				 {
			
 
				-    return numMels * MAX_NUM_FRAMES_PER_CHAR + 100;
			
 
				+  return numChars * MAX_NUM_MELS_PER_CHAR + 100;
			
 
				 }
			
 
				 
			
 
				 } // namespace
			
@@ -60,19 +61,25 @@ int maxFramesFromMels(const int numMels)
 
				  * CONSTRUCTORS / DESTRUCTOR **************************************************
			
 
				  *****************************************************************************/
			
 
				 
			
 
				-SpeechSynthesizer::SpeechSynthesizer(std::shared_ptr<Tacotron2Instance> tacotron,
			
 
				-    std::shared_ptr<WaveGlowInstance> waveglow, std::shared_ptr<DenoiserInstance> denoiser)
			
 
				-    : TimedObject("SpeechSynthsizer::infer()")
			
 
				-    , mMaxBatchSize(std::min(tacotron->getMaxBatchSize(), waveglow->getMaxBatchSize()))
			
 
				-    , mNumMaxFrames(maxFramesFromMels(tacotron->getMaximumInputLength()))
			
 
				-    , mNumSymbols(mMaxBatchSize)
			
 
				-    , mNumFrames(mMaxBatchSize)
			
 
				-    , mNumSamples(mMaxBatchSize)
			
 
				-    , mTacotron(tacotron)
			
 
				-    , mWaveglow(waveglow)
			
 
				-    , mDenoiser(denoiser)
			
 
				-    , mBuffer(mTacotron->getMaximumInputLength(), getMelSpacing() * mTacotron->getNumMelChannels(), getMaxOutputSize(),
			
 
				-          mMaxBatchSize)
			
 
				+SpeechSynthesizer::SpeechSynthesizer(
			
 
				+    std::shared_ptr<Tacotron2Instance> tacotron,
			
 
				+    std::shared_ptr<WaveGlowInstance> waveglow,
			
 
				+    std::shared_ptr<DenoiserInstance> denoiser) :
			
 
				+    TimedObject("SpeechSynthsizer::infer()"),
			
 
				+    mMaxBatchSize(
			
 
				+        std::min(tacotron->getMaxBatchSize(), waveglow->getMaxBatchSize())),
			
 
				+    mNumMaxMels(maxMelsFromChars(tacotron->getMaximumInputLength())),
			
 
				+    mNumSymbols(mMaxBatchSize),
			
 
				+    mNumFrames(mMaxBatchSize),
			
 
				+    mNumSamples(mMaxBatchSize),
			
 
				+    mTacotron(tacotron),
			
 
				+    mWaveglow(waveglow),
			
 
				+    mDenoiser(denoiser),
			
 
				+    mBuffer(
			
 
				+        mTacotron->getMaximumInputLength(),
			
 
				+        getMelSpacing() * mTacotron->getNumMelChannels(),
			
 
				+        getMaxOutputSize(),
			
 
				+        mMaxBatchSize)
			
 
				 {
			
 
				     addChild(mTacotron.get());
			
 
				     addChild(mWaveglow.get());
			
@@ -126,10 +133,23 @@ void SpeechSynthesizer::infer(const int batchSize, const int* const inputDevice,
 
				     {
			
 
				         melLengths = mNumFrames.data();
			
 
				     }
			
 
				-    mTacotron->infer(batchSize, inputDevice, inputSpacing, inputLength, mNumMaxFrames, melFramesDevice, melLengths);
			
 
				+    mTacotron->infer(
			
 
				+        batchSize,
			
 
				+        inputDevice,
			
 
				+        inputSpacing,
			
 
				+        inputLength,
			
 
				+        mNumMaxMels,
			
 
				+        melFramesDevice,
			
 
				+        melLengths);
			
 
				 
			
 
				     mWaveglow->infer(
			
 
				-        batchSize, melFramesDevice, mNumMaxFrames, melLengths, getMaxOutputSize(), samplesDevice, numSamples);
			
 
				+        batchSize,
			
 
				+        melFramesDevice,
			
 
				+        mNumMaxMels,
			
 
				+        melLengths,
			
 
				+        getMaxOutputSize(),
			
 
				+        samplesDevice,
			
 
				+        numSamples);
			
 
				 
			
 
				     if (mDenoiser)
			
 
				     {
			
@@ -207,12 +227,12 @@ int SpeechSynthesizer::getMaxInputSize() const
 
				 
			
 
				 int SpeechSynthesizer::getMelSpacing() const
			
 
				 {
			
 
				-    return mNumMaxFrames;
			
 
				+  return mNumMaxMels;
			
 
				 }
			
 
				 
			
 
				 int SpeechSynthesizer::getMaxOutputSize() const
			
 
				 {
			
 
				-    return mNumMaxFrames * mWaveglow->getNumberOfSamplesPerFrame();
			
 
				+  return mNumMaxMels * mWaveglow->getNumberOfSamplesPerFrame();
			
 
				 }
			
 
				 
			
 
				 } // namespace tts
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/speechSynthesizer.h
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/speechSynthesizer.h
@@ -129,7 +129,7 @@ public:
 
				 
			
 
				 private:
			
 
				     int mMaxBatchSize;
			
 
				-    int mNumMaxFrames;
			
 
				+    int mNumMaxMels;
			
 
				     std::vector<int> mNumSymbols;
			
 
				     std::vector<int> mNumFrames;
			
 
				     std::vector<int> mNumSamples;
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstance.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstance.cpp
@@ -74,7 +74,7 @@ DecoderInstance::DecoderInstance(
 
				         * TRTUtils::getBindingSize(getEngine(), OUTPUT_CHANNELS_NAME)),
			
 
				     mOutputGateHost(mMaxChunkSize * getMaxBatchSize())
			
 
				 {
			
 
				-    reset();
			
 
				+  reset(0);
			
 
				 }
			
 
				 
			
 
				 /******************************************************************************
			
@@ -177,16 +177,17 @@ bool DecoderInstance::isAllDone() const
 
				         mDone.cbegin(), mDone.cbegin() + mBatchSize, true, [](const bool a, const bool b) { return a && b; });
			
 
				 }
			
 
				 
			
 
				-void DecoderInstance::reset()
			
 
				+void DecoderInstance::reset(cudaStream_t stream)
			
 
				 {
			
 
				     mNextChunkSize = mMaxChunkSize;
			
 
				 
			
 
				-    std::fill(mDone.begin(), mDone.end(), false);
			
 
				-
			
 
				-    mDropout.reset(mSeed);
			
 
				+    mDropout.reset(mSeed, stream);
			
 
				 
			
 
				     // relies on zeros
			
 
				-    CudaUtils::zero(mDecoderInputDevice.data(), mDecoderInputDevice.size());
			
 
				+    CudaUtils::zeroAsync(
			
 
				+        mDecoderInputDevice.data(), mDecoderInputDevice.size(), stream);
			
 
				+
			
 
				+    std::fill(mDone.begin(), mDone.end(), false);
			
 
				 }
			
 
				 
			
 
				 void DecoderInstance::setNextChunkSize(const int chunkSize)
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstance.h
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstance.h
@@ -114,8 +114,10 @@ public:
 
				 
			
 
				     /**
			
 
				      * @brief Reset the decoder for new input.
			
 
				+     *
			
 
				+     * @param stream The stream to run on.
			
 
				      */
			
 
				-    virtual void reset();
			
 
				+    virtual void reset(cudaStream_t stream);
			
 
				 
			
 
				     /**
			
 
				      * @brief Set the number of decoder loops to execute for subsequent calls to
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstancePlain.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstancePlain.cpp
@@ -90,17 +90,17 @@ DecoderInstancePlain::DecoderInstancePlain(
 
				  * PUBLIC METHODS *************************************************************
			
 
				  *****************************************************************************/
			
 
				 
			
 
				-void DecoderInstancePlain::reset()
			
 
				+void DecoderInstancePlain::reset(cudaStream_t stream)
			
 
				 {
			
 
				-    DecoderInstance::reset();
			
 
				-
			
 
				-    mInputWeightsDevice.zero();
			
 
				-    mInAttentionHiddenStatesDevice.zero();
			
 
				-    mInAttentionCellStatesDevice.zero();
			
 
				-    mInputAttentionContextDevice.zero();
			
 
				-    mOutputAttentionContextDevice.zero();
			
 
				-    mInDecoderHiddenStatesDevice.zero();
			
 
				-    mInDecoderCellStatesDevice.zero();
			
 
				+  DecoderInstance::reset(stream);
			
 
				+
			
 
				+  mInputWeightsDevice.zeroAsync(stream);
			
 
				+  mInAttentionHiddenStatesDevice.zeroAsync(stream);
			
 
				+  mInAttentionCellStatesDevice.zeroAsync(stream);
			
 
				+  mInputAttentionContextDevice.zeroAsync(stream);
			
 
				+  mOutputAttentionContextDevice.zeroAsync(stream);
			
 
				+  mInDecoderHiddenStatesDevice.zeroAsync(stream);
			
 
				+  mInDecoderCellStatesDevice.zeroAsync(stream);
			
 
				 }
			
 
				 
			
 
				 /******************************************************************************
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstancePlain.h
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstancePlain.h
@@ -57,10 +57,12 @@ public:
 
				 
			
 
				     /**
			
 
				      * @brief Reset the decoder for new input.
			
 
				+     *
			
 
				+     * @param stream The stream to run on.
			
 
				      */
			
 
				-    void reset() override;
			
 
				+    void reset(cudaStream_t stream) override;
			
 
				 
			
 
				-protected:
			
 
				+  protected:
			
 
				     /**
			
 
				      * @brief Decode a single frame of output.
			
 
				      *
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstancePlugins.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstancePlugins.cpp
@@ -126,19 +126,19 @@ DecoderInstancePlugins::DecoderInstancePlugins(
 
				  * PUBLIC METHODS *************************************************************
			
 
				  *****************************************************************************/
			
 
				 
			
 
				-void DecoderInstancePlugins::reset()
			
 
				+void DecoderInstancePlugins::reset(cudaStream_t stream)
			
 
				 {
			
 
				-    DecoderInstance::reset();
			
 
				+  DecoderInstance::reset(stream);
			
 
				 
			
 
				-    mInputWeightsDevice.zero();
			
 
				-    mInAttentionHiddenStatesDevice.zero();
			
 
				-    mInAttentionCellStatesDevice.zero();
			
 
				-    mInputAttentionContextDevice.zero();
			
 
				-    mOutputAttentionContextDevice.zero();
			
 
				-    mInDecoderHiddenStatesDevice.zero();
			
 
				-    mInDecoderCellStatesDevice.zero();
			
 
				+  mInputWeightsDevice.zeroAsync(stream);
			
 
				+  mInAttentionHiddenStatesDevice.zeroAsync(stream);
			
 
				+  mInAttentionCellStatesDevice.zeroAsync(stream);
			
 
				+  mInputAttentionContextDevice.zeroAsync(stream);
			
 
				+  mOutputAttentionContextDevice.zeroAsync(stream);
			
 
				+  mInDecoderHiddenStatesDevice.zeroAsync(stream);
			
 
				+  mInDecoderCellStatesDevice.zeroAsync(stream);
			
 
				 
			
 
				-    mDimsSet = false;
			
 
				+  mDimsSet = false;
			
 
				 }
			
 
				 
			
 
				 /******************************************************************************
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstancePlugins.h
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/decoderInstancePlugins.h
@@ -57,10 +57,12 @@ public:
 
				 
			
 
				     /**
			
 
				      * @brief Reset the decoder for new input.
			
 
				+     *
			
 
				+     * @param stream The stream to run on.
			
 
				      */
			
 
				-    void reset() override;
			
 
				+    void reset(cudaStream_t stream) override;
			
 
				 
			
 
				-protected:
			
 
				+  protected:
			
 
				     /**
			
 
				      * @brief Decode a single frame of output.
			
 
				      *
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/postNetInstance.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/postNetInstance.cpp
@@ -41,9 +41,9 @@ PostNetInstance::PostNetInstance(TRTPtr<ICudaEngine> engine) :
 
				     TimedObject("PostNetInstance::infer()"),
			
 
				     EngineDriver(std::move(engine)),
			
 
				     mBinding(),
			
 
				-    mContext(nullptr)
			
 
				+    mContext(getEngine().createExecutionContext())
			
 
				 {
			
 
				-    mContext.reset(getEngine().createExecutionContext());
			
 
				+  // do nothing
			
 
				 }
			
 
				 
			
 
				 /******************************************************************************
			
@@ -61,7 +61,7 @@ void PostNetInstance::infer(
 
				 
			
 
				     if (!mContext->enqueue(batchSize, mBinding.getBindings(), stream, nullptr))
			
 
				     {
			
 
				-        throw std::runtime_error("Failed to run encoding.");
			
 
				+        throw std::runtime_error("Failed to run post net.");
			
 
				     }
			
 
				     CudaUtils::sync(stream);
			
 
				     stopTiming();
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/tacotron2Instance.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/tacotron2Instance.cpp
@@ -75,7 +75,11 @@ void Tacotron2Instance::infer(const int batchSize, const int* const inputDevice,
 
				 {
			
 
				     startTiming();
			
 
				 
			
 
				-    mStreamingInstance.startInference(batchSize, inputDevice, inputSpacing, inputLength);
			
 
				+    cudaStream_t stream;
			
 
				+    cudaStreamCreate(&stream);
			
 
				+
			
 
				+    mStreamingInstance.startInference(
			
 
				+        batchSize, inputDevice, inputSpacing, inputLength, stream);
			
 
				 
			
 
				     // do decoding
			
 
				     float* intermediateOutput;
			
@@ -107,7 +111,8 @@ void Tacotron2Instance::infer(const int batchSize, const int* const inputDevice,
 
				             mStreamingInstance.setNextChunkSize(maxOutputLength - numFramesTotal);
			
 
				         }
			
 
				 
			
 
				-        moreToDo = mStreamingInstance.inferNext(intermediateOutput + offset, mChunkSize.data());
			
 
				+        moreToDo = mStreamingInstance.inferNext(
			
 
				+            intermediateOutput + offset, mChunkSize.data(), stream);
			
 
				 
			
 
				         for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex)
			
 
				         {
			
@@ -144,9 +149,12 @@ void Tacotron2Instance::infer(const int batchSize, const int* const inputDevice,
 
				             mStreamingInstance.getChunkSize(),
			
 
				             numBlocks,
			
 
				             maxOutputLength,
			
 
				-            0);
			
 
				+            stream);
			
 
				     }
			
 
				 
			
 
				+    CudaUtils::sync(stream);
			
 
				+    cudaStreamDestroy(stream);
			
 
				+
			
 
				     stopTiming();
			
 
				 }
			
 
				 
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/tacotron2Instance.h
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/tacotron2Instance.h
@@ -114,7 +114,7 @@ public:
 
				     int getMaxBatchSize() const;
			
 
				 
			
 
				     /**
			
 
				-     * @brief Get the size of the `outputDevice` vector required for the giving
			
 
				+     * @brief Get the size of the `outputDevice` vector required for the given
			
 
				      * input parameters.
			
 
				      *
			
 
				      * @param batchSize The size of the batch.
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/tacotron2StreamingInstance.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/tacotron2StreamingInstance.cpp
@@ -47,6 +47,7 @@ using namespace nvinfer1;
 
				 
			
 
				 namespace tts
			
 
				 {
			
 
				+
			
 
				 /******************************************************************************
			
 
				  * CONSTRUCTORS / DESTRUCTOR **************************************************
			
 
				  *****************************************************************************/
			
@@ -117,7 +118,11 @@ Tacotron2StreamingInstance::Tacotron2StreamingInstance(
 
				  *****************************************************************************/
			
 
				 
			
 
				 void Tacotron2StreamingInstance::startInference(
			
 
				-    const int batchSize, const int* const inputDevice, const int inputSpacing, const int* const inputLength)
			
 
				+    const int batchSize,
			
 
				+    const int* const inputDevice,
			
 
				+    const int inputSpacing,
			
 
				+    const int* const inputLength,
			
 
				+    cudaStream_t stream)
			
 
				 {
			
 
				     startTiming();
			
 
				 
			
@@ -138,12 +143,6 @@ void Tacotron2StreamingInstance::startInference(
 
				         }
			
 
				     }
			
 
				 
			
 
				-    cudaStream_t stream;
			
 
				-    if (cudaStreamCreate(&stream) != cudaSuccess)
			
 
				-    {
			
 
				-        throw std::runtime_error("Failed to create cuda stream.");
			
 
				-    }
			
 
				-
			
 
				     // copy input to padded location and set zeros
			
 
				     CudaUtils::zeroAsync(
			
 
				         mPaddedInputDevice.data(), mMaxInputLength * mBatchSize, stream);
			
@@ -202,15 +201,13 @@ void Tacotron2StreamingInstance::startInference(
 
				 
			
 
				         mInUseDecoder = mDecoderPlain.get();
			
 
				     }
			
 
				-    mInUseDecoder->reset();
			
 
				-
			
 
				-    cudaStreamSynchronize(stream);
			
 
				-    cudaStreamDestroy(stream);
			
 
				+    mInUseDecoder->reset(stream);
			
 
				 
			
 
				     stopTiming();
			
 
				 }
			
 
				 
			
 
				-bool Tacotron2StreamingInstance::inferNext(float* const outputDevice, int* const outputLength)
			
 
				+bool Tacotron2StreamingInstance::inferNext(
			
 
				+    float* const outputDevice, int* const outputLength, cudaStream_t stream)
			
 
				 {
			
 
				     startTiming();
			
 
				     if (!mInUseDecoder)
			
@@ -234,12 +231,6 @@ bool Tacotron2StreamingInstance::inferNext(float* const outputDevice, int* const
 
				         throw std::runtime_error("mInputLengthHost not set.");
			
 
				     }
			
 
				 
			
 
				-    cudaStream_t stream;
			
 
				-    if (cudaStreamCreate(&stream) != cudaSuccess)
			
 
				-    {
			
 
				-        throw std::runtime_error("Failed to create cuda stream.");
			
 
				-    }
			
 
				-
			
 
				     // do decoding
			
 
				     mInUseDecoder->infer(
			
 
				         stream,
			
@@ -254,14 +245,13 @@ bool Tacotron2StreamingInstance::inferNext(float* const outputDevice, int* const
 
				     // call postnet
			
 
				     mPostnet->infer(stream, mBatchSize, mMelChunkDevice.data(), outputDevice);
			
 
				 
			
 
				+    cudaStreamSynchronize(stream);
			
 
				+
			
 
				     for (int batchIndex = 0; batchIndex < mBatchSize; ++batchIndex)
			
 
				     {
			
 
				         outputLength[batchIndex] = mInUseDecoder->lastChunkSize()[batchIndex];
			
 
				     }
			
 
				 
			
 
				-    cudaStreamSynchronize(stream);
			
 
				-    cudaStreamDestroy(stream);
			
 
				-
			
 
				     stopTiming();
			
 
				 
			
 
				     return !mInUseDecoder->isAllDone();
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/tacotron2StreamingInstance.h
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/tacotron2/tacotron2StreamingInstance.h
@@ -77,12 +77,14 @@ public:
 
				    * @param inputSpacing The spacing between the start of each item in the
			
 
				    * batch.
			
 
				    * @param inputLength The length of each input.
			
 
				+   * @param stream The stream to operate on.
			
 
				    */
			
 
				   void startInference(
			
 
				       int batchSize,
			
 
				       const int* inputDevice,
			
 
				       int inputSpacing,
			
 
				-      const int* inputLength);
			
 
				+      const int* inputLength,
			
 
				+      cudaStream_t stream);
			
 
				 
			
 
				   /**
			
 
				    * @brief Generate the next chunk of output.
			
@@ -90,10 +92,11 @@ public:
 
				    * @param outputDevice The location to write the output tensor in batch,
			
 
				    * frame, channel order.
			
 
				    * @param outputLength The length of each output sequence.
			
 
				+   * @param stream The stream to operate on.
			
 
				    *
			
 
				    * @return True if not all sequences have finished.
			
 
				    */
			
 
				-  bool inferNext(float* outputDevice, int* outputLength);
			
 
				+  bool inferNext(float* outputDevice, int* outputLength, cudaStream_t stream);
			
 
				 
			
 
				   /**
			
 
				    * @brief The random seed to use for dropouts. This resets the
			
@@ -157,6 +160,7 @@ public:
 
				    */
			
 
				   void setNextChunkSize(int chunkSize);
			
 
				 
			
 
				+
			
 
				 private:
			
 
				     // TRT network components
			
 
				     std::shared_ptr<EncoderInstance> mEncoder;
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/cudaMemory.h
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/cudaMemory.h
@@ -123,6 +123,11 @@ public:
 
				     CudaUtils::zero(data(), size());
			
 
				   }
			
 
				 
			
 
				+  void zeroAsync(cudaStream_t stream)
			
 
				+  {
			
 
				+    CudaUtils::zeroAsync(data(), size(), stream);
			
 
				+  }
			
 
				+
			
 
				   void clear()
			
 
				   {
			
 
				     if (m_ptr) {
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/dataShuffler.cu
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/dataShuffler.cu
@@ -165,8 +165,6 @@ void DataShuffler::parseDecoderOutput(const float* const in, float* const out, f
 
				     const dim3 block(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE);
			
 
				 
			
 
				     parseDecoderOutputKernel<<<grid, block, 0, stream>>>(in, out, gateOut, batchSize, chunkSize, numChannels);
			
 
				-    cudaError_t err = cudaStreamSynchronize(stream);
			
 
				-    assert(err == cudaSuccess);
			
 
				 }
			
 
				 
			
 
				 void DataShuffler::transposeMatrix(
			
@@ -176,8 +174,6 @@ void DataShuffler::transposeMatrix(
 
				     const dim3 block(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE);
			
 
				 
			
 
				     transposeMatrixKernel<<<grid, block, 0, stream>>>(in, out, nRows, nCols);
			
 
				-    cudaError_t err = cudaStreamSynchronize(stream);
			
 
				-    assert(err == cudaSuccess);
			
 
				 }
			
 
				 
			
 
				 void DataShuffler::shuffleMels(const float* const in, float* const out, const int batchSize, const int numChannels,
			
@@ -187,8 +183,6 @@ void DataShuffler::shuffleMels(const float* const in, float* const out, const in
 
				     const dim3 block(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE);
			
 
				 
			
 
				     shuffleMelsKernel<<<grid, block, 0, stream>>>(in, out, batchSize, numChannels, chunkSize, compactLength);
			
 
				-    cudaError_t err = cudaStreamSynchronize(stream);
			
 
				-    assert(err == cudaSuccess);
			
 
				 }
			
 
				 
			
 
				 void DataShuffler::frameTransfer(const float* const in, float* const out, const int inputSequenceSpacing,
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/dropoutGenerator.cu
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/dropoutGenerator.cu
@@ -92,9 +92,9 @@ DropoutGenerator::DropoutGenerator(
 
				  * PUBLIC METHODS *************************************************************
			
 
				  *****************************************************************************/
			
 
				 
			
 
				-void DropoutGenerator::reset(unsigned int seed)
			
 
				+void DropoutGenerator::reset(unsigned int seed, cudaStream_t stream)
			
 
				 {
			
 
				-    mRand.setSeed(seed);
			
 
				+  mRand.setSeed(seed, stream);
			
 
				 }
			
 
				 
			
 
				 void DropoutGenerator::generate(const int batchSize, const int numChunks, cudaStream_t stream)
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/dropoutGenerator.h
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/dropoutGenerator.h
@@ -53,8 +53,9 @@ public:
 
				      * @brief Reset the random number generator.
			
 
				      *
			
 
				      * @param seed The seed to use.
			
 
				+     * @param stream The stream to use.
			
 
				      */
			
 
				-    void reset(unsigned int seed = 0);
			
 
				+    void reset(unsigned int seed, cudaStream_t stream);
			
 
				 
			
 
				     /**
			
 
				      * @brief Generate a new set of dropout values.
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/utils.h
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/util/utils.h
@@ -111,6 +111,37 @@ public:
 
				     {
			
 
				         return 1.0f / (1.0f + std::exp(-x));
			
 
				     }
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Perform division of value by block, but round up to the nearest
			
 
				+     * integral.
			
 
				+     *
			
 
				+     * @tparam T The value type.
			
 
				+     * @param value The numerator.
			
 
				+     * @param block The denominator.
			
 
				+     *
			
 
				+     * @return The divided value rounded up.
			
 
				+     */
			
 
				+    template <typename T>
			
 
				+    static T roundUpDiv(const T value, const T block)
			
 
				+    {
			
 
				+      return (value / block) + (value % block > 0);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Round the value up to the nearest multiple of block.
			
 
				+     *
			
 
				+     * @tparam T The value type.
			
 
				+     * @param value The value to round up.
			
 
				+     * @param block The block size.
			
 
				+     *
			
 
				+     * @return The value rounded up to the nearest multiple of block.
			
 
				+     */
			
 
				+    template <typename T>
			
 
				+    static T roundUpTo(const T value, const T block)
			
 
				+    {
			
 
				+      return block * roundUpDiv(value, block);
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 } // namespace tts
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/waveglow/waveGlowBuilder.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/waveglow/waveGlowBuilder.cpp
@@ -123,25 +123,8 @@ TRTPtr<ICudaEngine> WaveGlowBuilder::build(
 
				     network->getInput(1)->setName(Z_INPUT_NAME);
			
 
				 
			
 
				     // add transpose to mel spectrogram
			
 
				-    ITensor* const originalInput = network->getInput(0);
			
 
				-    originalInput->setName("toBeRemoved");
			
 
				-    const Dims originalDims = originalInput->getDimensions();
			
 
				-    if (originalDims.nbDims != 4)
			
 
				-    {
			
 
				-        throw std::runtime_error("Invalid WaveGlow input of " + TRTUtils::dimsToString(originalDims));
			
 
				-    }
			
 
				-
			
 
				-    ITensor* const spectInput = network->addInput(MEL_INPUT_NAME, DataType::kFLOAT,
			
 
				-        Dims4(originalDims.d[0], originalDims.d[3], originalDims.d[2], originalDims.d[1]));
			
 
				-
			
 
				-    ILayer* const firstLayer = network->getLayer(0);
			
 
				-
			
 
				-    IShuffleLayer* const transLayer = network->addShuffle(*spectInput);
			
 
				-    transLayer->setFirstTranspose({0, 3, 2, 1});
			
 
				-
			
 
				-    firstLayer->setInput(0, *transLayer->getOutput(0));
			
 
				-
			
 
				-    network->removeTensor(*originalInput);
			
 
				+    ITensor* const spectInput = network->getInput(0);
			
 
				+    spectInput->setName(MEL_INPUT_NAME);
			
 
				 
			
 
				     TRTPtr<IBuilderConfig> config(builder.createBuilderConfig());
			
 
				     config->setMaxWorkspaceSize(1ULL << 29);
			
@@ -155,25 +138,15 @@ TRTPtr<ICudaEngine> WaveGlowBuilder::build(
 
				     Dims maxSpectDims = minSpectDims;
			
 
				     maxSpectDims.d[0] = maxBatchSize;
			
 
				 
			
 
				-    Dims minZDims = TRTUtils::getInputByName(*network, Z_INPUT_NAME)->getDimensions();
			
 
				-    minZDims.d[0] = 1;
			
 
				-    Dims maxZDims = minZDims;
			
 
				-    maxZDims.d[0] = maxBatchSize;
			
 
				 
			
 
				     TRTUtils::printDimensions("spect", minSpectDims);
			
 
				-    TRTUtils::printDimensions("z", minZDims);
			
 
				     TRTUtils::printDimensions("spect", maxSpectDims);
			
 
				-    TRTUtils::printDimensions("z", maxZDims);
			
 
				 
			
 
				     IOptimizationProfile* const optProfile = builder.createOptimizationProfile();
			
 
				     optProfile->setDimensions(MEL_INPUT_NAME, OptProfileSelector::kMIN, minSpectDims);
			
 
				     optProfile->setDimensions(MEL_INPUT_NAME, OptProfileSelector::kMAX, maxSpectDims);
			
 
				     optProfile->setDimensions(MEL_INPUT_NAME, OptProfileSelector::kOPT, minSpectDims);
			
 
				 
			
 
				-    optProfile->setDimensions(Z_INPUT_NAME, OptProfileSelector::kMIN, minZDims);
			
 
				-    optProfile->setDimensions(Z_INPUT_NAME, OptProfileSelector::kMAX, maxZDims);
			
 
				-    optProfile->setDimensions(Z_INPUT_NAME, OptProfileSelector::kOPT, minZDims);
			
 
				-
			
 
				     config->addOptimizationProfile(optProfile);
			
 
				 
			
 
				     TRTPtr<ICudaEngine> engine(
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/waveglow/waveGlowInstance.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/waveglow/waveGlowInstance.cpp
@@ -30,6 +30,7 @@
 
				 #include "dataShuffler.h"
			
 
				 #include "engineCache.h"
			
 
				 #include "normalDistribution.h"
			
 
				+#include "utils.h"
			
 
				 
			
 
				 #include "NvOnnxParser.h"
			
 
				 #include "cuda_runtime.h"
			
@@ -115,7 +116,10 @@ void WaveGlowInstance::infer(const int batchSize, const float* const melsDevice,
 
				         }
			
 
				     }
			
 
				 
			
 
				-    const int numChunks = ((maxNumMels - 1) / mIndependentChunkSize) + 1;
			
 
				+    // make sure the number of chunks will not exceed any of our buffers
			
 
				+    const int numChunks = std::min(
			
 
				+        Utils::roundUpDiv(maxNumMels, mIndependentChunkSize),
			
 
				+        numMaxSamples / mIndependentChunkSampleSize);
			
 
				     const int totalChunkSize = mStreamingInstance.getMelSpacing() * mStreamingInstance.getNumMelChannels();
			
 
				 
			
 
				     for (int i = 0; i < batchSize; ++i)
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/waveglow/waveGlowStreamingInstance.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/waveglow/waveGlowStreamingInstance.cpp
@@ -51,6 +51,23 @@ constexpr const char* const OUTPUT_NAME = "audio";
 
				 
			
 
				 } // namespace
			
 
				 
			
 
				+/******************************************************************************
			
 
				+ * HELPER FUNCTIONS ***********************************************************
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+namespace
			
 
				+{
			
 
				+
			
 
				+void setBatchDimensions(IExecutionContext* const context, const int batchSize)
			
 
				+{
			
 
				+  const ICudaEngine& engine = context->getEngine();
			
 
				+
			
 
				+  Dims melDims = engine.getBindingDimensions(0);
			
 
				+  melDims.d[0] = batchSize;
			
 
				+  context->setBindingDimensions(0, melDims);
			
 
				+}
			
 
				+} // namespace
			
 
				+
			
 
				 /******************************************************************************
			
 
				  * CONSTRUCTORS / DESTRUCTOR **************************************************
			
 
				  *****************************************************************************/
			
@@ -67,7 +84,7 @@ WaveGlowStreamingInstance::WaveGlowStreamingInstance(
 
				     mInputChannels(
			
 
				         TRTUtils::getBindingDimension(getEngine(), MEL_INPUT_NAME, 3)),
			
 
				     mZChannels(TRTUtils::getBindingDimension(getEngine(), Z_INPUT_NAME, 1)),
			
 
				-    mBatchSize(0),
			
 
				+    mBatchSize(1),
			
 
				     mBinding(),
			
 
				     mContext(getEngine().createExecutionContext()),
			
 
				     mRand(mChunkSampleSize, 0),
			
@@ -83,6 +100,9 @@ WaveGlowStreamingInstance::WaveGlowStreamingInstance(
 
				 
			
 
				     // generate z vector
			
 
				     mRand.setSeed(0, 0);
			
 
				+
			
 
				+    // set batch size to 1 by default
			
 
				+    setBatchDimensions(mContext.get(), mBatchSize);
			
 
				 }
			
 
				 
			
 
				 /******************************************************************************
			
@@ -91,20 +111,18 @@ WaveGlowStreamingInstance::WaveGlowStreamingInstance(
 
				 
			
 
				 void WaveGlowStreamingInstance::startInference(const int batchSize, cudaStream_t stream)
			
 
				 {
			
 
				-    mBatchSize = batchSize;
			
 
				-
			
 
				-    Dims melDims = getEngine().getBindingDimensions(0);
			
 
				-    melDims.d[0] = mBatchSize;
			
 
				-    mContext->setBindingDimensions(0, melDims);
			
 
				+  bool newBatchSize = mBatchSize != batchSize;
			
 
				+  mBatchSize = batchSize;
			
 
				 
			
 
				-    Dims zDims = getEngine().getBindingDimensions(1);
			
 
				-    zDims.d[0] = mBatchSize;
			
 
				-    mContext->setBindingDimensions(1, zDims);
			
 
				+  mRand.generate(mZ.data(), mZ.size(), stream);
			
 
				 
			
 
				-    mRand.generate(mZ.data(), mZ.size(), stream);
			
 
				+  if (newBatchSize) {
			
 
				+    // only set batch dimensions if they have changed
			
 
				+    setBatchDimensions(mContext.get(), mBatchSize);
			
 
				+  }
			
 
				 
			
 
				-    const ICudaEngine& engine = mContext->getEngine();
			
 
				-    mBinding.setBinding(engine, Z_INPUT_NAME, mZ.data());
			
 
				+  const ICudaEngine& engine = mContext->getEngine();
			
 
				+  mBinding.setBinding(engine, Z_INPUT_NAME, mZ.data());
			
 
				 }
			
 
				 
			
 
				 void WaveGlowStreamingInstance::inferNext(cudaStream_t stream, const float* const melsDevice, const int* const numMels,
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trtis/CustomContext.cpp
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trtis/CustomContext.cpp
@@ -144,6 +144,13 @@ CustomContext::CustomContext(const CustomInitializeData* const data) :
 
				     std::cerr << "Using default mapping." << std::endl;
			
 
				   }
			
 
				 
			
 
				+  // set cuda device
			
 
				+  cudaError_t err = cudaSetDevice(data->gpu_device_id);
			
 
				+  if (err != cudaSuccess) {
			
 
				+    throw std::runtime_error("Failed to set device to: " +
			
 
				+        std::to_string(data->gpu_device_id));
			
 
				+  }
			
 
				+
			
 
				   TRTPtr<IBuilder> builder;
			
 
				   {
			
 
				     std::lock_guard<std::mutex> lock(m_mutex);
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/trtis_client/run_trtis_benchmark_client.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/trtis_client/run_trtis_benchmark_client.sh
@@ -2,7 +2,7 @@
 
				 
			
 
				 IS=128
			
 
				 
			
 
				-TEXT="$(echo "The forms of printed letters should be beautiful, and  \
			
 
				+TEXT="$(echo "The forms of printed letters should be beautiful, and \
			
 
				 that their arrangement \
			
 
				 on the page should be reasonable and a help to the \
			
 
				 shapeliness of the letters \