ソースを参照

[Jasper/PyT] Added: inference support for TRT6 and TRT-IS with various backends; new Jupyter notebooks

Przemek Strzelczyk 6 年 前
コミット
09622fa363
65 ファイル変更3841 行追加781 行削除
  1. 3 0
      PyTorch/SpeechRecognition/Jasper/.dockerignore
  2. 4 0
      PyTorch/SpeechRecognition/Jasper/.gitmodules
  3. 6 2
      PyTorch/SpeechRecognition/Jasper/Dockerfile
  4. 24 12
      PyTorch/SpeechRecognition/Jasper/README.md
  5. 1 0
      PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server
  6. 4 14
      PyTorch/SpeechRecognition/Jasper/helpers.py
  7. BIN
      PyTorch/SpeechRecognition/Jasper/images/trtis_dynamic_batching.png
  8. BIN
      PyTorch/SpeechRecognition/Jasper/images/trtis_static_batching_bs1.png
  9. BIN
      PyTorch/SpeechRecognition/Jasper/images/trtis_static_batching_bs8.png
  10. BIN
      PyTorch/SpeechRecognition/Jasper/images/trtis_throughput_latency_summary.png
  11. 151 67
      PyTorch/SpeechRecognition/Jasper/inference.py
  12. 17 17
      PyTorch/SpeechRecognition/Jasper/inference_benchmark.py
  13. 7 3
      PyTorch/SpeechRecognition/Jasper/metrics.py
  14. 44 73
      PyTorch/SpeechRecognition/Jasper/model.py
  15. 34 8
      PyTorch/SpeechRecognition/Jasper/notebooks/JasperTRT.ipynb
  16. 274 0
      PyTorch/SpeechRecognition/Jasper/notebooks/JasperTRTIS.ipynb
  17. 138 18
      PyTorch/SpeechRecognition/Jasper/notebooks/README.md
  18. BIN
      PyTorch/SpeechRecognition/Jasper/notebooks/keynote.wav
  19. 0 0
      PyTorch/SpeechRecognition/Jasper/parts/__init__.py
  20. 85 56
      PyTorch/SpeechRecognition/Jasper/parts/features.py
  21. 30 23
      PyTorch/SpeechRecognition/Jasper/scripts/docker/launch.sh
  22. 1 1
      PyTorch/SpeechRecognition/Jasper/scripts/inference.sh
  23. 1 1
      PyTorch/SpeechRecognition/Jasper/scripts/inference_benchmark.sh
  24. 9 4
      PyTorch/SpeechRecognition/Jasper/scripts/train_benchmark.sh
  25. 43 27
      PyTorch/SpeechRecognition/Jasper/train.py
  26. 5 13
      PyTorch/SpeechRecognition/Jasper/trt/Dockerfile
  27. 107 104
      PyTorch/SpeechRecognition/Jasper/trt/README.md
  28. 13 0
      PyTorch/SpeechRecognition/Jasper/trt/onnx-trt.patch
  29. 39 47
      PyTorch/SpeechRecognition/Jasper/trt/perf.py
  30. 41 196
      PyTorch/SpeechRecognition/Jasper/trt/perfprocedures.py
  31. 102 38
      PyTorch/SpeechRecognition/Jasper/trt/perfutils.py
  32. 2 0
      PyTorch/SpeechRecognition/Jasper/trt/requirements.txt
  33. 0 0
      PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/build.sh
  34. 6 2
      PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/launch.sh
  35. 11 0
      PyTorch/SpeechRecognition/Jasper/trt/scripts/trt_inference.sh
  36. 25 14
      PyTorch/SpeechRecognition/Jasper/trt/scripts/trt_inference_benchmark.sh
  37. 11 6
      PyTorch/SpeechRecognition/Jasper/trt/scripts/walk_benchmark.sh
  38. 98 35
      PyTorch/SpeechRecognition/Jasper/trt/trtutils.py
  39. 40 0
      PyTorch/SpeechRecognition/Jasper/trtis/Dockerfile
  40. 381 0
      PyTorch/SpeechRecognition/Jasper/trtis/README.md
  41. 404 0
      PyTorch/SpeechRecognition/Jasper/trtis/jasper-client.py
  42. 45 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-decoder/config.pbtxt
  43. 32 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-feature-extractor/config.pbtxt
  44. 60 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-cpu-ensemble/config.pbtxt
  45. 53 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-cpu/config.pbtxt
  46. 60 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-ensemble/config.pbtxt
  47. 56 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx/config.pbtxt
  48. 61 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-pyt-ensemble/config.pbtxt
  49. 31 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-pyt/config.pbtxt
  50. 60 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-trt-ensemble/config.pbtxt
  51. 65 0
      PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-trt/config.pbtxt
  52. 1 0
      PyTorch/SpeechRecognition/Jasper/trtis/requirements.txt
  53. 8 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/docker/build.sh
  54. 39 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/docker/launch.sh
  55. 102 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/execute_all_perf_runs.sh
  56. 36 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/export_model.sh
  57. 102 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/export_model_helper.sh
  58. 125 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/generate_perf_results.sh
  59. 45 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/prepare_model_repository.sh
  60. 52 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_client.sh
  61. 81 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_perf_client.sh
  62. 58 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_server.sh
  63. 33 0
      PyTorch/SpeechRecognition/Jasper/trtis/scripts/wait_for_trtis_server.sh
  64. 472 0
      PyTorch/SpeechRecognition/Jasper/trtis/speech_utils.py
  65. 3 0
      PyTorch/SpeechRecognition/Jasper/utils/download_utils.py

+ 3 - 0
PyTorch/SpeechRecognition/Jasper/.dockerignore

@@ -2,3 +2,6 @@ results/
 *__pycache__
 checkpoints/
 .git/
+datasets/
+external/tensorrt-inference-server/
+checkpoints/

+ 4 - 0
PyTorch/SpeechRecognition/Jasper/.gitmodules

@@ -0,0 +1,4 @@
+[submodule "external/tensorrt-inference-server"]
+	path = external/tensorrt-inference-server
+	url = https://github.com/NVIDIA/tensorrt-inference-server.git
+	branch = r19.06

+ 6 - 2
PyTorch/SpeechRecognition/Jasper/Dockerfile

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3 
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.10-py3  
 FROM ${FROM_IMAGE_NAME}
 
 
@@ -20,5 +20,9 @@ RUN apt-get update && apt-get install -y libsndfile1 && apt-get install -y sox &
 
 WORKDIR /workspace/jasper
 
-COPY . .
+# Install requirements (do this first for better caching)
+COPY requirements.txt .
 RUN pip install --disable-pip-version-check -U -r requirements.txt
+
+# Copy rest of files
+COPY . .

+ 24 - 12
PyTorch/SpeechRecognition/Jasper/README.md

@@ -24,6 +24,7 @@ This repository provides scripts to train the Jasper model to achieve near state
    * [Inference process](#inference-process)
    * [Evaluation process](#evaluation-process)
    * [Inference process with TensorRT](#inference-process-with-tensorrt)
+   * [Inference process with TensorRT Inference Server](#inference-process-with-tensorrt-inference-server)
 - [Performance](#performance)
    * [Benchmarking](#benchmarking)
        * [Training performance benchmark](#training-performance-benchmark)
@@ -33,9 +34,9 @@ This repository provides scripts to train the Jasper model to achieve near state
            * [Training accuracy: NVIDIA DGX-1 (8x V100 32G)](#training-accuracy-nvidia-dgx-1-8x-v100-32G)
            * [Training stability test](#training-stability-test)
        * [Training performance results](#training-performance-results)
-           * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16G)
-           * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32G)
-           * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32G)
+         * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16G)
+         * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32G)
+         * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32G)
        * [Inference performance results](#inference-performance-results)
            * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16G)
            * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32G)
@@ -217,10 +218,10 @@ The following section lists the requirements in order to start training and eval
 
 ### Requirements
 
-This repository contains a `Dockerfile` which extends the PyTorch 19.09-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+This repository contains a `Dockerfile` which extends the PyTorch 19.10-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 19.09-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+* [PyTorch 19.10-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
 * [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
 
 Further required python packages are listed in `requirements.txt`, which are automatically installed with the Docker container built. To manually install them, run
@@ -383,7 +384,7 @@ The `scripts/` folder encapsulates all the one-click scripts required for runnin
 
 
 Other folders included in the `root` directory are:
-* `notebooks/` - Contains Jupyter notebook
+* `notebooks/` - Contains Jupyter notebooks and example audio files
 * `configs/` - Model configurations
 * `utils/` - Contains the necessary files for data download and  processing
 * `parts/` - Contains the necessary files for data pre-processing
@@ -558,6 +559,11 @@ Apart from the default arguments as listed in the [Parameters](#parameters) sect
 NVIDIA TensorRT is a platform for high-performance deep learning inference. It includes a deep learning inference optimizer and runtime that delivers low latency and high-throughput for deep learning inference applications. Jasper’s architecture, which is of deep convolutional nature, is designed to facilitate fast GPU inference. After optimizing the compute-intensive acoustic model with NVIDIA TensorRT, inference throughput increased by up to 1.8x over native PyTorch. 
 More information on how to perform inference using TensorRT and speed up comparison between TensorRT and native PyTorch can be found in the subfolder [./trt/README.md](trt/README.md)
 
+### Inference Process with TensorRT Inference Server
+The NVIDIA TensorRT Inference Server provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
+More information on how to perform inference using TensorRT Inference Server with different model backends can be found in the subfolder [./trtis/README.md](trtis/README.md)
+
+
 ## Performance
 
 ### Benchmarking
@@ -610,7 +616,7 @@ The results for Jasper Large's word error rate from the original paper after gre
 
 ##### Training accuracy: NVIDIA DGX-1 (8x V100 32G)
 
-Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.09-py3 NGC container with NVIDIA DGX-1 with (8x V100 32G) GPUs.
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.10-py3 NGC container with NVIDIA DGX-1 with (8x V100 32G) GPUs.
 The following tables report the word error rate(WER) of the acoustic model with greedy decoding on all LibriSpeech dev and test datasets for mixed precision training.
 
 FP16 (seed #6)
@@ -638,7 +644,7 @@ The following table compares greedy decoding word error rates across 8 different
 
 #### Training performance results
 
-Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.09-py3 NGC container. Performance (in sequences per second) is the steady-state throughput.
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.10-py3 NGC container. Performance (in sequences per second) is the steady-state throughput.
 
 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
 
@@ -706,7 +712,7 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 
 #### Inference performance results
 
-Our results were obtained by running the `scripts/inference_benchmark.sh` script in the PyTorch 19.09-py3 NGC container on NVIDIA DGX-1, DGX-2 and T4 on a single GPU. Performance numbers (latency in milliseconds per batch) were averaged over 1000 iterations.
+Our results were obtained by running the `scripts/inference_benchmark.sh` script in the PyTorch 19.10-py3 NGC container on NVIDIA DGX-1, DGX-2 and T4 on a single GPU. Performance numbers (latency in milliseconds per batch) were averaged over 1000 iterations.
 
 ##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
 
@@ -804,13 +810,19 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
 ## Release notes
+December 2019
+* Inference support for TRT 6 with dynamic shapes
+* Inference support for TensorRT Inference Server with acoustic model backends in ONNX, PyTorch JIT, TensorRT
+* Jupyter notebook for inference with TensorRT Inference Server
+
+November 2019
+* Google Colab notebook for inference with native TensorRT
 
-### Changelog
 September 2019
-* Inference support for TRT 6
+* Inference support for TensorRT 6 with static shapes
 * Jupyter notebook for inference
 
-July 2019
+August 2019
 * Initial release
 
 

+ 1 - 0
PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server

@@ -0,0 +1 @@
+Subproject commit 71f0771cb8cb2a2eb1c6a9433f9a56dd1f206c96

+ 4 - 14
PyTorch/SpeechRecognition/Jasper/helpers.py

@@ -19,20 +19,10 @@ from enum import Enum
 from metrics import word_error_rate
 
 
-class Optimization(Enum):
-    """Various levels of Optimization.
-    WARNING: This might have effect on model accuracy."""
-    nothing = 0
-    mxprO0 = 1
-    mxprO1 = 2
-    mxprO2 = 3
-    mxprO3 = 4
-
-
-AmpOptimizations = {Optimization.mxprO0: "O0",
-                    Optimization.mxprO1: "O1",
-                    Optimization.mxprO2: "O2",
-                    Optimization.mxprO3: "O3"}
+
+
+
+AmpOptimizations = ["O0", "O1", "O2", "O3"]
 
 def print_once(msg):
     if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):

BIN
PyTorch/SpeechRecognition/Jasper/images/trtis_dynamic_batching.png


BIN
PyTorch/SpeechRecognition/Jasper/images/trtis_static_batching_bs1.png


BIN
PyTorch/SpeechRecognition/Jasper/images/trtis_static_batching_bs8.png


BIN
PyTorch/SpeechRecognition/Jasper/images/trtis_throughput_latency_summary.png


+ 151 - 67
PyTorch/SpeechRecognition/Jasper/inference.py

@@ -19,19 +19,24 @@ from tqdm import tqdm
 import math
 import toml
 from dataset import AudioToTextDataLayer
-from helpers import process_evaluation_batch, process_evaluation_epoch, Optimization, add_ctc_labels, AmpOptimizations, print_dict, model_multi_gpu, __ctc_decoder_predictions_tensor
+from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, AmpOptimizations, print_dict, model_multi_gpu, __ctc_decoder_predictions_tensor
 from model import AudioPreprocessing, GreedyCTCDecoder, JasperEncoderDecoder
 from parts.features import audio_from_file
 import torch
+import torch.nn as nn
 import apex
 from apex import amp
 import random
 import numpy as np
 import pickle
 import time
+import os
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Jasper')
+
+    parser.register("type", "bool", lambda x: x.lower() in ("yes", "true", "t", "1"))
+
     parser.add_argument("--local_rank", default=None, type=int)
     parser.add_argument("--batch_size", default=16, type=int, help='data batch size')
     parser.add_argument("--steps", default=None, help='if not specified do evaluation on full dataset. otherwise only evaluates the specified number of iterations for each worker', type=int)
@@ -42,85 +47,143 @@ def parse_args():
     parser.add_argument("--max_duration", default=None, type=float, help='maximum duration of sequences. if None uses attribute from model configuration file')
     parser.add_argument("--pad_to", default=None, type=int, help="default is pad to value as specified in model configurations. if -1 pad to maximum duration. If > 0 pad batch to next multiple of value")
     parser.add_argument("--fp16", action='store_true', help='use half precision')
+    parser.add_argument("--pyt_fp16", action='store_true', help='use half precision')
     parser.add_argument("--cudnn_benchmark", action='store_true', help="enable cudnn benchmark")
     parser.add_argument("--save_prediction", type=str, default=None, help="if specified saves predictions in text form at this location")
     parser.add_argument("--logits_save_to", default=None, type=str, help="if specified will save logits to path")
     parser.add_argument("--seed", default=42, type=int, help='seed')
+    parser.add_argument("--masked_fill", type="bool", help="Overrides the masked_fill option for the Encoder")
+    parser.add_argument("--output_dir", default="results/", type=str, help="Output directory to store exported models. Only used if --export_model is used")
+    parser.add_argument("--export_model", action='store_true', help="Exports the audio_featurizer, encoder and decoder using torch.jit to the output_dir")
     parser.add_argument("--wav", type=str, help='absolute path to .wav file (16KHz)')
     return parser.parse_args()
 
-def eval(
-        data_layer,
-        audio_processor,
-        encoderdecoder,
-        greedy_decoder,
-        labels,
-        multi_gpu,
-        args):
-    """performs inference / evaluation
-    Args:
-        data_layer: data layer object that holds data loader
-        audio_processor: data processing module
-        encoderdecoder: acoustic model
-        greedy_decoder: greedy decoder
-        labels: list of labels as output vocabulary
-        multi_gpu: true if using multiple gpus
-        args: script input arguments
-    """
-    logits_save_to=args.logits_save_to
-    audio_processor.eval()
-    encoderdecoder.eval()
+def calc_wer(data_layer, audio_processor, 
+             encoderdecoder, greedy_decoder, 
+             labels, args):
+
+    encoderdecoder = encoderdecoder.module if hasattr(encoderdecoder, 'module') else encoderdecoder
     with torch.no_grad():
+        # reset global_var_dict - results of evaluation will be stored there
         _global_var_dict = {
             'predictions': [],
             'transcripts': [],
             'logits' : [],
         }
 
-
-        
-        if args.wav:
-            features, p_length_e = audio_processor(audio_from_file(args.wav))
-            torch.cuda.synchronize()
-            t0 = time.perf_counter()
-            t_log_probs_e = encoderdecoder(features)
-            torch.cuda.synchronize()
-            t1 = time.perf_counter()
-            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
-            hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
-            print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0))
-            print("TRANSCRIPT\t\t:", hypotheses[0])
-            return
-        
+        # Evaluation mini-batch for loop
         for it, data in enumerate(tqdm(data_layer.data_iterator)):
+
             tensors = []
             for d in data:
                 tensors.append(d.cuda())
-
+    
             t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
-
-            inp = (t_audio_signal_e, t_a_sig_length_e)
-
-            t_processed_signal, p_length_e = audio_processor(x=inp)
-            if args.use_conv_mask:
-                t_log_probs_e, t_encoded_len_e  = encoderdecoder((t_processed_signal, p_length_e))
-            else:
-                t_log_probs_e  = encoderdecoder(t_processed_signal)
-            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
-
+    
+            t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e) 
+            t_log_probs_e, _ = encoderdecoder.infer(t_processed_signal)
+            t_predictions_e = greedy_decoder(t_log_probs_e)
+    
             values_dict = dict(
                 predictions=[t_predictions_e],
                 transcript=[t_transcript_e],
                 transcript_length=[t_transcript_len_e],
                 output=[t_log_probs_e]
             )
+            # values_dict will contain results from all workers
             process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
-
+    
             if args.steps is not None and it + 1 >= args.steps:
                 break
+
+        # final aggregation (over minibatches) and logging of results
         wer, _ = process_evaluation_epoch(_global_var_dict)
+
+        return wer, _global_var_dict
+
+
+def jit_export(
+         audio, audio_len,
+         audio_processor,
+         encoderdecoder,
+         greedy_decoder,
+         args):
+
+                print("##############")
+
+                module_name = "{}_{}".format(os.path.basename(args.model_toml), "fp16" if args.fp16 else "fp32")
+
+                if args.masked_fill is not None and args.masked_fill == False:
+                    module_name = module_name + "_noMaskConv"
+
+                # Export just the featurizer
+                print("exporting featurizer ...")
+                traced_module_feat = torch.jit.script(audio_processor)
+                traced_module_feat.save(os.path.join(args.output_dir, module_name + "_feat.pt"))
+
+                # Export just the acoustic model
+                print("exporting acoustic model ...")
+                inp_postFeat, _ = audio_processor(audio, audio_len)
+                traced_module_acoustic = torch.jit.trace(encoderdecoder, inp_postFeat)
+                traced_module_acoustic.save(os.path.join(args.output_dir, module_name + "_acoustic.pt"))
+
+                # Export just the decoder
+                print("exporting decoder ...")
+
+                inp_postAcoustic = encoderdecoder(inp_postFeat)
+                traced_module_decode = torch.jit.script(greedy_decoder, inp_postAcoustic)
+                traced_module_decode.save(os.path.join(args.output_dir, module_name + "_decoder.pt"))
+                print("JIT export complete")
+
+                return traced_module_feat, traced_module_acoustic, traced_module_decode
+
+def run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels):
+            features = audio_processor(audio, audio_len)
+            torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            t_log_probs_e = encoderdecoder(features[0])
+            torch.cuda.synchronize()
+            t1 = time.perf_counter()
+            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
+            hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
+            print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0))
+            print("TRANSCRIPT\t\t:", hypotheses[0])
+
+
+def eval(
+         data_layer,
+         audio_processor,
+         encoderdecoder,
+         greedy_decoder,
+         labels,
+         multi_gpu,
+         args):
+    """performs inference / evaluation
+    Args:
+        data_layer: data layer object that holds data loader
+        audio_processor: data processing module
+        encoderdecoder: acoustic model
+        greedy_decoder: greedy decoder
+        labels: list of labels as output vocabulary
+        multi_gpu: true if using multiple gpus
+        args: script input arguments
+    """
+    logits_save_to=args.logits_save_to
+    
+    with torch.no_grad():
+        if args.wav:
+            audio, audio_len = audio_from_file(args.wav)
+            run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels)
+            if args.export_model:
+                jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export(audio, audio_len, audio_processor,
+                                                                                         encoderdecoder,
+                                                                                         greedy_decoder,args)
+            run_once(jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder, audio, audio_len, labels)
+            return
+        wer, _global_var_dict = calc_wer(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args)
         if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)):
             print("==========>>>>>>Evaluation WER: {0}\n".format(wer))
+      
             if args.save_prediction is not None:
                 with open(args.save_prediction, 'w') as fp:
                     fp.write('\n'.join(_global_var_dict['predictions']))
@@ -132,6 +195,15 @@ def eval(
                 with open(logits_save_to, 'wb') as f:
                     pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
 
+            # if args.export_model:            
+            #     feat, acoustic, decoder = jit_export(inp, audio_processor, encoderdecoder, greedy_decoder,args)
+            #     wer_after = calc_wer(data_layer, feat, acoustic, decoder, labels, args)
+            #     print("===>>>Before WER: {0}".format(wer))
+            #     print("===>>>Traced WER: {0}".format(wer_after))
+            #     print("===>>>Diff      : {0} %".format((wer_after - wer_before) * 100.0 / wer_before))
+            #     print("")
+
+                
 def main(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -148,9 +220,9 @@ def main(args):
         print("DISTRIBUTED with ", torch.distributed.get_world_size())
 
     if args.fp16:
-        optim_level = Optimization.mxprO3
+        optim_level = 3
     else:
-        optim_level = Optimization.mxprO0
+        optim_level = 0
 
     jasper_model_definition = toml.load(args.model_toml)
     dataset_vocab = jasper_model_definition['labels']['labels']
@@ -159,17 +231,27 @@ def main(args):
     val_manifest = args.val_manifest
     featurizer_config = jasper_model_definition['input_eval']
     featurizer_config["optimization_level"] = optim_level
+    featurizer_config["fp16"] = args.fp16
     args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
 
+    if args.masked_fill is not None:
+        print("{} masked_fill".format("Enabling" if args.masked_fill else "Disabling"))
+        jasper_model_definition["encoder"]["conv_mask"] = args.masked_fill
+
     if args.max_duration is not None:
         featurizer_config['max_duration'] = args.max_duration
     if args.pad_to is not None:
-        featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max"
+        featurizer_config['pad_to'] = args.pad_to 
 
-    print('model_config')
+    if featurizer_config['pad_to'] == "max":
+        featurizer_config['pad_to'] = -1
+        
+    print('=== model_config ===')
     print_dict(jasper_model_definition)
-    print('feature_config')
+    print()
+    print('=== feature_config ===')
     print_dict(featurizer_config)
+    print()
     data_layer = None
     
     if args.wav is None:
@@ -179,20 +261,23 @@ def main(args):
             manifest_filepath=val_manifest,
             labels=dataset_vocab,
             batch_size=args.batch_size,
-            pad_to_max=featurizer_config['pad_to'] == "max",
+            pad_to_max=featurizer_config['pad_to'] == -1,
             shuffle=False,
             multi_gpu=multi_gpu)
     audio_preprocessor = AudioPreprocessing(**featurizer_config)
-
     encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab))
 
     if args.ckpt is not None:
         print("loading model from ", args.ckpt)
-        checkpoint = torch.load(args.ckpt, map_location="cpu")
-        for k in audio_preprocessor.state_dict().keys():
-            checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k)
-        audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False)
-        encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)
+
+        if os.path.isdir(args.ckpt):
+            exit(0)
+        else:
+            checkpoint = torch.load(args.ckpt, map_location="cpu")
+            for k in audio_preprocessor.state_dict().keys():
+                checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k)
+            audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False)
+            encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)
 
     greedy_decoder = GreedyCTCDecoder()
 
@@ -211,19 +296,18 @@ def main(args):
             print('Have {0} examples to eval on.'.format(N))
             print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch))
             print('-----------------')
-    else:
-            audio_preprocessor.featurizer.normalize = "per_feature"
 
     print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize)
     audio_preprocessor.cuda()
     encoderdecoder.cuda()
     if args.fp16:
-        encoderdecoder = amp.initialize(
-            models=encoderdecoder,
-            opt_level=AmpOptimizations[optim_level])
+        encoderdecoder = amp.initialize( models=encoderdecoder,
+                                         opt_level=AmpOptimizations[optim_level])
 
     encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu)
-
+    audio_preprocessor.eval()
+    encoderdecoder.eval()
+    greedy_decoder.eval()
     
     eval(
         data_layer=data_layer,

+ 17 - 17
PyTorch/SpeechRecognition/Jasper/inference_benchmark.py

@@ -26,7 +26,7 @@ import toml
 import torch
 from apex import amp
 from dataset import AudioToTextDataLayer
-from helpers import process_evaluation_batch, process_evaluation_epoch, Optimization, add_ctc_labels, AmpOptimizations, print_dict
+from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, AmpOptimizations, print_dict
 from model import AudioPreprocessing, GreedyCTCDecoder, JasperEncoderDecoder
 
 def parse_args():
@@ -88,21 +88,16 @@ def eval(
                 dl_device = torch.device("cuda")
                 for d in data:
                     tensors.append(d.to(dl_device))
-
-
+     
                 t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
-
-                inp=(t_audio_signal_e, t_a_sig_length_e)
                 torch.cuda.synchronize()
                 t0 = time.perf_counter()
-                t_processed_signal, p_length_e = audio_processor(x=inp)
+                t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e)
                 torch.cuda.synchronize()
                 t1 = time.perf_counter()
                 
-                if args.use_conv_mask:
-                    t_log_probs_e, t_encoded_len_e  = encoderdecoder((t_processed_signal, p_length_e))
-                else:
-                    t_log_probs_e  = encoderdecoder(t_processed_signal)
+                t_log_probs_e, _  = encoderdecoder.infer(t_processed_signal)
+
                 torch.cuda.synchronize()
                 stop_time = time.perf_counter()
 
@@ -118,7 +113,7 @@ def eval(
                 process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
                 durations_dnn.append(time_dnn)
                 durations_dnn_and_prep.append(time_prep_and_dnn)
-                seq_lens.append(t_processed_signal.shape[-1])
+                seq_lens.append(t_processed_signal[0].shape[-1])
 
             if it >= steps:
 
@@ -135,7 +130,7 @@ def eval(
 
 def take_durations_and_output_percentile(durations, ratios):
     durations = np.asarray(durations) * 1000 # in ms
-    latency = durations
+    latency = durations 
 
     latency = latency[5:]
     mean_latency = np.mean(latency)
@@ -162,9 +157,9 @@ def main(args):
     assert(torch.cuda.is_available())
 
     if args.fp16:
-        optim_level = Optimization.mxprO3
+        optim_level = 3
     else:
-        optim_level = Optimization.mxprO0
+        optim_level = 0
     batch_size = args.batch_size
 
     jasper_model_definition = toml.load(args.model_toml)
@@ -174,11 +169,16 @@ def main(args):
     val_manifest = args.val_manifest
     featurizer_config = jasper_model_definition['input_eval']
     featurizer_config["optimization_level"] = optim_level
-    args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
+
     if args.max_duration is not None:
         featurizer_config['max_duration'] = args.max_duration
+    
+    # TORCHSCRIPT: Cant use mixed types. Using -1 for "max"
     if args.pad_to is not None:
-        featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max"
+        featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else -1
+    
+    if featurizer_config['pad_to'] == "max":
+        featurizer_config['pad_to'] = -1
 
     print('model_config')
     print_dict(jasper_model_definition)
@@ -191,7 +191,7 @@ def main(args):
                             manifest_filepath=val_manifest,
                             labels=dataset_vocab,
                             batch_size=batch_size,
-                            pad_to_max=featurizer_config['pad_to'] == "max",
+                            pad_to_max=featurizer_config['pad_to'] == -1,
                             shuffle=False,
                             multi_gpu=False)
 

+ 7 - 3
PyTorch/SpeechRecognition/Jasper/metrics.py

@@ -51,10 +51,14 @@ def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
     """
     scores = 0
     words = 0
-    if len(hypotheses) != len(references):
+    len_diff = len(references) - len(hypotheses) 
+    if len_diff > 0:
         raise ValueError("In word error rate calculation, hypotheses and reference"
-                                         " lists must have the same number of elements. But I got:"
-                                         "{0} and {1} correspondingly".format(len(hypotheses), len(references)))
+                         " lists must have the same number of elements. But I got:"
+                         "{0} and {1} correspondingly".format(len(hypotheses), len(references)))
+    elif len_diff < 0:
+        hypotheses = hypotheses[:len_diff]
+    
     for h, r in zip(hypotheses, references):
         h_list = h.split()
         r_list = r.split()

+ 44 - 73
PyTorch/SpeechRecognition/Jasper/model.py

@@ -16,7 +16,6 @@ from apex import amp
 import torch
 import torch.nn as nn
 from parts.features import FeatureFactory
-from helpers import Optimization
 import random
 
 
@@ -55,23 +54,23 @@ def get_same_padding(kernel_size, stride, dilation):
 class AudioPreprocessing(nn.Module):
     """GPU accelerated audio preprocessing
     """
+    __constants__ = ["optim_level"]
     def __init__(self, **kwargs):
         nn.Module.__init__(self)    # For PyTorch API
-        self.optim_level = kwargs.get('optimization_level', Optimization.nothing)
+        self.optim_level = kwargs.get('optimization_level', 0)
         self.featurizer = FeatureFactory.from_config(kwargs)
+        self.transpose_out = kwargs.get("transpose_out", False)
 
-    def forward(self, x):
-        input_signal, length = x
-        length.requires_grad_(False)
-        if self.optim_level not in  [Optimization.nothing, Optimization.mxprO0, Optimization.mxprO3]:
-            with amp.disable_casts():
-                processed_signal = self.featurizer(x)
-                processed_length = self.featurizer.get_seq_len(length)
+    @torch.no_grad()
+    def forward(self, input_signal, length):
+        processed_signal = self.featurizer(input_signal, length)
+        processed_length = self.featurizer.get_seq_len(length)    
+        if self.transpose_out:
+            processed_signal.transpose_(2,1)
+            return processed_signal, processed_length
         else:
-                processed_signal = self.featurizer(x)
-                processed_length = self.featurizer.get_seq_len(length)
-        return processed_signal, processed_length
-
+            return processed_signal, processed_length
+                
 class SpectrogramAugmentation(nn.Module):
     """Spectrogram augmentation
     """
@@ -131,7 +130,7 @@ class SpecCutoutRegions(nn.Module):
     def forward(self, x):
         sh = x.shape
 
-        mask = torch.zeros(x.shape).byte()
+        mask = torch.zeros(x.shape, dtype=torch.uint8)
 
         for idx in range(sh[0]):
             for i in range(self.cutout_rect_regions):
@@ -148,7 +147,7 @@ class SpecCutoutRegions(nn.Module):
         return x
 
 class JasperEncoder(nn.Module):
-
+    __constants__ = ["use_conv_mask"]    
     """Jasper encoder
     """
     def __init__(self, **kwargs):
@@ -214,83 +213,58 @@ class JasperDecoderForCTC(nn.Module):
         out = self.decoder_layers(encoder_output[-1]).transpose(1, 2)
         return nn.functional.log_softmax(out, dim=2)
 
-class Jasper(nn.Module):
-    """Contains data preprocessing, spectrogram augmentation, jasper encoder and decoder
+class JasperEncoderDecoder(nn.Module):
+    """Contains jasper encoder and decoder
     """
     def __init__(self, **kwargs):
         nn.Module.__init__(self)
-        if kwargs.get("no_featurizer", False):
-            self.audio_preprocessor = None
-        else:
-            self.audio_preprocessor = AudioPreprocessing(**kwargs.get("feature_config"))
-
-        self.data_spectr_augmentation = SpectrogramAugmentation(**kwargs.get("feature_config"))
+        self.transpose_in=kwargs.get("transpose_in", False)
         self.jasper_encoder = JasperEncoder(**kwargs.get("jasper_model_definition"))
         self.jasper_decoder = JasperDecoderForCTC(feat_in=kwargs.get("feat_in"),
                                                   num_classes=kwargs.get("num_classes"))
-        self.acoustic_model = JasperAcousticModel(self.jasper_encoder, self.jasper_decoder)
-
+        
     def num_weights(self):
         return sum(p.numel() for p in self.parameters() if p.requires_grad)
 
-    def forward(self, x):
-
-        # Apply optional preprocessing
-        if self.audio_preprocessor is not None:
-            t_processed_signal, p_length_t = self.audio_preprocessor(x)
-        # Apply optional spectral augmentation
-        if self.training:
-            t_processed_signal = self.data_spectr_augmentation(input_spec=t_processed_signal)
-            
-        if (self.jasper_encoder.use_conv_mask):
-            a_inp = (t_processed_signal, p_length_t)
-        else:
-            a_inp = t_processed_signal
-        # Forward Pass through Encoder-Decoder
-        return self.acoustic_model.forward(a_inp)
-
-
-class JasperAcousticModel(nn.Module):
-    def __init__(self, enc, dec, transpose_in=False):
-        nn.Module.__init__(self)
-        self.jasper_encoder = enc
-        self.jasper_decoder = dec
-        self.transpose_in = transpose_in
+    
     def forward(self, x):
         if self.jasper_encoder.use_conv_mask:
             t_encoded_t, t_encoded_len_t = self.jasper_encoder(x)
         else:
             if self.transpose_in:
-                x = x.transpose(1, 2)                
+                x = x.transpose(1, 2)   
             t_encoded_t = self.jasper_encoder(x)
-
-        out = self.jasper_decoder(encoder_output=t_encoded_t)
+            
+        out = self.jasper_decoder(t_encoded_t)
         if self.jasper_encoder.use_conv_mask:
             return out, t_encoded_len_t
         else:
             return out
 
-class JasperEncoderDecoder(nn.Module):
-    """Contains jasper encoder and decoder
+    def infer(self, x):
+        if self.jasper_encoder.use_conv_mask:
+            return self.forward(x)
+        else:
+            ret = self.forward(x[0])
+            return ret, len(ret)
+        
+    
+class Jasper(JasperEncoderDecoder):
+    """Contains data preprocessing, spectrogram augmentation, jasper encoder and decoder
     """
     def __init__(self, **kwargs):
-        nn.Module.__init__(self)
-        self.jasper_encoder = JasperEncoder(**kwargs.get("jasper_model_definition"))
-        self.jasper_decoder = JasperDecoderForCTC(feat_in=kwargs.get("feat_in"),
-                                                  num_classes=kwargs.get("num_classes"))
-        self.acoustic_model = JasperAcousticModel(self.jasper_encoder,
-                                                  self.jasper_decoder,
-                                                  kwargs.get("transpose_in", False))
-        
-    def num_weights(self):
-        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+        JasperEncoderDecoder.__init__(self, **kwargs)
+        feature_config = kwargs.get("feature_config")
+        if self.transpose_in:
+            feature_config["transpose"] = True
+        self.audio_preprocessor = AudioPreprocessing(**feature_config)
+        self.data_spectr_augmentation = SpectrogramAugmentation(**feature_config)
 
-    def forward(self, x):
-        return self.acoustic_model.forward(x)
 
 class MaskedConv1d(nn.Conv1d):
     """1D convolution with sequence masking
     """
+    __constants__ = ["use_conv_mask"]
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                              padding=0, dilation=1, groups=1, bias=False, use_conv_mask=True):
         super(MaskedConv1d, self).__init__(in_channels, out_channels, kernel_size,
@@ -313,16 +287,14 @@ class MaskedConv1d(nn.Conv1d):
             del mask
             del idxs
             lens = self.get_seq_len(lens)
+            return super(MaskedConv1d, self).forward(x), lens
         else:
-            x = inp
-        out = super(MaskedConv1d, self).forward(x)
+            return super(MaskedConv1d, self).forward(inp)
 
-        if self.use_conv_mask:
-            return out, lens
-        else:
-            return out
 
 class JasperBlock(nn.Module):
+    __constants__ = ["use_conv_mask", "conv"]
+
     """Jasper Block. See https://arxiv.org/pdf/1904.03288.pdf
     """
     def __init__(self, inplanes, planes, repeat=3, kernel_size=11, stride=1,
@@ -428,9 +400,8 @@ class GreedyCTCDecoder(nn.Module):
     """
     def __init__(self, **kwargs):
         nn.Module.__init__(self)    # For PyTorch API
-
+    @torch.no_grad()
     def forward(self, log_probs):
-        with torch.no_grad():
             argmx = log_probs.argmax(dim=-1, keepdim=False).int()
             return argmx
 

+ 34 - 8
PyTorch/SpeechRecognition/Jasper/notebooks/JasperTRT.ipynb

@@ -29,7 +29,7 @@
     "<img src=http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png style=\"width: 90px; float: right;\">\n",
     "\n",
     "# Jasper Inference For TensorRT 6\n",
-    "This Jupyter notebook provides scripts to perform high-performance inference using NVIDIA TensorRT. \n",
+    "This Jupyter notebook provides scripts to perform high-performance inference using NVIDIA TensorRT 6 with dynamic shapes. \n",
     "Jasper is a neural acoustic model for speech recognition. Its network architecture is designed to facilitate fast GPU inference. \n",
     "NVIDIA TensorRT is a platform for high-performance deep learning inference. It includes a deep learning inference optimizer and runtime that delivers low latency and high-throughput for deep learning inference applications.\n",
     "After optimizing the compute-intensive acoustic model with NVIDIA TensorRT, inference throughput increased by up to 1.8x over native PyTorch."
@@ -77,15 +77,41 @@
    "source": [
     "## 2. Requirements\n",
     "\n",
-    "Please refer to README.md"
+    "Please refer to Jasper TensorRT README.md"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3. Jasper Inference\n",
-    "### 3.1  Start a detached session in the NGC container"
+    "## 3. Jasper Inference\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.1  Prepare Working Directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "if not 'workbookDir' in globals():\n",
+    "    workbookDir = os.getcwd() + \"/../\"\n",
+    "print('workbookDir: ' + workbookDir)\n",
+    "os.chdir(workbookDir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.2 Start a detached session in the NGC container"
    ]
   },
   {
@@ -127,7 +153,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 3.2 Download and preprocess the dataset.\n",
+    "### 3.3 Download and preprocess the dataset.\n",
     "You will not need to download the dataset if you directly go to Section 5 to play with audio examples.\n",
     "\n",
     "If LibriSpeech http://www.openslr.org/12 has already been downloaded and preprocessed, no further steps in this subsection need to be taken.\n",
@@ -190,7 +216,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 3.3. Start TensorRT inference prediction\n",
+    "### 3.4. Start TensorRT inference prediction\n",
     "\n",
     "Inside the container, use the following script to run inference with TensorRT.\n",
     "You will need to set the parameters such as: \n",
@@ -220,7 +246,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    " ### 3.4. Start TensorRT Inference Benchmark\n",
+    " ### 3.5. Start TensorRT Inference Benchmark\n",
     "\n",
     "Run the following commmand to run inference benchmark with TensorRT inside the container.\n",
     "\n",
@@ -424,7 +450,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.7.3"
   }
  },
  "nbformat": 4,

+ 274 - 0
PyTorch/SpeechRecognition/Jasper/notebooks/JasperTRTIS.ipynb

@@ -0,0 +1,274 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# Jasper inference using TensorRT Inference Server\n",
+    "This Jupyter notebook provides scripts to deploy high-performance inference in NVIDIA TensorRT Inference Server offering different options for the model backend, among others NVIDIA TensorRT. \n",
+    "Jasper is a neural acoustic model for speech recognition. Its network architecture is designed to facilitate fast GPU inference. \n",
+    "NVIDIA TensorRT Inference Server provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server\n",
+    "NVIDIA TensorRT is a platform for high-performance deep learning inference. It includes a deep learning inference optimizer and runtime that delivers low latency and high-throughput for deep learning inference applications.\n",
+    "## 1. Overview\n",
+    "\n",
+    "The Jasper model is an end-to-end neural acoustic model for automatic speech recognition (ASR) that provides near state-of-the-art results on LibriSpeech among end-to-end ASR models without any external data. The Jasper architecture of convolutional layers was designed to facilitate fast GPU inference, by allowing whole sub-blocks to be fused into a single GPU kernel. This is important for meeting strict real-time requirements of ASR systems in deployment.The results of the acoustic model are combined with the results of external language models to get the top-ranked word sequences corresponding to a given audio segment. This post-processing step is called decoding.\n",
+    "\n",
+    "The original paper is Jasper: An End-to-End Convolutional Neural Acoustic Model https://arxiv.org/pdf/1904.03288.pdf.\n",
+    "\n",
+    "### 1.1 Model architecture\n",
+    "By default the model configuration is Jasper 10x5 with dense residuals. A Jasper BxR model has B blocks, each consisting of R repeating sub-blocks.\n",
+    "Each sub-block applies the following operations in sequence: 1D-Convolution, Batch Normalization, ReLU activation, and Dropout. \n",
+    "In the original paper Jasper is trained with masked convolutions, which masks out the padded part of an input sequence in a batch before the 1D-Convolution.\n",
+    "For inference masking is not used. The reason for this is that in inference, the original mask operation does not achieve better accuracy than without the mask operation on the test and development dataset. However, no masking achieves better inference performance especially after TensorRT optimization.\n",
+    "More information on the model architecture can be found in the [Jasper Pytorch README](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechRecognition/Jasper)\n",
+    "\n",
+    "### 1.2 TensorRT Inference Server Overview\n",
+    "\n",
+    "A typical TensorRT Inference Server pipeline can be broken down into the following 8 steps:\n",
+    "1. Client serializes the inference request into a message and sends it to the server (Client Send)\n",
+    "2. Message travels over the network from the client to the server (Network)\n",
+    "3. Message arrives at server, and is deserialized (Server Receive)\n",
+    "4. Request is placed on the queue (Server Queue)\n",
+    "5. Request is removed from the queue and computed (Server Compute)\n",
+    "6. Completed request is serialized in a message and sent back to the client (Server Send)\n",
+    "7. Completed message travels over network from the server to the client (Network)\n",
+    "8. Completed message is deserialized by the client and processed as a completed inference request (Client Receive)\n",
+    "\n",
+    "Generally, for local clients, steps 1-4 and 6-8 will only occupy a small fraction of time, compared to steps 5-6. As backend deep learning systems like Jasper are rarely exposed directly to end users, but instead only interfacing with local front-end servers, for the sake of Jasper, we can consider that all clients are local.\n",
+    "In this section, we will go over how to launch TensorRT Inference Server and client and get the best performant solution that fits your specific application needs.\n",
+    "\n",
+    "Note: The following instructions are run from outside the container and call `docker run` commands as required.\n",
+    "\n",
+    "### 1.3 Inference Pipeline in TensorRT Inference Server\n",
+    "The Jasper model pipeline consists of 3 components, where each part can be customized to be a different backend: \n",
+    "\n",
+    "**Data preprocessor**\n",
+    "\n",
+    "The data processor transforms an input raw audio file into a spectrogram. By default the pipeline uses mel filter banks as spectrogram features. This part does not have any learnable weights.\n",
+    "\n",
+    "**Acoustic model**\n",
+    "\n",
+    "The acoustic model takes in the spectrogram and outputs a probability over a list of characters. This part is the most compute intensive, taking more than 90% of the entire end-to-end pipeline. The acoustic model is the only component with learnable parameters and what differentiates Jasper from other end-to-end neural speech recognition models. In the original paper, the acoustic model contains a masking operation for training (More details in [Jasper PyTorch README](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper/README.md)). We do not use masking for inference . \n",
+    "\n",
+    "**Greedy decoder**\n",
+    "\n",
+    "The decoder takes the probabilities over the list of characters and outputs the final transcription. Greedy decoding is a fast and simple way of doing this by always choosing the character with the maximum probability. \n",
+    "\n",
+    "To run a model with TensorRT, we first construct the model in PyTorch, which is then exported into a ONNX static graph. Finally, a TensorRT engine is constructed from the ONNX file and can be launched to do inference. The following table shows which backends are supported for each part along the model pipeline.\n",
+    "\n",
+    "|Backend\\Pipeline component|Data preprocessor|Acoustic Model|Decoder|\n",
+    "|---|---|---|---|\n",
+    "|PyTorch JIT|x|x|x|\n",
+    "|ONNX|-|x|-|\n",
+    "|TensorRT|-|x|-|\n",
+    "\n",
+    "In order to run inference with TensorRT outside of the inference server, refer to the [Jasper TensorRT README](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper/trt/README.md)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.3 Learning objectives\n",
+    "\n",
+    "This notebook demonstrates:\n",
+    "- Speed up Jasper Inference with TensorRT in TensorRT Inference Server\n",
+    "- Use of Mixed Precision for Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Requirements\n",
+    "\n",
+    "Please refer to Jasper TensorRT Inference Server README.md"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Jasper Inference\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.1  Prepare Working Directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "if not 'workbookDir' in globals():\n",
+    "    workbookDir = os.getcwd() + \"/../\"\n",
+    "print('workbookDir: ' + workbookDir)\n",
+    "os.chdir(workbookDir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.2  Generate TRTIS Model Checkpoints\n",
+    "Use the PyTorch model checkpoint to generate all 3 model backends. You can find a pretrained checkpoint at https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16.\n",
+    "\n",
+    "Set the following parameters:\n",
+    "\n",
+    "* `ARCH`: hardware architecture. use 70 for Volta, 75 for Turing.\n",
+    "* `CHECKPOINT_DIR`: absolute path to model checkpoint directory.\n",
+    "* `CHECKPOINT`: model checkpoint name. (default: jasper10x5dr.pt)\n",
+    "* `PRECISION`: model precision. Default is using mixed precision.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env ARCH=70\n",
+    "# replace with absolute path to checkpoint directory, which should include CHECKPOINT file\n",
+    "%env CHECKPOINT_DIR=<CHECKPOINT_DIR> \n",
+    "# CHECKPOINT file name\n",
+    "%env CHECKPOINT=jasper_fp16.pt \n",
+    "%env PRECISION=fp16\n",
+    "!echo \"ARCH=${ARCH} CHECKPOINT_DIR=${CHECKPOINT_DIR} CHECKPOINT=${CHECKPOINT} PRECISION=${PRECISION} trtis/scripts/export_model.sh\"\n",
+    "!ARCH=${ARCH} CHECKPOINT_DIR=${CHECKPOINT_DIR} CHECKPOINT=${CHECKPOINT} PRECISION=${PRECISION} trtis/scripts/export_model.sh"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!bash trtis/scripts/prepare_model_repository.sh"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.3  Start the TensorRT Inference Server using Docker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!bash trtis/scripts/run_server.sh"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.4. Start inference prediction in TRTIS\n",
+    "\n",
+    "Use the following script to run inference with TensorRT Inference Server.\n",
+    "You will need to set the parameters such as: \n",
+    "\n",
+    "\n",
+    "* `MODEL_TYPE`: Model pipeline type. Choose from [pyt, onnx, trt] for Pytorch JIT, ONNX, or TensorRT model pipeline.\n",
+    "* `DATA_DIR`: absolute path to directory with audio files\n",
+    "* `FILE`: relative path of audio file to `DATA_DIR`\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL_TYPE=\"trt\"\n",
+    "DATA_DIR=os.path.join(workbookDir, \"notebooks/\")\n",
+    "FILE=\"example1.wav\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!bash trtis/scripts/run_client.sh $MODEL_TYPE $DATA_DIR $FILE"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can play with other examples from the 'notebooks' directory. You can also add your own audio files and generate the output text files in this way."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.5. Stop your container in the end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!docker stop jasper-trtis"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

+ 138 - 18
PyTorch/SpeechRecognition/Jasper/notebooks/README.md

@@ -1,37 +1,49 @@
-# Jasper notebook
+# Jasper notebooks
 
-## 1. JasperTRT.ipynb
-### Overview
+This folder provides different notebooks to run Jasper inference step by step. 
 
-This notebook provides scripts for you to run Jasper with TRT for inference step by step. You can run inference using either LibriSpeech dataset or your own audio input in .wav format, to generate the corresponding text file for the audio file.
+## Table Of Contents
 
+- [Jasper Jupyter Notebook for TensorRT](#jasper-jupyter-notebook-for-tensorrt)
+   * [Requirements](#requirements)
+    * [Quick Start Guide](#quick-start-guide)
+- [Jasper Colab Notebook for TensorRT](#jasper-colab-notebook-for-tensorrt)
+   * [Requirements](#requirements)
+    * [Quick Start Guide](#quick-start-guide)
+- [Jasper jupyter notebook for TensorRT Inference Server](#jasper-colab-notebook-for-tensorrt-inference-server)
+   * [Requirements](#requirements)
+    * [Quick Start Guide](#quick-start-guide)
+
+## Jasper Jupyter Notebook for TensorRT
 ### Requirements
 
-This repository contains a Dockerfile which extends the PyTorch 19.09-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+`./trt/` contains a Dockerfile which extends the PyTorch 19.09-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 
 * [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/) or [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) based GPU    
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
 * [PyTorch 19.09-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+* [NVIDIA machine learning repository](https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb) and [NVIDIA cuda repository](https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.1.243-1_amd64.deb) for NVIDIA TensorRT 6
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
 * [Pretrained Jasper Model Checkpoint](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16)
 
 ### Quick Start Guide
 
 Running the following scripts will build and launch the container containing all required dependencies for both TensorRT as well as native PyTorch. This is necessary for using inference with TensorRT and can also be used for data download, processing and training of the model.
 
-##### 1. Clone the repository.
+#### 1. Clone the repository.
 
 ```
 git clone https://github.com/NVIDIA/DeepLearningExamples
 cd DeepLearningExamples/PyTorch/SpeechRecognition/Jasper
 ```
 
-##### 2. Build the Jasper PyTorch with TRT 6 container:
+#### 2. Build the Jasper PyTorch with TRT 6 container:
 
 ```
-bash trt/scripts/docker/trt_build.sh
+bash trt/scripts/docker/build.sh
 ```
 
-##### 3. Create directories
+#### 3. Create directories
 Prepare to start a detached session in the NGC container.
 Create three directories on your local machine for dataset, checkpoint, and result, respectively, naming "data" "checkpoint" "result":
 
@@ -39,7 +51,7 @@ Create three directories on your local machine for dataset, checkpoint, and resu
 mkdir data checkpoint result
 ```
 
-##### 4. Download the checkpoint
+#### 4. Download the checkpoint
 Download the checkpoint file jasperpyt_fp16 from NGC Model Repository:  
 - https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16
 
@@ -49,18 +61,80 @@ The Jasper PyTorch container will be launched in the Jupyter notebook. Within th
 
 The /datasets, /checkpoints, /results directories are mounted as volumes and mapped to the corresponding directories "data" "checkpoint" "result" on the host.
 
-##### 5. Run the notebook
-Copy the notebook to the root directory of Jasper:
+#### 5. Run the notebook
+
+For running the notebook on your local machine, run:
+
+```
+jupyter notebook -- notebooks/JasperTRT.ipynb
+```
+
+For running the notebook on another machine remotely, run:
 
 ```
-cp notebooks/JasperTRT.ipynb .
+jupyter notebook --ip=0.0.0.0 --allow-root
 ```
 
-#### 6. Run the notebook
+And navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888`
+
+Use the token listed in the output from running the jupyter command to log in, for example: `http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b`
+
+
+
+## Jasper Colab Notebook for TensorRT
+### Requirements
+
+`./trt/` contains a Dockerfile which extends the PyTorch 19.09-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+
+* [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/) or [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) based GPU    
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 19.09-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+* [NVIDIA machine learning repository](https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb) and [NVIDIA cuda repository](https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.1.243-1_amd64.deb) for NVIDIA TensorRT 6
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+* [Pretrained Jasper Model Checkpoint](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16)
+
+### Quick Start Guide
+
+Running the following scripts will build and launch the container containing all required dependencies for both TensorRT as well as native PyTorch. This is necessary for using inference with TensorRT and can also be used for data download, processing and training of the model.
+
+#### 1. Clone the repository.
+
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/SpeechRecognition/Jasper
+```
+
+#### 2. Build the Jasper PyTorch with TRT 6 container:
+
+```
+bash trt/scripts/docker/build.sh
+```
+
+#### 3. Create directories
+Prepare to start a detached session in the NGC container.
+Create three directories on your local machine for dataset, checkpoint, and result, respectively, naming "data" "checkpoint" "result":
+
+```
+mkdir data checkpoint result
+```
+
+#### 4. Download the checkpoint
+Download the checkpoint file jasperpyt_fp16 from NGC Model Repository:  
+- https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16
+
+to the directory: _checkpoint_
+
+The Jasper PyTorch container will be launched in the Jupyter notebook. Within the container, the contents of the root repository will be copied to the /workspace/jasper directory.
+
+The /datasets, /checkpoints, /results directories are mounted as volumes and mapped to the corresponding directories "data" "checkpoint" "result" on the host.
+
+#### 5. Run the notebook
+
+>>>>>>> 2deaddbc2ea58d5318b06203ae30ace2dd576ecb
 For running the notebook on your local machine, run:
 
 ```
-jupyter notebook JasperTRT.ipynb
+jupyter notebook -- notebooks/Colab_Jasper_TRT_inference_demo.ipynb
 ```
 
 For running the notebook on another machine remotely, run:
@@ -74,10 +148,56 @@ And navigate a web browser to the IP address or hostname of the host machine at
 Use the token listed in the output from running the jupyter command to log in, for example: `http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b`
 
 
-## 2. Colab_Jasper_TRT_inference_demo
 
-This notebook demoes the process of Jasper inference with TensorRT from a pretrained checkpoint in Google Colab. It can be opened directly in Colab via this [link](https://colab.research.google.com/github/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper/notebooks/Colab_Jasper_TRT_inference_demo.ipynb).
+## Jasper Jupyter Notebook for TensorRT Inference Server
+### Requirements
 
-```python
+`./trtis/` contains a Dockerfile which extends the PyTorch 19.09-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+
+* [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/) or [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) based GPU    
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 19.09-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+* [TensorRT Inference Server 19.09 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrtserver)
+* [NVIDIA machine learning repository](https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb) and [NVIDIA cuda repository](https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.1.243-1_amd64.deb) for NVIDIA TensorRT 6
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+* [Pretrained Jasper Model Checkpoint](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16)
+
+### Quick Start Guide
+
+
+#### 1. Clone the repository.
 
 ```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/SpeechRecognition/Jasper
+```
+
+#### 2. Build a container that extends NGC PyTorch 19.09, TensorRT, TensorRT Inference Server, and TensorRT Inference Client.
+
+```
+bash trtis/scripts/docker/build.sh
+```
+
+#### 3. Download the checkpoint
+Download the checkpoint file jasper_fp16.pt from NGC Model Repository:  
+- https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16
+
+to an user specified directory _CHECKPOINT_DIR_
+
+#### 4. Run the notebook
+
+For running the notebook on your local machine, run:
+
+```
+jupyter notebook -- notebooks/JasperTRTIS.ipynb
+```
+
+For running the notebook on another machine remotely, run:
+
+```
+jupyter notebook --ip=0.0.0.0 --allow-root
+```
+
+And navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888`
+
+Use the token listed in the output from running the jupyter command to log in, for example: `http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b`

BIN
PyTorch/SpeechRecognition/Jasper/notebooks/keynote.wav


+ 0 - 0
PyTorch/SpeechRecognition/Jasper/parts/__init__.py


+ 85 - 56
PyTorch/SpeechRecognition/Jasper/parts/features.py

@@ -58,8 +58,32 @@ class WaveformFeaturizer(object):
 
         return cls(input_config, augmentor=aa)
 
-constant = 1e-5
-def normalize_batch(x, seq_len, normalize_type):
+
+# @torch.jit.script
+# def normalize_batch_per_feature(x, seq_len):
+#     x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
+#     x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
+
+#     for i in range(x.shape[0]):
+#         x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1)
+#         x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1)
+#     # make sure x_std is not zero
+#     x_std += 1e-5
+#     return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2)
+
+# @torch.jit.script
+# def normalize_batch_all_features(x, seq_len):
+#     x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+#     x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+#     for i in range(x.shape[0]):
+#         x_mean[i] = x[i, :, :int(seq_len[i])].mean()
+#         x_std[i] = x[i, :, :int(seq_len[i])].std()
+#     # make sure x_std is not zero
+#     x_std += 1e-5
+#     return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1)
+
[email protected]
+def normalize_batch(x, seq_len, normalize_type: str):
 #    print ("normalize_batch: x, seq_len, shapes: ", x.shape, seq_len, seq_len.shape)
     if normalize_type == "per_feature":
         x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
@@ -70,21 +94,22 @@ def normalize_batch(x, seq_len, normalize_type):
             x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1)
             x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1)
         # make sure x_std is not zero
-        x_std += constant
+        x_std += 1e-5
         return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2)
     elif normalize_type == "all_features":
         x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
         x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
         for i in range(x.shape[0]):
-            x_mean[i] = x[i, :, :seq_len[i].item()].mean()
-            x_std[i] = x[i, :, :seq_len[i].item()].std()
+            x_mean[i] = x[i, :, :int(seq_len[i])].mean()
+            x_std[i] = x[i, :, :int(seq_len[i])].std()
         # make sure x_std is not zero
-        x_std += constant
+        x_std += 1e-5
         return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1)
     else:
         return x
 
-def splice_frames(x, frame_splicing):
[email protected]
+def splice_frames(x, frame_splicing: int):
     """ Stacks frames together across feature dim
 
     input is batch_size, feature_dim, num_frames
@@ -92,15 +117,19 @@ def splice_frames(x, frame_splicing):
 
     """
     seq = [x]
-    for n in range(1, frame_splicing):
-        seq.append(torch.cat([x[:, :, :n], x[:, :, n:]], dim=2))
+    # TORCHSCRIPT: JIT doesnt like range(start, stop)
+    for n in range(frame_splicing - 1):
+        seq.append(torch.cat([x[:, :, :n + 1], x[:, :, n + 1:]], dim=2))
     return torch.cat(seq, dim=1)
 
 class SpectrogramFeatures(nn.Module):
+    # For JIT. See https://pytorch.org/docs/stable/jit.html#python-defined-constants
+    __constants__ = ["dither", "preemph", "n_fft", "hop_length", "win_length", "log", "frame_splicing", "window", "normalize", "pad_to", "max_duration", "do_normalize"]
+
     def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01,
                        n_fft=None,
-                       window="hamming", normalize="per_feature", log=True, center=True,
-                       dither=constant, pad_to=8, max_duration=16.7,
+                       window="hamming", normalize="per_feature", log=True, 
+                       dither=1e-5, pad_to=8, max_duration=16.7,
                        frame_splicing=1):
         super(SpectrogramFeatures, self).__init__()
         torch_windows = {
@@ -121,13 +150,12 @@ class SpectrogramFeatures(nn.Module):
 
         self.normalize = normalize
         self.log = log
-        self.center = center
         self.dither = dither
         self.pad_to = pad_to
         self.frame_splicing = frame_splicing
 
         max_length = 1 + math.ceil(
-                (max_duration * sample_rate - self.win_length) / self.hop_length
+            (max_duration * sample_rate - self.win_length) / self.hop_length
         )
         max_pad = 16 - (max_length % 16)
         self.max_length = max_length + max_pad
@@ -137,8 +165,7 @@ class SpectrogramFeatures(nn.Module):
             dtype=torch.int)
 
     @torch.no_grad()
-    def forward(self, inp):
-        x, seq_len = inp
+    def forward(self, x, seq_len):
         dtype = x.dtype
 
         seq_len = self.get_seq_len(seq_len)
@@ -154,8 +181,8 @@ class SpectrogramFeatures(nn.Module):
 
         # get spectrogram
         x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
-                          win_length=self.win_length, center=self.center,
-                          window=self.window.to(torch.float))
+                       win_length=self.win_length,
+                       window=self.window.to(torch.float))
         x = torch.sqrt(x.pow(2).sum(-1))
 
         # log features if required
@@ -167,22 +194,25 @@ class SpectrogramFeatures(nn.Module):
             x = splice_frames(x, self.frame_splicing)
 
         # normalize if required
-        if self.normalize:
-            x = normalize_batch(x, seq_len, normalize_type=self.normalize)
+        x = normalize_batch(x, seq_len, normalize_type=self.normalize)
 
         # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
         max_len = x.size(-1)
-        mask = torch.arange(max_len).to(seq_len.dtype).to(seq_len.device).expand(x.size(0), max_len) >= seq_len.unsqueeze(1)
+        mask = torch.arange(max_len, dtype=seq_len.dtype).to(seq_len.device).expand(x.size(0), max_len) >= seq_len.unsqueeze(1)
         x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0)
-        del mask
+        
+        # TORCHSCRIPT: Is this del important? It breaks scripting
+        #del mask
+    
         pad_to = self.pad_to
-        if pad_to == "max":
+    
+        # TORCHSCRIPT: Cant have mixed types. Using pad_to < 0 for "max"
+        if pad_to < 0:
             x = nn.functional.pad(x, (0, self.max_length - x.size(-1)))
         elif pad_to > 0:
             pad_amt = x.size(-1) % pad_to
             if pad_amt != 0:
                 x = nn.functional.pad(x, (0, pad_to - pad_amt))
-
         return x.to(dtype)
 
     @classmethod
@@ -194,8 +224,11 @@ class SpectrogramFeatures(nn.Module):
                    max_duration=cfg.get('max_duration', 16.7),
                    dither=cfg.get('dither', 1e-5), pad_to=cfg.get("pad_to", 0),
                    frame_splicing=cfg.get("frame_splicing", 1), log=log)
-
+constant=1e-5
 class FilterbankFeatures(nn.Module):
+    # For JIT. See https://pytorch.org/docs/stable/jit.html#python-defined-constants
+    __constants__ = ["dither", "preemph", "n_fft", "hop_length", "win_length", "center", "log", "frame_splicing", "window", "normalize", "pad_to", "max_duration", "max_length"]
+
     def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01,
                        window="hamming", normalize="per_feature", n_fft=None,
                        preemph=0.97,
@@ -204,7 +237,6 @@ class FilterbankFeatures(nn.Module):
                        max_duration=16.7,
                        frame_splicing=1):
         super(FilterbankFeatures, self).__init__()
-#        print("PADDING: {}".format(pad_to))
 
         torch_windows = {
             'hann': torch.hann_window,
@@ -220,6 +252,7 @@ class FilterbankFeatures(nn.Module):
 
         self.normalize = normalize
         self.log = log
+        #TORCHSCRIPT: Check whether or not we need this
         self.dither = dither
         self.frame_splicing = frame_splicing
         self.nfilt = nfilt
@@ -238,26 +271,25 @@ class FilterbankFeatures(nn.Module):
         self.register_buffer("window", window_tensor)
         # Calculate maximum sequence length (# frames)
         max_length = 1 + math.ceil(
-                (max_duration * sample_rate - self.win_length) / self.hop_length
+            (max_duration * sample_rate - self.win_length) / self.hop_length
         )
         max_pad = 16 - (max_length % 16)
         self.max_length = max_length + max_pad
 
-
     def get_seq_len(self, seq_len):
         return torch.ceil(seq_len.to(dtype=torch.float) / self.hop_length).to(
             dtype=torch.int)
-            # dtype=torch.long)
-
-    @torch.no_grad()
-    def forward(self, inp):
-        x, seq_len = inp
 
+    # do stft
+    # TORCHSCRIPT: center removed due to bug
+    def  stft(self, x):
+        return torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
+                          win_length=self.win_length,
+                          window=self.window.to(dtype=torch.float))
+    def forward(self, x, seq_len):
         dtype = x.dtype
 
         seq_len = self.get_seq_len(seq_len)
-
-#        print ("forward: x, seq_len, shapes: ", x.shape, seq_len, seq_len.shape)
         
         # dither
         if self.dither > 0:
@@ -267,13 +299,10 @@ class FilterbankFeatures(nn.Module):
         if self.preemph is not None:
             x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]),
                                         dim=1)
-
-        # do stft
-        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
-                       win_length=self.win_length,
-                       center=True, window=self.window.to(dtype=torch.float))
-
-        # get power spectrum
+            
+        x  = self.stft(x)
+            
+            # get power spectrum
         x = x.pow(2).sum(-1)
 
         # dot with filterbank energies
@@ -288,25 +317,25 @@ class FilterbankFeatures(nn.Module):
             x = splice_frames(x, self.frame_splicing)
 
         # normalize if required
-        if self.normalize:
-            x = normalize_batch(x, seq_len, normalize_type=self.normalize)
+        x = normalize_batch(x, seq_len, normalize_type=self.normalize)
 
         # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
         max_len = x.size(-1)
-        mask = torch.arange(max_len).to(seq_len.dtype).to(x.device).expand(x.size(0),
-                                                                           max_len) >= seq_len.unsqueeze(1)
-
-        x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0)
-        del mask
-        pad_to = self.pad_to
-        if pad_to == "max":
+        mask = torch.arange(max_len, dtype=seq_len.dtype).to(x.device).expand(x.size(0),
+                                                                              max_len) >= seq_len.unsqueeze(1)
+
+        x = x.masked_fill(mask.unsqueeze(1), 0)
+        # TORCHSCRIPT: Is this del important? It breaks scripting
+        # del mask
+        # TORCHSCRIPT: Cant have mixed types. Using pad_to < 0 for "max"
+        if self.pad_to < 0:
             x = nn.functional.pad(x, (0, self.max_length - x.size(-1)))
-        elif pad_to > 0:
-            pad_amt = x.size(-1) % pad_to
-            if pad_amt != 0:
-                x = nn.functional.pad(x, (0, pad_to - pad_amt))
-
-        return x.to(dtype)
+        elif self.pad_to > 0:
+            pad_amt = x.size(-1) % self.pad_to
+            #            if pad_amt != 0:
+            x = nn.functional.pad(x, (0, self.pad_to - pad_amt))
+        
+        return x # .to(dtype)
 
     @classmethod
     def from_config(cls, cfg, log=False):

+ 30 - 23
PyTorch/SpeechRecognition/Jasper/scripts/docker/launch.sh

@@ -1,31 +1,38 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#!/bin/bash
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+JASPER_REPO=${JASPER_REPO:-"${SCRIPT_DIR}/../.."}
 
+# Launch TRT JASPER container.
 
-#!/bin/bash
+DATA_DIR=${DATA_DIR:-"/datasets"}
+CHECKPOINT_DIR=${CHECKPOINT_DIR:-"/checkpoints"}
+RESULT_DIR=${RESULT_DIR:-"/results"}
+PROGRAM_PATH=${PROGRAM_PATH}
+    
+
+MOUNTS=""
+if [ ! -z "$DATA_DIR" ]; 
+then
+    MOUNTS="$MOUNTS -v $DATA_DIR:/datasets "
+fi
+
+if [ ! -z "$CHECKPOINT_DIR" ]; 
+then
+    MOUNTS="$MOUNTS -v $CHECKPOINT_DIR:/checkpoints "
+fi
 
-DATA_DIR=$1
-CHECKPOINT_DIR=$2
-RESULT_DIR=$3
+if [ ! -z "$RESULT_DIR" ]; 
+then
+    MOUNTS="$MOUNTS -v $RESULT_DIR:/results "
+fi
 
-docker run -it --rm \
+echo $MOUNTS
+nvidia-docker run -it --rm \
   --runtime=nvidia \
   --shm-size=4g \
   --ulimit memlock=-1 \
   --ulimit stack=67108864 \
-  -v "$DATA_DIR":/datasets \
-  -v "$CHECKPOINT_DIR":/checkpoints/ \
-  -v "$RESULT_DIR":/results/ \
-  -v $PWD:/code \
-  jasper bash
+  ${MOUNTS} \
+  -v ${JASPER_REPO}:/jasper \
+  ${EXTRA_JASPER_ENV} \
+  jasper:latest bash $PROGRAM_PATH

+ 1 - 1
PyTorch/SpeechRecognition/Jasper/scripts/inference.sh

@@ -21,7 +21,7 @@ DATA_DIR=${1-"/datasets/LibriSpeech"}
 DATASET=${2:-"dev-clean"}
 MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
 RESULT_DIR=${4:-"/results"}
-CHECKPOINT=$5
+CHECKPOINT=${5:-"/checkpoints/jasper_fp16.pt"}
 CREATE_LOGFILE=${6:-"true"}
 CUDNN_BENCHMARK=${7:-"false"}
 PRECISION=${8:-"fp32"}

+ 1 - 1
PyTorch/SpeechRecognition/Jasper/scripts/inference_benchmark.sh

@@ -22,7 +22,7 @@ DATA_DIR=${1:-"/datasets/LibriSpeech"}
 DATASET=${2:-"dev-clean"}
 MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
 RESULT_DIR=${4:-"/results"}
-CHECKPOINT=$5
+CHECKPOINT=${5:-"/checkpoints/jasper_fp16.pt"}
 CREATE_LOGFILE=${6:-"true"}
 CUDNN_BENCHMARK=${7:-"true"}
 PRECISION=${8:-"fp32"}

+ 9 - 4
PyTorch/SpeechRecognition/Jasper/scripts/train_benchmark.sh

@@ -15,6 +15,8 @@
 #!/bin/bash
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+PROJECT_DIR=${SCRIPT_DIR}/..
 
 DATA_DIR=${1:-"/datasets/LibriSpeech"}
 MODEL_CONFIG=${2:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
@@ -30,7 +32,7 @@ BATCH_SIZE=${11:-64}
 LEARNING_RATE=${12:-"0.015"}
 GRADIENT_ACCUMULATION_STEPS=${13:-1}
 PRINT_FREQUENCY=${14:-1}
-
+USE_PROFILER=${USE_PROFILER:-"false"}
 
 PREC=""
 if [ "$PRECISION" = "fp16" ] ; then
@@ -59,8 +61,11 @@ else
    CUDNN=""
 fi
 
-
-CMD=" train.py"
+if [ "${USE_PROFILER}" = "true" ] ; then
+    PYTHON_ARGS+="-m cProfile  -s cumtime"
+fi
+    
+CMD="${PYTHON_ARGS} ${PROJECT_DIR}/train.py"
 CMD+=" --batch_size=$BATCH_SIZE"
 CMD+=" --num_epochs=400"
 CMD+=" --output_dir=$RESULT_DIR"
@@ -78,7 +83,7 @@ CMD+=" --eval_freq=100000"
 CMD+=" --max_duration=$MAX_DURATION"
 CMD+=" --pad_to_max"
 CMD+=" --train_freq=$PRINT_FREQUENCY"
-CMD+=" --lr_decay"
+CMD+=" --lr_decay "
 CMD+=" $CUDNN"
 CMD+=" $PREC"
 CMD+=" $STEPS"

+ 43 - 27
PyTorch/SpeechRecognition/Jasper/train.py

@@ -24,11 +24,10 @@ import random
 import numpy as np
 import math
 from dataset import AudioToTextDataLayer
-from helpers import monitor_asr_train_progress, process_evaluation_batch, process_evaluation_epoch, Optimization, add_ctc_labels, AmpOptimizations, model_multi_gpu, print_dict, print_once
+from helpers import monitor_asr_train_progress, process_evaluation_batch, process_evaluation_epoch,  add_ctc_labels, AmpOptimizations, model_multi_gpu, print_dict, print_once
 from model import AudioPreprocessing, CTCLossNM, GreedyCTCDecoder, Jasper
 from optimizers import Novograd, AdamW
 
-
 def lr_policy(initial_lr, step, N):
     """
     learning rate decay
@@ -114,7 +113,15 @@ def train(
                 t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
 
                 model.eval()
-                t_log_probs_e, t_encoded_len_e = model(x=(t_audio_signal_e, t_a_sig_length_e))
+                if optim_level == 1:
+                  with amp.disable_casts():
+                      t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor(t_audio_signal_e, t_a_sig_length_e) 
+                else:
+                  t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor(t_audio_signal_e, t_a_sig_length_e)
+                if jasper_encoder.use_conv_mask:
+                    t_log_probs_e, t_encoded_len_e = model.forward((t_processed_signal_e, t_processed_sig_length_e))
+                else:
+                    t_log_probs_e = model.forward(t_processed_signal_e)
                 t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e)
                 t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
 
@@ -139,6 +146,10 @@ def train(
     epoch = args.start_epoch
     step = epoch * args.step_per_epoch
 
+    audio_preprocessor = model.module.audio_preprocessor if hasattr(model, 'module') else model.audio_preprocessor
+    data_spectr_augmentation = model.module.data_spectr_augmentation if hasattr(model, 'module') else model.data_spectr_augmentation
+    jasper_encoder = model.module.jasper_encoder if hasattr(model, 'module') else model.jasper_encoder
+
     while True:
         if multi_gpu:
             data_layer.sampler.set_epoch(epoch)
@@ -165,13 +176,22 @@ def train(
 
             t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = tensors
             model.train()
+            if optim_level == 1:
+              with amp.disable_casts():
+                  t_processed_signal_t, t_processed_sig_length_t = audio_preprocessor(t_audio_signal_t, t_a_sig_length_t) 
+            else:
+              t_processed_signal_t, t_processed_sig_length_t = audio_preprocessor(t_audio_signal_t, t_a_sig_length_t)
+            t_processed_signal_t = data_spectr_augmentation(t_processed_signal_t)
+            if jasper_encoder.use_conv_mask:
+                t_log_probs_t, t_encoded_len_t = model.forward((t_processed_signal_t, t_processed_sig_length_t))
+            else:
+                t_log_probs_t = model.forward(t_processed_signal_t)
             
-            t_log_probs_t, t_encoded_len_t = model(x=(t_audio_signal_t, t_a_sig_length_t))
             t_loss_t = ctc_loss(log_probs=t_log_probs_t, targets=t_transcript_t, input_length=t_encoded_len_t, target_length=t_transcript_len_t)
             if args.gradient_accumulation_steps > 1:
                     t_loss_t = t_loss_t / args.gradient_accumulation_steps
 
-            if optim_level in AmpOptimizations:
+            if optim_level >=0 and optim_level <=3:
                 with amp.scale_loss(t_loss_t, optimizer) as scaled_loss:
                     scaled_loss.backward()
             else:
@@ -189,7 +209,6 @@ def train(
                     train_wer = monitor_asr_train_progress(e_tensors, labels=labels)
                     print_once("Loss@Step: {0}  ::::::: {1}".format(step, str(average_loss)))
                     print_once("Step time: {0} seconds".format(time.time() - last_iter_start))
-
                 if step > 0 and step % args.eval_frequency == 0:
                     print_once("Doing Evaluation ....................... ......  ... .. . .")
                     eval()
@@ -224,15 +243,16 @@ def main(args):
         torch.cuda.set_device(args.local_rank)
         torch.distributed.init_process_group(backend='nccl', init_method='env://')
 
+
     multi_gpu = torch.distributed.is_initialized()
     if multi_gpu:
         print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size()))
-
+                
     # define amp optimiation level
     if args.fp16:
-        optim_level = Optimization.mxprO1
+        optim_level = 1
     else:
-        optim_level = Optimization.mxprO0
+        optim_level = 0
 
     jasper_model_definition = toml.load(args.model_toml)
     dataset_vocab = jasper_model_definition['labels']['labels']
@@ -251,8 +271,9 @@ def main(args):
         assert(args.max_duration > 0)
         featurizer_config['max_duration'] = args.max_duration
         featurizer_config_eval['max_duration'] = args.max_duration
-        featurizer_config['pad_to'] = "max"
-        featurizer_config_eval['pad_to'] = "max"
+        featurizer_config['pad_to'] = -1        
+        featurizer_config_eval['pad_to'] = -1
+        
     print_once('model_config')
     print_dict(jasper_model_definition)
 
@@ -325,32 +346,27 @@ def main(args):
                         weight_decay=args.weight_decay)
     else:
         raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind))
-
-
-    if optim_level in AmpOptimizations:
+    if optim_level >= 0 and optim_level <=3:
         model, optimizer = amp.initialize(
             min_loss_scale=1.0,
             models=model,
             optimizers=optimizer,
             opt_level=AmpOptimizations[optim_level])
-
     if args.ckpt is not None:
         optimizer.load_state_dict(checkpoint['optimizer'])
 
     model = model_multi_gpu(model, multi_gpu)
 
-    train(
-        data_layer=data_layer,
-        data_layer_eval=data_layer_eval,
-        model=model,
-        ctc_loss=ctc_loss,
-        greedy_decoder=greedy_decoder,
-        optimizer=optimizer,
-        labels=ctc_vocab,
-        optim_level=optim_level,
-        multi_gpu=multi_gpu,
-        fn_lr_policy=fn_lr_policy if args.lr_decay else None,
-        args=args)
+
+    train(data_layer, data_layer_eval, model, \
+          ctc_loss=ctc_loss, \
+          greedy_decoder=greedy_decoder, \
+          optimizer=optimizer, \
+          labels=ctc_vocab, \
+          optim_level=optim_level, \
+          multi_gpu=multi_gpu, \
+          fn_lr_policy=fn_lr_policy if args.lr_decay else None, \
+          args=args)
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Jasper')

+ 5 - 13
PyTorch/SpeechRecognition/Jasper/trt/Dockerfile

@@ -1,18 +1,12 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.10-py3 
 FROM ${FROM_IMAGE_NAME}
 
 RUN apt-get update && apt-get install -y python3
-RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.1.243-1_amd64.deb \
-&& dpkg -i cuda-repo-*.deb \
-&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb \
-&& dpkg -i nvidia-machine-learning-repo-*.deb \
-&& apt-get update \
-&& apt-get install -y --no-install-recommends python-libnvinfer python3-libnvinfer
 
-
-RUN cp -r /usr/lib/python3.6/dist-packages/tensorrt /opt/conda/lib/python3.6/site-packages/tensorrt
-# Add TensorRT executable to path (trtexec)
-ENV PATH=$PATH:/usr/src/tensorrt/bin
+WORKDIR /tmp/onnx-trt
+COPY trt/onnx-trt.patch .
+RUN git clone https://github.com/onnx/onnx-tensorrt.git && cd onnx-tensorrt && git submodule update --init --recursive && \
+    patch -f < ../onnx-trt.patch && mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DGPU_ARCHS="60 70 75" && make -j16 && make install && mv -f /usr/lib/libnvonnx* /usr/lib/x86_64-linux-gnu/ && ldconfig
 
 
 # Here's a good place to install pip reqs from JoC repo.
@@ -22,8 +16,6 @@ COPY requirements.txt /tmp/pipReqs/jocRequirements.txt
 COPY trt/requirements.txt /tmp/pipReqs/trtRequirements.txt
 RUN pip install --disable-pip-version-check -U -r jocRequirements.txt -r trtRequirements.txt
 
-# These packages are required for running preprocessing on the dataset to acquire manifest files and the like
-RUN apt-get install -y libsndfile1 && apt-get install -y ffmpeg sox && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /workspace/jasper
 COPY . .

+ 107 - 104
PyTorch/SpeechRecognition/Jasper/trt/README.md

@@ -1,6 +1,7 @@
+
 # Jasper Inference For TensorRT
 
-This is subfolder of the Jasper for PyTorch repository, tested and maintained by NVIDIA, and provides scripts to perform high-performance inference using NVIDIA TensorRT. Jasper is a neural acoustic model for speech recognition. Its network architecture is designed to facilitate fast GPU inference. More information about Jasper and its training and be found in the [root directory](../README.md). 
+This is subfolder of the Jasper for PyTorch repository, tested and maintained by NVIDIA, and provides scripts to perform high-performance inference using NVIDIA TensorRT. Jasper is a neural acoustic model for speech recognition. Its network architecture is designed to facilitate fast GPU inference. More information about Jasper and its training and be found in the [Jasper PyTorch README](../README.md). 
 NVIDIA TensorRT is a platform for high-performance deep learning inference. It includes a deep learning inference optimizer and runtime that delivers low latency and high-throughput for deep learning inference applications.
 After optimizing the compute-intensive acoustic model with NVIDIA TensorRT, inference throughput increased by up to 1.8x over native PyTorch. 
 
@@ -10,7 +11,7 @@ After optimizing the compute-intensive acoustic model with NVIDIA TensorRT, infe
 
 - [Model overview](#model-overview)
    * [Model architecture](#model-architecture)
-   * [TRT Inference pipeline](#trt-inference-pipeline)
+   * [TensorRT Inference pipeline](#tensorrt-inference-pipeline)
    * [Version Info](#version-info)
 - [Setup](#setup)
    * [Requirements](#requirements)
@@ -18,8 +19,8 @@ After optimizing the compute-intensive acoustic model with NVIDIA TensorRT, infe
 - [Advanced](#advanced)
    * [Scripts and sample code](#scripts-and-sample-code)
    * [Parameters](#parameters)
-   * [TRT Inference Process](#trt-inference-process)
-   * [TRT Inference Benchmark Process](#trt-inference-benchmark-process)
+   * [TensorRT Inference Process](#tensorrt-inference-process)
+   * [TensorRT Inference Benchmark Process](#tensorrt-inference-benchmark-process)
 - [Performance](#performance)
    * [Results](#results)
       * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
@@ -34,15 +35,15 @@ In the original paper Jasper is trained with masked convolutions, which masks ou
 For inference masking is not used. The reason for this is that in inference, the original mask operation does not achieve better accuracy than without the mask operation on the test and development dataset. However, no masking achieves better inference performance especially after TensorRT optimization.
 
 
-### TRT Inference pipeline
+### TensorRT Inference pipeline
 
 The Jasper inference pipeline consists of 3 components: data preprocessor, acoustic model and greedy decoder. The acoustic model is the most compute intensive, taking more than 90% of the entire end-to-end pipeline. The acoustic model is the only component with learnable parameters and also what differentiates Jasper from the competition. So, we focus on the acoustic model for the most part.
 
-For the non-TRT Jasper inference pipeline, all 3 components are implemented and run with native PyTorch. For the TensorRT inference pipeline, we show the speedup of running the acoustic model with TensorRT, while preprocessing and decoding are reused from the native PyTorch pipeline.
+For the non-TensorRT Jasper inference pipeline, all 3 components are implemented and run with native PyTorch. For the TensorRT inference pipeline, we show the speedup of running the acoustic model with TensorRT, while preprocessing and decoding are reused from the native PyTorch pipeline.
 
-To run a model with TensorRT, we first construct the model in PyTorch, which is then exported into an ONNX file. Finally, a TensorRT engine is constructed from the ONNX file, serialized to TRT plan file, and also launched to do inference.
+To run a model with TensorRT, we first construct the model in PyTorch, which is then exported into an ONNX file. Finally, a TensorRT engine is constructed from the ONNX file, serialized to TensorRT engine file, and also launched to do inference.
 
-Note that TensorRT engine is being runtime optimized before serialization. TRT tries a vast set of options to find the strategy that performs best on user’s GPU - so it takes a few minutes. After the TRT plan file is created, it can be reused. 
+Note that TensorRT engine is being runtime optimized before serialization. TensorRT tries a vast set of options to find the strategy that performs best on user’s GPU - so it takes a few minutes. After the TensorRT engine file is created, it can be reused. 
 
 ### Version Info
 
@@ -57,14 +58,14 @@ The following software version configuration has been tested and known to work:
 
 ## Setup
 
-The following section lists the requirements in order to start inference on the Jasper model with TRT.
+The following section lists the requirements in order to start inference on the Jasper model with TensorRT.
 
 ### Requirements
 
-This repository contains a `Dockerfile` which extends the PyTorch 19.09-py3 NGC container and encapsulates some dependencies. Ensure you have the following components:
+This repository contains a `Dockerfile` which extends the PyTorch 19.10-py3 NGC container and encapsulates some dependencies. Ensure you have the following components:
 
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 19.09-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+* [PyTorch 19.10-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
 * [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
 * [Pretrained Jasper Model Checkpoint](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16)
 
@@ -84,119 +85,121 @@ Running the following scripts will build and launch the container containing all
 
 1. Clone the repository.
 
-```bash
-git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples/PyTorch/SpeechRecognition/Jasper
-```
+      ```bash
+      git clone https://github.com/NVIDIA/DeepLearningExamples
+      cd DeepLearningExamples/PyTorch/SpeechRecognition/Jasper
+      ```
 2. Build the Jasper PyTorch with TensorRT container:
 
-```
-bash trt/scripts/docker/trt_build.sh
-```
+      ```bash
+      bash trt/scripts/docker/build.sh
+      ```
 3. Start an interactive session in the NGC docker container:
 
-```
-bash trt/scripts/docker/trt_launch.sh <DATA_DIR> <CHECKPOINT_DIR> <RESULT_DIR>
-```
+      ```bash
+      bash trt/scripts/docker/launch.sh <DATA_DIR> <CHECKPOINT_DIR> <RESULT_DIR>
+      ```
 
-Alternatively, to start a script in the docker container:
+      Alternatively, to start a script in the docker container:
 
-```
-bash trt/scripts/docker/trt_launch.sh <DATA_DIR> <CHECKPOINT_DIR> <RESULT_DIR> <SCRIPT_PATH>
-```
+      ```bash
+      bash trt/scripts/docker/aunch.sh <DATA_DIR> <CHECKPOINT_DIR> <RESULT_DIR> <SCRIPT_PATH>
+      ```
 
-The `/datasets`, `/checkpoints`, `/results` directories will be mounted as volumes and mapped to the corresponding directories `<DATA_DIR>`, `<CHECKPOINT_DIR>`, `<RESULT_DIR>` on the host. **These three paths should be absolute and should already exist.** The contents of this repository will be mounted to the `/workspace/jasper` directory. Note that `<DATA_DIR>`, `<CHECKPOINT_DIR>`, and `<RESULT_DIR>` directly correspond to the same arguments in `scripts/docker/launch.sh` mentioned in the [Quick Start Guide](../README.md).
+      The `/datasets`, `/checkpoints`, `/results` directories will be mounted as volumes and mapped to the corresponding directories `<DATA_DIR>`, `<CHECKPOINT_DIR>`, `<RESULT_DIR>` on the host. **These three paths should be absolute and should already exist.** The contents of this repository will be mounted to the `/workspace/jasper` directory. Note that `<DATA_DIR>`, `<CHECKPOINT_DIR>`, and `<RESULT_DIR>` directly correspond to the same arguments in `scripts/docker/launch.sh` mentioned in the [Jasper PyTorch README](../README.md).
 
-Briefly, `<DATA_DIR>` should contain, or be prepared to contain a `LibriSpeech` sub-directory (created in [Acquiring Dataset](#acquiring-dataset)), `<CHECKPOINT_DIR>` should contain a PyTorch model checkpoint (`*.pt`) file obtained through training described in [Quick Start Guide](../README.md), and `<RESULT_DIR>` should be prepared to contain timing results, logs, serialized TRT engines, and ONNX files.
+      Briefly, `<DATA_DIR>` should contain, or be prepared to contain a `LibriSpeech` sub-directory (created in [Acquiring Dataset](#acquiring-dataset)), `<CHECKPOINT_DIR>` should contain a PyTorch model checkpoint (`*.pt`) file obtained through training described in [Jasper PyTorch README](../README.md), and `<RESULT_DIR>` should be prepared to contain timing results, logs, serialized TensorRT engines, and ONNX files.
 
-4.  Acquiring dataset
+      4.  Acquiring dataset
 
-If LibriSpeech has already been downloaded and preprocessed as defined in the [Quick Start Guide](../README.md), no further steps in this subsection need to be taken.
+      If LibriSpeech has already been downloaded and preprocessed as defined in the [Jasper PyTorch README](../README.md), no further steps in this subsection need to be taken.
 
-If LibriSpeech has not been downloaded already, note that only a subset of LibriSpeech is typically used for inference (`dev-*` and `test-*`). To acquire the inference subset of LibriSpeech run the following commands inside the container (does not require GPU):
+      If LibriSpeech has not been downloaded already, note that only a subset of LibriSpeech is typically used for inference (`dev-*` and `test-*`). To acquire the inference subset of LibriSpeech run the following commands inside the container (does not require GPU):
 
-```
-bash trt/scripts/download_inference_librispeech.sh
-```
+      ```bash
+      bash trt/scripts/download_inference_librispeech.sh
+      ```
 
-Once the data download is complete, the following folders should exist:
+      Once the data download is complete, the following folders should exist:
 
-* `/datasets/LibriSpeech/`
-   * `dev-clean/`
-   * `dev-other/`
-   * `test-clean/`
-   * `test-other/`
+      * `/datasets/LibriSpeech/`
+         * `dev-clean/`
+         * `dev-other/`
+         * `test-clean/`
+         * `test-other/`
 
-Next, preprocessing the data can be performed with the following command:
+      Next, preprocessing the data can be performed with the following command:
 
-```
-bash trt/scripts/preprocess_inference_librispeech.sh
-```
+      ```bash
+      bash trt/scripts/preprocess_inference_librispeech.sh
+      ```
 
-Once the data is preprocessed, the following additional files should now exist:
-* `/datasets/LibriSpeech/`
-   * `librispeech-dev-clean-wav.json`
-   * `librispeech-dev-other-wav.json`
-   * `librispeech-test-clean-wav.json`
-   * `librispeech-test-other-wav.json`
-   * `dev-clean-wav/`
-   * `dev-other-wav/`
-   * `test-clean-wav/`
-   * `test-other-wav/`
+      Once the data is preprocessed, the following additional files should now exist:
+      * `/datasets/LibriSpeech/`
+         * `librispeech-dev-clean-wav.json`
+         * `librispeech-dev-other-wav.json`
+         * `librispeech-test-clean-wav.json`
+         * `librispeech-test-other-wav.json`
+         * `dev-clean-wav/`
+         * `dev-other-wav/`
+         * `test-clean-wav/`
+         * `test-other-wav/`
 
-5. Start TRT inference prediction
+5. Start TensorRT inference prediction
 
-Inside the container, use the following script to run inference with TRT.
-```
-export CHECKPOINT=<CHECKPOINT>
-export TRT_PRECISION=<PRECISION>
-export PYTORCH_PRECISION=<PRECISION>
-export TRT_PREDICTION_PATH=<TRT_PREDICTION_PATH>
-bash trt/scripts/trt_inference.sh
-```
-A pretrained model checkpoint can be downloaded from [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16). 
-More details can be found in [Advanced](#advanced) under [Scripts and sample code](#scripts-and-sample-code), [Parameters](#parameters) and [TRT Inference process](#trt-inference).
+      Inside the container, use the following script to run inference with TensorRT.
+      ```bash
+      export CHECKPOINT=<CHECKPOINT>
+      export TRT_PRECISION=<PRECISION>
+      export PYTORCH_PRECISION=<PRECISION>
+      export TRT_PREDICTION_PATH=<TRT_PREDICTION_PATH>
+      bash trt/scripts/trt_inference.sh
+      ```
+      A pretrained model checkpoint can be downloaded from [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16). 
+      More details can be found in [Advanced](#advanced) under [Scripts and sample code](#scripts-and-sample-code), [Parameters](#parameters) and [TensorRT Inference process](#trt-inference).
 
-6.  Start TRT inference benchmark
+6.  Start TensorRT inference benchmark
 
-Inside the container, use the following script to run inference benchmark with TRT.
-```
-export CHECKPOINT=<CHECKPOINT>
-export NUM_STEPS=<NUM_STEPS>
-export NUM_FRAMES=<NUM_FRAMES>
-export BATCH_SIZE=<BATCH_SIZE>
-export TRT_PRECISION=<PRECISION>
-export PYTORCH_PRECISION=<PRECISION>
-export CSV_PATH=<CSV_PATH>
-bash trt/scripts/trt_inference_benchmark.sh
-```
-A pretrained model checkpoint can be downloaded from the [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16). 
-More details can be found in [Advanced](#advanced) under [Scripts and sample code](#scripts-and-sample-code), [Parameters](#parameters) and [TRT Inference Benchmark process](#trt-inference-benchmark).
+      Inside the container, use the following script to run inference benchmark with TensorRT.
+      ```bash
+      export CHECKPOINT=<CHECKPOINT>
+      export NUM_STEPS=<NUM_STEPS>
+      export NUM_FRAMES=<NUM_FRAMES>
+      export BATCH_SIZE=<BATCH_SIZE>
+      export TRT_PRECISION=<PRECISION>
+      export PYTORCH_PRECISION=<PRECISION>
+      export CSV_PATH=<CSV_PATH>
+      bash trt/scripts/trt_inference_benchmark.sh
+      ```
+      A pretrained model checkpoint can be downloaded from the [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16). 
+      More details can be found in [Advanced](#advanced) under [Scripts and sample code](#scripts-and-sample-code), [Parameters](#parameters) and [TensorRT Inference Benchmark process](#trt-inference-benchmark).
 
 7. Start Jupyter notebook to run inference interactively
-The Jupyter notebook  is an open-source web application that allows you to create and share documents that contain live code, equations, visualizations and narrative text.
-The notebook which is located at `notebooks/JasperTRT.ipynb` offers an interactive method to run the Steps 2,3,4,5. In addition, the notebook shows examples how to use TRT to transcribe a single audio file into text. To launch the application please follow the instructions under [../notebooks/README.md](../notebooks/README.md). 
-A pretrained model checkpoint can be downloaded from [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16). 
+
+      The Jupyter notebook  is an open-source web application that allows you to create and share documents that contain live code, equations, visualizations and narrative text.
+      The notebook which is located at `notebooks/JasperTRT.ipynb` offers an interactive method to run the Steps 2,3,4,5. In addition, the notebook shows examples how to use TensorRT to transcribe a single audio file into text. To launch the application please follow the instructions under [../notebooks/README.md](../notebooks/README.md). 
+      A pretrained model checkpoint can be downloaded from [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16). 
 
 
 ## Advanced
-The following sections provide greater details on inference benchmarking with TRT and show inference results
+The following sections provide greater details on inference benchmarking with TensorRT and show inference results
 
 ### Scripts and sample code
 In the `trt/` directory, the most important files are:
-* `Dockerfile`: Container to run Jasper inference with TRT.
+* `Dockerfile`: Container to run Jasper inference with TensorRT.
 * `requirements.py`: Python package dependencies. Installed when building the Docker container.
-* `perf.py`: Entry point for inference pipeline using TRT.
-* `perfprocedures.py`: Contains functionality to run inference through both the PyTorch model and TRT Engine, taking runtime measurements of each component of the inference process for comparison.
-* `trtutils.py`: Helper functions for TRT components of Jasper inference.
-* `perfutils.py`: Helper functions for non-TRT components of Jasper inference.
+* `perf.py`: Entry point for inference pipeline using TensorRT.
+* `perfprocedures.py`: Contains functionality to run inference through both the PyTorch model and TensorRT Engine, taking runtime measurements of each component of the inference process for comparison.
+* `trtutils.py`: Helper functions for TensorRT components of Jasper inference.
+* `perfutils.py`: Helper functions for non-TensorRT components of Jasper inference.
+* `onnx-trt.patch`: Used to enable Onnx and TensorRT with dynamic shapes.
 
 The `trt/scripts/` directory has one-click scripts to run supported functionalities, such as:
 
 * `download_librispeech.sh`: Downloads LibriSpeech inference dataset.
 * `preprocess_librispeech.sh`: Preprocess LibriSpeech raw data files to be ready for inference.
-* `trt_inference_benchmark.sh`: Benchmarks and compares TRT and PyTorch inference pipelines using the `perf.py` script.
-* `trt_inference.sh`: Runs TRT and PyTorch inference using the `trt_inference_benchmark.sh` script.
+* `trt_inference_benchmark.sh`: Benchmarks and compares TensorRT and PyTorch inference pipelines using the `perf.py` script.
+* `trt_inference.sh`: Runs TensorRT and PyTorch inference using the `trt_inference_benchmark.sh` script.
 * `walk_benchmark.sh`: Illustrates an example of using `trt/scripts/trt_inference_benchmark.sh`, which *walks* a variety of values for `BATCH_SIZE` and `NUM_FRAMES`.
 * `docker/`: Contains the scripts for building and launching the container.
 
@@ -214,51 +217,51 @@ Arguments with Defaults:
 --------
 DATA_DIR: directory of the dataset (Default: `/datasets/Librispeech`)
 DATASET: name of dataset to use (default: `dev-clean`)
-RESULT_DIR: directory for results including TRT engines, ONNX files, logs, and CSVs (default: `/results`)
+RESULT_DIR: directory for results including TensorRT engines, ONNX files, logs, and CSVs (default: `/results`)
 CREATE_LOGFILE: boolean that indicates whether to create log of session to be stored in `$RESULT_DIR` (default: "true")
 CSV_PATH: file to store CSV results (default: `/results/res.csv`)
-TRT_PREDICTION_PATH: file to store inference prediction results generated with TRT (default: `none`)
+TRT_PREDICTION_PATH: file to store inference prediction results generated with TensorRT (default: `none`)
 PYT_PREDICTION_PATH: file to store inference prediction results generated with native PyTorch (default: `none`)
-VERBOSE: boolean that indicates whether to verbosely describe TRT engine building/deserialization and TRT inference (default: "false")
-TRT_PRECISION: "fp32" or "fp16". Defines which precision kernels will be used for TRT engine (default: "fp32")
+VERBOSE: boolean that indicates whether to verbosely describe TensorRT engine building/deserialization and TensorRT inference (default: "false")
+TRT_PRECISION: "fp32" or "fp16". Defines which precision kernels will be used for TensorRT engine (default: "fp32")
 PYTORCH_PRECISION: "fp32" or "fp16". Defines which precision will be used for inference in PyTorch (default: "fp32")
 NUM_STEPS: Number of inference steps. If -1 runs inference on entire dataset (default: 100)
 BATCH_SIZE: data batch size (default: 64)
 NUM_FRAMES: cuts/pads all pre-processed feature tensors to this length. 100 frames ~ 1 second of audio (default: 512)
-FORCE_ENGINE_REBUILD: boolean that indicates whether an already-built TRT engine of equivalent precision, batch-size, and number of frames should not be used.
-    Engines are specific to the GPU, library versions, TRT versions, and CUDA versions they were built in and cannot be used in a different environment. (default: "true")
+FORCE_ENGINE_REBUILD: boolean that indicates whether an already-built TensorRT engine of equivalent precision, batch-size, and number of frames should not be used. Engines are specific to the GPU, library versions, TensorRT versions, and CUDA versions they were built in and cannot be used in a different environment. (default: "true")
+USE_DYNAMIC_SHAPE: if 'yes' uses dynamic shapes (default: ‘yes’)
 ```
 
 The complete list of parameters available for `trt/scripts/trt_inference.sh` is the same as `trt/scripts/trt_inference_benchmark.sh` only with different default input arguments. In the following, only the parameters with different default values are listed:
 
 ```
-TRT_PREDICTION_PATH: file to store inference prediction results generated with TRT (default: `/results/trt_predictions.txt`)
+TRT_PREDICTION_PATH: file to store inference prediction results generated with TensorRT (default: `/results/trt_predictions.txt`)
 PYT_PREDICTION_PATH: file to store inference prediction results generated with native PyTorch (default: `/results/pyt_predictions.txtone`)
 NUM_STEPS: Number of inference steps. If -1 runs inference on entire dataset (default: -1)
 BATCH_SIZE: data batch size (default: 1)
 NUM_FRAMES: cuts/pads all pre-processed feature tensors to this length. 100 frames ~ 1 second of audio (default: 3600)
 ```
 
-### TRT Inference Benchmark process
+### TensorRT Inference Benchmark process
 
 The inference benchmarking is performed on a single GPU by ‘trt/scripts/trt_inference_benchmark.sh’ which delegates to `trt/perf.py`,  which takes the following steps:
 
 
 1. Construct Jasper acoustic model in PyTorch.
 
-2. Construct TRT Engine of Jasper acoustic model
+2. Construct TensorRT Engine of Jasper acoustic model
 
    1. Perform ONNX export on the PyTorch model, if its ONNX file does not already exist.
 
-	2. Construct TRT engine from ONNX export, if a saved engine file does not already exist or `FORCE_ENGINE_REBUILD` is `true`.
+	2. Construct TensorRT engine from ONNX export, if a saved engine file does not already exist or `FORCE_ENGINE_REBUILD` is `true`.
 
-3. For each batch in the dataset, run inference through both the PyTorch model and TRT Engine, taking runtime measurements of each component of the inference process.
+3. For each batch in the dataset, run inference through both the PyTorch model and TensorRT Engine, taking runtime measurements of each component of the inference process.
 
 4. Compile performance and WER accuracy results in CSV format, written to `CSV_PATH` file.
 
-`trt/perf.py` utilizes `trt/trtutils.py` and `trt/perfutils.py`, helper functions for TRT and non-TRT components of Jasper inference respectively.
+`trt/perf.py` utilizes `trt/trtutils.py` and `trt/perfutils.py`, helper functions for TensorRT and non-TensorRT components of Jasper inference respectively.
 
-### TRT Inference process
+### TensorRT Inference process
 
 The inference is performed by `trt/scripts/trt_inference.sh` which delegates to `trt/scripts/trt_inference_benchmark.sh`. The script runs on a single GPU. To do inference prediction on the entire dataset `NUM_FRAMES` is set to 3600, which roughly corresponds to 36 seconds. This covers the longest sentences in both LibriSpeech dev and test dataset. By default, `BATCH_SET` is set to 1 to simulate the online inference scenario in deployment. Other batch sizes can be tried by setting a different value to this parameter. By default `TRT_PRECISION` is set to full precision and can be changed by setting `export TRT_PRECISION=fp16`. The prediction results are stored at `/results/trt_predictions.txt` and `/results/pyt_predictions.txt`.
 
@@ -267,7 +270,7 @@ The inference is performed by `trt/scripts/trt_inference.sh` which delegates to
 ## Performance
 
 To benchmark the inference performance on a specific batch size and audio length refer to [Quick-Start-Guide](#quick-start-guide). To do a sweep over multiple batch sizes and audio durations run:
-```
+```bash
 bash trt/scripts/walk_benchmark.sh
 ```
 The results are obtained by running inference on LibriSpeech dev-clean dataset on a single T4 GPU using half precision with AMP. We compare the throughput of the acoustic model between TensorRT and native PyTorch.   
@@ -278,7 +281,7 @@ The results are obtained by running inference on LibriSpeech dev-clean dataset o
 
 #### Inference performance: NVIDIA T4
 
-| Sequence Length (in seconds) | Batch size | TRT FP16 Throughput (#sequences/second) Percentiles |     	|     	|     	| PyTorch FP16 Throughput (#sequences/second) Percentiles |     	|     	|     	| TRT/PyTorch Speedup |
+| Sequence Length (in seconds) | Batch size | PyTorch FP16 Throughput (#sequences/second) Percentiles |     	|     	|     	| TensorRT FP16 Throughput (#sequences/second) Percentiles |     	|     	|     	| PyT/TRT Speedup |
 |---------------|------------|---------------------|---------|---------|---------|-----------------|---------|---------|---------|-----------------|
 |           	|        	| 90%             	| 95% 	| 99% 	| Avg 	| 90%         	| 95% 	| 99% 	| Avg 	|             	|
 |2|1|71.002|70.897|70.535|71.987|42.974|42.932|42.861|43.166|1.668|

+ 13 - 0
PyTorch/SpeechRecognition/Jasper/trt/onnx-trt.patch

@@ -0,0 +1,13 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index a1937f1..85b03f2 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -20,7 +20,7 @@
+ 
+ cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
+ # The version of CMake which is not compatible with the old CUDA CMake commands.
+-set(CMAKE_VERSION_THRESHOLD "3.10.0")
++set(CMAKE_VERSION_THRESHOLD "3.15.0")
+ 
+ if(${CMAKE_VERSION} VERSION_LESS ${CMAKE_VERSION_THRESHOLD})
+   project(onnx2trt LANGUAGES CXX C)

+ 39 - 47
PyTorch/SpeechRecognition/Jasper/trt/perf.py

@@ -24,39 +24,54 @@ import trtutils
 import perfprocedures
 from model import GreedyCTCDecoder
 from helpers import __ctc_decoder_predictions_tensor
+import caffe2.python.onnx.backend as c2backend
+import onnxruntime as ort
 
-def main(args):
+import torch
+from torch import nn
+from torch.nn import functional as F
 
+
+def main(args):        
+    print ("Getting component")
     # Get shared utility across PyTorch and TRT
     pyt_components, saved_onnx = perfutils.get_pytorch_components_and_onnx(args)
 
+    print ("Getting engine")
     # Get a TRT engine. See function for argument parsing logic
-    engine = get_engine(args)
+    engine = trtutils.get_engine(args)
+    print ("Got engine.")
 
     if args.wav:
-        audio_processor = pyt_components['audio_preprocessor']
-        audio_processor.eval()
-        greedy_decoder = GreedyCTCDecoder()
-        input_wav, seq_len = pyt_components['input_wav']
-        features = audio_processor((input_wav, seq_len))
-        features = perfutils.adjust_shape(features, args.seq_len)
-        with engine.create_execution_context() as context:
-            t_log_probs_e, copyto, inference, copyfrom= perfprocedures.do_inference(context, features[0], 1)
-        log_probs=perfutils.torchify_trt_out(t_log_probs_e, 1)
-        
-        t_predictions_e = greedy_decoder(log_probs=log_probs)
-        hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=perfutils.get_vocab())
-        print("INTERENCE TIME: {} ms".format(inference*1000.0))
-        print("TRANSCRIPT: ", hypotheses[0])
-
-        return
+        with torch.no_grad():
+            audio_processor = pyt_components['audio_preprocessor']
+            audio_processor.eval()
+            greedy_decoder = GreedyCTCDecoder()
+            input_wav, num_audio_samples = pyt_components['input_wav']
+            features = audio_processor(input_wav, num_audio_samples)
+            features = perfutils.adjust_shape(features, args)
+            if not args.engine_path:
+                outputs = engine.run(None, {'FEATURES': features[0].data.cpu().numpy()})
+                inference = 1.0
+                t_log_probs_e = outputs[0]
+                t_log_probs_e=perfutils.torchify_trt_out(t_log_probs_e, t_log_probs_e.shape)
+            else:
+                with engine.create_execution_context() as context:
+                    t_log_probs_e, copyto, inference, copyfrom= perfprocedures.do_inference(context, [features[0]])
+            t_predictions_e = greedy_decoder(t_log_probs_e)
+            hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=perfutils.get_vocab())
+            print("INTERENCE TIME: {} ms".format(inference*1000.0))
+            print("TRANSCRIPT: ", hypotheses)
+            return
 
-    
     wer, preds, times = perfprocedures.compare_times_trt_pyt_exhaustive(engine,
                                                                         pyt_components,
-                                                                        num_steps=args.num_steps)
+                                                                        args)
+
     string_header, string_data = perfutils.do_csv_export(wer, times, args.batch_size, args.seq_len)
+
     if args.csv_path is not None:
+        print ("Exporting to " + args.csv_path)
         with open(args.csv_path, 'a+') as f:
             # See if header is there, if so, check that it matches
             f.seek(0) # Read from start of file
@@ -93,6 +108,7 @@ def parse_args():
     parser.add_argument("--val_manifest", type=str, help="JSON manifest of dataset.")
     parser.add_argument("--onnx_path", default=None, type=str, help="Path to onnx model for engine creation")
     parser.add_argument("--seq_len", default=None, type=int, help="Generate an ONNX export with this fixed sequence length, and save to --onnx_path. Requires also using --onnx_path and --ckpt_path.")
+    parser.add_argument("--max_seq_len", default=3600, type=int, help="Max sequence length for TRT engine build. Default works with TRTIS benchmark. Set it larger than seq_len")
     parser.add_argument("--ckpt_path", default=None, type=str, help="If provided, will also construct pytorch acoustic model")
     parser.add_argument("--max_duration", default=None, type=float, help="Maximum possible length of audio data in seconds")
     parser.add_argument("--num_steps", default=-1, type=int, help="Number of inference steps to run")
@@ -104,36 +120,12 @@ def parse_args():
     parser.add_argument("--pyt_prediction_path", type=str, default=None, help="File to write predictions inferred with pytorch")
     parser.add_argument("--verbose", action="store_true", default=False, help="If set, will verbosely describe TRT engine building and deserialization as well as TRT inference")
     parser.add_argument("--wav", type=str, help='absolute path to .wav file (16KHz)')
-    parser.add_argument("--max_workspace_size", default=4*1024*1024*1024, type=int, help="Maximum batch size for constructed engine; needed when building")
+    parser.add_argument("--max_workspace_size", default=0, type=int, help="Maximum GPU memory workspace size for constructed engine; needed when building")
+    parser.add_argument("--transpose", action="store_true", default=False, help="If set, will transpose input")
+    parser.add_argument("--dynamic_shape", action="store_true", default=False, help="If set, use dynamic shape")
 
     return parser.parse_args()
 
-def get_engine(args):
-    '''Get a TRT engine
-
-    If --should_serialize is present, always build from ONNX and store result in --engine_path.
-    Else If an engine is provided as an argument (--engine_path) use that one.
-    Otherwise, make one from onnx (--onnx_load_path), but don't serialize it.
-    '''
-    engine = None
-
-    if args.engine_path is not None and args.use_existing_engine:
-        engine = trtutils.deserialize_engine(args.engine_path, args.verbose)
-    elif args.engine_path is not None and args.onnx_path is not None:
-        # Build a new engine and serialize it.
-        engine = trtutils.build_engine_from_parser(args.onnx_path, args.engine_batch_size, args.trt_fp16, args.verbose, args.max_workspace_size)
-        with open(args.engine_path, 'wb') as f:
-            f.write(engine.serialize())
-    else:
-        raise Exception("One of the following sets of arguments must be provided:\n"+
-                        "<engine_path> + --use_existing_engine\n"+
-                        "<engine_path> + <onnx_path>\n"+
-                        "in order to construct a TRT engine")
-    if engine is None:
-        raise Exception("Failed to acquire TRT engine")
-
-    return engine
-
 if __name__ == "__main__":
     args = parse_args()
 

+ 41 - 196
PyTorch/SpeechRecognition/Jasper/trt/perfprocedures.py

@@ -24,155 +24,10 @@ import time
 import torch
 from tqdm import tqdm
 
-def time_pyt(engine, pyt_components):
-    '''Times execution of PyTorch inference
-    '''
-    baked_seq_len = engine.get_binding_shape(0)[1]
-    preprocess_times = []
-    pyt_infers = []
-    pyt_components['audio_preprocessor'].eval()
-    pyt_components['acoustic_model'].eval()
-    with torch.no_grad():
-        for data in tqdm(pyt_components['data_layer'].data_iterator):
-            tensors = []
-            for d in data:
-                tensors.append(d.to(torch.device("cuda")))
-            input_tensor = (tensors[0], tensors[1])
-            t0 = time.perf_counter()
-            am_input = pyt_components['audio_preprocessor'](x=input_tensor)
-            # Pad or cut to the neccessary engine length
-            am_input = perfutils.adjust_shape(am_input, baked_seq_len)
-            batch_size = am_input[0].shape[0]
-            torch.cuda.synchronize()
-            t1 = time.perf_counter()
-            # Run PyT inference
-            pyt_out = pyt_components['acoustic_model'](x=am_input)
-            torch.cuda.synchronize()
-            t2 = time.perf_counter()
-            perfutils.global_process_batch(log_probs=pyt_out,
-                                           original_tensors=tensors,
-                                           batch_size=batch_size,
-                                           is_trt=False)
-            assemble_times.append(t1-t0)
-            pyt_infers.append(t2-t1)
-
-    pyt_wer = perfutils.global_process_epoch(is_trt=False)
-    trt_wer = None
-    trt_preds = perfutils._global_trt_dict['predictions']
-    pyt_preds = perfutils._global_pyt_dict['predictions']
-    times = {
-        'preprocess': assemble_times,
-        'pyt_infers': pyt_infers
-    }
-    wer = {
-        'trt': trt_wer,
-        'pyt': pyt_wer
-    }
-    preds = {
-        'trt': trt_preds,
-        'pyt': pyt_preds
-    }
-    return wer, preds, times
-
-def time_trt(engine, pyt_components):
-    '''Times execution of TRT inference
-    '''
-    baked_seq_len = engine.get_binding_shape(0)[1]
-    assemble_times = []
-    trt_copytos = []
-    trt_copyfroms = []
-    trt_infers = []
-    decodingandeval = []
-    with engine.create_execution_context() as context, torch.no_grad():
-        for data in tqdm(pyt_components['data_layer'].data_iterator):
-            tensors = []
-            for d in data:
-                tensors.append(d.to(torch.device("cuda")))
-            input_tensor = (tensors[0], tensors[1])
-            t0 = time.perf_counter()
-            am_input = pyt_components['audio_preprocessor'](x=input_tensor)
-            # Pad or cut to the neccessary engine length
-            am_input = perfutils.adjust_shape(am_input, baked_seq_len)
-            batch_size = am_input[0].shape[0]
-            torch.cuda.synchronize()
-            t1 = time.perf_counter()
-            # Run TRT inference
-            trt_out, time_to, time_infer, time_from= do_inference(
-                                                                  context=context,
-                                                                  inp=am_input,
-                                                                  batch_size=batch_size)
-            t3 = time.perf_counter()
-            trt_out = perfutils.torchify_trt_out(trt_out, batch_size)
-            perfutils.global_process_batch(log_probs=trt_out,
-                                           original_tensors=tensors,
-                                           batch_size=batch_size,
-                                           is_trt=True)
-            torch.cuda.synchronize()
-            t4 = time.perf_counter()
-
-
-            assemble_times.append(t1-t0)
-            trt_copytos.append(time_to)
-            trt_copyfroms.append(time_from)
-            trt_infers.append(time_infer)
-            decodingandeval.append(t4-t3)
-
-
-    trt_wer = perfutils.global_process_epoch(is_trt=True)
-    pyt_wer = perfutils.global_process_epoch(is_trt=False)
-    trt_preds = perfutils._global_trt_dict['predictions']
-    pyt_preds = perfutils._global_pyt_dict['predictions']
-    times = {
-        'assemble': assemble_times,
-        'trt_copyto': trt_copytos,
-        'trt_copyfrom': trt_copyfroms,
-        'trt_infers': trt_infers,
-        'decodingandeval': decodingandeval
-    }
-    wer = {
-        'trt': trt_wer,
-        'pyt': pyt_wer
-    }
-    preds = {
-        'trt': trt_preds,
-        'pyt': pyt_preds
-    }
-    return wer, preds, times
-
-def run_trt(engine, pyt_components):
-    '''Runs TRT inference for accuracy evaluation
-    '''
-    baked_seq_len = engine.get_binding_shape(0)[1]
-    wers = []
-    preds = []
-    with engine.create_execution_context() as context, torch.no_grad():
-        for data in tqdm(pyt_components['data_layer'].data_iterator):
-            tensors = []
-            for d in data:
-                tensors.append(d.to(torch.device("cuda")))
-            input_tensor = (tensors[0], tensors[1])
-            am_input = pyt_components['audio_preprocessor'](x=input_tensor)
-            # Pad or cut to the neccessary engine length
-            am_input = perfutils.adjust_shape(am_input, baked_seq_len)
-            batch_size = am_input[0].shape[0]
-            torch.cuda.synchronize()
-            # Run TRT inference
-            trt_out, _,_,_= do_inference(context=context, inp=am_input, batch_size=batch_size)
-            trt_out = perfutils.torchify_trt_out(trt_out, batch_size=batch_size)
-            wer, pred = perfutils.get_results(log_probs=trt_out,
-                                              original_tensors=tensors,
-                                              batch_size=batch_size)
-            wers.append(wer)
-            preds.append(pred)
-
-
-    return wers, preds
-
-def compare_times_trt_pyt_exhaustive(engine, pyt_components, num_steps):
+def compare_times_trt_pyt_exhaustive(engine, pyt_components, args):
     '''Compares execution times and WER between TRT and PyTorch'''
 
     # The engine has a fixed-size sequence length, which needs to be known for slicing/padding input
-    baked_seq_len = engine.get_binding_shape(0)[1]
     preprocess_times = []
     inputadjust_times = []
     outputadjust_times = []
@@ -185,39 +40,35 @@ def compare_times_trt_pyt_exhaustive(engine, pyt_components, num_steps):
 
     with engine.create_execution_context() as context, torch.no_grad():
         for data in tqdm(pyt_components['data_layer'].data_iterator):
-            if num_steps >= 1:
-                if step_counter > num_steps:
+            if args.num_steps >= 1:
+                if step_counter > args.num_steps:
                     break
                 step_counter +=1
             tensors = []
             for d in data:
-                tensors.append(d.to(torch.device("cuda")))
-
-            input_tensor = (tensors[0], tensors[1])
+                tensors.append(d.cuda())
             preprocess_start = time.perf_counter()
-            am_input = pyt_components['audio_preprocessor'](x=input_tensor)
+            am_input = pyt_components['audio_preprocessor'](tensors[0], tensors[1])
+            
             torch.cuda.synchronize()
             preprocess_end = time.perf_counter()
 
             # Pad or cut to the neccessary engine length
             inputadjust_start = time.perf_counter()
-            am_input = perfutils.adjust_shape(am_input, baked_seq_len)
+            am_input = perfutils.adjust_shape(am_input, args)
             torch.cuda.synchronize()
             inputadjust_end = time.perf_counter()
 
             batch_size = am_input[0].shape[0]
 
+            inp = [am_input[0]]
+            
             # Run TRT inference 1: Async copying and inference
-            trt_out, time_taken= do_inference_overlap(
-                                                      context=context,
-                                                      inp=am_input,
-                                                      batch_size=batch_size)
+            # import ipdb; ipdb.set_trace()
+            trt_out, time_taken= do_inference_overlap(context, inp)
             torch.cuda.synchronize()
             outputadjust_start = time.perf_counter()
-            trt_out = perfutils.torchify_trt_out(trt_out, batch_size)
-            torch.cuda.synchronize()
             outputadjust_end = time.perf_counter()
-
             process_batch_start = time.perf_counter()
             perfutils.global_process_batch(log_probs=trt_out,
                                            original_tensors=tensors,
@@ -225,9 +76,10 @@ def compare_times_trt_pyt_exhaustive(engine, pyt_components, num_steps):
                                            is_trt=True)
             torch.cuda.synchronize()
             process_batch_end = time.perf_counter()
+
             # Create explicit stream so pytorch doesn't complete asynchronously
             pyt_infer_start = time.perf_counter()
-            pyt_out = pyt_components['acoustic_model'](x=am_input[0])
+            pyt_out = pyt_components['acoustic_model'](am_input[0])
             torch.cuda.synchronize()
             pyt_infer_end = time.perf_counter()
             perfutils.global_process_batch(log_probs=pyt_out,
@@ -235,10 +87,8 @@ def compare_times_trt_pyt_exhaustive(engine, pyt_components, num_steps):
                                            batch_size=batch_size,
                                            is_trt=False)
             # Run TRT inference 2: Synchronous copying and inference
-            _, time_to, time_infer, time_from = do_inference(
-                                                             context=context,
-                                                             inp=am_input,
-                                                             batch_size=batch_size)
+            sync_out, time_to, time_infer, time_from = do_inference(context,inp)
+            del sync_out
             preprocess_times.append(preprocess_end - preprocess_start)
             inputadjust_times.append(inputadjust_end - inputadjust_start)
             outputadjust_times.append(outputadjust_end - outputadjust_start)
@@ -273,65 +123,60 @@ def compare_times_trt_pyt_exhaustive(engine, pyt_components, num_steps):
     }
     return wer, preds, times
 
-def do_inference(context, inp, batch_size):
+def do_inference(context, inp):
     '''Do inference using a TRT engine and time it
     Execution and device-to-host copy are completed synchronously
     '''
-
-
     # Typical Python-TRT used in samples would copy input data from host to device.
     # Because the PyTorch Tensor is already on the device, such a copy is unneeded.
-
-    # Create input array of device pointers
-    inputs = [inp[0].data_ptr()]
     t0 = time.perf_counter()
+    stream = cuda.Stream()
     # Create output buffers and stream
-    outputs, bindings, stream = trtutils.allocate_buffers_with_existing_inputs(context.engine,
-                                                                               inputs,
-                                                                               batch_size)
-    t1 = time.perf_counter()
-    # Run inference and transfer outputs to host asynchronously
-    context.execute_async(batch_size=batch_size,
-                          bindings=bindings,
-                          stream_handle=stream.handle)
+    outputs, bindings, out_shape = trtutils.allocate_buffers_with_existing_inputs(context, inp)
+    t01 = time.perf_counter()
+    # simulate sync call here
+    context.execute_async_v2(
+        bindings=bindings,
+        stream_handle=stream.handle)
     stream.synchronize()
+
     t2 = time.perf_counter()
+    # for out in outputs:
+    #     cuda.memcpy_dtoh(out.host, out.device) 
     [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
     stream.synchronize()
+   
     t3 = time.perf_counter()
-
-
-    copyto = t1-t0
-    inference = t2-t1
+    copyto = t01-t0
+    inference = t2-t01
     copyfrom = t3-t2
     out = outputs[0].host
+    outputs[0].device.free()
+    out = perfutils.torchify_trt_out(outputs[0].host, out_shape)
     return out, copyto, inference, copyfrom
 
-def do_inference_overlap(context, inp, batch_size):
+def do_inference_overlap(context, inp):
     '''Do inference using a TRT engine and time it
     Execution and device-to-host copy are completed asynchronously
     '''
     # Typical Python-TRT used in samples would copy input data from host to device.
     # Because the PyTorch Tensor is already on the device, such a copy is unneeded.
-
-    # Create input array of device pointers
-    inputs = [inp[0].data_ptr()]
+    
     t0 = time.perf_counter()
     # Create output buffers and stream
-    outputs, bindings, stream = trtutils.allocate_buffers_with_existing_inputs(context.engine,
-                                                                               inputs,
-                                                                               batch_size)
+    stream = cuda.Stream()
+    outputs, bindings, out_shape = trtutils.allocate_buffers_with_existing_inputs(context, inp)
+    t01 = time.perf_counter()
     t1 = time.perf_counter()
     # Run inference and transfer outputs to host asynchronously
-    context.execute_async(batch_size=batch_size,
-                          bindings=bindings,
-                          stream_handle=stream.handle)
+    context.execute_async_v2(
+                             bindings=bindings,
+                             stream_handle=stream.handle)
     [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
     stream.synchronize()
     t2 = time.perf_counter()
-
-
     copyto = t1-t0
     inference = t2-t1
-    out = outputs[0].host
+    outputs[0].device.free()
+    out = perfutils.torchify_trt_out(outputs[0].host, out_shape)
     return out, t2-t1

+ 102 - 38
PyTorch/SpeechRecognition/Jasper/trt/perfutils.py

@@ -14,14 +14,16 @@
 '''Contains helper functions for non-TRT components of JASPER inference
 '''
 
-from model import GreedyCTCDecoder, AudioPreprocessing, Jasper
+from model import GreedyCTCDecoder, AudioPreprocessing, JasperEncoderDecoder
 from dataset import AudioToTextDataLayer
-from helpers import Optimization, AmpOptimizations, process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, norm
+from helpers import AmpOptimizations, process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, norm
 from apex import amp
 import torch
 import torch.nn as nn
 import toml
 from parts.features import audio_from_file
+import onnx
+import os
 
 _global_ctc_labels = None
 def get_vocab():
@@ -94,20 +96,61 @@ def global_process_epoch(is_trt=True):
     return wer
 
 
-def get_onnx(path, acoustic_model, signal_shape, dtype=torch.float):
+
+def get_onnx(path, acoustic_model,  args):
     ''' Get an ONNX model with float weights
 
     Requires an --onnx_save_path and --ckpt_path (so that an acoustic model could be constructed).
     Fixed-length --seq_len must be provided as well.
     '''
+    
+    dynamic_dim = 0
+    if args.dynamic_shape:
+        dynamic_dim = 1 if args.transpose else 2
+
+
+    if args.transpose:
+        signal_shape=(args.engine_batch_size, args.seq_len, 64)
+    else:
+        signal_shape=(args.engine_batch_size, 64, args.seq_len)
+        
     with torch.no_grad():
-        phony_signal = torch.zeros(signal_shape, dtype=dtype, device=torch.device("cuda"))
-        torch.onnx.export(acoustic_model, (phony_signal,), path, input_names=["FEATURES"], output_names=["LOGITS"])
+        phony_signal = torch.zeros(signal_shape, dtype=torch.float, device=torch.device("cuda"))
+        phony_len = torch.IntTensor(len(phony_signal))
+        phony_out = acoustic_model.infer((phony_signal, phony_len))
+        
+        input_names=["FEATURES"]
+        output_names=["LOGITS"]
+
+        if acoustic_model.jasper_encoder.use_conv_mask:
+            input_names.append("FETURES_LEN")
+            output_names.append("LOGITS_LEN")
+            phony_signal = [phony_signal, phony_len]
+        
+        if dynamic_dim > 0:
+            dynamic_axes={
+                "FEATURES" : {0 : "BATCHSIZE", dynamic_dim : "NUM_FEATURES"},
+                "LOGITS" : { 0: "BATCHSIZE", 1 : "NUM_LOGITS"}
+            }
+        else:
+            dynamic_axes = None
+
+        jitted_model = acoustic_model
+        
+        torch.onnx.export(jitted_model, phony_signal, path,
+                          input_names=input_names, output_names=output_names,
+                          opset_version=10,
+                          do_constant_folding=True,
+                          verbose=True,
+                          dynamic_axes=dynamic_axes,
+                          example_outputs = phony_out
+        )
+
         fn=path+".readable"
         with open(fn, 'w') as f:
             #Write human-readable graph representation to file as well.
-            import onnx
             tempModel = onnx.load(path)
+            onnx.checker.check_model(tempModel)
             pgraph = onnx.helper.printable_graph(tempModel.graph)
             f.write(pgraph)
 
@@ -124,16 +167,15 @@ def get_pytorch_components_and_onnx(args):
     _global_ctc_labels= add_ctc_labels(dataset_vocab)
     featurizer_config = model_definition['input_eval']
 
-    optim_level = Optimization.mxprO3 if args.pyt_fp16 else Optimization.mxprO0
+    optim_level = 3 if args.pyt_fp16 else 0
 
     featurizer_config["optimization_level"] = optim_level
-    acoustic_model = None
+
     audio_preprocessor = None
     onnx_path = None
     data_layer = None
     wav = None
     seq_len = None
-    dtype=torch.float
     
     if args.max_duration is not None:
         featurizer_config['max_duration'] = args.max_duration
@@ -146,64 +188,85 @@ def get_pytorch_components_and_onnx(args):
                                            shuffle=False)
     if args.wav is not None:
         args.batch_size=1
-        args.engine_batch_size=1
         wav, seq_len = audio_from_file(args.wav)
         if args.seq_len is None or args.seq_len == 0:
             args.seq_len = seq_len/(featurizer_config['sample_rate']/100)
-        
 
-    model = Jasper(feature_config=featurizer_config,
-                   jasper_model_definition=model_definition,
-                   feat_in=1024,
-                   num_classes=len(get_vocab()))
+    if args.transpose:
+        featurizer_config["transpose_out"] = True
+        model_definition["transpose_in"] = True
 
-    model.cuda()
+    model = JasperEncoderDecoder(jasper_model_definition=model_definition, feat_in=1024, num_classes=len(get_vocab()), transpose_in=args.transpose)
+    model = model.cuda()
     model.eval()
-    acoustic_model = model.acoustic_model
-    audio_preprocessor = model.audio_preprocessor
 
+    audio_preprocessor = AudioPreprocessing(**featurizer_config)
+    audio_preprocessor = audio_preprocessor.cuda()
+    audio_preprocessor.eval()
+    
     if args.ckpt_path is not None:
-        checkpoint = torch.load(args.ckpt_path, map_location="cpu")
-        model.load_state_dict(checkpoint['state_dict'], strict=False)
+        if os.path.isdir(args.ckpt_path):
+            d_checkpoint = torch.load(args.ckpt_path+"/decoder.pt", map_location="cpu")
+            e_checkpoint = torch.load(args.ckpt_path+"/encoder.pt", map_location="cpu")
+            model.jasper_encoder.load_state_dict(e_checkpoint, strict=False)            
+            model.jasper_decoder.load_state_dict(d_checkpoint, strict=False)            
+        else:
+            checkpoint = torch.load(args.ckpt_path, map_location="cpu")
+            model.load_state_dict(checkpoint['state_dict'], strict=False)
+            
+    # if we are to produce engine, not run/create ONNX, postpone AMP initialization
+    # (ONNX parser cannot handle mixed FP16 ONNX yet)
+    if args.pyt_fp16 and args.engine_path is None:
+        amp.initialize(models=model, opt_level=AmpOptimizations[optim_level])
         
     if args.make_onnx:
-        if args.onnx_path is None or acoustic_model is None:
+        if args.onnx_path is None or args.ckpt_path is None:
             raise Exception("--ckpt_path, --onnx_path must be provided when using --make_onnx")
-        onnx_path = get_onnx(args.onnx_path, acoustic_model,
-                             signal_shape=(args.engine_batch_size, 64, args.seq_len), dtype=torch.float)
+        onnx_path = get_onnx(args.onnx_path, model, args)
 
-    if args.pyt_fp16:
-        amp.initialize(models=acoustic_model, opt_level=AmpOptimizations[optim_level])
-        
+    if args.pyt_fp16 and args.engine_path is not None:
+        amp.initialize(models=model, opt_level=AmpOptimizations[optim_level])
+    
     return {'data_layer': data_layer,
             'audio_preprocessor': audio_preprocessor,
-            'acoustic_model': acoustic_model,
+            'acoustic_model': model,
             'input_wav' : (wav, seq_len) }, onnx_path
 
-def adjust_shape(am_input, baked_length):
+def adjust_shape(am_input, args):
     '''Pads or cuts acoustic model input tensor to some fixed_length
 
     '''
-    in_seq_len = am_input[0].shape[2]
-    newSeq=am_input[0]
+    input = am_input[0]    
+    baked_length = args.seq_len
+    
+    if args.transpose:
+        in_seq_len = input.shape[1]
+    else:
+        in_seq_len = input.shape[2]
+
+    if  baked_length is None or in_seq_len == baked_length:
+        return (input, am_input[1])
+
+    if args.transpose:
+        return (input.resize_(input.shape[0], baked_length, 64), am_input[1])
+    
+    newSeq=input
     if in_seq_len > baked_length:
         # Cut extra bits off, no inference done
-        newSeq = am_input[0][...,0:baked_length].contiguous()
+        newSeq = input[...,0:baked_length].contiguous()
     elif in_seq_len < baked_length:
         # Zero-pad to satisfy length
         pad_length = baked_length - in_seq_len
-        newSeq = nn.functional.pad(am_input[0], (0, pad_length), 'constant', 0)
-    return (newSeq,)
+        newSeq = nn.functional.pad(input, (0, pad_length), 'constant', 0)
+    return (newSeq, am_input[1])
 
-def torchify_trt_out(trt_out, batch_size):
+def torchify_trt_out(trt_out, desired_shape):
     '''Reshapes flat data to format for greedy+CTC decoding
-
     Used to convert numpy array on host to PyT Tensor
     '''
-    desired_shape = (batch_size,-1,len(get_vocab()))
-
     # Predictions must be reshaped.
-    return torch.Tensor(trt_out).reshape(desired_shape)
+    ret = torch.from_numpy(trt_out)
+    return ret.reshape((desired_shape[0], desired_shape[1], desired_shape[2]))
 
 def do_csv_export(wers, times, batch_size, num_frames):
     '''Produces CSV header and data for input data
@@ -250,3 +313,4 @@ def do_csv_export(wers, times, batch_size, num_frames):
     string_header = ", ".join(header)
     string_data = ", ".join(data)
     return string_header, string_data
+

+ 2 - 0
PyTorch/SpeechRecognition/Jasper/trt/requirements.txt

@@ -1,2 +1,4 @@
 pycuda
 pillow
+onnx==1.5.0
+onnxruntime==0.5.0

+ 0 - 0
PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/trt_build.sh → PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/build.sh


+ 6 - 2
PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/trt_launch.sh → PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/launch.sh

@@ -1,4 +1,6 @@
 #!/bin/bash
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+JASPER_REPO=${JASPER_REPO:-"${SCRIPT_DIR}/../../.."}
 
 # Launch TRT JASPER container.
 
@@ -6,9 +8,9 @@ DATA_DIR=$1
 CHECKPOINT_DIR=$2
 RESULT_DIR=$3
 PROGRAM_PATH=${PROGRAM_PATH}
-
+    
 if [ $# -lt 3 ]; then
-    echo "Usage: ./trt_launch.sh <DATA_DIR> <CHECKPOINT_DIR> <RESULT_DIR> (<SCRIPT_PATH>)"
+    echo "Usage: ./launch.sh <DATA_DIR> <CHECKPOINT_DIR> <RESULT_DIR> (<SCRIPT_PATH>)"
     echo "All directory paths must be absolute paths and exist"
     exit 1
 fi
@@ -36,4 +38,6 @@ nvidia-docker run -it --rm \
   -v $DATA_DIR:/datasets \
   -v $CHECKPOINT_DIR:/checkpoints/ \
   -v $RESULT_DIR:/results/ \
+  -v ${JASPER_REPO}:/jasper \
+  ${EXTRA_JASPER_ENV} \
   jasper:trt6 bash $PROGRAM_PATH

+ 11 - 0
PyTorch/SpeechRecognition/Jasper/trt/scripts/trt_inference.sh

@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # Performs inference and measures latency and accuracy of TRT and PyTorch implementations of JASPER.
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
@@ -29,6 +30,7 @@ PYTORCH_PRECISION=${PYTORCH_PRECISION:-"fp32"}
 NUM_STEPS=${NUM_STEPS:-"-1"}
 BATCH_SIZE=${BATCH_SIZE:-1}
 NUM_FRAMES=${NUM_FRAMES:-3600}
+MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE:-$NUM_FRAMES}
 FORCE_ENGINE_REBUILD=${FORCE_ENGINE_REBUILD:-"true"}
 CSV_PATH=${CSV_PATH:-"/results/res.csv"}
 TRT_PREDICTION_PATH=${TRT_PREDICTION_PATH:-"/results/trt_predictions.txt"}
@@ -47,6 +49,7 @@ export PYTORCH_PRECISION="$PYTORCH_PRECISION"
 export NUM_STEPS="$NUM_STEPS"
 export BATCH_SIZE="$BATCH_SIZE"
 export NUM_FRAMES="$NUM_FRAMES"
+export MAX_SEQUENCE_LENGTH_FOR_ENGINE="$MAX_SEQUENCE_LENGTH_FOR_ENGINE"
 export FORCE_ENGINE_REBUILD="$FORCE_ENGINE_REBUILD"
 export CSV_PATH="$CSV_PATH"
 export TRT_PREDICTION_PATH="$TRT_PREDICTION_PATH"
@@ -54,3 +57,11 @@ export PYT_PREDICTION_PATH="$PYT_PREDICTION_PATH"
 export VERBOSE="$VERBOSE"
 
 bash ./trt/scripts/trt_inference_benchmark.sh $1 $2 $3 $4 $5 $6 $7
+
+trt_word_error_rate=`cat "$CSV_PATH" | awk '{print $3}'`
+pyt_word_error_rate=`cat "$CSV_PATH" | awk '{print $4}'`
+
+echo "word error rate for native PyTorch inference: "
+echo "${pyt_word_error_rate}"
+echo "word error rate for native TRT inference: "
+echo "${trt_word_error_rate}"

+ 25 - 14
PyTorch/SpeechRecognition/Jasper/trt/scripts/trt_inference_benchmark.sh

@@ -16,24 +16,29 @@
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
+trap "exit" INT
+
+
 # Mandatory Arguments
-CHECKPOINT=$CHECKPOINT
+CHECKPOINT=${CHECKPOINT:-"/checkpoints/jasper_fp16.pt"}
 
 # Arguments with Defaults
 DATA_DIR=${DATA_DIR:-"/datasets/LibriSpeech"}
 DATASET=${DATASET:-"dev-clean"}
 RESULT_DIR=${RESULT_DIR:-"/results"}
+LOG_DIR=${RESULT_DIR}/logs
 CREATE_LOGFILE=${CREATE_LOGFILE:-"true"}
-TRT_PRECISION=${TRT_PRECISION:-"fp32"}
-PYTORCH_PRECISION=${PYTORCH_PRECISION:-"fp32"}
+TRT_PRECISION=${TRT_PRECISION:-"fp16"}
+PYTORCH_PRECISION=${PYTORCH_PRECISION:-"fp16"}
 NUM_STEPS=${NUM_STEPS:-"100"}
 BATCH_SIZE=${BATCH_SIZE:-64}
 NUM_FRAMES=${NUM_FRAMES:-512}
-FORCE_ENGINE_REBUILD=${FORCE_ENGINE_REBUILD:-"true"}
+FORCE_ENGINE_REBUILD=${FORCE_ENGINE_REBUILD:-"false"}
 CSV_PATH=${CSV_PATH:-"/results/res.csv"}
 TRT_PREDICTION_PATH=${TRT_PREDICTION_PATH:-"none"}
 PYT_PREDICTION_PATH=${PYT_PREDICTION_PATH:-"none"}
 VERBOSE=${VERBOSE:-"false"}
+USE_DYNAMIC_SHAPE=${USE_DYNAMIC_SHAPE:-"yes"}
 
 
 # Set up flag-based arguments
@@ -62,7 +67,6 @@ if [ "$VERBOSE" = "true" ] ; then
     SHOULD_VERBOSE="--verbose"
 fi
 
-
 STEPS=""
 if [ "$NUM_STEPS" -gt 0 ] ; then
    STEPS=" --num_steps $NUM_STEPS"
@@ -73,21 +77,27 @@ ONNX_DIR=$RESULT_DIR/onnxs
 ENGINE_DIR=$RESULT_DIR/engines
 mkdir -p $ONNX_DIR
 mkdir -p $ENGINE_DIR
+mkdir -p $LOG_DIR
+
 
 
-PREFIX=BS${BATCH_SIZE}_NF${NUM_FRAMES}
+if [ "$USE_DYNAMIC_SHAPE" = "yes" ] ; then
+    DYNAMIC_PREFIX=" --dynamic_shape "
+    PREFIX=DYNAMIC
+else
+    PREFIX=BS${BATCH_SIZE}_NF${NUM_FRAMES}
+fi
 
-# Currently, TRT parser for ONNX can't parse half-precision weights, so ONNX
+# Currently, TRT parser for ONNX can't parse mixed-precision weights, so ONNX
 # export will always be FP32. This is also enforced in perf.py
 ONNX_FILE=fp32_${PREFIX}.onnx
 ENGINE_FILE=${TRT_PRECISION}_${PREFIX}.engine
 
 
-
 # If an ONNX with the same precision and number of frames exists, don't recreate it because
 # TRT engine construction can be done on an onnx of any batch size
 # "%P" only prints filenames (rather than absolute/relative path names)
-EXISTING_ONNX=$(find $ONNX_DIR -name "fp32_BS*_NF${NUM_FRAMES}.onnx" -printf "%P\n" | head -n 1)
+EXISTING_ONNX=$(find $ONNX_DIR -name ${ONNX_FILE} -printf "%P\n" | head -n 1)
 SHOULD_MAKE_ONNX=""
 if [ -z "$EXISTING_ONNX" ] ; then
     SHOULD_MAKE_ONNX="--make_onnx"
@@ -126,7 +136,7 @@ CMD+=" --engine_batch_size $BATCH_SIZE"
 CMD+=" --model_toml configs/jasper10x5dr_nomask.toml"
 CMD+=" --dataset_dir $DATA_DIR"
 CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
-CMD+=" --ckpt $CHECKPOINT"
+CMD+=" --ckpt_path $CHECKPOINT"
 CMD+=" $SHOULD_VERBOSE"
 CMD+=" $TRT_PREC"
 CMD+=" $PYTORCH_PREC"
@@ -136,6 +146,7 @@ CMD+=" --onnx_path ${RESULT_DIR}/onnxs/${ONNX_FILE}"
 CMD+=" --seq_len $NUM_FRAMES"
 CMD+=" $SHOULD_MAKE_ONNX"
 CMD+=" $SHOULD_MAKE_ENGINE"
+CMD+=" $DYNAMIC_PREFIX"
 CMD+=" --csv_path $CSV_PATH"
 CMD+=" $1 $2 $3 $4 $5 $6 $7 $8 $9"
 CMD+=" $TRT_PREDICTION_PATH"
@@ -146,17 +157,17 @@ if [ "$CREATE_LOGFILE" == "true" ] ; then
   export GBS=$(expr $BATCH_SIZE )
   printf -v TAG "jasper_trt_inference_benchmark_%s_gbs%d" "$PYTORCH_PRECISION" $GBS
   DATESTAMP=`date +'%y%m%d%H%M%S'`
-  LOGFILE=$RESULT_DIR/$TAG.$DATESTAMP.log
+  LOGFILE=$LOG_DIR/$TAG.$DATESTAMP.log
   printf "Logs written to %s\n" "$LOGFILE"
 fi
 
+mkdir -p ${RESULT_DIR}/logs
+
 set -x
 if [ -z "$LOGFILE" ] ; then
    $CMD
 else
-   (
-     $CMD
-   ) |& tee $LOGFILE
+   $CMD |& tee $LOGFILE
    grep 'latency' $LOGFILE
 fi
 set +x

+ 11 - 6
PyTorch/SpeechRecognition/Jasper/trt/scripts/walk_benchmark.sh

@@ -16,13 +16,18 @@
 
 
 export NUM_STEPS=100
-export FORCE_ENGINE_REBUILD="true"
-export CHECKPOINT="/checkpoints/jasper.pt"
-export CREATE_LOGFILE="false"
-for prec in fp16;
+export FORCE_ENGINE_REBUILD="false"
+export CHECKPOINT=${CHECKPOINT:-"/checkpoints/jasper_fp16.pt"}
+export CREATE_LOGFILE="true"
+prec=fp16
+export TRT_PRECISION=$prec
+export PYTORCH_PRECISION=$prec
+
+trap "exit" INT
+
+for use_dynamic in yes no;
 do
-    export TRT_PRECISION=$prec
-    export PYTORCH_PRECISION=$prec
+    export USE_DYNAMIC_SHAPE=${use_dynamic}
     export CSV_PATH="/results/${prec}.csv"
     for nf in 208 304 512 704 1008 1680;
     do

+ 98 - 35
PyTorch/SpeechRecognition/Jasper/trt/trtutils.py

@@ -15,8 +15,9 @@
 '''
 import pycuda.driver as cuda
 import tensorrt as trt
+import onnxruntime as ort
+import numpy as np
 
-# Simple class: more explicit than dealing with 2-tuple
 class HostDeviceMem(object):
     '''Type for managing host and device buffers
 
@@ -32,22 +33,48 @@ class HostDeviceMem(object):
     def __repr__(self):
         return self.__str__()
 
-def build_engine_from_parser(model_path, batch_size, is_fp16=True, is_verbose=False, max_workspace_size=4*1024*1024*1024):
+def build_engine_from_parser(args):
     '''Builds TRT engine from an ONNX file
     Note that network output 1 is unmarked so that the engine will not use
     vestigial length calculations associated with masked_fill
     '''
-    TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if is_verbose else trt.Logger(trt.Logger.WARNING)
-    with trt.Builder(TRT_LOGGER) as builder:
-        builder.max_batch_size = batch_size
-        builder.fp16_mode = is_fp16
-        builder.max_workspace_size = max_workspace_size
-        with builder.create_network() as network:
-            with trt.OnnxParser(network, TRT_LOGGER) as parser:
-                with open(model_path, 'rb') as model:
-                    parser.parse(model.read())
-                
-                return builder.build_cuda_engine(network)
+    TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if args.verbose else trt.Logger(trt.Logger.WARNING)
+    builder = trt.Builder(TRT_LOGGER)
+    builder.max_batch_size = 64
+
+    if args.trt_fp16:
+        builder.fp16_mode = True
+        print("Optimizing for FP16")
+        config_flags = 1 << int(trt.BuilderFlag.FP16) # | 1 << int(trt.BuilderFlag.STRICT_TYPES)
+        max_size = 4*1024*1024*1024
+        max_len = args.max_seq_len
+    else:
+        config_flags = 0
+        max_size = 4*1024*1024*1024
+        max_len = args.max_seq_len
+    if args.max_workspace_size > 0:
+        builder.max_workspace_size = args.max_workspace_size
+    else:
+        builder.max_workspace_size = max_size
+        
+    config = builder.create_builder_config()
+    config.flags = config_flags
+    
+    if args.dynamic_shape:
+        profile = builder.create_optimization_profile()
+        if args.transpose:
+            profile.set_shape("FEATURES", min=(1,192,64), opt=(args.engine_batch_size,256,64), max=(builder.max_batch_size, max_len, 64))
+        else:
+            profile.set_shape("FEATURES", min=(1,64,192), opt=(args.engine_batch_size,64,256), max=(builder.max_batch_size, 64, max_len))        
+        config.add_optimization_profile(profile)    
+    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(explicit_batch)
+
+    with trt.OnnxParser(network, TRT_LOGGER) as parser:
+        with open(args.onnx_path, 'rb') as model:
+            parsed = parser.parse(model.read())
+            print ("Parsing returned ", parsed, "dynamic_shape= " , args.dynamic_shape, "\n")
+            return builder.build_engine(network, config=config)
 
 def deserialize_engine(engine_path, is_verbose):
     '''Deserializes TRT engine at engine_path
@@ -58,7 +85,7 @@ def deserialize_engine(engine_path, is_verbose):
     return engine
 
 
-def allocate_buffers_with_existing_inputs(engine, inp, batch_size=1):
+def allocate_buffers_with_existing_inputs(context, inp):
     '''
     allocate_buffers() (see TRT python samples) but uses an existing inputs on device
 
@@ -66,27 +93,63 @@ def allocate_buffers_with_existing_inputs(engine, inp, batch_size=1):
           would be produced by allocate_buffers(). That is, inputs are in the
           order defined by iterating through `engine`
     '''
-
     # Add input to bindings
-    bindings = []
+    bindings = [0,0]
     outputs = []
-    stream = cuda.Stream()
-    inp_idx = 0
+    engine = context.engine
+    batch_size = inp[0].shape
+    inp_idx = engine.get_binding_index("FEATURES")    
+    inp_b = inp[0].data_ptr()
+    assert(inp[0].is_contiguous())
+    bindings[inp_idx] = inp_b
+    sh = inp[0].shape
+    batch_size = sh[0]
+    orig_shape = context.get_binding_shape(inp_idx)
+    if orig_shape[0]==-1:
+        context.set_binding_shape(inp_idx, trt.Dims([batch_size, sh[1], sh[2]]))
 
-    for binding in engine:
-        if engine.binding_is_input(binding):
-            bindings.append(inp[inp_idx])
-            inp_idx += 1
-        else:
-            # Unchanged from do_inference()
-            size = trt.volume(engine.get_binding_shape(binding)) * batch_size
-            dtype = trt.nptype(engine.get_binding_dtype(binding))
-            # Allocate host and device buffers
-            host_mem = cuda.pagelocked_empty(size, dtype)
-            device_mem = cuda.mem_alloc(host_mem.nbytes*2)
-            # Append the device buffer to device bindings.
-            bindings.append(int(device_mem))
-            # Append to the appropriate list.
-            outputs.append(HostDeviceMem(host_mem, device_mem))
-
-    return outputs, bindings, stream
+    assert context.all_binding_shapes_specified
+
+    out_idx = engine.get_binding_index("LOGITS")
+    # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
+    out_shape = context.get_binding_shape(out_idx)
+    #print ("Out_shape: ", out_shape)
+    h_output = cuda.pagelocked_empty(tuple(out_shape), dtype=np.float32())
+    # print ("Out bytes: " , h_output.nbytes)
+    d_output = cuda.mem_alloc(h_output.nbytes)
+    bindings[out_idx] = int(d_output)
+    hdm = HostDeviceMem(h_output, d_output)
+    outputs.append(hdm)
+    return outputs, bindings, out_shape
+
+def get_engine(args):
+    '''Get a TRT engine
+
+    If --should_serialize is present, always build from ONNX and store result in --engine_path.
+    Else If an engine is provided as an argument (--engine_path) use that one.
+    Otherwise, make one from onnx (--onnx_load_path), but don't serialize it.
+    '''
+    engine = None
+
+    if args.engine_path is not None and args.use_existing_engine:
+        engine = deserialize_engine(args.engine_path, args.verbose)
+    elif args.engine_path is not None and args.onnx_path is not None:
+        # Build a new engine and serialize it.
+        print("Building TRT engine ....") 
+        engine = build_engine_from_parser(args)
+        if engine is not None:
+            with open(args.engine_path, 'wb') as f:
+                f.write(engine.serialize())
+                print("TRT engine saved at " + args.engine_path + " ...") 
+    elif args.onnx_path is not None:
+        ort_session = ort.InferenceSession(args.onnx_path)
+        return ort_session
+    else:
+        raise Exception("One of the following sets of arguments must be provided:\n"+
+                        "<engine_path> + --use_existing_engine\n"+
+                        "<engine_path> + <onnx_path>\n"+
+                        "in order to construct a TRT engine")
+    if engine is None:
+        raise Exception("Failed to acquire TRT engine")
+
+    return engine

+ 40 - 0
PyTorch/SpeechRecognition/Jasper/trtis/Dockerfile

@@ -0,0 +1,40 @@
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3 
+
+FROM tensorrtserver_client as trtis-client
+FROM ${FROM_IMAGE_NAME}
+RUN apt-get update && apt-get install -y python3
+ARG version=6.0.1-1+cuda10.1
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.1.243-1_amd64.deb \
+&& dpkg -i cuda-repo-*.deb \
+&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb \
+&& dpkg -i nvidia-machine-learning-repo-*.deb \
+&& apt-get update \
+&& apt-get install -y --no-install-recommends libnvinfer6=${version} libnvonnxparsers6=${version} libnvparsers6=${version} libnvinfer-plugin6=${version} libnvinfer-dev=${version} libnvonnxparsers-dev=${version} libnvparsers-dev=${version} libnvinfer-plugin-dev=${version} python-libnvinfer=${version} python3-libnvinfer=${version}
+RUN cp -r /usr/lib/python3.6/dist-packages/tensorrt /opt/conda/lib/python3.6/site-packages/tensorrt
+
+
+ENV PATH=$PATH:/usr/src/tensorrt/bin
+WORKDIR /tmp/onnx-trt
+COPY trt/onnx-trt.patch .
+RUN git clone https://github.com/onnx/onnx-tensorrt.git && cd onnx-tensorrt && git checkout  b677b9cbf19af803fa6f76d05ce558e657e4d8b6  && git submodule update --init --recursive && \
+    patch -f < ../onnx-trt.patch && mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DGPU_ARCHS="60 70 75" && make -j16 && make install && mv -f /usr/lib/libnvonnx* /usr/lib/x86_64-linux-gnu/ && ldconfig
+
+
+# Here's a good place to install pip reqs from JoC repo.
+# At the same step, also install TRT pip reqs
+WORKDIR /tmp/pipReqs
+COPY requirements.txt /tmp/pipReqs/pytRequirements.txt
+COPY trt/requirements.txt /tmp/pipReqs/trtRequirements.txt
+COPY trtis/requirements.txt /tmp/pipReqs/trtisRequirements.txt
+RUN apt-get update && apt-get install -y --no-install-recommends portaudio19-dev && pip install -r pytRequirements.txt && pip install -r trtRequirements.txt && pip install -r trtisRequirements.txt
+
+
+#Copy the perf_client over
+COPY --from=trtis-client /workspace/install/bin/perf_client /workspace/install/bin/perf_client
+#Copy the python wheel and install with pip
+COPY --from=trtis-client /workspace/install/python/tensorrtserver*.whl /tmp/
+RUN pip install /tmp/tensorrtserver*.whl && rm /tmp/tensorrtserver*.whl
+
+WORKDIR /workspace/jasper
+COPY . .
+

+ 381 - 0
PyTorch/SpeechRecognition/Jasper/trtis/README.md

@@ -0,0 +1,381 @@
+# Jasper Inference Using TensorRT Inference Server
+
+This is a subfolder of the Jasper for PyTorch repository that provides scripts to deploy high-performance inference using NVIDIA TensorRT Inference Server offering different options for the inference model pipeline.
+
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+   * [Model architecture](#model-architecture)
+   * [TensorRT Inference Server Overview](#tensorrt-inference-server-overview)
+   * [Inference Pipeline in TensorRT Inference Server](#inference-pipeline-in-tensorrt-inference-server)
+- [Setup](#setup)
+  * [Supported Software](#supported-software)
+  * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+  * [Scripts and sample code](#scripts-and-sample-code)
+- [Performance](#performance)
+  * [Inference Benchmarking in TensorRT Inference Server](#inference-benchmarking-in-tensorrt-inference-server)
+  * [Results](#results)
+    * [Performance analysis for TensorRT Inference Server: NVIDIA T4](#performance-analysis-for-tensorrt-inference-server-nvidia-t4)
+	* [Maximum Batch Size](#maximum-batch-size)
+	* [Batching techniques: Static versus Dynamic Batching](#batching-techniques-static-versus-dynamic-batching)
+    	* [TensorRT/ONNX/PyTorch JIT comparisons](#tensorrt/onnx/pytorch-jit-comparisons)
+		    * [Throughput Comparison](#throughput-comparison)
+		    * [Latency Comparison](#latency-comparison)
+
+## Model overview
+
+### Model architecture
+
+
+Jasper is a neural acoustic model for speech recognition. Its network architecture is designed to facilitate fast GPU inference. More information about Jasper and its training and be found in the [Jasper PyTorch README](../README.md). 
+By default the model configuration is Jasper 10x5 with dense residuals. A Jasper BxR model has B blocks, each consisting of R repeating sub-blocks.
+Each sub-block applies the following operations in sequence: 1D-Convolution, Batch Normalization, ReLU activation, and Dropout.
+
+In the original paper Jasper is trained with masked convolutions, which masks out the padded part of an input sequence in a batch before the 1D-Convolution. 
+For inference masking is not used. The reason for this is that in inference, the original mask operation does not achieve better accuracy than without the mask operation on the test and development dataset. However, no masking achieves better inference performance especially after TensorRT optimization.
+
+More information on the Jasper model architecture can be found in the [Jasper PyTorch README](../README.md). 
+
+
+
+
+### TensorRT Inference Server Overview
+
+The [NVIDIA TensorRT Inference Server](https://github.com/NVIDIA/tensorrt-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
+This folder contains detailed performance analysis as well as scripts to run Jasper inference using TensorRT Inference Server.
+
+A typical TensorRT Inference Server pipeline can be broken down into the following steps:
+
+1. The client serializes the inference request into a message and sends it to the server (Client Send). 
+2. The message travels over the network from the client to the server (Network).
+3. The message arrives at the server, and is deserialized (Server Receive).
+4. The request is placed on the queue (Server Queue).
+5. The request is removed from the queue and computed (Server Compute).
+6. The completed request is serialized in a message and sent back to the client (Server Send).
+7. The completed message then travels over the network from the server to the client (Network).
+8. The completed message is deserialized by the client and processed as a completed inference request (Client Receive).
+
+Generally, for local clients, steps 1-4 and 6-8 will only occupy a small fraction of time, compared to steps 5-6. As backend deep learning systems like Jasper are rarely exposed directly to end users, but instead only interfacing with local front-end servers, for the sake of Jasper, we can consider that all clients are local.
+In this section, we will go over how to launch both the TensorRT Inference Server and the client and get the best performance solution that fits your specific application needs.
+
+Note: The following instructions are run from outside the container and call `docker run` commands as required.
+
+
+## Inference Pipeline in TensorRT Inference Server
+
+The Jasper model pipeline consists of 3 components, where each part can be customized to be a different backend: 
+
+**Data preprocessor**
+
+The data processor transforms an input raw audio file into a spectrogram. By default the pipeline uses mel filter banks as spectrogram features. This part does not have any learnable weights.
+
+**Acoustic model**
+
+The acoustic model takes in the spectrogram and outputs a probability over a list of characters. This part is the most compute intensive, taking more than 90% of the entire end-to-end pipeline. The acoustic model is the only component with learnable parameters and what differentiates Jasper from other end-to-end neural speech recognition models. In the original paper, the acoustic model contains a masking operation for training (More details in [../README.md]). We do not use masking for inference . 
+
+**Greedy decoder**
+
+The decoder takes the probabilities over the list of characters and outputs the final transcription. Greedy decoding is a fast and simple way of doing this by always choosing the character with the maximum probability. 
+
+To run a model with TensorRT, we first construct the model in PyTorch, which is then exported into a ONNX static graph. Finally, a TensorRT engine is constructed from the ONNX file and can be launched to do inference. The following table shows which backends are supported for each part along the model pipeline.
+
+|Backend\Pipeline component|Data preprocessor|Acoustic Model|Decoder|
+|---|---|---|---|
+|PyTorch JIT|x|x|x|
+|ONNX|-|x|-|
+|TensorRT|-|x|-|
+
+In order to run inference with TensorRT outside of the inference server, refer to the [Jasper TensorRT README](../trt/README.md).
+
+
+
+
+## Setup
+
+### Supported Software
+
+The following software version configuration is supported has been tested.
+
+|Software|Version|
+|--------|-------|
+|Python|3.6.9|
+|PyTorch|1.2.0|
+|TensorRT|6.0.1.5|
+|CUDA|10.1.243|
+
+
+The following section lists the requirements in order to start inference with Jasper in TensorRT Inference Server.
+
+### Requirements
+
+The repository contains a folder `./trtis/` with a `Dockerfile` which extends the PyTorch 19.09-py3 NGC container and encapsulates some dependencies. Ensure you have the following components:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 19.09-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+* [TensorRT Inference Server 19.09 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrtserver)
+* Access to [NVIDIA machine learning repository](https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb) and [NVIDIA cuda repository](https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.1.243-1_amd64.deb) for NVIDIA TensorRT 6
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+* [Pretrained Jasper Model Checkpoint](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16)
+
+Required Python packages are listed in `requirements.txt`, `trt/requirements.txt` and `trtis/requirements.txt`. These packages are automatically installed when the Docker container is built. 
+
+
+## Quick Start Guide
+
+Running the following scripts will build and launch the container containing all required dependencies for both TensorRT 6 as well as native PyTorch. This is necessary for using inference with TensorRT and can also be used for data download, processing and training of the model.
+
+1. Clone the repository.
+
+    ```bash
+    git clone https://github.com/NVIDIA/DeepLearningExamples
+    cd DeepLearningExamples/PyTorch/SpeechRecognition/Jasper
+    ```
+
+2. Build a container that extends NGC PyTorch 19.09, TensorRT, TensorRT Inference Server, and TensorRT Inference Client:
+
+    ```bash
+    bash trtis/scripts/docker/build.sh
+    ```
+
+3. Start an interactive session in the Docker container:
+
+    ```bash
+    export DATA_DIR=<DATA_DIR>
+    export CHECKPOINT_DIR=<CHECKPOINT_DIR>
+    export RESULT_DIR=<RESULT_DIR>
+    bash trtis/scripts/docker/launch.sh
+    ```
+
+    Where <DATA_DIR>, <CHECKPOINT_DIR> and <RESULT_DIR> can be either empty or absolute directory paths to dataset, existing checkpoints or potential output files.
+      
+    Alternatively, to start a script `foo.sh` in the Docker container without an interactive session, run:
+
+    ```bash
+    export DATA_DIR=<DATA_DIR>
+    export CHECKPOINT_DIR=<CHECKPOINT_DIR>
+    export RESULT_DIR=<RESULT_DIR>
+    export PROGRAM_PATH=foo.sh
+    bash trtis/scripts/docker/trtis.sh
+    ```
+
+    The `/datasets`, `/checkpoints`, `/results` directories will be mounted as volumes and mapped to the corresponding directories `<DATA_DIR>`, `<CHECKPOINT_DIR>`, `<RESULT_DIR>` on the host. Note that `<DATA_DIR>`, `<CHECKPOINT_DIR>`, and `<RESULT_DIR>` directly correspond to the same arguments in `scripts/docker/launch.sh` and `trt/scripts/docker/launch.sh` mentioned in the [Jasper PyTorch README](../README.md) and [Jasper TensorRT README](../trt/README.md).
+
+    Briefly, `<DATA_DIR>` should contain, or be prepared to contain a `LibriSpeech` sub-directory (created in [Acquiring Dataset](../trt/README.md)), `<CHECKPOINT_DIR>` should contain a PyTorch model checkpoint (`*.pt`) file obtained through training described in [Jasper PyTorch README](../README.md), and `<RESULT_DIR>` should be prepared to contain timing results and logs. Downloading `LibriSpeech` is not required for Inference in TensorRT Inference Server on a single .wav audio file. To do inference and evaluation on LibriSpeech, download the dataset following the instructions in the [Jasper TensorRT README](../README.md)
+
+4. Convert pretrained PyTorch model checkpoint into TensorRT Inference Server compatible model backends.
+
+    From outside the container, run:
+
+    ```bash
+    export ARCH=<ARCH>
+    export CHECKPOINT_DIR=<CHECKPOINT_DIR>
+    export CHECKPOINT=<CHECKPOINT>
+    export PRECISION=<PRECISION>
+    export MAX_SEQUENCE_LENGTH_FOR_ENGINE=<MAX_SEQUENCE_LENGTH_FOR_ENGINE>
+    bash trtis/scripts/export_model.sh
+    bash trtis/scripts/prepare_model_repository.sh
+    ```
+
+    Where `<ARCH>` is either 70(Volta) or 75(Turing), `<CHECKPOINT_DIR>` is the absolute path that contains the pretrained checkpoint `<CHECKPOINT>`, and `<PRECISION>` is either `fp16` or `fp32`. `<MAX_SEQUENCE_LENGTH_FOR_ENGINE>` defines the maximum feasible audio length, where 100 corresponds to 1 second.
+    The exported models for deployment will be generated at `./trtis/deploy/`.
+
+    A pretrained PyTorch model checkpoint for model conversion can be downloaded from the [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16). 
+
+    More details can be found in the [Advanced](#advanced) section under [Scripts and sample code](#scripts-and-sample-code).
+
+5. Download Pre-exported Inference Checkpoints from NGC 
+
+    If you would like to skip the manual model export, you can find already generated model backends in [https://ngc.nvidia.com/models/nvidian:swdl:jasperpyt_jit_fp16](https://ngc.nvidia.com/models/nvidian:swdl:jasperpyt_jit_fp16), [https://ngc.nvidia.com/models/nvidian:swdl:jasperpyt_onnx_fp16](https://ngc.nvidia.com/models/nvidian:swdl:jasperpyt_onnx_fp16), [https://ngc.nvidia.com/models/nvidian:swdl:jasperpyt_trt_turing_fp16](https://ngc.nvidia.com/models/nvidian:swdl:jasperpyt_trt_turing_fp16), [https://ngc.nvidia.com/models/nvidian:swdl:jasperpyt_trt_volta_fp16](https://ngc.nvidia.com/models/nvidian:swdl:jasperpyt_trt_volta_fp16). for every version of the model pipeline. If you wish to use TensorRT pipeline, make sure to download the correct version for your hardware. The extracted model folder should contain 3 subfolders `jasper-feature-extractor`, `jasper-decoder` and `jasper-x` where x can be pyt, onnx, trt depending on the model backend. You will find folders with the same name in your local Jasper repository under `trtis/model_repo/’. Copy the content of each of the 3 model folders to the according directory in your Jasper project, replace files with the same name.
+
+    Then run:
+    ```bash
+    bash trtis/scripts/prepare_model_repository.sh
+    ```
+
+6. Launch TensorRT Inference Server.
+
+    Start the server:
+    ```bash
+    bash trtis/scripts/run_server.sh
+    ```
+
+7. Run all inference benchmarks. 
+
+    From outside the container, run:
+
+    ```bash
+    export ARCH=<ARCH>
+    export CHECKPOINT_DIR=<CHECKPOINT_DIR>
+    export RESULT_DIR=<RESULT_DIR>
+    export CHECKPOINT=<CHECKPOINT>
+    bash trtis/scripts/execute_all_perf_runs.sh
+    ```
+
+    Where `<ARCH>` is either 70(Volta) or 75(Turing), `<CHECKPOINT_DIR>` is the absolute path that contains the pretrained checkpoint `<CHECKPOINT>`, and `<RESULT_DIR>` is the absolute path to potential output files.
+
+    Note: This can take several hours to complete due to the extensiveness of the benchmark. More details about the benchmark are found in the [Advanced](#advanced) section under [Performance](#performance).
+
+8. Run inference using the Client and TensorRT Inference Server.
+
+    8.1 From outside the container, restart the server:
+    ```bash
+    bash trtis/scripts/run_server.sh
+    ``` 
+
+    8.2 From outside the container, submit the client request using:
+    ```bash
+    bash trtis/scripts/run_client.sh <MODEL_TYPE> <DATA_DIR> <FILE>
+    ```
+
+    Where `<MODEL_TYPE>` can be either “pyt” (default), “trt” or “onnx”. `<DATA_DIR>` is an absolute local path to the directory of files. <FILE> is the relative path to <DATA_DIR> to either an audio file in .wav format or a manifest file in .json format. 
+
+    Note: If <FILE> is *.json <DATA_DIR> should be the path to the LibriSpeech dataset. In this case this script will do both inference and evaluation on the accoring LibriSpeech dataset. 
+
+9. Start Jupyter Notebook to run inference interactively.
+
+    Run:
+    ```bash
+    jupyter notebook -- notebooks/JasperTRTIS.ipynb
+    ```
+
+    A pretrained model checkpoint necessary for using the jupyter notebook to be able to run inference can be downloaded from [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16). 
+
+
+## Advanced
+
+The following sections provide greater details about the TensorRT Inference Server pipeline and inference analysis and benchmarking results.
+
+
+### Scripts and sample code
+
+The `trtis/` directory contains the following files:
+* `jasper-client.py`: Python client script that takes an audio file and a specific model pipeline type and submits a client request to the server to run inference with the model on the given audio file.
+* `speech-utils.py`: helper functions for `jasper-client.py`
+
+The `trtis/scripts/` directory has easy to use scripts to run supported functionalities, such as:
+* `./docker/build.sh`: builds container
+* `./docker/launch.sh`: launches container
+* `execute_all_perf_runs.sh`: runs all benchmarks using TRTIS perfclient calls `generate_perf_results.sh`
+* `export_model.sh`: from pretrained PyTorch checkpoint generates backends for every version of the model inference pipeline, calls `export_model_helper.sh`
+* `prepare_model_repository.sh`: copies model config files from `./model_repo/` to `./deploy/model_rep`o and creates links to generated model backends, setting up the model repository for TensorRT Inference Server
+* `generate_perf_results.sh`: runs benchmark with perf-client for specific configuration and calls `run_perf_client.sh`
+* `run_server.sh`: launches TensorRT Inference Server
+* `run_client.sh`: launches client by using `jasper-client.py` to submit inference requests to server
+
+
+
+
+## Performance
+
+### Inference Benchmarking in TensorRT Inference Server
+
+To benchmark the inference performance on either Volta or Turing GPU, run `bash trtis/scripts/execute_all_perf_runs.sh` according to [Quick-Start-Guide](#quick-start-guide) Step 7 and set `ARCH` according to the underlying hardware (`ARCH=70` for Volta and `ARCH=75` for Turing)
+
+By default, this script measures inference performance for all 3 model pipelines: PyTorch JIT  (‘pyt’) pipeline, ONNX (‘onnx’) pipeline, TensorRT(‘trt’) pipeline, both with fp32 and fp16 precision. Each of these pipelines is measured for different audio input lengths (2sec, 7sec, 16.7sec) and a range of different server batch sizes (up to 64). This takes place in `trtis/scripts/generate_perf_results.sh`. For a specific audio length and batch size static and dynamic batching comparison is performed. For benchmarking we used `MAX_SEQUENCE_LENGTH_FOR_ENGINE=1792` for inference model generation.
+
+
+
+### Results
+
+
+#### Performance Analysis for TensorRT Inference Server: NVIDIA T4
+
+
+
+### Results
+
+
+#### Performance Analysis for TensorRT Inference Server: NVIDIA T4
+
+Based on the figure below, we recommend using the Dynamic Batcher with `max_batch_size=8`, `max_queue_delay_microseconds` as large as possible to fit within your latency window (the values used below are extremely large to exaggerate their effect). The largest improvements to both throughput and latency come from increasing the batch size due to efficiency gains in the GPU with larger batches. The Dynamic Batcher combines the best of both worlds by efficiently batching together a large number of concurrent requests, while also keeping latency down for infrequent requests. 
+
+All results below are obtained using the following configurations:
+* Single T4 16GB GPU on a local server
+* Jasper Large
+* Audio length = 7 seconds 
+* FP16 precision
+
+Latencies are indicated by bar plots using the left axis. Throughput is indicated by the blue line plot using the right axis. X-axis indicates the concurrency - the maximum number of inference requests that can be in the pipeline at any given time. For example, when the concurrency is set to 1, the client waits for an inference request to be completed (Step 8) before it sends another to the server (Step 1). A high number of concurrent requests can reduce the impact of network latency on overall throughput.
+
+
+<img src="../images/trtis_throughput_latency_summary.png" width="100%" height="100%"> 
+
+Figure 1: Latency vs Throughput for Jasper Large, FP16, Audio Length = 7sec using various configurations and all 3 model backends  available in TensorRT Inference Server. TensorRT is denoted as TRT, PyTorch as PyT.
+
+
+##### Maximum Batch Size
+In general, increasing batch size leads to higher throughput at the cost of higher latency. In the following sections, we analyze the results using the example of the TensorRT-pipeline. 
+ 
+As we can see in Figure 2, the throughput at Batch Size=1, Client Concurrent Requests = 8 is 45 and in Figure 3, the throughput at Batch Size=8, Client Concurrent Requests = 1 is 101, giving a speedup of ~2.24x. 
+Note: We compare Batch Size=1, Client Concurrent Requests = 8 to Batch Size=8, Client Concurrent Requests = 1 to keep the Total Number of Outstanding Requests equal between the two different modes. Where Total Number of Outstanding Requests = Batch Size * Client Concurrent Requests. 
+Increasing the batch size by 8-fold from 1 to 8 results in an increase in compute time by only 2.42x (45ms to 109ms) showing that computation is more efficient at higher batch sizes. Hence, an optimal batch size would be the maximum batch size that can both fit in memory and is within the preferred latency threshold.
+
+<img src="../images/trtis_static_batching_bs1.png" width="80%" height="80%"> 
+
+Figure 2: TensorRT pipeline - Latency & Throughput vs Concurrency using Static Batching at Batch size = 1
+
+<img src="../images/trtis_static_batching_bs8.png" width="80%" height="80%"> 
+
+Figure 3: TensorRT pipeline - Latency & Throughput vs Concurrency using Static Batching at Batch size = 8
+
+##### Batching techniques: Static versus Dynamic Batching
+In the following section, we analyze the results using the example of the TensorRT-pipeline.
+Static batching is a feature of the inference server that allows inference requests to be served as they are received. It is preferred in scenarios where low latency is desired at the cost of throughput when the GPU is under utilized.
+Dynamic batching is a feature of the inference server that allows inference requests to be combined by the server, so that a batch is created dynamically, resulting in an increased throughput. It is preferred in scenarios where we would like to maximize throughput and GPU utilization at the cost of higher latencies. You can set the Dynamic Batcher parameter `max_queue_delay_microseconds` to indicate the maximum amount of time you are willing to wait and `preferred_batch_size` to indicate your maximum server batch size in the TensorRT Inference Server model config. 
+Figure 4 emphasizes the increase in overall throughput with dynamic batching. At low numbers of concurrent requests, the increased throughput comes at the cost of increasing latency as the requests are queued up to max_queue_delay_microseconds. The effect of preferred_batchsize for dynamic batching is visually depicted by the dip in Server Queue time at integer multiples of the preferred batch sizes. At higher numbers of concurrent requests, the throughput approaches a maximum limit as we saturate the GPU utilization.
+
+<img src="../images/trtis_dynamic_batching.png" width="80%" height="80%"> 
+ 
+Figure 4: TensorRT pipeline - Latency & Throughput vs Concurrency using dynamic Batching at client Batch size = 1, maximum server batch size=4, max_queue_delay_microseconds = 5000
+
+
+ 
+##### TensorRT/ONNX/PyTorch JIT comparisons
+The following tables show inference and latency comparisons across all 3 backends for mixed precision and static batching. The main observations are:
+Increasing the batch size leads to higher inference throughput and latency up to a certain batch size, after which it slowly saturates.
+TensorRT is faster than both the PyTorch and ONNX pipeline, achieving a speedup of up to ~1.5x and ~2.4x respectively.
+The longer the audio length, the lower the throughput and the higher the latency.
+
+
+###### Throughput Comparison
+
+Following Table shows throughput benchmark results for all 3 model backends in TensorRT Inference Server using static batching under optimal concurrency
+
+
+|Audio length in seconds|Batch Size|TensorRT (inf/s)|PyTorch (inf/s)|ONNX (inf/s)|TensorRT/PyTorch Speedup|TensorRT/Onnx Speedup|
+|---    |---    |---    |---    |---    |---    |---    |
+|2.00|1.00|46.67|40.67|41.00|1.15|1.14|
+|2.00|2.00|90.67|74.67|58.00|1.21|1.56|
+|2.00|4.00|168.00|128.00|112.00|1.31|1.50|
+|2.00|8.00|248.00|213.33|194.67|1.16|1.27|
+|7.00|1.00|44.33|31.67|37.00|1.40|1.20|
+|7.00|2.00|74.67|56.67|49.33|1.32|1.51|
+|7.00|4.00|100.00|62.67|50.67|1.60|1.97|
+|7.00|8.00|106.67|80.00|53.33|1.33|2.00|
+|16.70|1.00|31.00|20.00|25.33|1.55|1.22|
+|16.70|2.00|42.00|29.33|19.33|1.43|2.17|
+|16.70|4.00|46.67|29.33|22.67|1.59|2.06|
+|16.70|8.00|50.67|37.33|21.33|1.36|2.38|
+
+###### Latency Comparison
+
+Following Table shows throughput benchmark results for all 3 model backends in TensorRT Inference Server using static batching and a single concurrent request. 
+
+
+|Audio length in seconds|Batch Size|TensorRT (ms)|PyTorch (ms)|ONNX (ms)|TensorRT/PyTorch Speedup|TensorRT/Onnx Speedup|
+|---    |---    |---    |---    |---    |---    |---    |
+|2.00|1.00|24.74|27.80|26.70|1.12|1.08|
+|2.00|2.00|23.75|29.76|38.54|1.25|1.62|
+|2.00|4.00|25.28|34.09|39.67|1.35|1.57|
+|2.00|8.00|36.18|41.18|45.84|1.14|1.27|
+|7.00|1.00|25.86|34.82|29.41|1.35|1.14|
+|7.00|2.00|29.83|38.04|43.37|1.28|1.45|
+|7.00|4.00|41.91|66.69|79.38|1.59|1.89|
+|7.00|8.00|80.72|106.86|151.61|1.32|1.88|
+|16.70|1.00|34.89|52.83|43.10|1.51|1.24|
+|16.70|2.00|51.91|73.52|105.58|1.42|2.03|
+|16.70|4.00|95.42|145.17|187.49|1.52|1.96|
+|16.70|8.00|167.67|229.67|413.74|1.37|2.47|

+ 404 - 0
PyTorch/SpeechRecognition/Jasper/trtis/jasper-client.py

@@ -0,0 +1,404 @@
+#!/usr/bin/python
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+import argparse
+import numpy as np
+import os
+from tensorrtserver.api import *
+from speech_utils import AudioSegment, SpeechClient
+import soundfile
+import pyaudio as pa
+import threading
+import math
+import time
+import glob
+
+FLAGS = None
+
+
+# read audio chunk from a file
+def get_audio_chunk_from_soundfile(sf, chunk_size, int_values):
+
+    dtype = 'int32' if int_values else 'float32'
+    audio_signal = sf.read(chunk_size, dtype=dtype)
+    end = False
+    # pad to chunk size
+    if len(audio_signal) < chunk_size:
+        end = True
+        audio_signal = np.pad(audio_signal, (0, chunk_size-len(
+            audio_signal)), mode='constant')
+    return audio_signal, end
+
+
+# generator that returns chunks of audio data from file
+def audio_generator_from_file(input_filename, target_sr, int_values,
+                              chunk_duration):
+
+    sf = soundfile.SoundFile(input_filename, 'rb')
+    chunk_size = int(chunk_duration*sf.samplerate)
+    start = True
+    end = False
+
+    while not end:
+
+        audio_signal, end = get_audio_chunk_from_soundfile(
+            sf, chunk_size, int_values)
+
+        audio_segment = AudioSegment(audio_signal, sf.samplerate, target_sr)
+
+        yield audio_segment.samples, target_sr, start, end
+
+        start = False
+
+    sf.close()
+
+
+# generator that returns chunks of audio data from file
+class AudioGeneratorFromMicrophone:
+
+    def __init__(self,input_device_id, target_sr, chunk_duration):
+
+        self.recording_state = "init"
+        self.target_sr  = target_sr
+        self.chunk_duration = chunk_duration
+
+        self.p = pa.PyAudio()
+
+        device_info = self.p.get_host_api_info_by_index(0)
+        num_devices = device_info.get('deviceCount')
+        devices = {}
+        for i in range(0, num_devices):
+            if (self.p.get_device_info_by_host_api_device_index(0, i).get(
+                'maxInputChannels')) > 0:
+                devices[i] = self.p.get_device_info_by_host_api_device_index(
+                    0, i)
+
+        if (len(devices) == 0):
+            raise RuntimeError("Cannot find any valid input devices")
+
+        if input_device_id is None or input_device_id not in \
+            devices.keys():
+            print("\nInput Devices:")
+            for id, info in devices.items():
+                print("{}: {}".format(id,info.get("name")))
+            input_device_id = int(input("Enter device id to use: "))
+
+        self.input_device_id = input_device_id
+
+
+    def generate_audio(self):
+
+        chunk_size = int(self.chunk_duration*self.target_sr)
+
+
+        self. recording_state = "init"
+
+        def keyboard_listener():
+            input("Press Enter to start and end recording...")
+            self.recording_state = "capture"
+            print("Recording...")
+
+            input("")
+            self.recording_state = "release"
+
+        listener = threading.Thread(target=keyboard_listener)
+        listener.start()
+
+        start = True
+        end = False
+
+        stream_initialized = False
+        step = 0
+        while self.recording_state != "release":
+            try:
+                if self.recording_state == "capture":
+
+                    if not stream_initialized:
+                        stream = self.p.open(
+                            format=pa.paInt16,
+                            channels=1,
+                            rate=self.target_sr,
+                            input=True,
+                            input_device_index=self.input_device_id,
+                            frames_per_buffer=chunk_size)
+                        stream_initialized = True
+
+                    # Read audio chunk from microphone
+                    audio_signal = stream.read(chunk_size)
+                    audio_signal = np.frombuffer(audio_signal,dtype=np.int16)
+                    audio_segment = AudioSegment(audio_signal,
+                                                              self.target_sr,
+                                                              self.target_sr)
+
+                    yield audio_segment.samples, self.target_sr, start, end
+
+                    start = False
+                    step += 1
+            except Exception as e:
+                print(e)
+                break
+
+        stream.close()
+        self.p.terminate()
+
+    def generate_audio_signal(self):
+
+
+        #chunk_size = int(self.chunk_duration*self.target_sr)
+        chunk_size = int(0.2*self.target_sr)
+        self. recording_state = "init"
+
+        def keyboard_listener():
+            input("Press Enter to start and end recording...")
+            self.recording_state = "capture"
+            print("Recording...")
+
+            input("")
+            self.recording_state = "release"
+
+        listener = threading.Thread(target=keyboard_listener)
+        listener.start()
+
+        audio_samples = []
+        stream_initialized = False
+        step = 0
+        while self.recording_state != "release":
+            try:
+                if self.recording_state == "capture":
+
+                    if not stream_initialized:
+                        stream = self.p.open(
+                            format=pa.paInt16,
+                            channels=1,
+                            rate=self.target_sr,
+                            input=True,
+                            input_device_index=self.input_device_id,
+                            frames_per_buffer=chunk_size)
+                        stream_initialized = True
+
+                    # Read audio chunk from microphone
+                    audio_signal = stream.read(chunk_size)
+                    audio_signal = np.frombuffer(audio_signal,dtype=np.int16)
+                    audio_segment = AudioSegment(audio_signal,
+                                                              self.target_sr,
+                                                              self.target_sr)
+
+                    if step == 0:
+                        audio_samples = audio_segment.samples
+                    else:
+                        audio_samples = np.concatenate((audio_samples,
+                                                       audio_segment.samples))
+
+                    start = False
+                    step += 1
+            except Exception as e:
+                print(e)
+                break
+
+        stream.close()
+        self.p.terminate()
+
+        return audio_samples
+
+# generator that returns chunks of audio features from file
+def audio_features_generator(input_filename, speech_features_params,
+                             target_sr, int_values, chunk_duration):
+
+    sf = soundfile.SoundFile(input_filename, 'rb')
+
+    chunk_size = int(chunk_duration*sf.samplerate)
+
+    start = True
+    end = False
+
+    while not end:
+
+        audio_signal, end = get_audio_chunk_from_soundfile(sf, chunk_size,
+                                                       int_values)
+
+        audio_segment = AudioSegment(audio_signal, sf.samplerate, target_sr)
+        audio_features, features_length = get_speech_features(
+          audio_segment.samples, target_sr, speech_features_params)
+
+        yield audio_features, start, end
+
+        start = False
+
+    sf.close()
+
+
+def audio_features_generator_with_buffer(input_filename,
+                                         speech_features_params, target_sr,
+                                         int_values, chunk_duration):
+
+    sf = soundfile.SoundFile(input_filename, 'rb')
+
+    chunk_size = int(chunk_duration*sf.samplerate)
+
+    start = True
+    end = False
+
+    audio_signal = np.zeros(shape=3*chunk_size, dtype=np.float32)
+
+    while not end:
+
+        audio_signal[-chunk_size:], end = get_audio_chunk_from_soundfile(sf, chunk_size, int_values)
+
+        audio_segment = AudioSegment(audio_signal, sf.samplerate, target_sr)
+        audio_features, features_length = get_speech_features(
+          audio_segment.samples, target_sr, speech_features_params)
+
+        yield audio_features, start, end
+
+        start = False
+        audio_signal[:-chunk_size] = audio_signal[chunk_size:]
+
+
+    sf.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-v', '--verbose', action="store_true", required=False,
+                        default=False, help='Enable verbose output')
+    parser.add_argument('--fixed_size', type=int, required=False,
+                        default=0,
+                        help="send fixed_size requests, pad or truncate")    
+    parser.add_argument('--batch_size', type=int, required=False, default=1,
+                        help='batch size')
+    parser.add_argument('--model_platform', required=False,
+                        default='trt',
+                        help='Jasper model platform')
+    parser.add_argument('-u', '--url', type=str, required=False,
+                        default='localhost:8000',
+                        help='Inference server URL. Default is '
+                             'localhost:8000.')
+    parser.add_argument('-i', '--protocol', type=str, required=False,
+                        default='HTTP',
+                        help='Protocol (HTTP/gRPC) used to communicate with '
+                             'inference service. Default is HTTP.')
+    parser.add_argument('--audio_filename', type=str, required=False,
+                        default=None,
+                        help='Input audio filename')
+    parser.add_argument('--data_dir', type=str, required=False,
+                        default=None,
+                        help='data directory')
+    parser.add_argument('--manifest_filename', type=str, required=False,
+                        default=None,
+                        help='relative manifest paths to --data_dir directory.')
+
+    FLAGS = parser.parse_args()
+
+    protocol = ProtocolType.from_str(FLAGS.protocol)
+
+    valid_model_platforms = {"pyt","onnx", "trt"}
+
+    if FLAGS.model_platform not in valid_model_platforms:
+        raise ValueError("Invalid model_platform {}. Valid choices are {"
+                         "}".format(FLAGS.model_platform,
+            valid_model_platforms))
+
+
+    model_name = "jasper-" + FLAGS.model_platform + "-ensemble"
+
+    speech_client = SpeechClient(
+        FLAGS.url, protocol, model_name, 1,
+        FLAGS.batch_size, model_platform=FLAGS.model_platform,
+        verbose=FLAGS.verbose, mode="synchronous",
+        from_features=False
+    )
+
+
+    
+    filenames = []
+    transcripts = []
+    if FLAGS.audio_filename is not None:
+        audio_file = os.path.join(FLAGS.data_dir, FLAGS.audio_filename)
+        if os.path.isdir(audio_file):
+            filenames = glob.glob(os.path.join(os.path.abspath(audio_file), "**", "*.wav"),
+                                recursive=True)
+        else:
+            filenames = [audio_file]
+    elif FLAGS.manifest_filename is not None:
+        filter_speed=1.0
+        data_dir=FLAGS.data_dir
+        labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'", "<BLANK>"]
+        labels_map = dict([(labels[i], i) for i in range(len(labels))])
+        blank_index = len(labels)-1
+        table = None
+        import string
+        punctuation = string.punctuation
+        punctuation = punctuation.replace("+", "")
+        punctuation = punctuation.replace("&", "")
+        table = str.maketrans(punctuation, " " * len(punctuation))
+
+        import json
+        if "./trtis" not in sys.path:
+            sys.path.append("./")
+            sys.path.append("./trtis")
+        from speech_utils import normalize_string, parse_transcript
+        FLAGS.manifest_filename = FLAGS.manifest_filename.split(',')
+        for manifest in FLAGS.manifest_filename:
+            manifest=os.path.join(data_dir, manifest)
+            print(manifest)
+            with open(manifest, "r", encoding="utf-8") as fh:
+                a=json.load(fh)
+                for data in a:
+                    files_and_speeds = data['files']
+                    audio_path = [x['fname'] for x in files_and_speeds if x['speed'] == filter_speed][0]
+                    filenames.append(os.path.join(data_dir, audio_path))
+                    transcript_text = data[
+                                'transcript']
+                    transcript_text = normalize_string(transcript_text, labels=labels, table=table)
+                    transcripts.append(transcript_text) #parse_transcript(transcript_text, labels_map, blank_index)) # convert to vocab indices
+
+        
+        
+    # Read the audio files
+    # Group requests in batches
+    audio_idx = 0
+    last_request = False
+    predictions = []
+    while not last_request:
+        batch_audio_samples = []
+        batch_filenames = []
+        
+        for idx in range(FLAGS.batch_size):
+            filename = filenames[audio_idx]
+            print("Reading audio file: ", filename)
+            audio = AudioSegment.from_file(
+                filename,
+                offset=0, duration=FLAGS.fixed_size).samples
+            if FLAGS.fixed_size:
+                audio = np.resize(audio, FLAGS.fixed_size)
+                
+            audio_idx = (audio_idx + 1) % len(filenames)
+            if audio_idx == 0:
+                last_request = True
+
+            batch_audio_samples.append(audio)
+            batch_filenames.append(filename)
+
+        predictions += speech_client.recognize(
+            batch_audio_samples,
+            batch_filenames)
+
+    if transcripts:
+        predictions = [x for l in predictions for x in l ]
+        from metrics import word_error_rate
+        wer, scores, num_words = word_error_rate(predictions, transcripts)
+        print(wer)

+ 45 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-decoder/config.pbtxt

@@ -0,0 +1,45 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+default_model_filename: "jasper-decoder.pt"
+name: "jasper-decoder"
+platform: "pytorch_libtorch"
+
+max_batch_size: 64
+input [
+  {
+    name: "CLASS_LOGITS__0"
+    data_type: TYPE_FP32
+    dims: [ -1, 29 ]
+  }
+]
+output [
+  {
+    name: "CANDIDATE_TRANSCRIPT__0"
+    data_type: TYPE_INT32
+    dims: [ -1]
+  }
+]
+

+ 32 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-feature-extractor/config.pbtxt

@@ -0,0 +1,32 @@
+name: "jasper-feature-extractor"
+platform: "pytorch_libtorch"
+default_model_filename: "jasper-feature-extractor.pt"
+max_batch_size: 64
+
+input [ {
+  name: "AUDIO_SIGNAL__0"
+  data_type: TYPE_FP32
+  dims: [ -1 ]
+},
+{
+  name: "NUM_SAMPLES__1"
+  data_type: TYPE_INT32
+  dims: [ 1 ]
+  reshape { shape: [] }
+}
+]
+
+output [
+{
+  name: "AUDIO_FEATURES__0"
+  data_type: TYPE_FP32
+  dims: [64, -1]
+}
+,
+  {	
+    name: "NUM_TIME_STEPS__1"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [] }
+  }
+]

+ 60 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-cpu-ensemble/config.pbtxt

@@ -0,0 +1,60 @@
+name: "jasper-onnx-cpu-ensemble"
+platform: "ensemble"
+max_batch_size: 64#MAX_BATCH
+input {
+  name: "AUDIO_SIGNAL"
+  data_type: TYPE_FP32
+  dims: -1#AUDIO_LENGTH
+}
+input {
+    name: "NUM_SAMPLES"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+}
+output {
+  name: "TRANSCRIPT"
+  data_type: TYPE_INT32
+  dims: [-1]
+}
+ensemble_scheduling {
+  step {
+    model_name: "jasper-feature-extractor"
+    model_version: -1
+    input_map {
+      key: "AUDIO_SIGNAL__0"
+      value: "AUDIO_SIGNAL"
+    }
+    input_map {
+      key: "NUM_SAMPLES__1"
+      value: "NUM_SAMPLES"
+    }
+    output_map {
+      key: "AUDIO_FEATURES__0"
+      value: "AUDIO_FEATURES"
+    }
+  }
+  step {
+    model_name: "jasper-onnx-cpu"
+    model_version: -1
+    input_map {
+      key: "FEATURES"
+      value: "AUDIO_FEATURES"
+    }
+    output_map {
+      key: "LOGITS"
+      value: "CHARACTER_PROBABILITIES"
+    }
+  }
+  step {
+    model_name: "jasper-decoder"
+    model_version: -1
+    input_map {
+      key: "CLASS_LOGITS__0"
+      value: "CHARACTER_PROBABILITIES"
+    }
+    output_map {
+      key: "CANDIDATE_TRANSCRIPT__0"
+      value: "TRANSCRIPT"
+    }
+  }
+}

+ 53 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-cpu/config.pbtxt

@@ -0,0 +1,53 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "jasper-onnx-cpu"
+platform: "onnxruntime_onnx"
+default_model_filename: "jasper.onnx"
+
+max_batch_size : 64#MAX_BATCH
+
+ input [
+   {
+     name: "FEATURES"
+     data_type: TYPE_FP32
+     dims: [ 64, -1 ]
+   }
+ ]
+ output [
+   {
+     name: "LOGITS"
+     data_type: TYPE_FP32
+     dims: [ -1, 29 ]
+   }
+  ]
+
+instance_group [
+ {
+   count: 1#NUM_ENGINES 
+   kind: KIND_CPU
+  }  
+]

+ 60 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-ensemble/config.pbtxt

@@ -0,0 +1,60 @@
+name: "jasper-onnx-ensemble"
+platform: "ensemble"
+max_batch_size: 64#MAX_BATCH
+input {
+  name: "AUDIO_SIGNAL"
+  data_type: TYPE_FP32
+  dims: -1#AUDIO_LENGTH
+}
+input {
+    name: "NUM_SAMPLES"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+}
+output {
+  name: "TRANSCRIPT"
+  data_type: TYPE_INT32
+  dims: [-1]
+}
+ensemble_scheduling {
+  step {
+    model_name: "jasper-feature-extractor"
+    model_version: -1
+    input_map {
+      key: "AUDIO_SIGNAL__0"
+      value: "AUDIO_SIGNAL"
+    }
+    input_map {
+      key: "NUM_SAMPLES__1"
+      value: "NUM_SAMPLES"
+    }
+    output_map {
+      key: "AUDIO_FEATURES__0"
+      value: "AUDIO_FEATURES"
+    }
+  }
+  step {
+    model_name: "jasper-onnx"
+    model_version: -1
+    input_map {
+      key: "FEATURES"
+      value: "AUDIO_FEATURES"
+    }
+    output_map {
+      key: "LOGITS"
+      value: "CHARACTER_PROBABILITIES"
+    }
+  }
+  step {
+    model_name: "jasper-decoder"
+    model_version: -1
+    input_map {
+      key: "CLASS_LOGITS__0"
+      value: "CHARACTER_PROBABILITIES"
+    }
+    output_map {
+      key: "CANDIDATE_TRANSCRIPT__0"
+      value: "TRANSCRIPT"
+    }
+  }
+}

+ 56 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx/config.pbtxt

@@ -0,0 +1,56 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "jasper-onnx"
+platform: "onnxruntime_onnx"
+default_model_filename: "jasper.onnx"
+
+ max_batch_size : 64#MAX_BATCH
+ input [
+   {
+     name: "FEATURES"
+     data_type: TYPE_FP32
+     dims: [ 64, -1 ]
+   }
+ ]
+ output [
+   {
+     name: "LOGITS"
+     data_type: TYPE_FP32
+     dims: [ -1, 29 ]
+   }
+  ]
+
+instance_group {
+  count: 1#NUM_ENGINES
+  gpus: 0
+  kind: KIND_GPU
+}
+
+#db#dynamic_batching {
+#db#    preferred_batch_size: 64#MAX_BATCH
+#db#    max_queue_delay_microseconds: #MAX_QUEUE
+#db#}

+ 61 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-pyt-ensemble/config.pbtxt

@@ -0,0 +1,61 @@
+name: "jasper-pyt-ensemble"
+platform: "ensemble"
+max_batch_size: 64#MAX_BATCH
+input {
+  name: "AUDIO_SIGNAL"
+  data_type: TYPE_FP32
+  dims: -1#AUDIO_LENGTH
+}
+input {
+    name: "NUM_SAMPLES"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+}
+output {
+  name: "TRANSCRIPT"
+  data_type: TYPE_INT32
+  dims: [-1]
+}
+
+ensemble_scheduling {
+  step {
+    model_name: "jasper-feature-extractor"
+    model_version: -1
+    input_map {
+      key: "AUDIO_SIGNAL__0"
+      value: "AUDIO_SIGNAL"
+    }
+    input_map {
+      key: "NUM_SAMPLES__1"
+      value: "NUM_SAMPLES"
+    }
+    output_map {
+      key: "AUDIO_FEATURES__0"
+      value: "AUDIO_FEATURES"
+    }
+  }
+  step {
+    model_name: "jasper-pyt"
+    model_version: -1
+    input_map {
+      key: "AUDIO_FEATURES__0"
+      value: "AUDIO_FEATURES"
+    }
+    output_map {
+      key: "LOG_PROBS__0"
+      value: "CHARACTER_PROBABILITIES"
+    }
+  }
+  step {
+    model_name: "jasper-decoder"
+    model_version: -1
+    input_map {
+      key: "CLASS_LOGITS__0"
+      value: "CHARACTER_PROBABILITIES"
+    }
+    output_map {
+      key: "CANDIDATE_TRANSCRIPT__0"
+      value: "TRANSCRIPT"
+    }
+  }
+}

+ 31 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-pyt/config.pbtxt

@@ -0,0 +1,31 @@
+name: "jasper-pyt"
+platform: "pytorch_libtorch"
+default_model_filename: "jasper.pt"
+
+max_batch_size: 64#MAX_BATCH
+input [
+  {
+    name: "AUDIO_FEATURES__0"
+    data_type: TYPE_FP32
+    dims: [64, -1]
+  }
+]
+
+output [
+  {
+    name: "LOG_PROBS__0"
+    data_type: TYPE_FP32
+    dims: [-1, 29]
+  }
+]
+
+instance_group {
+  count: 1#NUM_ENGINES
+  gpus: 0
+  kind: KIND_GPU
+}
+
+#db#dynamic_batching {
+#db#    preferred_batch_size: 64#MAX_BATCH
+#db#    max_queue_delay_microseconds: #MAX_QUEUE
+#db#}

+ 60 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-trt-ensemble/config.pbtxt

@@ -0,0 +1,60 @@
+name: "jasper-trt-ensemble"
+platform: "ensemble"
+max_batch_size: 64#MAX_BATCH
+input {
+  name: "AUDIO_SIGNAL"
+  data_type: TYPE_FP32
+  dims: -1#AUDIO_LENGTH
+}
+input {
+    name: "NUM_SAMPLES"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+}
+output {
+  name: "TRANSCRIPT"
+  data_type: TYPE_INT32
+  dims: [-1]
+}
+ensemble_scheduling {
+ step {
+    model_name: "jasper-feature-extractor"
+    model_version: -1
+    input_map {
+      key: "AUDIO_SIGNAL__0"
+      value: "AUDIO_SIGNAL"
+    }
+    input_map {
+      key: "NUM_SAMPLES__1"
+      value: "NUM_SAMPLES"
+    }
+    output_map {
+      key: "AUDIO_FEATURES__0"
+      value: "AUDIO_FEATURES"
+    }    
+  }
+  step {
+    model_name: "jasper-trt"
+    model_version: -1
+    input_map {
+      key: "FEATURES"
+      value: "AUDIO_FEATURES"
+    }
+    output_map {
+      key: "LOGITS"
+      value: "CHARACTER_PROBABILITIES"
+    }
+  }
+  step {
+    model_name: "jasper-decoder"
+    model_version: -1
+    input_map {
+      key: "CLASS_LOGITS__0"
+      value: "CHARACTER_PROBABILITIES"
+    }
+    output_map {
+      key: "CANDIDATE_TRANSCRIPT__0"
+      value: "TRANSCRIPT"
+    }
+  }
+}

+ 65 - 0
PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-trt/config.pbtxt

@@ -0,0 +1,65 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "jasper-trt"
+platform: "tensorrt_plan"
+default_model_filename: "jasper_70.plan"
+
+max_batch_size: 64#MAX_BATCH
+
+input [
+      {
+        name: "FEATURES"
+        data_type: TYPE_FP32
+        dims: [64, -1]
+      }
+]
+
+output [
+      {
+        name: "LOGITS"
+        data_type: TYPE_FP32
+        dims: [-1, 29 ]
+      }
+]
+
+cc_model_filenames: [
+   { key: "7.0"
+     value: "jasper_70.plan"},
+   { key: "7.5"
+     value: "jasper_75.plan"}
+]
+
+instance_group {
+  count: 1#NUM_ENGINES
+  gpus: 0
+  kind: KIND_GPU
+}
+
+#db#dynamic_batching {
+#db#    preferred_batch_size: 64#MAX_BATCH
+#db#    max_queue_delay_microseconds: #MAX_QUEUE
+#db#}

+ 1 - 0
PyTorch/SpeechRecognition/Jasper/trtis/requirements.txt

@@ -0,0 +1 @@
+PyAudio

+ 8 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/docker/build.sh

@@ -0,0 +1,8 @@
+#!/bin/bash
+# ensure the TRTIS submodule is added and build the clients
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+PROJECT_DIR=${SCRIPT_DIR}/../../../
+docker pull nvcr.io/nvidia/tensorrtserver:19.09-py3
+git submodule update --init --recursive
+docker build -t tensorrtserver_client -f ${PROJECT_DIR}/external/tensorrt-inference-server/Dockerfile.client ${PROJECT_DIR}/external/tensorrt-inference-server
+docker build . --rm -f ${PROJECT_DIR}/trtis/Dockerfile -t jasper:trtis

+ 39 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/docker/launch.sh

@@ -0,0 +1,39 @@
+#!/bin/bash
+# Launch TRT JASPER container.
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+JASPER_REPO=${JASPER_REPO:-"${SCRIPT_DIR}/../../.."}
+
+
+DATA_DIR=${DATA_DIR:-"/datasets"}
+CHECKPOINT_DIR=${CHECKPOINT_DIR:-"/checkpoints"}
+RESULT_DIR=${RESULT_DIR:-"/results"}
+PROGRAM_PATH=${PROGRAM_PATH}
+    
+
+MOUNTS=""
+if [ ! -z "$DATA_DIR" ]; 
+then
+    MOUNTS="$MOUNTS -v $DATA_DIR:/datasets "
+fi
+
+if [ ! -z "$CHECKPOINT_DIR" ]; 
+then
+    MOUNTS="$MOUNTS -v $CHECKPOINT_DIR:/checkpoints "
+fi
+
+if [ ! -z "$RESULT_DIR" ]; 
+then
+    MOUNTS="$MOUNTS -v $RESULT_DIR:/results "
+fi
+
+echo $MOUNTS
+docker run -it --rm \
+  --runtime=nvidia \
+  --shm-size=4g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  ${MOUNTS} \
+  -v ${JASPER_REPO}:/jasper \
+  ${EXTRA_JASPER_ENV} \
+  jasper:trtis bash $PROGRAM_PATH

+ 102 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/execute_all_perf_runs.sh

@@ -0,0 +1,102 @@
+#!/bin/bash
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This will run all the necessary scripts to generate ALL needed output
+
+
+#### input arguments
+ARCH=${ARCH:-75}
+CHECKPOINT_DIR=${CHECKPOINT_DIR}
+RESULT_DIR=${RESULT_DIR}
+CHECKPOINT=${CHECKPOINT:-"jasper_fp16.pt"}
+MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE:-1792}
+####
+
+for dir in $CHECKPOINT_DIR $RESULT_DIR; do
+    if [[ $dir != /* ]]; then
+        echo "All directory paths must be absolute paths!"
+        echo "${dir} is not an absolute path"
+        exit 1
+    fi
+
+    if [ ! -d $dir ]; then
+        echo "All directory paths must exist!"
+        echo "${dir} does not exist"
+        exit 1
+    fi
+done
+
+REGENERATE_ENGINES=${REGENERATE_ENGINES:-"yes"}
+PRECISION_TESTS=${PRECISION_TESTS:-"fp16 fp32"}
+export GPU=${GPU:-}
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+PROJECT_DIR=${SCRIPT_DIR}/../..
+MODEL_REPO=${MODEL_REPO:-"${PROJECT_DIR}/trtis/model_repo"}
+
+# We need to make sure TRTIS uses only one GPU, same as export does
+# for TRTIS
+export NVIDIA_VISIBLE_DEVICES=${GPU}
+
+export TRTIS_CLIENT_CONTAINER_TAG=tensorrtserver_client
+
+trap "exit" INT
+
+
+SCRIPT=${SCRIPT_DIR}/generate_perf_results.sh
+
+function run_for_length () {
+    TRTIS_CLIENT_CONTAINER_TAG=tensorrtserver_client AUDIO_LENGTH=$1 BATCH_SIZE=$2 RESULT_DIR=${RESULT_DIR} PRECISION=${PRECISION} ${SCRIPT} 
+}
+
+
+for PRECISION in ${PRECISION_TESTS};
+do
+
+    if [ "${REGENERATE_ENGINES}" == "yes" ]; then
+        echo "REGENERATE_ENGINES==yes, forcing re-export"
+    else	
+        if [ -f ${MODEL_REPO}/jasper-onnx/1/jasper.onnx ]; then
+            echo "Found ${MODEL_REPO}/jasper-onnx/1/jasper.onnx, skipping model export. Set REGENERATE_ENGINES=yes to force re-export"
+        else
+            REGENERATE_ENGINES=yes
+            echo "${MODEL_REPO}/jasper-onnx/1/jasper.onnx not found, forcing re-export"
+        fi
+    fi
+  
+    if [ "${REGENERATE_ENGINES}" == "yes" ]; then
+        ARCH=${ARCH} CHECKPOINT_DIR=${CHECKPOINT_DIR} CHECKPOINT=${CHECKPOINT} PRECISION=${PRECISION} MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE} \
+        ${PROJECT_DIR}/trtis/scripts/export_model.sh || exit 1
+    fi
+  
+    for BATCH_SIZE in 1 2 4 8 16 32 64;
+    do
+        
+        # 7 Seconds
+        run_for_length 112000 $BATCH_SIZE
+
+        # 2 seconds
+        run_for_length 32000 $BATCH_SIZE
+
+        # 16.7 Seconds
+        run_for_length 267200 $BATCH_SIZE
+
+    done
+    # prepare for FP32 run
+    REGENERATE_ENGINES=yes
+done
+
+
+

+ 36 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/export_model.sh

@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#### input arguments
+ARCH=${ARCH:-75}
+CHECKPOINT_DIR=${CHECKPOINT_DIR:-/checkpoints}
+CHECKPOINT=${CHECKPOINT:-"jasper_fp16.pt"}
+PRECISION=${PRECISION:-fp16}
+MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE:-3600}
+GPU=${GPU:-0}
+####
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+PROJECT_DIR=${SCRIPT_DIR}/../..
+if [ -f /.dockerenv ]; then # inside docker
+	CUDA_VISIBLE_DEVICES=${GPU} CHECKPOINT=${CHECKPOINT} CHECKPOINT_DIR=${CHECKPOINT_DIR} PRECISION=${PRECISION} ARCH=${ARCH} MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE} ${PROJECT_DIR}/trtis/scripts/export_model_helper.sh || exit 1
+else
+	set -x
+	PROGRAM_PATH="/jasper/trtis/scripts/export_model_helper.sh"  \
+	EXTRA_JASPER_ENV="-e PRECISION=${PRECISION} -e CHECKPOINT=${CHECKPOINT} -e CHECKPOINT_DIR=/checkpoints -e ARCH=${ARCH} -e MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE} -e CUDA_VISIBLE_DEVICES=${GPU}" \
+	CHECKPOINT_DIR=${CHECKPOINT_DIR} DATA_DIR= RESULT_DIR= \
+	${PROJECT_DIR}/trtis/scripts/docker/launch.sh || exit 1
+	set +x
+fi

+ 102 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/export_model_helper.sh

@@ -0,0 +1,102 @@
+#!/bin/bash
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+#### input arguments
+CHECKPOINT=${CHECKPOINT}
+PRECISION=${PRECISION:-fp16}
+ARCH=${ARCH:-75}
+MODEL_REPO=${MODEL_REPO:-"${SCRIPT_DIR}/../model_repo"}
+JASPER_REPO=${JASPER_REPO:-"${SCRIPT_DIR}/../.."}
+MODEL_CONFIG=${MODEL_CONFIG:-"jasper10x5dr_nomask.toml"}
+CHECKPOINT_DIR=${CHECKPOINT_DIR:-"/checkpoints"}
+MAX_SEQUENCE_LENGTH_FOR_ENGINE=${MAX_SEQUENCE_LENGTH_FOR_ENGINE}
+####
+
+
+
+export PYTHONPATH=${JASPER_REPO}
+
+echo "export_model.sh: Exporting TorchScript ... "
+
+mkdir -p ${MODEL_REPO}/jasper-trt/1/
+mkdir -p ${MODEL_REPO}/jasper-onnx/1/
+mkdir -p ${MODEL_REPO}/jasper-pyt/1/
+mkdir -p ${MODEL_REPO}/jasper-trt-ensemble/1/
+mkdir -p ${MODEL_REPO}/jasper-onnx-ensemble/1/
+mkdir -p ${MODEL_REPO}/jasper-pyt-ensemble/1/
+
+mkdir -p ${MODEL_REPO}/jasper-feature-extractor/1/
+mkdir -p ${MODEL_REPO}/jasper-decoder/1/
+
+PREC_FLAGS=""
+if [ "$PRECISION" == "fp16" ]
+then
+	PREC_FLAGS="--fp16 --pyt_fp16"
+fi
+
+python  ${JASPER_REPO}/inference.py \
+	--ckpt ${CHECKPOINT_DIR}/${CHECKPOINT} \
+	--wav=${JASPER_REPO}/notebooks/example1.wav  \
+	--model_toml=${JASPER_REPO}/configs/${MODEL_CONFIG} \
+	--export_model --output_dir ${PWD} ${PREC_FLAGS} ${ADDITIONAL_ARGS} || exit 1
+
+mv *_feat.pt ${MODEL_REPO}/jasper-feature-extractor/1/jasper-feature-extractor.pt
+mv *_acoustic.pt ${MODEL_REPO}/jasper-pyt/1/jasper.pt
+mv *_decoder.pt ${MODEL_REPO}/jasper-decoder/1/jasper-decoder.pt
+
+echo "TorchScript export succeeded."
+echo "export_model.sh: Exporting ONNX and TRT ... "
+
+# we need 2 separate export passes because OSS TRT ONNX parser currently chokes on hybrid ONNX
+echo "export_model.sh: Exporting TRT engine, CUDA ARCH = ${ARCH} ... "
+
+PREC_FLAGS=""
+if [ "$PRECISION" == "fp16" ]
+then
+	PREC_FLAGS="--trt_fp16"
+fi
+
+# remove targtes first
+rm -f ${MODEL_REPO}/jasper-trt/1/jasper_${ARCH}.plan ${MODEL_REPO}/jasper-onnx/1/jasper.onnx
+
+python  ${JASPER_REPO}/trt/perf.py \
+	--ckpt_path ${CHECKPOINT_DIR}/${CHECKPOINT} \
+	--wav=${JASPER_REPO}/notebooks/example1.wav \
+	--model_toml=${JASPER_REPO}/configs/${MODEL_CONFIG} \
+	--make_onnx --onnx_path jasper-tmp.onnx --engine_path ${MODEL_REPO}/jasper-trt/1/jasper_${ARCH}.plan --seq_len=256 --max_seq_len ${MAX_SEQUENCE_LENGTH_FOR_ENGINE} --verbose --dynamic_shape ${PREC_FLAGS} || exit 1
+rm -fr jasper-tmp.onnx
+
+
+PREC_FLAGS=""
+if [ "$PRECISION" == "fp16" ]
+then
+	PREC_FLAGS="--trt_fp16 --pyt_fp16"
+fi
+python  ${JASPER_REPO}/trt/perf.py \
+	--ckpt_path ${CHECKPOINT_DIR}/${CHECKPOINT} \
+	--wav=${JASPER_REPO}/notebooks/example1.wav \
+	--model_toml=${JASPER_REPO}/configs/${MODEL_CONFIG} \
+	--make_onnx --onnx_path ${MODEL_REPO}/jasper-onnx/1/jasper.onnx --seq_len=256 --max_seq_len ${MAX_SEQUENCE_LENGTH_FOR_ENGINE} --verbose ${PREC_FLAGS} --dynamic_shape ${ADDITIONAL_TRT_ARGS} || exit 1
+
+mkdir -p ${MODEL_REPO}/jasper-onnx-cpu/1
+cp -f ${MODEL_REPO}/jasper-onnx/1/jasper.onnx ${MODEL_REPO}/jasper-onnx-cpu/1/jasper.onnx 
+
+if [ -f /.dockerenv ]; then # inside docker
+    # Make sure we do not leave read-only root-owned files
+    chmod -R a+wrX ${MODEL_REPO}/
+fi
+
+echo "export_model.sh: Exporting ONNX and TRT engines succeeded "

+ 125 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/generate_perf_results.sh

@@ -0,0 +1,125 @@
+#!/bin/bash
+
+#### input arguments
+RESULT_DIR=${RESULT_DIR} # used by perf_client to store results
+NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"0"}
+SERVER_HOSTNAME=${SERVER_HOSTNAME:-localhost}
+AUDIO_LENGTH=${AUDIO_LENGTH:-80000}
+BATCH_SIZE=${BATCH_SIZE:-16}
+####
+
+set -e
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+TRTIS_DIR=${SCRIPT_DIR}/..
+PROJECT_DIR=${TRTIS_DIR}/..
+
+GPU_TESTS=${GPU_TESTS:-"trt pyt onnx"}
+CPU_TESTS=${CPU_TESTS:-""}
+
+ENGINE_COUNT_TESTS=${ENGINE_COUNT_TESTS:-"1"}
+
+# Export the set variables in case they used the default
+export NVIDIA_VISIBLE_DEVICES
+export SERVER_HOSTNAME
+export AUDIO_LENGTH
+export MAX_LATENCY=2000 # Set max latency high to prevent errors
+TRTIS=${TRTIS:-jasper-trtis}
+max_queue_delays="10 5 2" #ms
+
+
+# Ensure that the server is closed when the script exits
+function cleanup_server {
+    current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+    logfile="/tmp/${TRTIS}-${current_time}.log"
+    echo "Shutting down ${TRTIS} container, log is in ${logfile}"
+    docker logs ${TRTIS} > ${logfile} 2>&1
+    docker stop ${TRTIS} > /dev/null 2>&1
+}
+
+trap cleanup_server EXIT
+
+trap "exit" INT
+
+function wait_for_trtis {
+    TIMEOUT=${1:-30}
+    timeout ${TIMEOUT} ${SCRIPT_DIR}/wait_for_trtis_server.sh || (echo '\nServer timeout!!!\n' && exit 1) 
+}
+
+function modify_ensemble {
+    
+    PLAT=$1
+    
+    REPO=${TRTIS_DIR}/deploy/model_repo
+    INPLACE="--in_place"
+            
+    CONF=${REPO}/jasper-${PLAT}/config.pbtxt
+    CONF_E=${REPO}/jasper-${PLAT}-ensemble/config.pbtxt
+    
+
+    echo "Modifying ${CONF} : batch size ${BATCH_SIZE} engines=${NUM_ENGINES} ..."
+    cleanup_server || true
+    sed -i -e "s/1#NUM_ENGINES/${NUM_ENGINES}/g" -e "s/64#MAX_BATCH/${BATCH_SIZE}/g" ${CONF}
+    if [ "$MAX_QUEUE" != "" ] ; then
+        sed -i -e "s/#db#//g" -e "s/#MAX_QUEUE/${MAX_QUEUE}/g" ${CONF}
+    fi
+
+
+    echo "Modifying ${CONF_E} for size $2, batch size ${BATCH_SIZE} ${TRTIS_DYN_BATCH_ARGS}.."
+    sed -i -e "s/-1#AUDIO_LENGTH/${AUDIO_LENGTH}/g" -e "s/64#MAX_BATCH/${BATCH_SIZE}/g" ${CONF_E} 
+
+    ${SCRIPT_DIR}/run_server.sh ${NORESTART}
+    
+    wait_for_trtis
+    
+    echo "done."
+}
+
+echo "GPU tests: ${GPU_TESTS}" 
+echo "CPU tests: ${CPU_TESTS}"
+echo "PRECISION: ${PRECISION}"
+
+for plat in ${GPU_TESTS} ${CPU_TESTS}; do
+    if [ "$plat" == "onnx-cpu" ]; then
+        export MAX_LATENCY=10000
+        export MEASUREMENT_WINDOW=15000
+    elif [ "$plat" == "none" ]; then
+	    continue
+    else
+        export MAX_LATENCY=2000
+        export MEASUREMENT_WINDOW=3000    
+    fi
+    
+
+    export BASE_SAVE_NAME="${plat}_${PRECISION}_${AUDIO_LENGTH}_BS${BATCH_SIZE}"
+    export MODEL_NAME=jasper-${plat}-ensemble
+
+    MODELS="jasper-${plat} jasper-${plat}-ensemble" ${SCRIPT_DIR}/prepare_model_repository.sh
+
+    if [ "$plat" == "onnx-cpu" ]; then
+        export MAX_LATENCY=10000
+        export MEASUREMENT_WINDOW=15000
+    fi    
+
+    # ############## Engine Count Comparison (static batcing) ##############
+    for num_engines in ${ENGINE_COUNT_TESTS}; do
+	SAVE_RESULTS_DIR="${BASE_SAVE_NAME}/static/${num_engines}_engines"
+        NUM_ENGINES=${num_engines} BATCH_SIZE=${BATCH_SIZE} modify_ensemble ${plat} ${AUDIO_LENGTH}
+        echo "Running engines comparison, ${num_engines} engines..." 
+        MAX_CONCURRENCY=8 BATCH_SIZE=${BATCH_SIZE} ${SCRIPT_DIR}/run_perf_client.sh  ${SAVE_RESULTS_DIR} || echo '\nPerf Client Failure!!!\n'
+    done
+    
+    ############## Dynamic Batching Comparison ##############
+    for delay in ${max_queue_delays}; do
+	    echo "Running dynamic batching comparison, models=${MODELS}, delay ${delay}..."
+        TRTIS_DYN_BATCHING_DELAY=$((delay * 1000))
+	    SAVE_RESULTS_DIR="${BASE_SAVE_NAME}/batching/${TRTIS_DYN_BATCHING_DELAY}"
+        NUM_ENGINES=1 MAX_QUEUE=${TRTIS_DYN_BATCHING_DELAY} BATCH_SIZE=${BATCH_SIZE} modify_ensemble ${plat} ${AUDIO_LENGTH}
+        BATCH_SIZE=1 MAX_CONCURRENCY=$((BATCH_SIZE*2)) ${SCRIPT_DIR}/run_perf_client.sh ${SAVE_RESULTS_DIR} || echo '\nPerf Client Failure!!!\n'
+    done
+
+    
+    
+done
+
+echo "Complete!"

+ 45 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/prepare_model_repository.sh

@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Create folder deploy/model_repo that will be used by TRTIS
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+PROJECT_DIR=${SCRIPT_DIR}/..
+DEPLOY_DIR=${PROJECT_DIR}/deploy
+HOST_REPO=${DEPLOY_DIR}/model_repo
+
+
+MODELS_TRT=${MODELS_TRT:-"jasper-trt jasper-trt-ensemble"}
+MODELS_PYT=${MODELS_PYT:-"jasper-pyt jasper-pyt-ensemble"}
+MODELS_ONNX=${MODELS_ONNX:-"jasper-onnx jasper-onnx-ensemble"}
+DECODERS="jasper-decoder"
+EXTRACTORS="jasper-feature-extractor"
+
+MODELS=${MODELS:-"${MODELS_ONNX} ${MODELS_TRT} ${MODELS_PYT}"} 
+
+# only link working models to install directory
+rm -fr ${HOST_REPO} && mkdir -p ${HOST_REPO}
+
+echo "Setting up model repo at ${HOST_REPO}, models: ${MODELS} ..."
+for m  in ${EXTRACTORS} ${DECODERS} ${MODELS}; do
+    mkdir -p ${HOST_REPO}/$m
+    cp ${PROJECT_DIR}/model_repo/$m/config.pbtxt ${HOST_REPO}/$m
+    ln -sf /model_repo/$m/1 ${HOST_REPO}/$m
+    if [ -f /.dockerenv ]; then # inside docker
+	    chmod -R a+w ${HOST_REPO}/$m
+    fi
+done
+

+ 52 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_client.sh

@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+PROJECT_DIR=${SCRIPT_DIR}/../..
+
+
+MODEL_TYPE=${1:-"pyt"}
+DATA_DIR=${2} # folder with data
+FILE=${3} # json manifest file, OR single wav file
+
+
+
+JASPER_CONTAINER_TAG=${JASPER_CONTAINER_TAG:-jasper:trtis}
+
+if [ "$#" -ge 1 ] && [ "${FILE: -4}" == ".wav" ]; then 
+  CMD="python /jasper/trtis/jasper-client.py --data_dir /data --audio_filename ${FILE} --model_platform ${MODEL_TYPE}"
+  ARGS=""
+  ARGS="$ARGS -v $DATA_DIR:/data"
+elif [ "$#" -ge 1 ] && [ "${FILE: -4}" == "json" ]; then
+  ARGS=""
+  ARGS="$ARGS -v $DATA_DIR:/data"
+  CMD="python /jasper/trtis/jasper-client.py --manifest_filename ${FILE} --model_platform ${MODEL_TYPE} --data_dir /data"
+else
+  ARGS="-it"
+  CMD=""
+fi
+
+echo "========== STARTING ${JASPER_CONTAINER_TAG} =========="
+
+set -x
+nvidia-docker run --rm -it \
+   --net=host \
+   --shm-size=1g \
+   --ulimit memlock=-1 \
+   --ulimit stack=67108864 \
+   -v ${PROJECT_DIR}:/jasper \
+   --name=trtis-client \
+   ${ARGS} ${JASPER_CONTAINER_TAG} ${CMD}
+set +x

+ 81 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_perf_client.sh

@@ -0,0 +1,81 @@
+
+#!/bin/bash
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+trap "exit" INT
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+PROJECT_DIR=${SCRIPT_DIR}/../..
+
+TRTIS_CLIENT_CONTAINER_TAG=${TRTIS_CLIENT_CONTAINER_TAG:-tensorrtserver_client}
+
+SERVER_HOSTNAME=${SERVER_HOSTNAME:-localhost}
+MODEL_NAME=${MODEL_NAME:-jasper-pyt-ensemble}
+MODEL_VERSION=${MODEL_VERSION:-1}
+BATCH_SIZE=${BATCH_SIZE:-1}
+RESULT_DIR=${RESULT_DIR:-${PROJECT_DIR}/results}
+MAX_LATENCY=${MAX_LATENCY:-500}
+MAX_CONCURRENCY=${MAX_CONCURRENCY:-64}
+MEASUREMENT_WINDOW=${MEASUREMENT_WINDOW:-3000}
+
+TIMESTAMP=$(date "+%y%m%d_%H%M")
+
+# RESULT_DIR_H is the path on the host, outside the container. Inside the container RESULT_DIR_H is always mounted to /results
+RESULT_DIR_H="${RESULT_DIR}/perf_client/${MODEL_NAME}"
+
+# Set the output folder using the first argument or pick a default
+if [ -z ${1+x} ]; then
+   RESULT_DIR_H=${RESULT_DIR_H}/batch_$BATCH_SIZE
+else
+   RESULT_DIR_H=${RESULT_DIR_H}/"$1"
+   shift
+fi
+
+# Make the directory if it doesnt exist
+if [ ! -d "${RESULT_DIR_H}" ]; then
+   mkdir -p "${RESULT_DIR_H}"
+fi
+
+echo "Saving output to ${RESULT_DIR_H}"
+
+LOGNAME="${RESULT_DIR_H}/log_${TIMESTAMP}.log"
+OUTPUT_FILE_CSV="results_${TIMESTAMP}.csv"
+
+ARGS="\
+   -m ${MODEL_NAME} \
+   -x ${MODEL_VERSION} \
+   -p ${MEASUREMENT_WINDOW} \
+   -v \
+   -i gRPC \
+   -u ${SERVER_HOSTNAME}:8001 \
+   -b ${BATCH_SIZE} \
+   -l ${MAX_LATENCY} \
+   --max-threads ${MAX_CONCURRENCY} "
+
+curl -s "http://${SERVER_HOSTNAME}:8000/api/status/${MODEL_NAME}" | grep ready_state | grep SERVER_READY || (echo "Model ${MODEL_NAME} is not ready, perf_client skipped..." && exit 1)
+
+echo "=== STARTING: perf client ${ARGS} --concurrency-range 1:4:1 ==="
+docker run  -e DISPLAY=${DISPLAY}  --runtime nvidia --rm \
+	      --privileged --net=host \
+	      -v ${RESULT_DIR_H}:/results --name jasper-perf-client \
+	      ${TRTIS_CLIENT_CONTAINER_TAG}  perf_client $ARGS -f /results/${OUTPUT_FILE_CSV}_p1 --concurrency-range 1:4:1 2>&1 | tee -a $LOGNAME
+
+echo "=== STARTING: perf client ${ARGS} --concurrency-range 8:${MAX_CONCURRENCY}:8 ==="
+docker run  -e DISPLAY=${DISPLAY}  --runtime nvidia --rm \
+	      --privileged --net=host \
+	      -v ${RESULT_DIR_H}:/results --name jasper-perf-client \
+	      ${TRTIS_CLIENT_CONTAINER_TAG}  perf_client $ARGS -f /results/${OUTPUT_FILE_CSV}_p2 --concurrency-range 8:${MAX_CONCURRENCY}:8 2>&1 | tee -a $LOGNAME
+
+cat ${RESULT_DIR_H}/${OUTPUT_FILE_CSV}_p1 ${RESULT_DIR_H}/${OUTPUT_FILE_CSV}_p2 > ${RESULT_DIR_H}/${OUTPUT_FILE_CSV}

+ 58 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_server.sh

@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR=$(cd $(dirname $0); pwd)
+TRTIS_DIR=${SCRIPT_DIR}/..
+
+DEPLOY_DIR=${DEPLOY_DIR:-${TRTIS_DIR}/deploy}
+MODELS_DIR=${MODEL_DIR:-"$DEPLOY_DIR/model_repo"}
+TRTIS_CONTAINER_TAG=${TRTIS_CONTAINER_TAG:-"nvcr.io/nvidia/tensorrtserver:19.09-py3"}
+NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"all"}
+TRTIS=${TRTIS:-jasper-trtis}
+
+# Ensure that the server is closed when the script exits
+function cleanup_server {
+    current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+    logfile="/tmp/${TRTIS}-${current_time}.log"
+    echo "Shutting down ${TRTIS} container, log is in ${logfile}"
+    docker logs ${TRTIS} > ${logfile} 2>&1
+    docker stop ${TRTIS} > /dev/null 2>&1
+}
+
+trap "exit" INT
+
+if [ "$(docker inspect -f "{{.State.Running}}" ${TRTIS})" = "true" ]; then
+    if [ "$1" == "norestart" ]; then
+       echo "${TRTIS} is already running ..."
+       exit 0
+    fi   
+    cleanup_server || true
+fi
+
+# To start  TRTIS container with alternative commandline, set CMD
+CMD=${CMD:-"/opt/tensorrtserver/bin/trtserver --log-verbose=100 --exit-on-error=false --strict-model-config=false --model-store=/models"}
+DAEMON=${DAEMON:-"-d"}
+RM=${RM:-"--rm"}
+
+set -x
+docker run -p 8000:8000 -p 8001:8001 -p 8002:8002 \
+       --runtime nvidia \
+       -e NVIDIA_VISIBLE_DEVICES=${NV_VISIBLE_DEVICES} \
+       -v ${MODELS_DIR}:/models \
+       -v ${TRTIS_DIR}/model_repo:/model_repo \
+       --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864  \
+       ${DAEMON} ${RM} --name ${TRTIS} ${TRTIS_CONTAINER_TAG} \
+       ${CMD}
+set +x

+ 33 - 0
PyTorch/SpeechRecognition/Jasper/trtis/scripts/wait_for_trtis_server.sh

@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SERVER_URI=${1:-"localhost"}
+
+echo "Waiting for TRTIS Server to be ready at http://$SERVER_URI:8000..."
+
+live_command="curl -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/live"
+ready_command="curl -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/ready"
+
+current_status=$($live_command)
+
+# First check the current status. If that passes, check the json. If either fail, loop
+while [[ ${current_status} != "200" ]] || [[ $($ready_command) != "200" ]]; do
+
+   printf "."
+   sleep 1
+   current_status=$($live_command)
+done
+
+echo "TRTIS Server is ready!"

+ 472 - 0
PyTorch/SpeechRecognition/Jasper/trtis/speech_utils.py

@@ -0,0 +1,472 @@
+#!/usr/bin/python
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import soundfile as sf
+import math
+from os import system
+import numpy as np
+from tensorrtserver.api import *
+import tensorrtserver.api.model_config_pb2 as model_config
+import grpc
+from tensorrtserver.api import api_pb2
+from tensorrtserver.api import grpc_service_pb2
+from tensorrtserver.api import grpc_service_pb2_grpc
+if "./trtis" not in sys.path:
+    sys.path.append("./")
+    sys.path.append("./trtis")
+from parts.text import _clean_text
+
+WINDOWS_FNS = {"hanning": np.hanning, "hamming": np.hamming, "none": None}
+
+
+def model_dtype_to_np(model_dtype):
+    if model_dtype == model_config.TYPE_BOOL:
+        return np.bool
+    elif model_dtype == model_config.TYPE_INT8:
+        return np.int8
+    elif model_dtype == model_config.TYPE_INT16:
+        return np.int16
+    elif model_dtype == model_config.TYPE_INT32:
+        return np.int32
+    elif model_dtype == model_config.TYPE_INT64:
+        return np.int64
+    elif model_dtype == model_config.TYPE_UINT8:
+        return np.uint8
+    elif model_dtype == model_config.TYPE_UINT16:
+        return np.uint16
+    elif model_dtype == model_config.TYPE_UINT32:
+        return np.uint32
+    elif model_dtype == model_config.TYPE_FP16:
+        return np.float16
+    elif model_dtype == model_config.TYPE_FP32:
+        return np.float32
+    elif model_dtype == model_config.TYPE_FP64:
+        return np.float64
+    elif model_dtype == model_config.TYPE_STRING:
+        return np.dtype(object)
+    return None
+
+def load_transcript(transcript_path):
+    with open(transcript_path, 'r', encoding="utf-8") as transcript_file:
+        transcript = transcript_file.read().replace('\n', '')
+    return transcript
+
+def parse_transcript(transcript, labels_map, blank_index):
+    chars = [labels_map.get(x, blank_index) for x in list(transcript)]
+    transcript = list(filter(lambda x: x != blank_index, chars))
+    return transcript
+
+def normalize_string(s, labels, table, **unused_kwargs):
+    """
+    Normalizes string. For example:
+    'call me at 8:00 pm!' -> 'call me at eight zero pm'
+
+    Args:
+        s: string to normalize
+        labels: labels used during model training.
+
+    Returns:
+            Normalized string
+    """
+
+    def good_token(token, labels):
+        s = set(labels)
+        for t in token:
+            if not t in s:
+                return False
+        return True
+
+    try:
+        text = _clean_text(s, ["english_cleaners"], table).strip()
+        return ''.join([t for t in text if good_token(t, labels=labels)])
+    except:
+        print("WARNING: Normalizing {} failed".format(s))
+        return None
+
+def ctc_decoder_predictions_tensor(prediction_cpu_tensor, batch_size, labels):
+    """
+    Takes output of greedy ctc decoder and performs ctc decoding algorithm to
+    remove duplicates and special symbol. Returns prediction
+    Args:
+        tensor: model output tensor
+        label: A list of labels
+    Returns:
+        prediction
+    """
+    blank_id = len(labels) - 1
+    hypotheses = []
+    labels_map = dict([(i, labels[i]) for i in range(len(labels))])
+    # iterate over batch
+    prediction_cpu_tensor = prediction_cpu_tensor.reshape((batch_size, int(prediction_cpu_tensor.size/batch_size)))
+    for ind in range(batch_size):
+        prediction = prediction_cpu_tensor[ind].tolist()
+        # CTC decoding procedure
+        decoded_prediction = []
+        previous = len(labels) - 1 # id of a blank symbol
+        for p in prediction:
+            if (p != previous or previous == blank_id) and p != blank_id:
+                decoded_prediction.append(p)
+            previous = p
+        hypothesis = ''.join([labels_map[c] for c in decoded_prediction])
+        hypotheses.append(hypothesis)
+    return hypotheses
+
+class SpeechClient(object):
+
+    def __init__(self, url, protocol, model_name, model_version, batch_size,
+                 model_platform=None, verbose=False,
+                 mode="batch",
+                 from_features=True):
+
+        self.model_name = model_name
+        self.model_version = model_version
+        self.verbose = verbose
+        self.batch_size = batch_size
+        self.transpose_audio_features = False
+        self.grpc_stub = None
+        self.ctx = None
+        self.correlation_id = 0
+        self.first_run = True
+        if mode == "streaming" or mode == "asynchronous":
+            self.correlation_id = 1
+
+        self.buffer = []
+
+        self.ctx = InferContext(url, protocol, model_name, model_version,
+                                verbose, self.correlation_id, False)
+        server_ctx = ServerStatusContext(url, protocol, model_name,
+                                         verbose)
+        server_status = server_ctx.get_server_status()
+
+        self.audio_signals_name, self.num_samples_name, self.transcripts_name, \
+        self.audio_signals_type, self.num_samples_type, self.transcripts_type =  self.parse_model(server_status, model_name,
+                                                                                                  batch_size, model_platform, verbose)
+        self.labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'", "<BLANK>"]
+
+    def postprocess(self, results, labels):
+
+        if len(results) != 1:
+            raise Exception("expected 1 result, got {}".format(len(results)))
+
+        transcript_values = results['TRANSCRIPT']
+        res = []
+        for transcript, filename in zip(transcript_values,
+                                        labels):
+            print('---')
+            print('File: ', filename)
+            t=ctc_decoder_predictions_tensor(transcript, self.batch_size, self.labels)
+            print("Final transcript: ", t)
+            print('---')
+            res.append(t)
+        return res
+            
+
+    def check_num_samples(self, num_samples):
+        if num_samples.data_type != model_config.TYPE_UINT32 and num_samples.data_type != model_config.TYPE_INT32:
+             raise Exception(
+                    "expecting num_samples datatype to be TYPE_UINT32/TYPE_INT32, "
+                    "model '" + model_name + "' output type is " +
+                    model_config.DataType.Name(num_samples.data_type))
+        if len(num_samples.dims) != 1:
+            raise Exception("Expecting num_samples to have 1 dimension, "
+                            "model '{}' num_samples has {}".format(
+                                model_name,len(num_samples.dims)))
+
+    def parse_model(self, server_status,
+                    model_name, batch_size,
+                    model_platform=None, verbose=False):
+        """
+        Check the configuration of the ensemble model
+        """
+
+        if model_name not in server_status.model_status:
+            print("Server status:")
+            print(server_status)
+            raise Exception("unable to get status for '" + model_name + "'")
+
+        status = server_status.model_status[model_name]
+        config = status.config
+
+        self.model_platform = model_platform
+
+        # Inputs are:
+        #   1) audio_signal: raw audio samples [num_samples]
+        #   2) sample_rate: sample rate of audio
+        #   3) num_samples: length of audio
+
+        if len(config.input) < 2:
+            raise Exception(
+                "expecting 2-3 inputs, got {}".format(len(config.input)))
+
+        # Outputs are:
+        #   1) transcripts:        candidate transcripts
+
+        if len(config.output) != 1:
+            raise Exception(
+                "expecting 1 output, got {}".format(len(config.output)))
+
+        audio_signal = config.input[0]
+
+        if len(config.input) > 1:
+            num_samples = config.input[1]
+            self.check_num_samples(num_samples);
+            
+        transcripts = config.output[0]
+
+        expected_audio_signal_dim = 1
+        expected_audio_signal_type = model_config.TYPE_FP32
+
+        if audio_signal.data_type != expected_audio_signal_type:
+            raise Exception("expecting audio_signal datatype to be " +
+                            model_config.DataType.Name(
+                                expected_audio_signal_type) +
+                            "model '" + model_name + "' output type is " +
+                            model_config.DataType.Name(audio_signal.data_type))
+
+
+        # Model specifying maximum batch size of 0 indicates that batching
+        # is not supported and so the input tensors do not expect an "N"
+        # dimension (and 'batch_size' should be 1 so that only a single
+        # image instance is inferred at a time).
+        max_batch_size = config.max_batch_size
+        if max_batch_size == 0:
+            if batch_size != 1:
+                raise Exception(
+                    "batching not supported for model '" + model_name + "'")
+        else:  # max_batch_size > 0
+            if batch_size > max_batch_size:
+                raise Exception(
+                    "expecting batch size <= {} for model {}".format(
+                        max_batch_size, model_name))
+
+        if len(audio_signal.dims) != expected_audio_signal_dim:
+            raise Exception("Expecting audio signal to have {} dimensions, "
+                            "model '{}' audio_signal has {}".format(
+                expected_audio_signal_dim,
+                model_name,
+                len(audio_signal.dims)))
+
+        return (audio_signal.name, num_samples.name, transcripts.name, 
+                model_dtype_to_np(audio_signal.data_type),
+                model_dtype_to_np(num_samples.data_type),
+                model_dtype_to_np(transcripts.data_type),
+                )
+
+
+    def update_audio_request(self, request, audio_generator):
+
+        for audio_signal, sample_rate, start, end in audio_generator:
+            # Delete the current inputs
+
+            input_batch = [audio_signal.astype(self.audio_signals_type)]
+            num_samples_batch = audio_signal.shape[0]
+            num_samples_batch = [np.asarray([num_samples_batch],
+                                            dtype=self.num_samples_type)]
+
+
+            flags = InferRequestHeader.FLAG_NONE
+            input_batch[0] = np.expand_dims(input_batch[0], axis=0)
+
+            audio_bytes = input_batch[0].tobytes()
+            num_samples_bytes = num_samples_batch[0].tobytes()
+
+            request.meta_data.input[0].dims[0] = audio_signal.shape[0]
+            request.meta_data.input[0].batch_byte_size = len(audio_bytes)
+
+            request.meta_data.input[1].dims[0] = 1
+            request.meta_data.input[1].batch_byte_size = len(num_samples_bytes)
+
+            if start:
+                request.meta_data.flags = flags | \
+                                          InferRequestHeader.FLAG_SEQUENCE_START
+            else:
+                request.meta_data.flags = flags;
+
+            # Send request with audio signal
+            del request.raw_input[:]
+            request.raw_input.extend([audio_bytes])
+            request.raw_input.extend([num_samples_bytes])
+            yield request
+
+            # If end, send empty request to flush out remaining audio
+            if end:
+                request.meta_data.flags = flags | \
+                                          InferRequestHeader.FLAG_SEQUENCE_END
+                zero_bytes = np.zeros(shape=input_batch[0].shape,
+                                      dtype=input_batch[0].dtype).tobytes()
+                del request.raw_input[:]
+                request.raw_input.extend([zero_bytes])
+                request.raw_input.extend([num_samples_bytes])
+                yield request
+
+    def recognize(self, audio_signal, filenames):
+        # Send requests of FLAGS.batch_size audio signals. If the number of
+        # audios isn't an exact multiple of FLAGS.batch_size then just
+        # start over with the first audio until the batch is filled.
+
+        flags = InferRequestHeader.FLAG_NONE
+        flags = flags | InferRequestHeader.FLAG_SEQUENCE_START
+
+        input_batch = []
+        input_filenames = []
+        max_num_samples_batch = 0
+
+        for idx in range(self.batch_size):
+            input_batch.append(audio_signal[idx].astype(
+                self.audio_signals_type))
+            input_filenames.append(filenames[idx])
+            num_samples = audio_signal[idx].shape[0]
+
+            if (num_samples > max_num_samples_batch):
+                max_num_samples_batch = num_samples
+
+        for idx in range(self.batch_size):
+            num_samples = input_batch[idx].shape[0]
+            print("num_samples : ", num_samples)
+            #input_batch[idx] = np.pad(input_batch[idx],
+            #                          ((0,
+            #                            max_num_samples_batch -
+            #                            num_samples)),
+            #                          mode='constant')
+
+            mean = np.mean(input_batch[idx])
+            std_var = np.std(input_batch[idx])
+            gauss_noise = np.random.normal(
+                mean,std_var,
+                max_num_samples_batch-num_samples)
+
+            input_batch[idx]= np.concatenate(
+                (input_batch[idx], gauss_noise.astype(
+                    self.audio_signals_type)))
+
+        max_num_samples_batch = np.asarray([max_num_samples_batch],
+                                           dtype=self.num_samples_type)
+
+        num_samples_batch = [max_num_samples_batch] * self.batch_size
+
+        #print(num_samples_batch)
+        #print(input_batch)
+        #print(input_sample_rates)
+
+        # Send request
+        print("Sending request to transcribe file(s):", ",".join(
+            input_filenames))
+
+        if (self.model_platform == "obsolete_pyt"):
+            result = self.ctx.run(
+                {self.audio_signals_name: input_batch,
+                 self.num_samples_name: num_samples_batch},
+                {self.transcripts_name: InferContext.ResultFormat.RAW},
+                self.batch_size, flags)
+        else:
+            result = self.ctx.run(
+                {self.audio_signals_name: input_batch,
+                 self.num_samples_name: num_samples_batch},
+                {self.transcripts_name: InferContext.ResultFormat.RAW},
+                self.batch_size, flags)
+
+        res =self.postprocess(result, input_filenames)
+
+        return res
+ 
+
+def preemphasis(signal, coeff=0.97):
+    return np.append(signal[0], signal[1:] - coeff * signal[:-1])
+
+
+def normalize_signal(signal, gain=None):
+    """
+    Normalize float32 signal to [-1, 1] range
+    """
+    if gain is None:
+        gain = 1.0 / (np.max(np.abs(signal)) + 1e-5)
+    return signal * gain
+
+
+
+class AudioSegment(object):
+    """Monaural audio segment abstraction.
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :raises TypeError: If the sample data type is not float or int.
+    """
+
+    def __init__(self, samples, sample_rate, target_sr=16000, trim=False,
+                 trim_db=60):
+        """Create audio segment from samples.
+        Samples are convert float32 internally, with int scaled to [-1, 1].
+        """
+        samples = self._convert_samples_to_float32(samples)
+        self._samples = samples
+        self._sample_rate = sample_rate
+        if self._samples.ndim >= 2:
+            self._samples = np.mean(self._samples, 1)
+
+    @staticmethod
+    def _convert_samples_to_float32(samples):
+        """Convert sample type to float32.
+        Audio sample type is usually integer or float-point.
+        Integers will be scaled to [-1, 1] in float32.
+        """
+        float32_samples = samples.astype('float32')
+        if samples.dtype in np.sctypes['int']:
+            bits = np.iinfo(samples.dtype).bits
+            float32_samples *= (1. / 2 ** (bits - 1))
+        elif samples.dtype in np.sctypes['float']:
+            pass
+        else:
+            raise TypeError("Unsupported sample type: %s." % samples.dtype)
+        return float32_samples
+
+    @classmethod
+    def from_file(cls, filename, target_sr=16000, int_values=False, offset=0,
+                  duration=0, trim=False):
+        """
+        Load a file supported by librosa and return as an AudioSegment.
+        :param filename: path of file to load
+        :param target_sr: the desired sample rate
+        :param int_values: if true, load samples as 32-bit integers
+        :param offset: offset in seconds when loading audio
+        :param duration: duration in seconds when loading audio
+        :return: numpy array of samples
+        """
+        with sf.SoundFile(filename, 'r') as f:
+            dtype = 'int32' if int_values else 'float32'
+            sample_rate = f.samplerate
+            if offset > 0:
+                f.seek(int(offset * sample_rate))
+            if duration > 0:
+                samples = f.read(int(duration * sample_rate), dtype=dtype)
+            else:
+                samples = f.read(dtype=dtype)
+
+        samples = samples.transpose()
+        return cls(samples, sample_rate, target_sr=target_sr, trim=trim)
+
+    @property
+    def samples(self):
+        return self._samples.copy()
+
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+
+# define our clear function
+def clear_screen():
+    _ = system('clear')

+ 3 - 0
PyTorch/SpeechRecognition/Jasper/utils/download_utils.py

@@ -31,6 +31,9 @@ def download_file(url, dest_folder, fname, overwrite=False):
 
     tmp_fpath = fpath + '.tmp'
 
+    if not os.path.exists(os.path.dirname(tmp_fpath)):
+        os.makedirs(os.path.dirname(tmp_fpath))
+
     r = requests.get(url, stream=True)
     file_size = int(r.headers['Content-Length'])
     chunk_size = 1024 * 1024  # 1MB