Explorar el Código

updated Tacotron2: included trt, new dllogger

gkarch hace 6 años
padre
commit
2992264d3a
Se han modificado 42 ficheros con 1633 adiciones y 1108 borrados
  1. 2 1
      PyTorch/SpeechSynthesis/Tacotron2/Dockerfile
  2. 3 0
      PyTorch/SpeechSynthesis/Tacotron2/README.md
  3. 0 0
      PyTorch/SpeechSynthesis/Tacotron2/dllogger/__init__.py
  4. 0 61
      PyTorch/SpeechSynthesis/Tacotron2/dllogger/autologging.py
  5. 0 429
      PyTorch/SpeechSynthesis/Tacotron2/dllogger/logger.py
  6. 0 257
      PyTorch/SpeechSynthesis/Tacotron2/dllogger/tags.py
  7. 414 0
      PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_onnx.py
  8. 1 3
      PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts.py
  9. 0 3
      PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts_config.py
  10. 47 8
      PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py
  11. 0 3
      PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_trt_config.py
  12. 23 47
      PyTorch/SpeechSynthesis/Tacotron2/inference.py
  13. 24 26
      PyTorch/SpeechSynthesis/Tacotron2/inference_perf.py
  14. 53 0
      PyTorch/SpeechSynthesis/Tacotron2/main.py
  15. 11 5
      PyTorch/SpeechSynthesis/Tacotron2/models.py
  16. 4 4
      PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/README.md
  17. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_1GPU.sh
  18. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_4GPU.sh
  19. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_8GPU.sh
  20. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_1GPU.sh
  21. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_4GPU.sh
  22. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_8GPU.sh
  23. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_1GPU.sh
  24. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_4GPU.sh
  25. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_8GPU.sh
  26. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_1GPU.sh
  27. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_4GPU.sh
  28. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_8GPU.sh
  29. 4 5
      PyTorch/SpeechSynthesis/Tacotron2/run_latency_tests.sh
  30. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/scripts/train_tacotron2.sh
  31. 1 1
      PyTorch/SpeechSynthesis/Tacotron2/scripts/train_waveglow.sh
  32. 2 2
      PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py
  33. 45 135
      PyTorch/SpeechSynthesis/Tacotron2/test_infer.py
  34. 31 5
      PyTorch/SpeechSynthesis/Tacotron2/test_infer.sh
  35. 73 95
      PyTorch/SpeechSynthesis/Tacotron2/train.py
  36. 93 0
      PyTorch/SpeechSynthesis/Tacotron2/trt/README.md
  37. 130 0
      PyTorch/SpeechSynthesis/Tacotron2/trt/export_onnx2trt.py
  38. 368 0
      PyTorch/SpeechSynthesis/Tacotron2/trt/inference_trt.py
  39. 4 0
      PyTorch/SpeechSynthesis/Tacotron2/trt/run_latency_tests_trt.sh
  40. 181 0
      PyTorch/SpeechSynthesis/Tacotron2/trt/test_infer_trt.py
  41. 98 0
      PyTorch/SpeechSynthesis/Tacotron2/trt/trt_utils.py
  42. 8 5
      PyTorch/SpeechSynthesis/Tacotron2/waveglow/model.py

+ 2 - 1
PyTorch/SpeechSynthesis/Tacotron2/Dockerfile

@@ -1,5 +1,6 @@
-FROM nvcr.io/nvidia/pytorch:19.10-py3
+FROM nvcr.io/nvidia/pytorch:19.11-py3
 
 ADD . /workspace/tacotron2
 WORKDIR /workspace/tacotron2
 RUN pip install -r requirements.txt
+RUN pip --no-cache-dir --no-cache install  'git+https://github.com/NVIDIA/dllogger'

+ 3 - 0
PyTorch/SpeechSynthesis/Tacotron2/README.md

@@ -704,6 +704,9 @@ November 2019
 * Implemented training resume from checkpoint
 * Added notebook for running Tacotron 2 and WaveGlow in TRTIS.
 
+December  2019
+* Added `trt` subfolder for running Tacotron 2 and WaveGlow in TensorRT.
+
 ### Known issues
 
 There are no known issues in this release.

+ 0 - 0
PyTorch/SpeechSynthesis/Tacotron2/dllogger/__init__.py


+ 0 - 61
PyTorch/SpeechSynthesis/Tacotron2/dllogger/autologging.py

@@ -1,61 +0,0 @@
-#
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import subprocess
-import xml.etree.ElementTree as ET
-
-from dllogger.logger import LOGGER
-
-#TODO: print CUDA version, container version etc
-
-def log_hardware():
-    # TODO: asserts - what if you cannot launch those commands?
-    # number of CPU threads
-    cpu_info_command = 'cat /proc/cpuinfo'
-    cpu_info = subprocess.run(cpu_info_command.split(), stdout=subprocess.PIPE).stdout.split()
-    cpu_num_index = len(cpu_info) - cpu_info[::-1].index(b'processor') + 1
-    cpu_num = int(cpu_info[cpu_num_index]) + 1
-
-    # CPU name
-    cpu_name_begin_index = cpu_info.index(b'name')
-    cpu_name_end_index = cpu_info.index(b'stepping')
-    cpu_name = b' '.join(cpu_info[cpu_name_begin_index + 2:cpu_name_end_index]).decode('utf-8')
-
-    LOGGER.log(key='cpu_info', value={"num": cpu_num, "name": cpu_name})
-
-    # RAM memory
-    ram_info_command = 'free -m -h'
-    ram_info = subprocess.run(ram_info_command.split(), stdout=subprocess.PIPE).stdout.split()
-    ram_index = ram_info.index(b'Mem:') + 1
-    ram = ram_info[ram_index].decode('utf-8')
-
-    LOGGER.log(key='mem_info', value={"ram": ram})
-
-    # GPU
-    nvidia_smi_command = 'nvidia-smi -q -x'
-    nvidia_smi_output = subprocess.run(nvidia_smi_command.split(), stdout=subprocess.PIPE).stdout
-    nvidia_smi = ET.fromstring(nvidia_smi_output)
-    gpus = nvidia_smi.findall('gpu')
-    ver = nvidia_smi.findall('driver_version')
-
-    LOGGER.log(key="gpu_info",
-                 value={
-                      "driver_version": ver[0].text,
-                      "num": len(gpus),
-                      "name": [g.find('product_name').text for g in gpus],
-                      "mem": [g.find('fb_memory_usage').find('total').text for g in gpus]})
-
-def log_args(args):
-    LOGGER.log(key='args', value=vars(args))

+ 0 - 429
PyTorch/SpeechSynthesis/Tacotron2/dllogger/logger.py

@@ -1,429 +0,0 @@
-# Copyright 2018 MLBenchmark Group. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-import json
-import logging
-import os
-import inspect
-import sys
-import re
-from contextlib import contextmanager
-import functools
-from collections import OrderedDict
-
-NVLOGGER_NAME = 'nv_logger'
-NVLOGGER_VERSION = '0.2.2'
-NVLOGGER_TOKEN = ':::NVLOG'
-
-MLPERF_NAME = 'mlperf_logger'
-MLPERF_VERSION = '0.5.0'
-MLPERF_TOKEN = ':::MLP'
-
-DEFAULT_JSON_FILENAME = 'nvlog.json'
-
-RUN_SCOPE = 0
-EPOCH_SCOPE = 1
-TRAIN_ITER_SCOPE = 2
-
-_data = OrderedDict([
-    ('model', None),
-    ('epoch', -1),
-    ('iteration', -1),
-    ('total_iteration', -1),
-    ('metrics', OrderedDict()),
-    ('timed_blocks', OrderedDict()),
-    ('current_scope', RUN_SCOPE)
-    ])
-
-def get_caller(stack_index=2, root_dir=None):
-    caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
-
-    # Trim the file names for readability.
-    filename = caller.filename
-    if root_dir is not None:
-        filename = re.sub("^" + root_dir + "/", "", filename)
-    return "%s:%d" % (filename, caller.lineno)
-
-class StandardMeter(object):
-
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        pass
-
-    def record(self, value):
-        self.value = value
-
-    def get_value(self):
-        return self.value
-
-    def get_last(self):
-        return self.value
-
-class AverageMeter(object):
-
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.n = 0
-        self.value = 0
-        self.last = 0
-
-    def record(self, value, n = 1):
-        self.last = value
-        self.n += n
-        self.value += value * n
-
-    def get_value(self):
-        return self.value / self.n
-
-    def get_last(self):
-        return self.last
-
-class JsonBackend(object):
-
-    def __init__(self, log_file=DEFAULT_JSON_FILENAME, logging_scope=TRAIN_ITER_SCOPE,
-            iteration_interval=1):
-        self.log_file = log_file
-        self.logging_scope = logging_scope
-        self.iteration_interval = iteration_interval
-
-        self.json_log = OrderedDict([
-            ('run', OrderedDict()),
-            ('epoch', OrderedDict()),
-            ('iter', OrderedDict()),
-            ('event', OrderedDict()),
-            ])
-        
-        self.json_log['epoch']['x'] = []
-        if self.logging_scope == TRAIN_ITER_SCOPE:
-            self.json_log['iter']['x'] = [[]]
-
-    def register_metric(self, key, metric_scope):
-        if (metric_scope == TRAIN_ITER_SCOPE and 
-                self.logging_scope == TRAIN_ITER_SCOPE):
-            if not key in self.json_log['iter'].keys():
-                self.json_log['iter'][key] = [[]]
-        if metric_scope == EPOCH_SCOPE:
-            if not key in self.json_log['epoch'].keys():
-                self.json_log['epoch'][key] = []
-
-    def log(self, key, value):
-        if _data['current_scope'] == RUN_SCOPE:
-            self.json_log['run'][key] = value
-        elif _data['current_scope'] == EPOCH_SCOPE: 
-            pass
-        elif _data['current_scope'] == TRAIN_ITER_SCOPE:
-            pass
-        else:
-            raise ValueError('log function for scope "', _data['current_scope'], 
-                    '" not implemented')
-    
-    def log_event(self, key, value):
-        if not key in self.json_log['event'].keys():
-            self.json_log['event'][key] = []
-        entry = OrderedDict()
-        entry['epoch'] = _data['epoch']
-        entry['iter'] = _data['iteration']
-        entry['timestamp'] = time.time()
-        if value:
-            entry['value'] = value
-        self.json_log['event'][key].append(entry)
-
-    def log_iteration_summary(self):
-        if (self.logging_scope == TRAIN_ITER_SCOPE and 
-                _data['total_iteration'] % self.iteration_interval == 0):
-            for key, m in _data['metrics'].items():
-                if m.metric_scope == TRAIN_ITER_SCOPE:
-                    self.json_log['iter'][key][-1].append(m.get_last())
-
-            # log x for iteration number
-            self.json_log['iter']['x'][-1].append(_data['iteration'])
-
-
-    def dump_json(self):
-        if self.log_file is None:
-            print(json.dumps(self.json_log, indent=4))
-        else:
-            with open(self.log_file, 'w') as f:
-                json.dump(self.json_log, fp=f, indent=4)
-
-    def log_epoch_summary(self):
-        for key, m in _data['metrics'].items():
-            if m.metric_scope == EPOCH_SCOPE:
-                self.json_log['epoch'][key].append(m.get_value())
-            elif (m.metric_scope == TRAIN_ITER_SCOPE and 
-                    self.logging_scope == TRAIN_ITER_SCOPE):
-                # create new sublists for each iter metric in the next epoch
-                self.json_log['iter'][key].append([])
-        
-        # log x for epoch number
-        self.json_log['epoch']['x'].append(_data['epoch'])
-
-        # create new sublist for iter's x in the next epoch
-        if self.logging_scope == TRAIN_ITER_SCOPE:
-            self.json_log['iter']['x'].append([])
-
-        self.dump_json()
-
-    def timed_block_start(self, name):
-        pass
-
-    def timed_block_stop(self, name):
-        pass
-
-    def finish(self):
-        self.dump_json()
-
-class _ParentStdOutBackend(object):
-
-    def __init__(self, name, token, version, log_file, logging_scope, iteration_interval):
-
-        self.root_dir = None
-        self.worker = [0]
-        self.prefix = ''
-
-        self.name = name
-        self.token = token
-        self.version = version
-        self.log_file = log_file
-        self.logging_scope = logging_scope
-        self.iteration_interval = iteration_interval
-
-        self.logger = logging.getLogger(self.name)
-        self.logger.setLevel(logging.DEBUG)
-        self.logger.handlers = []
-
-        if (self.log_file == None):
-            self.stream_handler = logging.StreamHandler(stream=sys.stdout)
-            self.stream_handler.setLevel(logging.DEBUG)
-            self.logger.addHandler(self.stream_handler)
-        else:
-            self.file_handler = logging.FileHandler(self.log_file, mode='w')
-            self.file_handler.setLevel(logging.DEBUG)
-            self.logger.addHandler(self.file_handler)
-
-    def register_metric(self, key, meter=None, metric_scope=EPOCH_SCOPE):
-        pass
-
-    def log_epoch_summary(self):
-        pass
-
-    def log_iteration_summary(self):
-        pass
-
-    def log(self, key, value):
-        if _data['current_scope'] > self.logging_scope:
-            pass
-        elif (_data['current_scope'] == TRAIN_ITER_SCOPE and 
-                _data['total_iteration'] % self.iteration_interval != 0):
-            pass
-        else:
-            self.log_stdout(key, value)
-
-    def log_event(self, key, value):
-        self.log_stdout(key, value)
-        
-    def log_stdout(self, key, value=None, forced=False):
-        # TODO: worker 0 
-        # only the 0-worker will log
-        #if not forced and self.worker != 0:
-        #    pass
-
-        if value is None:
-            msg = key
-        else:
-            str_json = json.dumps(value)
-            msg = '{key}: {value}'.format(key=key, value=str_json)
-
-        call_site = get_caller(root_dir=self.root_dir)
-        now = time.time()
-
-        message = '{prefix}{token}v{ver} {model} {secs:.9f} ({call_site}) {msg}'.format(
-            prefix=self.prefix, token=self.token, ver=self.version, secs=now, 
-            model=_data['model'],
-            call_site=call_site, msg=msg)
-
-        self.logger.debug(message)
-
-    def timed_block_start(self, name):
-        self.log_stdout(key=name + "_start")
-
-    def timed_block_stop(self, name):
-        self.log_stdout(key=name + "_stop")
-
-    def finish(self):
-        pass
-
-class StdOutBackend(_ParentStdOutBackend):
-
-    def __init__(self, log_file=None, logging_scope=EPOCH_SCOPE, iteration_interval=1):
-        _ParentStdOutBackend.__init__(self, name=NVLOGGER_NAME, token=NVLOGGER_TOKEN, 
-                version=NVLOGGER_VERSION, log_file=log_file, logging_scope=logging_scope, 
-                iteration_interval=iteration_interval)
-        
-class MLPerfBackend(_ParentStdOutBackend):
-
-    def __init__(self, log_file=None, logging_scope=TRAIN_ITER_SCOPE, iteration_interval=1):
-        _ParentStdOutBackend.__init__(self, name=MLPERF_NAME, token=MLPERF_TOKEN, 
-                version=MLPERF_VERSION, log_file=log_file, logging_scope=logging_scope, 
-                iteration_interval=iteration_interval)
-
-class _Logger(object):
-    def __init__(self):
-
-        self.backends = [
-                StdOutBackend(),
-                JsonBackend()
-                ]
-   
-    def set_model_name(self, name):
-        _data['model'] = name
-
-
-    def set_backends(self, backends):
-        self.backends = backends
-
-        
-    def register_metric(self, key, meter=None, metric_scope=EPOCH_SCOPE):
-        if meter == None:
-            meter = StandardMeter()
-        #TODO: move to argument of Meter?
-        meter.metric_scope = metric_scope
-        _data['metrics'][key] = meter
-        for b in self.backends:
-            b.register_metric(key, metric_scope)
-
-    def log(self, key, value=None, forced=False):
-        if _data['current_scope'] == TRAIN_ITER_SCOPE or _data['current_scope'] == EPOCH_SCOPE:
-            if key in _data['metrics'].keys():
-                if _data['metrics'][key].metric_scope == _data['current_scope']:
-                    _data['metrics'][key].record(value)
-        for b in self.backends:
-            b.log(key, value)
-
-    def log_event(self, key, value=None):
-        for b in self.backends:
-            b.log_event(key, value)
-    
-    def timed_block_start(self, name):
-        if not name in _data['timed_blocks']:
-            _data['timed_blocks'][name] = OrderedDict()
-        _data['timed_blocks'][name]['start'] = time.time()
-        for b in self.backends:
-            b.timed_block_start(name)
-    
-    def timed_block_stop(self, name):
-        if not name in _data['timed_blocks']:
-            raise ValueError('timed_block_stop called before timed_block_start for ' + name)
-        _data['timed_blocks'][name]['stop'] = time.time()
-        delta = _data['timed_blocks'][name]['stop'] - _data['timed_blocks'][name]['start']
-        self.log(name + '_time', delta)
-        for b in self.backends:
-            b.timed_block_stop(name)
-
-    def iteration_start(self):
-        _data['current_scope'] = TRAIN_ITER_SCOPE
-        _data['iteration'] += 1
-        _data['total_iteration'] += 1
-
-
-    def iteration_stop(self):
-        for b in self.backends:
-            b.log_iteration_summary()
-        _data['current_scope'] = EPOCH_SCOPE
-
-    def epoch_start(self):
-        _data['current_scope'] = EPOCH_SCOPE 
-        _data['epoch'] += 1
-        _data['iteration'] = -1
-
-        for n, m in _data['metrics'].items():
-            if m.metric_scope == TRAIN_ITER_SCOPE:
-                m.reset()
-
-    def epoch_stop(self):
-        for b in self.backends:
-            b.log_epoch_summary()
-        _data['current_scope'] = RUN_SCOPE
-
-    def finish(self):
-        for b in self.backends:
-            b.finish()
-
-    def iteration_generator_wrapper(self, gen):
-        for g in gen:
-            self.iteration_start()
-            yield g
-            self.iteration_stop()
-
-    def epoch_generator_wrapper(self, gen):
-        for g in gen:
-            self.epoch_start()
-            yield g
-            self.epoch_stop()
-
-LOGGER = _Logger()
-
-@contextmanager
-def timed_block(prefix, value=None, logger=LOGGER, forced=False):
-    """ This function helps with timed blocks
-        ----
-        Parameters:
-        prefix - one of items from TIMED_BLOCKS; the action to be timed
-        logger - NVLogger object
-        forced - if True then the events are always logged (even if it should be skipped)
-    """
-    if logger is None:
-        pass
-    logger.timed_block_start(prefix)
-    yield logger
-    logger.timed_block_stop(prefix)
-
-def timed_function(prefix, variable=None, forced=False):
-    """ This decorator helps with timed functions
-        ----
-        Parameters:
-        prefix - one of items from TIME_BLOCK; the action to be timed
-        logger - NVLogger object
-        forced - if True then the events are always logged (even if it should be skipped)
-    """
-    def timed_function_decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            logger = kwargs.get('logger', LOGGER)
-            value = kwargs.get(variable, next(iter(args), None))
-            with timed_block(prefix=prefix, logger=logger, value=value, forced=forced):
-                    func(*args, **kwargs)
-        return wrapper
-    return timed_function_decorator
-

+ 0 - 257
PyTorch/SpeechSynthesis/Tacotron2/dllogger/tags.py

@@ -1,257 +0,0 @@
-# Copyright 2018 MLBenchmark Group. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Common values reported
-
-VALUE_EPOCH = "epoch"
-VALUE_ITERATION = "iteration"
-VALUE_ACCURACY = "accuracy"
-VALUE_BLEU = "bleu"
-VALUE_TOP1 = "top1"
-VALUE_TOP5 = "top5"
-VALUE_BBOX_MAP = "bbox_map"
-VALUE_MASK_MAP = "mask_map"
-VALUE_BCE = "binary_cross_entropy"
-
-
-# Timed blocks (used with timed_function & timed_block
-# For each there should be *_start and *_stop tags defined
-
-RUN_BLOCK = "run"
-SETUP_BLOCK = "setup"
-PREPROC_BLOCK = "preproc"
-
-TRAIN_BLOCK = "train"
-TRAIN_PREPROC_BLOCK = "train_preproc"
-TRAIN_EPOCH_BLOCK = "train_epoch"
-TRAIN_EPOCH_PREPROC_BLOCK = "train_epoch_preproc"
-TRAIN_CHECKPOINT_BLOCK = "train_checkpoint"
-TRAIN_ITER_BLOCK = "train_iteration"
-
-EVAL_BLOCK = "eval"
-EVAL_ITER_BLOCK = "eval_iteration"
-
-#TODO: to remove?
-TIMED_BLOCKS = {
-    RUN_BLOCK,
-    SETUP_BLOCK,
-    PREPROC_BLOCK,
-    TRAIN_BLOCK,
-    TRAIN_PREPROC_BLOCK,
-    TRAIN_EPOCH_BLOCK,
-    TRAIN_EPOCH_PREPROC_BLOCK,
-    TRAIN_CHECKPOINT_BLOCK,
-    TRAIN_ITER_BLOCK,
-    EVAL_BLOCK,
-    EVAL_ITER_BLOCK,
-}
-
-
-# Events
-
-RUN_INIT = "run_init"
-
-SETUP_START = "setup_start"
-SETUP_STOP = "setup_stop"
-
-PREPROC_START = "preproc_start"
-PREPROC_STOP = "preproc_stop"
-
-RUN_START = "run_start"
-RUN_STOP = "run_stop"
-RUN_FINAL = "run_final"
-
-TRAIN_CHECKPOINT_START = "train_checkpoint_start"
-TRAIN_CHECKPOINT_STOP = "train_checkpoint_stop"
-
-TRAIN_PREPROC_START = "train_preproc_start"
-TRAIN_PREPROC_STOP = "train_preproc_stop"
-
-TRAIN_EPOCH_PREPROC_START = "train_epoch_preproc_start"
-TRAIN_EPOCH_PREPROC_STOP = "train_epoch_preproc_stop"
-
-TRAIN_ITER_START = "train_iter_start"
-TRAIN_ITER_STOP = "train_iter_stop"
-
-TRAIN_EPOCH_START = "train_epoch_start"
-TRAIN_EPOCH_STOP = "train_epoch_stop"
-
-
-# MLPerf specific tags
-
-RUN_CLEAR_CACHES = "run_clear_caches"
-
-PREPROC_NUM_TRAIN_EXAMPLES = "preproc_num_train_examples"
-PREPROC_NUM_EVAL_EXAMPLES = "preproc_num_eval_examples"
-PREPROC_TOKENIZE_TRAINING = "preproc_tokenize_training"
-PREPROC_TOKENIZE_EVAL = "preproc_tokenize_eval"
-PREPROC_VOCAB_SIZE = "preproc_vocab_size"
-
-RUN_SET_RANDOM_SEED = "run_set_random_seed"
-
-INPUT_SIZE = "input_size"
-INPUT_BATCH_SIZE = "input_batch_size"
-INPUT_ORDER = "input_order"
-INPUT_SHARD = "input_shard"
-INPUT_BN_SPAN = "input_bn_span"
-
-INPUT_CENTRAL_CROP = "input_central_crop"
-INPUT_CROP_USES_BBOXES = "input_crop_uses_bboxes"
-INPUT_DISTORTED_CROP_MIN_OBJ_COV = "input_distorted_crop_min_object_covered"
-INPUT_DISTORTED_CROP_RATIO_RANGE = "input_distorted_crop_aspect_ratio_range"
-INPUT_DISTORTED_CROP_AREA_RANGE = "input_distorted_crop_area_range"
-INPUT_DISTORTED_CROP_MAX_ATTEMPTS = "input_distorted_crop_max_attempts"
-INPUT_MEAN_SUBTRACTION = "input_mean_subtraction"
-INPUT_RANDOM_FLIP = "input_random_flip"
-
-INPUT_RESIZE = "input_resize"
-INPUT_RESIZE_ASPECT_PRESERVING = "input_resize_aspect_preserving"
-
-
-# Opt
-
-OPT_NAME = "opt_name"
-
-OPT_LR = "opt_learning_rate"
-OPT_MOMENTUM = "opt_momentum"
-
-OPT_WEIGHT_DECAY = "opt_weight_decay"
-
-OPT_HP_ADAM_BETA1 = "opt_hp_Adam_beta1"
-OPT_HP_ADAM_BETA2 = "opt_hp_Adam_beta2"
-OPT_HP_ADAM_EPSILON = "opt_hp_Adam_epsilon"
-
-OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps"
-
-
-#  Train
-
-TRAIN_LOOP = "train_loop"
-TRAIN_EPOCH = "train_epoch"
-TRAIN_CHECKPOINT = "train_checkpoint"
-TRAIN_LOSS = "train_loss"
-TRAIN_ITERATION_LOSS = "train_iteration_loss"
-
-
-# Eval
-
-EVAL_START = "eval_start"
-EVAL_SIZE = "eval_size"
-EVAL_TARGET = "eval_target"
-EVAL_ACCURACY = "eval_accuracy"
-EVAL_STOP = "eval_stop"
-
-
-# Perf
-
-PERF_IT_PER_SEC = "perf_it_per_sec"
-PERF_TIME_TO_TRAIN = "time_to_train"
-
-EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy"
-
-
-# Model
-
-MODEL_HP_LOSS_FN = "model_hp_loss_fn"
-
-MODEL_HP_INITIAL_SHAPE = "model_hp_initial_shape"
-MODEL_HP_FINAL_SHAPE = "model_hp_final_shape"
-
-MODEL_L2_REGULARIZATION = "model_l2_regularization"
-MODEL_EXCLUDE_BN_FROM_L2 = "model_exclude_bn_from_l2"
-
-MODEL_HP_RELU = "model_hp_relu"
-MODEL_HP_CONV2D_FIXED_PADDING = "model_hp_conv2d_fixed_padding"
-MODEL_HP_BATCH_NORM = "model_hp_batch_norm"
-MODEL_HP_DENSE = "model_hp_dense"
-
-
-# GNMT specific
-
-MODEL_HP_LOSS_SMOOTHING = "model_hp_loss_smoothing"
-MODEL_HP_NUM_LAYERS = "model_hp_num_layers"
-MODEL_HP_HIDDEN_SIZE = "model_hp_hidden_size"
-MODEL_HP_DROPOUT = "model_hp_dropout"
-
-EVAL_HP_BEAM_SIZE = "eval_hp_beam_size"
-TRAIN_HP_MAX_SEQ_LEN = "train_hp_max_sequence_length"
-EVAL_HP_MAX_SEQ_LEN = "eval_hp_max_sequence_length"
-EVAL_HP_LEN_NORM_CONST = "eval_hp_length_normalization_constant"
-EVAL_HP_LEN_NORM_FACTOR = "eval_hp_length_normalization_factor"
-EVAL_HP_COV_PENALTY_FACTOR = "eval_hp_coverage_penalty_factor"
-
-
-# NCF specific
-
-PREPROC_HP_MIN_RATINGS = "preproc_hp_min_ratings"
-PREPROC_HP_NUM_EVAL = "preproc_hp_num_eval"
-PREPROC_HP_SAMPLE_EVAL_REPLACEMENT = "preproc_hp_sample_eval_replacement"
-
-INPUT_HP_NUM_NEG = "input_hp_num_neg"
-INPUT_HP_SAMPLE_TRAIN_REPLACEMENT = "input_hp_sample_train_replacement"
-INPUT_STEP_TRAIN_NEG_GEN = "input_step_train_neg_gen"
-INPUT_STEP_EVAL_NEG_GEN = "input_step_eval_neg_gen"
-
-EVAL_HP_NUM_USERS = "eval_hp_num_users"
-EVAL_HP_NUM_NEG = "eval_hp_num_neg"
-
-MODEL_HP_MF_DIM = "model_hp_mf_dim"
-MODEL_HP_MLP_LAYER_SIZES = "model_hp_mlp_layer_sizes"
-
-
-# RESNET specific
-
-EVAL_EPOCH_OFFSET = "eval_offset"
-
-MODEL_HP_INITIAL_MAX_POOL = "model_hp_initial_max_pool"
-MODEL_HP_BEGIN_BLOCK = "model_hp_begin_block"
-MODEL_HP_END_BLOCK = "model_hp_end_block"
-MODEL_HP_BLOCK_TYPE = "model_hp_block_type"
-MODEL_HP_PROJECTION_SHORTCUT = "model_hp_projection_shortcut"
-MODEL_HP_SHORTCUT_ADD = "model_hp_shorcut_add"
-MODEL_HP_RESNET_TOPOLOGY = "model_hp_resnet_topology"
-
-
-# Transformer specific
-
-INPUT_MAX_LENGTH = "input_max_length"
-
-MODEL_HP_INITIALIZER_GAIN = "model_hp_initializer_gain"
-MODEL_HP_VOCAB_SIZE = "model_hp_vocab_size"
-MODEL_HP_NUM_HIDDEN_LAYERS = "model_hp_hidden_layers"
-MODEL_HP_EMBEDDING_SHARED_WEIGHTS = "model_hp_embedding_shared_weights"
-MODEL_HP_ATTENTION_DENSE = "model_hp_attention_dense"
-MODEL_HP_ATTENTION_DROPOUT = "model_hp_attention_dropout"
-MODEL_HP_FFN_OUTPUT_DENSE = "model_hp_ffn_output_dense"
-MODEL_HP_FFN_FILTER_DENSE = "model_hp_ffn_filter_dense"
-MODEL_HP_RELU_DROPOUT = "model_hp_relu_dropout"
-MODEL_HP_LAYER_POSTPROCESS_DROPOUT = "model_hp_layer_postprocess_dropout"
-MODEL_HP_NORM = "model_hp_norm"
-MODEL_HP_SEQ_BEAM_SEARCH = "model_hp_sequence_beam_search"
-

+ 414 - 0
PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_onnx.py

@@ -0,0 +1,414 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+import argparse
+
+import sys
+sys.path.append('./')
+
+import models
+from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model, prepare_input_sequence
+from common.utils import to_gpu, get_mask_from_lengths
+
+def parse_args(parser):
+    """
+    Parse commandline arguments.
+    """
+    parser.add_argument('--tacotron2', type=str,
+                        help='full path to the Tacotron2 model checkpoint file')
+    parser.add_argument('-o', '--output', type=str, required=True,
+                        help='Directory for the exported Tacotron 2 ONNX model')
+
+    return parser
+
+
+def encoder_infer(self, x, input_lengths):
+    device = x.device
+    for conv in self.convolutions:
+        x = F.dropout(F.relu(conv(x.to(device))), 0.5, False)
+
+    x = x.transpose(1, 2)
+
+    input_lengths_cpu = input_lengths[:] # TODO
+    input_lengths_cpu = input_lengths_cpu.cpu().numpy() # TODO
+    x = nn.utils.rnn.pack_padded_sequence(
+        x, input_lengths_cpu, batch_first=True)
+
+    outputs, _ = self.lstm(x)
+
+    outputs, _ = nn.utils.rnn.pad_packed_sequence(
+        outputs, batch_first=True)
+
+    lens = input_lengths*2
+
+    return outputs, lens
+
+
+class Encoder(torch.nn.Module):
+    def __init__(self, tacotron2):
+        super(Encoder, self).__init__()
+        self.tacotron2 = tacotron2
+        self.tacotron2.encoder.lstm.flatten_parameters()
+        self.infer = encoder_infer
+
+    def forward(self, sequence, sequence_lengths):
+        embedded_inputs = self.tacotron2.embedding(sequence).transpose(1, 2)
+        memory, lens = self.infer(self.tacotron2.encoder, embedded_inputs, sequence_lengths)
+        processed_memory = self.tacotron2.decoder.attention_layer.memory_layer(memory)
+        return memory, processed_memory, lens
+
+class Postnet(torch.nn.Module):
+    def __init__(self, tacotron2):
+        super(Postnet, self).__init__()
+        self.tacotron2 = tacotron2
+
+    def forward(self, mel_outputs):
+        mel_outputs_postnet = self.tacotron2.postnet(mel_outputs)
+        return mel_outputs + mel_outputs_postnet
+
+def lstmcell2lstm_params(lstm_mod, lstmcell_mod):
+    lstm_mod.weight_ih_l0 = torch.nn.Parameter(lstmcell_mod.weight_ih)
+    lstm_mod.weight_hh_l0 = torch.nn.Parameter(lstmcell_mod.weight_hh)
+    lstm_mod.bias_ih_l0 = torch.nn.Parameter(lstmcell_mod.bias_ih)
+    lstm_mod.bias_hh_l0 = torch.nn.Parameter(lstmcell_mod.bias_hh)
+
+
+def prenet_infer(self, x):
+    x1 = x[:]
+    for linear in self.layers:
+        x1 = F.relu(linear(x1))
+        x0 = x1[0].unsqueeze(0)
+        mask = torch.le(torch.rand(256, device='cuda').to(torch.float32), 0.5).to(torch.float32)
+        mask = mask.expand(x1.size(0), x1.size(1))
+        x1 = x1*mask*2.0
+
+    return x1
+
+class DecoderIter(torch.nn.Module):
+    def __init__(self, tacotron2):
+        super(DecoderIter, self).__init__()
+
+        self.tacotron2 = tacotron2
+        dec = tacotron2.decoder
+
+        self.p_attention_dropout = dec.p_attention_dropout
+        self.p_decoder_dropout = dec.p_decoder_dropout
+        self.prenet = dec.prenet
+
+        self.prenet.infer = prenet_infer
+
+        self.attention_rnn = nn.LSTM(dec.prenet_dim + dec.encoder_embedding_dim,
+                                     dec.attention_rnn_dim, 1)
+        lstmcell2lstm_params(self.attention_rnn, dec.attention_rnn)
+        self.attention_rnn.flatten_parameters()
+
+        self.attention_layer = dec.attention_layer
+
+        self.decoder_rnn = nn.LSTM(dec.attention_rnn_dim + dec.encoder_embedding_dim,
+                                   dec.decoder_rnn_dim, 1)
+        lstmcell2lstm_params(self.decoder_rnn, dec.decoder_rnn)
+        self.decoder_rnn.flatten_parameters()
+
+        self.linear_projection = dec.linear_projection
+        self.gate_layer = dec.gate_layer
+
+
+    def decode(self, decoder_input, in_attention_hidden, in_attention_cell,
+               in_decoder_hidden, in_decoder_cell, in_attention_weights,
+               in_attention_weights_cum, in_attention_context, memory,
+               processed_memory, mask):
+
+        cell_input = torch.cat((decoder_input, in_attention_context), -1)
+
+        _, (out_attention_hidden, out_attention_cell) = self.attention_rnn(
+            cell_input.unsqueeze(0), (in_attention_hidden.unsqueeze(0),
+                                      in_attention_cell.unsqueeze(0)))
+        out_attention_hidden = out_attention_hidden.squeeze(0)
+        out_attention_cell = out_attention_cell.squeeze(0)
+
+        out_attention_hidden = F.dropout(
+            out_attention_hidden, self.p_attention_dropout, False)
+
+        attention_weights_cat = torch.cat(
+            (in_attention_weights.unsqueeze(1),
+             in_attention_weights_cum.unsqueeze(1)), dim=1)
+        out_attention_context, out_attention_weights = self.attention_layer(
+            out_attention_hidden, memory, processed_memory,
+            attention_weights_cat, mask)
+
+        out_attention_weights_cum = in_attention_weights_cum + out_attention_weights
+        decoder_input_tmp = torch.cat(
+            (out_attention_hidden, out_attention_context), -1)
+
+        _, (out_decoder_hidden, out_decoder_cell) = self.decoder_rnn(
+            decoder_input_tmp.unsqueeze(0), (in_decoder_hidden.unsqueeze(0),
+                                             in_decoder_cell.unsqueeze(0)))
+        out_decoder_hidden = out_decoder_hidden.squeeze(0)
+        out_decoder_cell = out_decoder_cell.squeeze(0)
+
+        out_decoder_hidden = F.dropout(
+            out_decoder_hidden, self.p_decoder_dropout, False)
+
+        decoder_hidden_attention_context = torch.cat(
+            (out_decoder_hidden, out_attention_context), 1)
+
+        decoder_output = self.linear_projection(
+            decoder_hidden_attention_context)
+
+        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
+
+        return (decoder_output, gate_prediction, out_attention_hidden,
+                out_attention_cell, out_decoder_hidden, out_decoder_cell,
+                out_attention_weights, out_attention_weights_cum, out_attention_context)
+
+    # @torch.jit.script
+    def forward(self,
+                decoder_input,
+                attention_hidden,
+                attention_cell,
+                decoder_hidden,
+                decoder_cell,
+                attention_weights,
+                attention_weights_cum,
+                attention_context,
+                memory,
+                processed_memory,
+                mask):
+        decoder_input1 = self.prenet.infer(self.prenet, decoder_input)
+        outputs = self.decode(decoder_input1,
+                              attention_hidden,
+                              attention_cell,
+                              decoder_hidden,
+                              decoder_cell,
+                              attention_weights,
+                              attention_weights_cum,
+                              attention_context,
+                              memory,
+                              processed_memory,
+                              mask)
+        return outputs
+
+
+
+def test_inference(encoder, decoder_iter, postnet):
+
+    encoder.eval()
+    decoder_iter.eval()
+    postnet.eval()
+
+    from trt.inference_trt import init_decoder_inputs
+
+    texts = ["Hello World, good day."]
+    sequences, sequence_lengths = prepare_input_sequence(texts)
+
+    measurements = {}
+
+    print("Running Tacotron2 Encoder")
+    with torch.no_grad():
+        memory, processed_memory, lens = encoder(sequences, sequence_lengths)
+
+    print("Running Tacotron2 Decoder")
+    device = memory.device
+    mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device = device)
+    not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device = device)
+    mel_outputs, gate_outputs, alignments = (torch.zeros(1), torch.zeros(1), torch.zeros(1))
+    gate_threshold = 0.6
+    max_decoder_steps = 1000
+    first_iter = True
+
+    (decoder_input, attention_hidden, attention_cell, decoder_hidden,
+     decoder_cell, attention_weights, attention_weights_cum,
+     attention_context, memory, processed_memory,
+     mask) = init_decoder_inputs(memory, processed_memory, sequence_lengths)
+
+    while True:
+        with torch.no_grad():
+            (mel_output, gate_output,
+             attention_hidden, attention_cell,
+             decoder_hidden, decoder_cell,
+             attention_weights, attention_weights_cum,
+             attention_context) = decoder_iter(decoder_input, attention_hidden, attention_cell, decoder_hidden,
+                                               decoder_cell, attention_weights, attention_weights_cum,
+                                               attention_context, memory, processed_memory, mask)
+
+        if first_iter:
+            mel_outputs = torch.unsqueeze(mel_output, 2)
+            gate_outputs = torch.unsqueeze(gate_output, 2)
+            alignments = torch.unsqueeze(attention_weights, 2)
+            first_iter = False
+        else:
+            mel_outputs = torch.cat((mel_outputs, torch.unsqueeze(mel_output, 2)), 2)
+            gate_outputs = torch.cat((gate_outputs, torch.unsqueeze(gate_output, 2)), 2)
+            alignments = torch.cat((alignments, torch.unsqueeze(attention_weights, 2)), 2)
+
+        dec = torch.le(torch.sigmoid(gate_output), gate_threshold).to(torch.int32).squeeze(1)
+        not_finished = not_finished*dec
+        mel_lengths += not_finished
+
+        if torch.sum(not_finished) == 0:
+            print("Stopping after ",mel_outputs.size(2)," decoder steps")
+            break
+        if mel_outputs.size(2) == max_decoder_steps:
+            print("Warning! Reached max decoder steps")
+            break
+
+        decoder_input = mel_output
+
+
+    print("Running Tacotron2 PostNet")
+    with torch.no_grad():
+        mel_outputs_postnet = postnet(mel_outputs)
+
+    return mel_outputs_postnet
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        description='PyTorch Tacotron 2 export to TRT')
+    parser = parse_args(parser)
+    args, _ = parser.parse_known_args()
+
+    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, False)
+
+    opset_version = 10
+
+    sequences = torch.randint(low=0, high=148, size=(1,50),
+                             dtype=torch.long).cuda()
+    sequence_lengths = torch.IntTensor([sequences.size(1)]).cuda().long()
+    dummy_input = (sequences, sequence_lengths)
+
+    encoder = Encoder(tacotron2)
+    encoder.eval()
+    with torch.no_grad():
+        encoder(*dummy_input)
+
+    torch.onnx.export(encoder, dummy_input, args.output+"/"+"encoder.onnx",
+                      opset_version=opset_version,
+                      do_constant_folding=True,
+                      input_names=["sequences", "sequence_lengths"],
+                      output_names=["memory", "processed_memory", "lens"],
+                      dynamic_axes={"sequences": {0: "batch_size", 1: "text_seq"},
+                                    "sequence_lengths": {0: "batch_size"},
+                                    "memory": {0: "batch_size", 1: "mem_seq"},
+                                    "processed_memory": {0: "batch_size", 1: "mem_seq"},
+                                    "lens": {0: "batch_size"},
+                      })
+
+    decoder_iter = DecoderIter(tacotron2)
+    memory = torch.randn((1,sequence_lengths[0],512)).cuda() #encoder_outputs
+    memory_lengths = sequence_lengths
+    # initialize decoder states for dummy_input
+    decoder_input = tacotron2.decoder.get_go_frame(memory)
+    mask = get_mask_from_lengths(memory_lengths)
+    (attention_hidden,
+     attention_cell,
+     decoder_hidden,
+     decoder_cell,
+     attention_weights,
+     attention_weights_cum,
+     attention_context,
+     processed_memory) = tacotron2.decoder.initialize_decoder_states(memory)
+    dummy_input = (decoder_input,
+                   attention_hidden,
+                   attention_cell,
+                   decoder_hidden,
+                   decoder_cell,
+                   attention_weights,
+                   attention_weights_cum,
+                   attention_context,
+                   memory,
+                   processed_memory,
+                   mask)
+
+    decoder_iter = DecoderIter(tacotron2)
+    decoder_iter.eval()
+    with torch.no_grad():
+        decoder_iter(*dummy_input)
+
+    torch.onnx.export(decoder_iter, dummy_input, args.output+"/"+"decoder_iter.onnx",
+                      opset_version=opset_version,
+                      do_constant_folding=True,
+                      input_names=["decoder_input",
+                                   "attention_hidden",
+                                   "attention_cell",
+                                   "decoder_hidden",
+                                   "decoder_cell",
+                                   "attention_weights",
+                                   "attention_weights_cum",
+                                   "attention_context",
+                                   "memory",
+                                   "processed_memory",
+                                   "mask"],
+                      output_names=["decoder_output",
+                                    "gate_prediction",
+                                    "out_attention_hidden",
+                                    "out_attention_cell",
+                                    "out_decoder_hidden",
+                                    "out_decoder_cell",
+                                    "out_attention_weights",
+                                    "out_attention_weights_cum",
+                                    "out_attention_context"],
+                      dynamic_axes={"decoder_input" : {0: "batch_size"},
+                                    "attention_hidden" : {0: "batch_size"},
+                                    "attention_cell" : {0: "batch_size"},
+                                    "decoder_hidden" : {0: "batch_size"},
+                                    "decoder_cell" : {0: "batch_size"},
+                                    "attention_weights" : {0: "batch_size", 1: "seq_len"},
+                                    "attention_weights_cum" : {0: "batch_size", 1: "seq_len"},
+                                    "attention_context" : {0: "batch_size"},
+                                    "memory" : {0: "batch_size", 1: "seq_len"},
+                                    "processed_memory" : {0: "batch_size", 1: "seq_len"},
+                                    "mask" : {0: "batch_size", 1: "seq_len"},
+                                    "decoder_output" : {0: "batch_size"},
+                                    "gate_prediction" : {0: "batch_size"},
+                                    "out_attention_hidden" : {0: "batch_size"},
+                                    "out_attention_cell" : {0: "batch_size"},
+                                    "out_decoder_hidden" : {0: "batch_size"},
+                                    "out_decoder_cell" : {0: "batch_size"},
+                                    "out_attention_weights" : {0: "batch_size", 1: "seq_len"},
+                                    "out_attention_weights_cum" : {0: "batch_size", 1: "seq_len"},
+                                    "out_attention_context" : {0: "batch_size"}
+                      })
+
+    postnet = Postnet(tacotron2)
+    dummy_input = torch.randn((1,80,620)).cuda()
+    torch.onnx.export(postnet, dummy_input, args.output+"/"+"postnet.onnx",
+                      opset_version=opset_version,
+                      do_constant_folding=True,
+                      input_names=["mel_outputs"],
+                      output_names=["mel_outputs_postnet"],
+                      dynamic_axes={"mel_outputs": {0: "batch_size", 2: "mel_seq"},
+                                    "mel_outputs_postnet": {0: "batch_size", 2: "mel_seq"}})
+
+    mel = test_inference(encoder, decoder_iter, postnet)
+    torch.save(mel, "mel.pt")
+
+if __name__ == '__main__':
+    main()

+ 1 - 3
PyTorch/SpeechSynthesis/Tacotron2/export_tacotron2_ts.py → PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts.py

@@ -28,7 +28,6 @@
 import torch
 import argparse
 from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model
-from dllogger.autologging import log_hardware, log_args
 
 def parse_args(parser):
     """
@@ -52,9 +51,8 @@ def main():
     parser = parse_args(parser)
     args = parser.parse_args()
 
-    log_args(args)    
     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
-                                     args.amp_run, rename=True)
+                                     args.amp_run, forward_is_infer=True)
     
     jitted_tacotron2 = torch.jit.script(tacotron2)
 

+ 0 - 3
PyTorch/SpeechSynthesis/Tacotron2/export_tacotron2_ts_config.py → PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts_config.py

@@ -28,7 +28,6 @@
 
 import os
 import argparse
-from dllogger.autologging import log_hardware, log_args
 
 
 def parse_args(parser):
@@ -59,8 +58,6 @@ def main():
     parser = parse_args(parser)
     args = parser.parse_args()
     
-    log_args(args)    
-    
     # prepare repository
     model_folder = os.path.join('./trtis_repo', args.trtis_model_name)
     version_folder = os.path.join(model_folder, str(args.trtis_model_version))

+ 47 - 8
PyTorch/SpeechSynthesis/Tacotron2/export_waveglow_trt.py → PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py

@@ -28,9 +28,10 @@
 import torch
 import argparse
 
-from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model
+import sys
+sys.path.append('./')
 
-from dllogger.autologging import log_args
+from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model
 
 def parse_args(parser):
     """
@@ -38,8 +39,8 @@ def parse_args(parser):
     """
     parser.add_argument('--waveglow', type=str, required=True,
                         help='full path to the WaveGlow model checkpoint file')
-    parser.add_argument('-o', '--output', type=str, default="waveglow.onnx",
-                        help='filename for the exported WaveGlow TRT engine')
+    parser.add_argument('-o', '--output', type=str, required=True,
+                        help='Directory for the exported WaveGlow ONNX model')
     parser.add_argument('--amp-run', action='store_true',
                         help='inference with AMP')
     parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
@@ -112,9 +113,38 @@ def convert_1d_to_2d_(glow):
 
     glow.cuda()
 
+def test_inference(waveglow):
+
+
+    from scipy.io.wavfile import write
+
+    mel = torch.load("mel.pt").cuda()
+    # mel = torch.load("mel_spectrograms/LJ001-0015.wav.pt").cuda()
+    # mel = mel.unsqueeze(0)
+    mel_lengths = [mel.size(2)]
+    stride = 256
+    kernel_size = 1024
+    n_group = 8
+    z_size2 = (mel.size(2)-1)*stride+(kernel_size-1)+1
+    # corresponds to cutoff in infer_onnx
+    z_size2 = z_size2 - (kernel_size-stride)
+    z_size2 = z_size2//n_group
+    z = torch.randn(1, n_group, z_size2, 1).cuda()
+    mel = mel.unsqueeze(3)
+
+    with torch.no_grad():
+        audios = waveglow(mel, z)
+
+    for i, audio in enumerate(audios):
+        audio = audio[:mel_lengths[i]*256]
+        audio = audio/torch.max(torch.abs(audio))
+        write("audio_pyt.wav", 22050, audio.cpu().numpy())
+
+
 def export_onnx(parser, args):
 
-    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run)
+    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
+                                    args.amp_run, forward_is_infer=False)
 
     # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
     mel = torch.randn(1, 80, 620).cuda()
@@ -140,7 +170,18 @@ def export_onnx(parser, args):
         if args.amp_run:
             waveglow.half()
         mel = mel.unsqueeze(3)
-        torch.onnx.export(waveglow, (mel, z), args.output)
+
+        opset_version = 10
+        torch.onnx.export(waveglow, (mel, z), args.output+"/"+"waveglow.onnx",
+                          opset_version=opset_version,
+                          do_constant_folding=True,
+                          input_names=["mel", "z"],
+                          output_names=["audio"],
+                          dynamic_axes={"mel":   {0: "batch_size", 2: "mel_seq"},
+                                        "z":     {0: "batch_size", 2: "z_seq"},
+                                        "audio": {0: "batch_size", 1: "audio_seq"}})
+
+    test_inference(waveglow)
 
 
 def main():
@@ -150,8 +191,6 @@ def main():
     parser = parse_args(parser)
     args, _ = parser.parse_known_args()
 
-    log_args(args)
-
     export_onnx(parser, args)
 
 if __name__ == '__main__':

+ 0 - 3
PyTorch/SpeechSynthesis/Tacotron2/export_waveglow_trt_config.py → PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_trt_config.py

@@ -28,7 +28,6 @@
 
 import os
 import argparse
-from dllogger.autologging import log_hardware, log_args
 
 
 def parse_args(parser):
@@ -54,8 +53,6 @@ def main():
     parser = parse_args(parser)
     args = parser.parse_args()
     
-    log_args(args)    
-    
     # prepare repository
     model_folder = os.path.join('./trtis_repo', args.trtis_model_name)
     version_folder = os.path.join(model_folder, str(args.trtis_model_version))

+ 23 - 47
PyTorch/SpeechSynthesis/Tacotron2/inference.py

@@ -35,9 +35,8 @@ from scipy.io.wavfile import write
 import sys
 
 import time
-from dllogger.logger import LOGGER
-import dllogger.logger as dllg
-from dllogger.autologging import log_hardware, log_args
+import dllogger as DLLogger
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
 
 from apex import amp
 
@@ -72,16 +71,6 @@ def parse_args(parser):
     return parser
 
 
-def load_checkpoint(checkpoint_path, model_name):
-    assert os.path.isfile(checkpoint_path)
-
-    print("Loading checkpoint '{}'".format(checkpoint_path))
-    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
-    model.load_state_dict(checkpoint_dict['state_dict'])
-    print("Loaded '{}' checkpoint '{}'" .format(model_name, checkpoint_path))
-    return model
-
-
 def checkpoint_from_distributed(state_dict):
     """
     Checks whether checkpoint was generated by DistributedDataParallel. DDP
@@ -111,12 +100,13 @@ def unwrap_distributed(state_dict):
     return new_state_dict
 
 
-def load_and_setup_model(model_name, parser, checkpoint, amp_run, rename=False):
+def load_and_setup_model(model_name, parser, checkpoint, amp_run, forward_is_infer=False):
     model_parser = models.parse_model_args(model_name, parser, add_help=False)
     model_args, _ = model_parser.parse_known_args()
 
     model_config = models.get_model_config(model_name, model_args)
-    model = models.get_model(model_name, model_config, to_cuda=True, rename=rename)
+    model = models.get_model(model_name, model_config, to_cuda=True,
+                             forward_is_infer=forward_is_infer)
 
     if checkpoint is not None:
         state_dict = torch.load(checkpoint)['state_dict']
@@ -195,30 +185,19 @@ def main():
     parser = parse_args(parser)
     args, _ = parser.parse_known_args()
 
-    LOGGER.set_model_name("Tacotron2_PyT")
-    LOGGER.set_backends([
-        dllg.StdOutBackend(log_file=None,
-                           logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
-        dllg.JsonBackend(log_file=args.log_file,
-                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
-    ])
-    LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("waveglow_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
-
-    log_hardware()
-    log_args(args)
+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
+                                              args.output+'/'+args.log_file),
+                            StdOutBackend(Verbosity.VERBOSE)])
+    for k,v in vars(args).items():
+        DLLogger.log(step="PARAMETER", data={k:v})
+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
 
     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
-                                     args.amp_run)
+                                     args.amp_run, forward_is_infer=True)
     waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
-                                    args.amp_run)
+                                    args.amp_run, forward_is_infer=True)
     denoiser = Denoiser(waveglow).cuda()
 
-    tacotron2.forward = tacotron2.infer
-    type(tacotron2).forward = type(tacotron2).infer
     jitted_tacotron2 = torch.jit.script(tacotron2)
 
     texts = []
@@ -228,7 +207,7 @@ def main():
     except:
         print("Could not read file")
         sys.exit(1)
-
+    
     if args.include_warmup:
         sequence = torch.randint(low=0, high=148, size=(1,50),
                                  dtype=torch.long).cuda()
@@ -236,9 +215,7 @@ def main():
         for i in range(3):
             with torch.no_grad():
                 mel, mel_lengths = jitted_tacotron2(sequence, input_lengths)
-                _ = waveglow.infer(mel)
-
-    LOGGER.iteration_start()
+                _ = waveglow(mel)
 
     measurements = {}
 
@@ -248,19 +225,19 @@ def main():
         mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths)
 
     with torch.no_grad(), MeasureTime(measurements, "waveglow_time"):
-        audios = waveglow.infer(mel, sigma=args.sigma_infer)
+        audios = waveglow(mel, sigma=args.sigma_infer)
         audios = audios.float()
         audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
 
+    print("Stopping after",mel.size(2),"decoder steps")
     tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time']
     waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time']
 
-    LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf)
-    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
-    LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf)
-    LOGGER.log(key="waveglow_latency", value=measurements['waveglow_time'])
-    LOGGER.log(key="latency", value=(measurements['tacotron2_time']+
-                                     measurements['waveglow_time']))
+    DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf})
+    DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']})
+    DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
+    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
+    DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time'])})
 
     for i, audio in enumerate(audios):
         audio = audio[:mel_lengths[i]*args.stft_hop_length]
@@ -268,8 +245,7 @@ def main():
         audio_path = args.output + "audio_"+str(i)+".wav"
         write(audio_path, args.sampling_rate, audio.cpu().numpy())
 
-    LOGGER.iteration_stop()
-    LOGGER.finish()
+    DLLogger.flush()
 
 if __name__ == '__main__':
     main()

+ 24 - 26
PyTorch/SpeechSynthesis/Tacotron2/inference_perf.py

@@ -34,10 +34,8 @@ import time
 
 from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model, MeasureTime
 
-from dllogger.logger import LOGGER
-import dllogger.logger as dllg
-from dllogger import tags
-from dllogger.autologging import log_hardware, log_args
+import dllogger as DLLogger
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
 
 from apex import amp
 
@@ -52,6 +50,8 @@ def parse_args(parser):
     parser.add_argument('--amp-run', action='store_true',
                         help='inference with AMP')
     parser.add_argument('-bs', '--batch-size', type=int, default=1)
+    parser.add_argument('-o', '--output', type=str, required=True,
+                        help='Directory to save results')
     parser.add_argument('--log-file', type=str, default='nvlog.json',
                         help='Filename for logging')
 
@@ -70,29 +70,23 @@ def main():
 
     log_file = args.log_file
 
-    LOGGER.set_model_name("Tacotron2_PyT")
-    LOGGER.set_backends([
-        dllg.StdOutBackend(log_file=None,
-                           logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
-        dllg.JsonBackend(log_file,
-                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
-    ])
-    LOGGER.register_metric("items_per_sec",
-                           metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("latency",
-                           metric_scope=dllg.TRAIN_ITER_SCOPE)
+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
+                                              args.output+'/'+args.log_file),
+                            StdOutBackend(Verbosity.VERBOSE)])
+    for k,v in vars(args).items():
+        DLLogger.log(step="PARAMETER", data={k:v})
+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
 
-    log_hardware()
-    log_args(args)
+    model = load_and_setup_model(args.model_name, parser, None, args.amp_run,
+                                 forward_is_infer=True)
 
-    model = load_and_setup_model(args.model_name, parser, None, args.amp_run)
+    if args.model_name == "Tacotron2":
+        model = torch.jit.script(model)
 
     warmup_iters = 3
     num_iters = 1+warmup_iters
 
     for i in range(num_iters):
-        if i >= warmup_iters:
-            LOGGER.iteration_start()
 
         measurements = {}
 
@@ -101,7 +95,7 @@ def main():
                                         dtype=torch.long).cuda()
             input_lengths = torch.IntTensor([text_padded.size(1)]*args.batch_size).cuda().long()
             with torch.no_grad(), MeasureTime(measurements, "inference_time"):
-                mels, _ = model.infer(text_padded, input_lengths)
+                mels, _ = model(text_padded, input_lengths)
             num_items = mels.size(0)*mels.size(2)
 
         if args.model_name == 'WaveGlow':
@@ -113,16 +107,20 @@ def main():
                 mel_padded = mel_padded.half()
 
             with torch.no_grad(), MeasureTime(measurements, "inference_time"):
-                audios = model.infer(mel_padded)
+                audios = model(mel_padded)
                 audios = audios.float()
             num_items = audios.size(0)*audios.size(1)
 
         if i >= warmup_iters:
-            LOGGER.log(key="items_per_sec", value=(num_items/measurements['inference_time']))
-            LOGGER.log(key="latency", value=measurements['inference_time'])
-            LOGGER.iteration_stop()
+            DLLogger.log(step=(i-warmup_iters,), data={"latency": measurements['inference_time']})
+            DLLogger.log(step=(i-warmup_iters,), data={"items_per_sec": num_items/measurements['inference_time']})
+
+    DLLogger.log(step=tuple(),
+                 data={'infer_latency': measurements['inference_time']})
+    DLLogger.log(step=tuple(),
+                 data={'infer_items_per_sec': num_items/measurements['inference_time']})
 
-    LOGGER.finish()
+    DLLogger.flush()
 
 if __name__ == '__main__':
     main()

+ 53 - 0
PyTorch/SpeechSynthesis/Tacotron2/main.py

@@ -0,0 +1,53 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import argparse
+from train import main as main_train
+from inference_perf import main as main_infer
+
+def parse_args(parser):
+    """
+    Parse commandline arguments.
+    """
+
+    parser.add_argument('--bench-class',  type=str, choices=['train', 'perf-infer', 'perf-train'], required=True, help='Choose test class')
+
+    return parser
+
+def main():
+    
+    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Testing')
+    parser = parse_args(parser)
+    args, unknown_args = parser.parse_known_args()
+
+    if "train" in args.bench_class:
+        main_train()
+    else:
+        main_infer()
+
+if __name__ == '__main__':
+    main()

+ 11 - 5
PyTorch/SpeechSynthesis/Tacotron2/models.py

@@ -63,19 +63,25 @@ def init_bn(module):
 
 
 def get_model(model_name, model_config, to_cuda,
-              uniform_initialize_bn_weight=False, rename=False):
+              uniform_initialize_bn_weight=False, forward_is_infer=False):
     """ Code chooses a model based on name"""
     model = None
     if model_name == 'Tacotron2':
-        if rename:
-            class Tacotron2_extra(Tacotron2):
+        if forward_is_infer:
+            class Tacotron2__forward_is_infer(Tacotron2):
                 def forward(self, inputs, input_lengths):
                     return self.infer(inputs, input_lengths)
-            model = Tacotron2_extra(**model_config)
+            model = Tacotron2__forward_is_infer(**model_config)
         else:
             model = Tacotron2(**model_config)
     elif model_name == 'WaveGlow':
-        model = WaveGlow(**model_config)
+        if forward_is_infer:
+            class WaveGlow__forward_is_infer(WaveGlow):
+                def forward(self, spect, sigma=1.0):
+                    return self.infer(spect, sigma)
+            model = WaveGlow__forward_is_infer(**model_config)
+        else:
+            model = WaveGlow(**model_config)
     else:
         raise NotImplementedError(model_name)
 

+ 4 - 4
PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/README.md

@@ -48,7 +48,7 @@ Follow the Tacotron 2 Quick Start Guide (points 1-4) to start the container.
 Inside the container, type:
 ```bash
 cd /workspace/tacotron2/
-python export_tacotron2_ts_config.py --amp-run
+python exports/export_tacotron2_ts_config.py --amp-run
 ```
 
 This will export the folder structure of the TRTIS repository and the config file of Tacotron 2. 
@@ -67,7 +67,7 @@ Move the downloaded model to `trtis_repo/tacotron2/1/model.pt`
 
 To export the Tacotron 2 model using TorchScript, type:
 ```bash
-python export_tacotron2_ts.py --tacotron2 <tacotron2_checkpoint> -o trtis_repo/tacotron2/1/model.pt --amp-run
+python exports/export_tacotron2_ts.py --tacotron2 <tacotron2_checkpoint> -o trtis_repo/tacotron2/1/model.pt --amp-run
 ```
 
 This will save the model as ``trtis_repo/tacotron2/1/model.pt``.
@@ -78,7 +78,7 @@ For WaveGlow, we also need to create the folder structure that will be used by t
 Inside the container, type:
 ```bash
 cd /workspace/tacotron2/
-python export_waveglow_trt_config.py --amp-run
+python exports/export_waveglow_trt_config.py --amp-run
 ```
 
 This will export the folder structure of the TRTIS repository and the config file of Waveglow. 
@@ -106,7 +106,7 @@ cd /workspace/onnx-tensorrt/build && cmake .. -DCMAKE_CXX_FLAGS=-isystem\ /usr/l
 In order to export the model into the ONNX intermediate representation, type:
 
 ```bash
-python export_waveglow_trt.py --waveglow <waveglow_checkpoint> --wn-channels 256 --amp-run
+python exports/export_waveglow_onnx.py --waveglow <waveglow_checkpoint> --wn-channels 256 --amp-run
 ```
 
 This will save the model as `waveglow.onnx` (you can change its name with the flag `--output <filename>`).

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_1GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3
+python train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_4GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3
+python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_8GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3
+python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_1GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1
+python train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_4GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1
+python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_8GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1
+python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_1GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
+python train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_4GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
+python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_8GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
+python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_1GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
+python train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_4GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
+python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_8GPU.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
+python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json

+ 4 - 5
PyTorch/SpeechSynthesis/Tacotron2/run_latency_tests.sh

@@ -1,5 +1,4 @@
-bash test_infer.sh -bs 1 -il 128 -p amp --num-iters 1003 --tacotron2 checkpoint_Tacotron2_amp --waveglow checkpoint_WaveGlow_amp
-bash test_infer.sh -bs 4 -il 128 -p amp --num-iters 1003 --tacotron2 checkpoint_Tacotron2_amp --waveglow checkpoint_WaveGlow_amp
-bash test_infer.sh -bs 1 -il 128 -p fp32 --num-iters 1003 --tacotron2 checkpoint_Tacotron2_fp32 --waveglow checkpoint_WaveGlow_fp32
-bash test_infer.sh -bs 4 -il 128 -p fp32 --num-iters 1003 --tacotron2 checkpoint_Tacotron2_fp32 --waveglow checkpoint_WaveGlow_fp32
-
+bash test_infer.sh -bs 1 -il 128 -p amp --num-iters 1003 --tacotron2 ./checkpoints/checkpoint_Tacotron2_amp --waveglow ./checkpoints/checkpoint_WaveGlow_amp
+bash test_infer.sh -bs 4 -il 128 -p amp --num-iters 1003 --tacotron2 ./checkpoints/checkpoint_Tacotron2_amp --waveglow ./checkpoints/checkpoint_WaveGlow_amp
+bash test_infer.sh -bs 1 -il 128 -p fp32 --num-iters 1003 --tacotron2 ./checkpoints/checkpoint_Tacotron2_fp32 --waveglow ./checkpoints/checkpoint_WaveGlow_fp32
+bash test_infer.sh -bs 4 -il 128 -p fp32 --num-iters 1003 --tacotron2 ./checkpoints/checkpoint_Tacotron2_fp32 --waveglow ./checkpoints/checkpoint_WaveGlow_fp32

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/scripts/train_tacotron2.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m Tacotron2 -o ./output/ -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file ./output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1 --amp-run
+python -m multiproc train.py -m Tacotron2 -o ./output/ -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1 --amp-run

+ 1 - 1
PyTorch/SpeechSynthesis/Tacotron2/scripts/train_waveglow.sh

@@ -1,2 +1,2 @@
 mkdir -p output
-python -m multiproc train.py -m WaveGlow -o ./output/ -lr 1e-4 --epochs 1001 -bs 10 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-enabled --cudnn-benchmark --log-file ./output/nvlog.json --amp-run
+python -m multiproc train.py -m WaveGlow -o ./output/ -lr 1e-4 --epochs 1001 -bs 10 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json --amp-run

+ 2 - 2
PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py

@@ -90,7 +90,7 @@ class Attention(nn.Module):
         energies = self.v(torch.tanh(
             processed_query + processed_attention_weights + processed_memory))
 
-        energies = energies.squeeze(-1)
+        energies = energies.squeeze(2)
         return energies
 
     def forward(self, attention_hidden_state, memory, processed_memory,
@@ -107,7 +107,7 @@ class Attention(nn.Module):
         alignment = self.get_alignment_energies(
             attention_hidden_state, processed_memory, attention_weights_cat)
 
-        alignment.masked_fill_(mask, self.score_mask_value)
+        alignment = alignment.masked_fill(mask, self.score_mask_value)
 
         attention_weights = F.softmax(alignment, dim=1)
         attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)

+ 45 - 135
PyTorch/SpeechSynthesis/Tacotron2/test_infer.py

@@ -34,10 +34,11 @@ from scipy.io.wavfile import write
 
 import sys
 
+from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence
+
 import time
-from dllogger.logger import LOGGER
-import dllogger.logger as dllg
-from dllogger.autologging import log_hardware, log_args
+import dllogger as DLLogger
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
 
 from apex import amp
 
@@ -69,35 +70,6 @@ def parse_args(parser):
     return parser
 
 
-def checkpoint_from_distributed(state_dict):
-    """
-    Checks whether checkpoint was generated by DistributedDataParallel. DDP
-    wraps model in additional "module.", it needs to be unwrapped for single
-    GPU inference.
-    :param state_dict: model's state dict
-    """
-    ret = False
-    for key, _ in state_dict.items():
-        if key.find('module.') != -1:
-            ret = True
-            break
-    return ret
-
-
-def unwrap_distributed(state_dict):
-    """
-    Unwraps model from DistributedDataParallel.
-    DDP wraps model in additional "module.", it needs to be removed for single
-    GPU inference.
-    :param state_dict: model's state dict
-    """
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        new_key = key.replace('module.', '')
-        new_state_dict[new_key] = value
-    return new_state_dict
-
-
 def load_and_setup_model(model_name, parser, checkpoint, amp_run, to_cuda=True):
     model_parser = models.parse_model_args(model_name, parser, add_help=False)
     model_args, _ = model_parser.parse_known_args()
@@ -126,52 +98,44 @@ def load_and_setup_model(model_name, parser, checkpoint, amp_run, to_cuda=True):
     return model
 
 
-# taken from tacotron2/data_function.py:TextMelCollate.__call__
-def pad_sequences(batch):
-    # Right zero-pad all one-hot text sequences to max input length
-    input_lengths, ids_sorted_decreasing = torch.sort(
-        torch.LongTensor([len(x) for x in batch]),
-        dim=0, descending=True)
-    max_input_len = input_lengths[0]
-
-    text_padded = torch.LongTensor(len(batch), max_input_len)
-    text_padded.zero_()
-    for i in range(len(ids_sorted_decreasing)):
-        text = batch[ids_sorted_decreasing[i]]
-        text_padded[i, :text.size(0)] = text
-
-    return text_padded, input_lengths
+def print_stats(measurements_all):
 
+    print(np.mean(measurements_all['latency'][1:]),
+          np.mean(measurements_all['throughput'][1:]),
+          np.mean(measurements_all['pre_processing'][1:]),
+          np.mean(measurements_all['type_conversion'][1:])+
+          np.mean(measurements_all['storage'][1:])+
+          np.mean(measurements_all['data_transfer'][1:]),
+          np.mean(measurements_all['num_mels_per_audio'][1:]))
 
-def prepare_input_sequence(texts):
-
-    d = []
-    for i,text in enumerate(texts):
-        d.append(torch.IntTensor(
-            text_to_sequence(text, ['english_cleaners'])[:]))
-
-    text_padded, input_lengths = pad_sequences(d)
-    if torch.cuda.is_available():
-        text_padded = torch.autograd.Variable(text_padded).cuda().long()
-        input_lengths = torch.autograd.Variable(input_lengths).cuda().long()
-    else:
-        text_padded = torch.autograd.Variable(text_padded).long()
-        input_lengths = torch.autograd.Variable(input_lengths).long()
-
-    return text_padded, input_lengths
+    throughput = measurements_all['throughput']
+    preprocessing = measurements_all['pre_processing']
+    type_conversion = measurements_all['type_conversion']
+    storage = measurements_all['storage']
+    data_transfer = measurements_all['data_transfer']
+    postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
+    latency = measurements_all['latency']
+    num_mels_per_audio = measurements_all['num_mels_per_audio']
 
-class MeasureTime():
-    def __init__(self, measurements, key):
-        self.measurements = measurements
-        self.key = key
+    latency.sort()
 
-    def __enter__(self):
-        torch.cuda.synchronize()
-        self.t0 = time.perf_counter()
+    cf_50 = max(latency[:int(len(latency)*0.50)])
+    cf_90 = max(latency[:int(len(latency)*0.90)])
+    cf_95 = max(latency[:int(len(latency)*0.95)])
+    cf_99 = max(latency[:int(len(latency)*0.99)])
+    cf_100 = max(latency[:int(len(latency)*1.0)])
 
-    def __exit__(self, exc_type, exc_value, exc_traceback):
-        torch.cuda.synchronize()
-        self.measurements[self.key] = time.perf_counter() - self.t0
+    print("Throughput average (samples/sec) = {:.4f}".format(np.mean(throughput)))
+    print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
+    print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
+    print("Number of mels per audio average = {}".format(np.mean(num_mels_per_audio)))
+    print("Latency average (seconds) = {:.4f}".format(np.mean(latency)))
+    print("Latency std (seconds) = {:.4f}".format(np.std(latency)))
+    print("Latency cl 50 (seconds) = {:.4f}".format(cf_50))
+    print("Latency cl 90 (seconds) = {:.4f}".format(cf_90))
+    print("Latency cl 95 (seconds) = {:.4f}".format(cf_95))
+    print("Latency cl 99 (seconds) = {:.4f}".format(cf_99))
+    print("Latency cl 100 (seconds) = {:.4f}".format(cf_100))
 
 
 def main():
@@ -184,22 +148,11 @@ def main():
     parser = parse_args(parser)
     args, unknown_args = parser.parse_known_args()
 
-    LOGGER.set_model_name("Tacotron2_PyT")
-    LOGGER.set_backends([
-        dllg.JsonBackend(log_file=args.log_file,
-                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
-    ])
-    LOGGER.register_metric("pre_processing", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("waveglow_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("type_conversion", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("storage", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("data_transfer", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("num_mels_per_audio", metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("throughput", metric_scope=dllg.TRAIN_ITER_SCOPE)
+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
+                            StdOutBackend(Verbosity.VERBOSE)])
+    for k,v in vars(args).items():
+        DLLogger.log(step="PARAMETER", data={k:v})
+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
 
     measurements_all = {"pre_processing": [],
                         "tacotron2_latency": [],
@@ -213,9 +166,6 @@ def main():
                         "num_mels_per_audio": [],
                         "throughput": []}
 
-    log_hardware()
-    log_args(args)
-
     print("args:", args, unknown_args)
 
     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run)
@@ -229,9 +179,6 @@ def main():
 
     for iter in range(args.num_iters):
 
-        if iter >= warmup_iters:
-            LOGGER.iteration_start()
-
         measurements = {}
 
         with MeasureTime(measurements, "pre_processing"):
@@ -269,48 +216,11 @@ def main():
         if iter >= warmup_iters:
             for k,v in measurements.items():
                 measurements_all[k].append(v)
-                LOGGER.log(key=k, value=v)
+                DLLogger.log(step=(iter-warmup_iters), data={k: v})
 
-            LOGGER.iteration_stop()
-
-    LOGGER.finish()
-
-    print(np.mean(measurements_all['latency'][1:]),
-          np.mean(measurements_all['throughput'][1:]),
-          np.mean(measurements_all['pre_processing'][1:]),
-          np.mean(measurements_all['type_conversion'][1:])+
-          np.mean(measurements_all['storage'][1:])+
-          np.mean(measurements_all['data_transfer'][1:]),
-          np.mean(measurements_all['num_mels_per_audio'][1:]))
+    DLLogger.flush()
 
-    throughput = measurements_all['throughput']
-    preprocessing = measurements_all['pre_processing']
-    type_conversion = measurements_all['type_conversion']
-    storage = measurements_all['storage']
-    data_transfer = measurements_all['data_transfer']
-    postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
-    latency = measurements_all['latency']
-    num_mels_per_audio = measurements_all['num_mels_per_audio']
-
-    latency.sort()
-
-    cf_50 = max(latency[:int(len(latency)*0.50)])
-    cf_90 = max(latency[:int(len(latency)*0.90)])
-    cf_95 = max(latency[:int(len(latency)*0.95)])
-    cf_99 = max(latency[:int(len(latency)*0.99)])
-    cf_100 = max(latency[:int(len(latency)*1.0)])
-
-    print("Throughput average (samples/sec) = {:.4f}".format(np.mean(throughput)))
-    print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
-    print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
-    print("Number of mels per audio average = {}".format(np.mean(num_mels_per_audio)))
-    print("Latency average (seconds) = {:.4f}".format(np.mean(latency)))
-    print("Latency std (seconds) = {:.4f}".format(np.std(latency)))
-    print("Latency cl 50 (seconds) = {:.4f}".format(cf_50))
-    print("Latency cl 90 (seconds) = {:.4f}".format(cf_90))
-    print("Latency cl 95 (seconds) = {:.4f}".format(cf_95))
-    print("Latency cl 99 (seconds) = {:.4f}".format(cf_99))
-    print("Latency cl 100 (seconds) = {:.4f}".format(cf_100))
+    print_stats(measurements_all)
 
 if __name__ == '__main__':
     main()

+ 31 - 5
PyTorch/SpeechSynthesis/Tacotron2/test_infer.sh

@@ -7,6 +7,7 @@ NUM_ITERS=1003 # extra 3 iterations for warmup
 TACOTRON2_CKPT="checkpoint_Tacotron2_1500_fp32"
 WAVEGLOW_CKPT="checkpoint_WaveGlow_1000_fp32"
 AMP_RUN=""
+TEST_PROGRAM="test_infer.py"
 
 while [ -n "$1" ]
 do
@@ -27,10 +28,26 @@ do
 	    NUM_ITERS="$2"
 	    shift
 	    ;;
+	--test)
+	    TEST_PROGRAM="$2"
+	    shift
+	    ;;
 	--tacotron2)
 	    TACOTRON2_CKPT="$2"
 	    shift
 	    ;;
+	--encoder)
+	    ENCODER_CKPT="$2"
+	    shift
+	    ;;
+	--decoder)
+	    DECODER_CKPT="$2"
+	    shift
+	    ;;
+	--postnet)
+	    POSTNET_CKPT="$2"
+	    shift
+	    ;;
 	--waveglow)
 	    WAVEGLOW_CKPT="$2"
 	    shift
@@ -51,9 +68,17 @@ NVLOG_FILE=nvlog_${LOG_SUFFIX}.json
 TMP_LOGFILE=tmp_log_${LOG_SUFFIX}.log
 LOGFILE=log_${LOG_SUFFIX}.log
 
+
+if [ "$TEST_PROGRAM" = "trt/test_infer_trt.py" ]
+then
+    MODELS="--encoder $ENCODER_CKPT --decoder $DECODER_CKPT --postnet $POSTNET_CKPT"
+else
+    MODELS="--tacotron2 $TACOTRON2_CKPT"
+fi
+
 set -x
-python test_infer.py \
-       --tacotron2 $TACOTRON2_CKPT \
+python $TEST_PROGRAM \
+       $MODELS \
        --waveglow $WAVEGLOW_CKPT \
        --batch-size $BATCH_SIZE \
        --input-length $INPUT_LENGTH $AMP_RUN \
@@ -67,7 +92,8 @@ PERF=$(cat $TMP_LOGFILE | grep -F 'Throughput average (samples/sec)' | awk -F'=
 NUM_MELS=$(cat $TMP_LOGFILE | grep -F 'Number of mels per audio average' | awk -F'= ' '{print $2}')
 LATENCY=$(cat $TMP_LOGFILE | grep -F 'Latency average (seconds)' | awk -F'= ' '{print $2}')
 LATENCYSTD=$(cat $TMP_LOGFILE | grep -F 'Latency std (seconds)' | awk -F'= ' '{print $2}')
-LATENCY50=$(cat $TMP_LOGFILE | grep -F 'Latency cl 50 (seconds)' | awk -F'= ' '{print $2}')
-LATENCY100=$(cat $TMP_LOGFILE | grep -F 'Latency cl 100 (seconds)' | awk -F'= ' '{print $2}')
+LATENCY90=$(cat $TMP_LOGFILE | grep -F 'Latency cl 90 (seconds)' | awk -F'= ' '{print $2}')
+LATENCY95=$(cat $TMP_LOGFILE | grep -F 'Latency cl 95 (seconds)' | awk -F'= ' '{print $2}')
+LATENCY99=$(cat $TMP_LOGFILE | grep -F 'Latency cl 99 (seconds)' | awk -F'= ' '{print $2}')
 
-echo "$BATCH_SIZE,$INPUT_LENGTH,$PRECISION,$NUM_ITERS,$LATENCY,$LATENCYSTD,$LATENCY50,$LATENCY100,$PERF,$NUM_MELS" >> $LOGFILE
+echo "$BATCH_SIZE,$INPUT_LENGTH,$PRECISION,$NUM_ITERS,$LATENCY,$LATENCYSTD,$LATENCY90,$LATENCY95,$LATENCY99,$PERF,$NUM_MELS" >> $LOGFILE

+ 73 - 95
PyTorch/SpeechSynthesis/Tacotron2/train.py

@@ -45,10 +45,9 @@ import models
 import loss_functions
 import data_functions
 
-from dllogger.logger import LOGGER
-import dllogger.logger as dllg
-from dllogger import tags
-from dllogger.autologging import log_hardware, log_args
+import dllogger as DLLogger
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+
 from scipy.io.wavfile import write as write_wav
 
 from apex import amp
@@ -61,7 +60,7 @@ def parse_args(parser):
     Parse commandline arguments.
     """
 
-    parser.add_argument('-o', '--output_directory', type=str, required=True,
+    parser.add_argument('-o', '--output', type=str, required=True,
                         help='Directory to save checkpoints')
     parser.add_argument('-d', '--dataset-path', type=str,
                         default='./', help='Path to dataset')
@@ -154,6 +153,9 @@ def parse_args(parser):
     distributed.add_argument('--dist-backend', default='nccl', type=str, choices={'nccl'},
                              help='Distributed run backend')
 
+    benchmark = parser.add_argument_group('benchmark')
+    benchmark.add_argument('--bench-class', type=str, default='')
+
     return parser
 
 
@@ -223,8 +225,8 @@ def evaluating(model):
             model.train()
 
 
-def validate(model, criterion, valset, iteration, batch_size, world_size,
-             collate_fn, distributed_run, rank, batch_to_gpu):
+def validate(model, criterion, valset, epoch, batch_iter, batch_size,
+             world_size, collate_fn, distributed_run, rank, batch_to_gpu):
     """Handles all the validation scoring and printing"""
     with evaluating(model), torch.no_grad():
         val_sampler = DistributedSampler(valset) if distributed_run else None
@@ -245,11 +247,11 @@ def validate(model, criterion, valset, iteration, batch_size, world_size,
             val_loss += reduced_val_loss
         val_loss = val_loss / (i + 1)
 
-    LOGGER.log(key="val_iter_loss", value=val_loss)
+        DLLogger.log(step=(epoch, batch_iter, epoch), data={'val_iter_loss': val_loss})
+        return val_loss
 
-
-def adjust_learning_rate(epoch, optimizer, learning_rate,
-                         anneal_steps, anneal_factor):
+def adjust_learning_rate(iteration, epoch, optimizer, learning_rate,
+                         anneal_steps, anneal_factor, rank):
 
     p = 0
     if anneal_steps is not None:
@@ -263,8 +265,7 @@ def adjust_learning_rate(epoch, optimizer, learning_rate,
         lr = learning_rate*(anneal_factor ** p)
 
     if optimizer.param_groups[0]['lr'] != lr:
-        LOGGER.log_event("learning_rate changed",
-                         value=str(optimizer.param_groups[0]['lr']) + " -> " + str(lr))
+        DLLogger.log(step=(epoch, iteration), data={'learning_rate changed': str(optimizer.param_groups[0]['lr'])+" -> "+str(lr)})
 
     for param_group in optimizer.param_groups:
         param_group['lr'] = lr
@@ -276,51 +277,38 @@ def main():
     parser = parse_args(parser)
     args, _ = parser.parse_known_args()
 
-    LOGGER.set_model_name("Tacotron2_PyT")
-    LOGGER.set_backends([
-        dllg.StdOutBackend(log_file=None,
-                           logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
-        dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None,
-                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
-    ])
-
-    LOGGER.timed_block_start("run")
-    LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS,
-                           metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("iter_time",
-                           metric_scope=dllg.TRAIN_ITER_SCOPE)
-    LOGGER.register_metric("epoch_time",
-                           metric_scope=dllg.EPOCH_SCOPE)
-    LOGGER.register_metric("run_time",
-                           metric_scope=dllg.RUN_SCOPE)
-    LOGGER.register_metric("val_iter_loss",
-                           metric_scope=dllg.EPOCH_SCOPE)
-    LOGGER.register_metric("train_epoch_items/sec",
-                           metric_scope=dllg.EPOCH_SCOPE)
-    LOGGER.register_metric("train_epoch_avg_items/sec",
-                           metric_scope=dllg.EPOCH_SCOPE)
-    LOGGER.register_metric("train_epoch_avg_loss",
-                           metric_scope=dllg.EPOCH_SCOPE)
-
-    log_hardware()
+    if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        local_rank = int(os.environ['LOCAL_RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+    else:
+        local_rank = args.rank
+        world_size = args.world_size
 
-    model_name = args.model_name
-    parser = models.parse_model_args(model_name, parser)
-    parser.parse_args()
+    distributed_run = world_size > 1
 
-    args = parser.parse_args()
+    if local_rank == 0:
+        DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
+                                                  args.output+'/'+args.log_file),
+                                StdOutBackend(Verbosity.VERBOSE)])
+    else:
+        DLLogger.init(backends=[])
 
-    log_args(args)
+    for k,v in vars(args).items():
+        DLLogger.log(step="PARAMETER", data={k:v})
+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
+
+    model_name = args.model_name
+    parser = models.parse_model_args(model_name, parser)
+    args, _ = parser.parse_known_args()
 
     torch.backends.cudnn.enabled = args.cudnn_enabled
     torch.backends.cudnn.benchmark = args.cudnn_benchmark
 
-    distributed_run = args.world_size > 1
     if distributed_run:
-        init_distributed(args, args.world_size, args.rank, args.group_name)
+        init_distributed(args, world_size, local_rank, args.group_name)
 
-    LOGGER.log(key=tags.RUN_START)
     run_start_time = time.time()
+    DLLogger.log(step=tuple(), data={'run_start': run_start_time})
 
     model_config = models.get_model_config(model_name, args)
     model = models.get_model(model_name, model_config,
@@ -374,21 +362,22 @@ def main():
     batch_to_gpu = data_functions.get_batch_to_gpu(model_name)
 
     iteration = 0
-    model.train()
+    train_epoch_avg_items_per_sec = 0.0
+    val_loss = 0.0
+    num_iters = 0
 
-    LOGGER.log(key=tags.TRAIN_LOOP)
+    model.train()
 
     for epoch in range(start_epoch, args.epochs):
-        LOGGER.epoch_start()
         epoch_start_time = time.time()
-        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
-
+        DLLogger.log(step=(epoch,) , data={'train_epoch_start': epoch_start_time})
         # used to calculate avg items/sec over epoch
         reduced_num_items_epoch = 0
 
         # used to calculate avg loss over epoch
         train_epoch_avg_loss = 0.0
         train_epoch_avg_items_per_sec = 0.0
+
         num_iters = 0
 
         # if overflow at the last iteration then do not save checkpoint
@@ -398,14 +387,14 @@ def main():
             train_loader.sampler.set_epoch(epoch)
 
         for i, batch in enumerate(train_loader):
-            print("Batch: {}/{} epoch {}".format(i, len(train_loader), epoch))
-            LOGGER.iteration_start()
             iter_start_time = time.time()
-            LOGGER.log(key=tags.TRAIN_ITER_START, value=i)
+            DLLogger.log(step=(epoch, i),
+                         data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))})
+            DLLogger.log(step=(epoch, i), data={'train_iter_start': iter_start_time})
 
             start = time.perf_counter()
-            adjust_learning_rate(epoch, optimizer, args.learning_rate,
-                                 args.anneal_steps, args.anneal_factor)
+            adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate,
+                                 args.anneal_steps, args.anneal_factor, local_rank)
 
             model.zero_grad()
             x, y, num_items = batch_to_gpu(batch)
@@ -414,7 +403,7 @@ def main():
             loss = criterion(y_pred, y)
 
             if distributed_run:
-                reduced_loss = reduce_tensor(loss.data, args.world_size).item()
+                reduced_loss = reduce_tensor(loss.data, world_size).item()
                 reduced_num_items = reduce_tensor(num_items.data, 1).item()
             else:
                 reduced_loss = loss.item()
@@ -422,7 +411,7 @@ def main():
             if np.isnan(reduced_loss):
                 raise Exception("loss is NaN")
 
-            LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss)
+            DLLogger.log(step=(epoch,i), data={'train_iter_loss': reduced_loss})
 
             train_epoch_avg_loss += reduced_loss
             num_iters += 1
@@ -442,60 +431,49 @@ def main():
 
             optimizer.step()
 
-            iteration += 1
-
-            LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i)
-
             iter_stop_time = time.time()
             iter_time = iter_stop_time - iter_start_time
             items_per_sec = reduced_num_items/iter_time
             train_epoch_avg_items_per_sec += items_per_sec
 
-            LOGGER.log(key="train_iter_items/sec",
-                       value=items_per_sec)
-            LOGGER.log(key="iter_time", value=iter_time)
-            LOGGER.iteration_stop()
+            DLLogger.log(step=(epoch, i), data={'train_iter_items/sec': items_per_sec})
+            DLLogger.log(step=(epoch, i), data={'train_iter_stop': iter_stop_time})
+            DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time})
+            iteration += 1
+
 
-        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
         epoch_stop_time = time.time()
         epoch_time = epoch_stop_time - epoch_start_time
 
-        LOGGER.log(key="train_epoch_items/sec",
-                   value=(reduced_num_items_epoch/epoch_time))
-        LOGGER.log(key="train_epoch_avg_items/sec",
-                   value=(train_epoch_avg_items_per_sec/num_iters if num_iters > 0 else 0.0))
-        LOGGER.log(key="train_epoch_avg_loss", value=(
-            train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0))
-        LOGGER.log(key="epoch_time", value=epoch_time)
-
-        LOGGER.log(key=tags.EVAL_START, value=epoch)
-
-        validate(model, criterion, valset, iteration,
-                 args.batch_size, args.world_size, collate_fn,
-                 distributed_run, args.rank, batch_to_gpu)
+        DLLogger.log(step=(epoch,), data={'train_epoch_items/sec': reduced_num_items_epoch/epoch_time})
+        DLLogger.log(step=(epoch,), data={'train_epoch_avg_items/sec':
+                                          (train_epoch_avg_items_per_sec/num_iters if num_iters > 0 else 0.0)})
+        DLLogger.log(step=(epoch,), data={'train_epoch_avg_loss': (train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0)})
+        DLLogger.log(step=(epoch,), data={'epoch_time': epoch_time})
 
-        LOGGER.log(key=tags.EVAL_STOP, value=epoch)
+        val_loss = validate(model, criterion, valset, epoch, i,
+                            args.batch_size, world_size, collate_fn,
+                            distributed_run, local_rank, batch_to_gpu)
 
-        if (epoch % args.epochs_per_checkpoint == 0) and args.rank == 0:
+        if (epoch % args.epochs_per_checkpoint == 0) and local_rank == 0 and args.bench_class == "":
             checkpoint_path = os.path.join(
-                args.output_directory, "checkpoint_{}_{}".format(model_name, epoch))
+                args.output, "checkpoint_{}_{}".format(model_name, epoch))
             save_checkpoint(model, optimizer, epoch, model_config,
                             args.amp_run, checkpoint_path)
+        if local_rank == 0:
+            DLLogger.flush()
 
-        LOGGER.epoch_stop()
 
     run_stop_time = time.time()
+    DLLogger.log(step=tuple(), data={'run_stop': run_start_time})
     run_time = run_stop_time - run_start_time
-    LOGGER.log(key="run_time", value=run_time)
-    LOGGER.log(key=tags.RUN_FINAL)
-
-    print("training time", run_stop_time - run_start_time)
-
-    LOGGER.timed_block_stop("run")
-
-    if args.rank == 0:
-        LOGGER.finish()
+    DLLogger.log(step=tuple(), data={'run_time': run_time})
+    DLLogger.log(step=tuple(), data={'train_items_per_sec':
+                                     (train_epoch_avg_items_per_sec/num_iters if num_iters > 0 else 0.0)})
+    DLLogger.log(step=tuple(), data={'val_loss': val_loss})
 
+    if local_rank == 0:
+        DLLogger.flush()
 
 if __name__ == '__main__':
     main()

+ 93 - 0
PyTorch/SpeechSynthesis/Tacotron2/trt/README.md

@@ -0,0 +1,93 @@
+# Tacotron 2 and WaveGlow Inference For TensorRT
+
+This is subfolder of the Tacotron 2 for PyTorch repository, tested and
+maintained by NVIDIA, and provides scripts to perform high-performance
+inference using NVIDIA TensorRT.
+The Tacotron 2 and WaveGlow models form a text-to-speech (TTS) system that
+enables users to synthesize natural sounding speech from raw transcripts
+without any additional information such as patterns and/or rhythms of speech.
+More information about the TTS system and its training can be found in the
+[Tacotron 2 PyTorch README](../README.md).
+NVIDIA TensorRT is a platform for high-performance deep learning inference.
+It includes a deep learning inference optimizer and runtime that delivers low
+latency and high-throughput for deep learning inference applications. After
+optimizing the compute-intensive acoustic model with NVIDIA TensorRT,
+inference throughput increased by up to *Xx* over native PyTorch.
+
+
+## Quick Start Guide
+
+1. Clone the repository.
+
+	```bash
+    git clone https://github.com/NVIDIA/DeepLearningExamples
+    cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2
+    ```
+
+2. Download pretrained checkpoints from [NGC](https://ngc.nvidia.com/catalog/models)
+and store them in `./checkpoints` directory:
+
+- [Tacotron2 checkpoint](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_fp16)
+- [WaveGlow checkpoint](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_fp16)
+
+    ```bash
+    mkdir -p checkpoints
+    mv <Tacotron2_checkpoint> <WaveGlow_checkpoint> ./checkpoints/
+    ```
+
+3. Build the Tacotron 2 and WaveGlow PyTorch NGC container.
+
+    ```bash
+    bash scripts/docker/build.sh
+    ```
+
+4. Start an interactive session in the NGC container to run training/inference.
+After you build the container image, you can start an interactive CLI session with:
+
+    ```bash
+    bash scripts/docker/interactive.sh
+    ```
+
+5. Export the models to ONNX intermediate representations (ONNX IRs).
+Export Tacotron 2 to three ONNX parts: Encoder, Decoder, and Postnet:
+
+	```bash
+	python exports/export_tacotron2_onnx.py --tacotron2 ./checkpoints/nvidia_tacotron2pyt_fp16_20190427 -o output/
+	```
+
+Export WaveGlow to ONNX IR:
+
+	```bash
+	python exports/export_waveglow_onnx.py --waveglow ./checkpoints/nvidia_waveglow256pyt_fp16 --wn-channels 256 -o output/
+	```
+
+After running the above commands, there should be four new files in `./output/`
+directory: `encoder.onnx`, `decoder_iter.onnx`, `postnet.onnx`, and 'waveglow.onnx`.
+
+6. Export the ONNX IRs to TensorRT engines:
+
+	```bash
+	python trt/export_onnx2trt.py --encoder output/encoder.onnx --decoder output/decoder_iter.onnx --postnet output/postnet.onnx --waveglow output/waveglow.onnx -o output/ --fp16
+	```
+
+After running the command, there should be four new files in `./output/`
+directory: `encoder_fp16.engine`, `decoder_iter_fp16.engine`, 
+`postnet_fp16.engine`, and 'waveglow_fp16.engine`.
+
+7. Run the inference:
+
+	```bash
+	python trt/inference_trt.py -i phrases/phrase.txt --encoder output/encoder_fp16.engine --decoder output/decoder_iter_fp16.engine --postnet output/postnet_fp16.engine --waveglow output/waveglow_fp16.engine -o output/
+	```
+
+## Inference performance: NVIDIA T4
+
+Our results were obtained by running the `./trt/run_latency_tests_trt.sh` script in
+the PyTorch-19.11-py3 NGC container. Please note that to reproduce the results,
+you need to provide pretrained checkpoints for Tacotron 2 and WaveGlow. Please
+edit the script to provide your checkpoint filenames.
+
+|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+|1| 128| FP16| | | | | | | | | | |
+|1| 128| FP32| | | | | | | | | | |

+ 130 - 0
PyTorch/SpeechSynthesis/Tacotron2/trt/export_onnx2trt.py

@@ -0,0 +1,130 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import pycuda.driver as cuda
+import pycuda.autoinit
+import tensorrt as trt
+import onnx
+import argparse
+
+import sys
+sys.path.append('./')
+
+from trt.trt_utils import build_engine
+
+def parse_args(parser):
+    """
+    Parse commandline arguments.
+    """
+    parser.add_argument('-o', '--output', required=True,
+                        help='output folder to save audio (file per phrase)')
+    parser.add_argument('--encoder', type=str, default="",
+                        help='full path to the Encoder ONNX')
+    parser.add_argument('--decoder', type=str, default="",
+                        help='full path to the DecoderIter ONNX')
+    parser.add_argument('--postnet', type=str, default="",
+                        help='full path to the Postnet ONNX')
+    parser.add_argument('--waveglow', type=str, default="",
+                        help='full path to the WaveGlow ONNX')
+    parser.add_argument('--fp16', action='store_true',
+                        help='inference with FP16')
+
+    return parser
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        description='Export from ONNX to TensorRT for Tacotron 2 and WaveGlow')
+    parser = parse_args(parser)
+    args = parser.parse_args()
+
+    engine_prec = "_fp16" if args.fp16 else "_fp32"
+
+    # Encoder
+    shapes=[{"name": "sequences",        "min": (1,4), "opt": (1,128), "max": (4,256)},
+            {"name": "sequence_lengths", "min": (1,),  "opt": (1,),    "max": (4,)}]
+    if args.encoder != "":
+        print("Building Encoder ...")
+        encoder_engine = build_engine(args.encoder, shapes=shapes, fp16=args.fp16)
+        if encoder_engine is not None:
+            with open(args.output+"/"+"encoder"+engine_prec+".engine", 'wb') as f:
+                f.write(encoder_engine.serialize())
+        else:
+            print("Failed to build engine from", args.encoder)
+            sys.exit()
+
+    # DecoderIter
+    shapes=[{"name": "decoder_input",         "min": (1,80),    "opt": (1,80),      "max": (4,80)},
+            {"name": "attention_hidden",      "min": (1,1024),  "opt": (1,1024),    "max": (4,1024)},
+            {"name": "attention_cell",        "min": (1,1024),  "opt": (1,1024),    "max": (4,1024)},
+            {"name": "decoder_hidden",        "min": (1,1024),  "opt": (1,1024),    "max": (4,1024)},
+            {"name": "decoder_cell",          "min": (1,1024),  "opt": (1,1024),    "max": (4,1024)},
+            {"name": "attention_weights",     "min": (1,4),     "opt": (1,128),     "max": (4,256)},
+            {"name": "attention_weights_cum", "min": (1,4),     "opt": (1,128),     "max": (4,256)},
+            {"name": "attention_context",     "min": (1,512),   "opt": (1,512),     "max": (4,512)},
+            {"name": "memory",                "min": (1,4,512), "opt": (1,128,512), "max": (4,256,512)},
+            {"name": "processed_memory",      "min": (1,4,128), "opt": (1,128,128), "max": (4,256,128)},
+            {"name": "mask",                  "min": (1,4),     "opt": (1,128),     "max": (4,256)}]
+    if args.decoder != "":
+        print("Building Decoder ...")
+        decoder_iter_engine = build_engine(args.decoder, shapes=shapes, fp16=args.fp16)
+        if decoder_iter_engine is not None:
+            with open(args.output+"/"+"decoder_iter"+engine_prec+".engine", 'wb') as f:
+                f.write(decoder_iter_engine.serialize())
+        else:
+            print("Failed to build engine from", args.decoder)
+            sys.exit()
+
+    # Postnet
+    shapes=[{"name": "mel_outputs", "min": (1,80,32), "opt": (1,80,768), "max": (4,80,1664)}]
+    if args.postnet != "":
+        print("Building Postnet ...")
+        postnet_engine = build_engine(args.postnet, shapes=shapes, fp16=args.fp16)
+        if postnet_engine is not None:
+            with open(args.output+"/"+"postnet"+engine_prec+".engine", 'wb') as f:
+                f.write(postnet_engine.serialize())
+        else:
+            print("Failed to build engine from", args.postnet)
+            sys.exit()
+
+    # WaveGlow
+    shapes=[{"name": "mel", "min": (1,80,32,1),  "opt": (1,80,768,1),  "max": (1,80,1664,1)},
+            {"name": "z",   "min": (1,8,1024,1), "opt": (1,8,24576,1), "max": (1,8,53248,1)}]
+    if args.waveglow != "":
+        print("Building WaveGlow ...")
+        waveglow_engine = build_engine(args.waveglow, shapes=shapes, fp16=args.fp16)
+        if waveglow_engine is not None:
+            with open(args.output+"/"+"waveglow"+engine_prec+".engine", 'wb') as f:
+                f.write(waveglow_engine.serialize())
+        else:
+            print("Failed to build engine from", args.waveglow)
+            sys.exit()
+
+
+if __name__ == '__main__':
+    main()

+ 368 - 0
PyTorch/SpeechSynthesis/Tacotron2/trt/inference_trt.py

@@ -0,0 +1,368 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import tensorrt as trt
+import numpy as np
+from scipy.io.wavfile import write
+import pycuda.autoinit
+import pycuda.driver as cuda
+import time
+import torch
+import argparse
+import sys
+
+sys.path.append('./')
+
+from common.utils import to_gpu, get_mask_from_lengths
+from tacotron2.text import text_to_sequence
+from inference import MeasureTime, prepare_input_sequence, load_and_setup_model
+import dllogger as DLLogger
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+from trt.trt_utils import load_engine, run_trt_engine
+
+from waveglow.denoiser import Denoiser
+
+def parse_args(parser):
+    """
+    Parse commandline arguments.
+    """
+    parser.add_argument('-i', '--input', type=str, required=True,
+                        help='full path to the input text (phareses separated by new line)')
+    parser.add_argument('-o', '--output', required=True,
+                        help='output folder to save audio (file per phrase)')
+    parser.add_argument('--encoder', type=str, required=True,
+                        help='full path to the Encoder engine')
+    parser.add_argument('--decoder', type=str, required=True,
+                        help='full path to the DecoderIter engine')
+    parser.add_argument('--postnet', type=str, required=True,
+                        help='full path to the Postnet engine')
+    parser.add_argument('--waveglow', type=str, required=True,
+                        help='full path to the WaveGlow engine')
+    parser.add_argument('--waveglow-ckpt', type=str, default="",
+                        help='full path to the WaveGlow model checkpoint file')
+    parser.add_argument('--log-file', type=str, default='nvlog.json',
+                        help='Filename for logging')
+    parser.add_argument('-d', '--denoising-strength', default=0.01, type=float)
+    parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
+                        help='Sampling rate')
+    parser.add_argument('--stft-hop-length', type=int, default=256,
+                        help='STFT hop length for estimating audio length from mel size')
+
+    return parser
+
+
+def init_decoder_inputs(memory, processed_memory, memory_lengths):
+
+    bs = memory.size(0)
+    seq_len = memory.size(1)
+    attention_rnn_dim = 1024
+    decoder_rnn_dim = 1024
+    encoder_embedding_dim = 512
+    n_mel_channels = 80
+
+    attention_hidden = torch.zeros(bs, attention_rnn_dim).cuda().float()
+    attention_cell = torch.zeros(bs, attention_rnn_dim).cuda().float()
+    decoder_hidden = torch.zeros(bs, decoder_rnn_dim).cuda().float()
+    decoder_cell = torch.zeros(bs, decoder_rnn_dim).cuda().float()
+    attention_weights = torch.zeros(bs, seq_len).cuda().float()
+    attention_weights_cum = torch.zeros(bs, seq_len).cuda().float()
+    attention_context = torch.zeros(bs, encoder_embedding_dim).cuda().float()
+    mask = get_mask_from_lengths(memory_lengths).cuda()
+    decoder_input = torch.zeros(bs, n_mel_channels).cuda().float()
+
+    return (decoder_input, attention_hidden, attention_cell, decoder_hidden,
+            decoder_cell, attention_weights, attention_weights_cum,
+            attention_context, memory, processed_memory, mask)
+
+def init_decoder_outputs(memory, memory_lengths):
+
+    bs = memory.size(0)
+    seq_len = memory.size(1)
+    attention_rnn_dim = 1024
+    decoder_rnn_dim = 1024
+    encoder_embedding_dim = 512
+    n_mel_channels = 80
+
+    attention_hidden = torch.zeros(bs, attention_rnn_dim).cuda().float()
+    attention_cell = torch.zeros(bs, attention_rnn_dim).cuda().float()
+    decoder_hidden = torch.zeros(bs, decoder_rnn_dim).cuda().float()
+    decoder_cell = torch.zeros(bs, decoder_rnn_dim).cuda().float()
+    attention_weights = torch.zeros(bs, seq_len).cuda().float()
+    attention_weights_cum = torch.zeros(bs, seq_len).cuda().float()
+    attention_context = torch.zeros(bs, encoder_embedding_dim).cuda().float()
+    decoder_output = torch.zeros(bs, n_mel_channels).cuda().float()
+    gate_prediction = torch.zeros(bs, 1).cuda().float()
+
+    return (attention_hidden, attention_cell, decoder_hidden,
+            decoder_cell, attention_weights, attention_weights_cum,
+            attention_context, decoder_output, gate_prediction)
+
+def init_decoder_tensors(decoder_inputs, decoder_outputs):
+
+    decoder_tensors = {
+        # inputs
+        'decoder_input': decoder_inputs[0],
+        'attention_hidden': decoder_inputs[1],
+        'attention_cell': decoder_inputs[2],
+        'decoder_hidden': decoder_inputs[3],
+        'decoder_cell': decoder_inputs[4],
+        'attention_weights': decoder_inputs[5],
+        'attention_weights_cum': decoder_inputs[6],
+        'attention_context': decoder_inputs[7],
+        'memory': decoder_inputs[8],
+        'processed_memory': decoder_inputs[9],
+        'mask': decoder_inputs[10],
+        # outputs
+        'out_attention_hidden': decoder_outputs[0],
+        'out_attention_cell': decoder_outputs[1],
+        'out_decoder_hidden': decoder_outputs[2],
+        'out_decoder_cell': decoder_outputs[3],
+        'out_attention_weights': decoder_outputs[4],
+        'out_attention_weights_cum': decoder_outputs[5],
+        'out_attention_context': decoder_outputs[6],
+        'decoder_output': decoder_outputs[7],
+        'gate_prediction': decoder_outputs[8],
+    }
+    return decoder_tensors
+
+def swap_inputs_outputs(decoder_inputs, decoder_outputs):
+
+    new_decoder_inputs = (decoder_outputs[7], # decoder_output
+                          decoder_outputs[0], # attention_hidden
+                          decoder_outputs[1], # attention_cell
+                          decoder_outputs[2], # decoder_hidden
+                          decoder_outputs[3], # decoder_cell
+                          decoder_outputs[4], # attention_weights
+                          decoder_outputs[5], # attention_weights_cum
+                          decoder_outputs[6], # attention_context
+                          decoder_inputs[8],  # memory
+                          decoder_inputs[9],  # processed_memory
+                          decoder_inputs[10]) # mask
+
+    new_decoder_outputs = (decoder_inputs[1], # attention_hidden
+                           decoder_inputs[2], # attention_cell
+                           decoder_inputs[3], # decoder_hidden
+                           decoder_inputs[4], # decoder_cell
+                           decoder_inputs[5], # attention_weights
+                           decoder_inputs[6], # attention_weights_cum
+                           decoder_inputs[7], # attention_context
+                           decoder_inputs[0], # decoder_input
+                           decoder_outputs[8])# gate_output
+
+    return new_decoder_inputs, new_decoder_outputs
+
+
+def infer_tacotron2_trt(encoder, decoder_iter, postnet,
+                        encoder_context, decoder_context, postnet_context,
+                        sequences, sequence_lengths, measurements):
+
+    memory = torch.zeros((len(sequence_lengths),sequence_lengths[0],512)).cuda().float()
+    processed_memory = torch.zeros((len(sequence_lengths),sequence_lengths[0],128)).cuda().float()
+    lens = torch.zeros_like(sequence_lengths)
+
+    encoder_tensors = {
+        # inputs
+        'sequences': sequences, 'sequence_lengths': sequence_lengths,
+        # outputs
+        'memory': memory, 'lens': lens, 'processed_memory': processed_memory
+    }
+
+    print("Running Tacotron2 Encoder")
+    with MeasureTime(measurements, "tacotron2_encoder_time"):
+        run_trt_engine(encoder_context, encoder, encoder_tensors)
+
+    device = memory.device
+    mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device = device)
+    not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device = device)
+    mel_outputs, gate_outputs, alignments = (torch.zeros(1, device = device), torch.zeros(1, device = device), torch.zeros(1, device = device))
+    gate_threshold = 0.6
+    max_decoder_steps = 1664
+    first_iter = True
+
+    decoder_inputs = init_decoder_inputs(memory, processed_memory, sequence_lengths)
+    decoder_outputs = init_decoder_outputs(memory, sequence_lengths)
+
+    print("Running Tacotron2 Decoder")
+    while True:
+        decoder_tensors = init_decoder_tensors(decoder_inputs, decoder_outputs)
+        with MeasureTime(measurements, "step"):
+            run_trt_engine(decoder_context, decoder_iter, decoder_tensors)
+
+        if first_iter:
+            mel_outputs = torch.unsqueeze(decoder_outputs[7], 2)
+            gate_outputs = torch.unsqueeze(decoder_outputs[8], 2)
+            alignments = torch.unsqueeze(decoder_outputs[4], 2)
+            measurements['tacotron2_decoder_time'] = measurements['step']
+            first_iter = False
+        else:
+            mel_outputs = torch.cat((mel_outputs, torch.unsqueeze(decoder_outputs[7], 2)), 2)
+            gate_outputs = torch.cat((gate_outputs, torch.unsqueeze(decoder_outputs[8], 2)), 2)
+            alignments = torch.cat((alignments, torch.unsqueeze(decoder_outputs[4], 2)), 2)
+            measurements['tacotron2_decoder_time'] += measurements['step']
+
+        dec = torch.le(torch.sigmoid(decoder_outputs[8]), gate_threshold).to(torch.int32).squeeze(1)
+        not_finished = not_finished*dec
+        mel_lengths += not_finished
+
+        if torch.sum(not_finished) == 0:
+            print("Stopping after",mel_outputs.size(2),"decoder steps")
+            break
+        if mel_outputs.size(2) == max_decoder_steps:
+            print("Warning! Reached max decoder steps")
+            break
+
+        decoder_inputs, decoder_outputs = swap_inputs_outputs(decoder_inputs, decoder_outputs)
+
+    mel_outputs_postnet = torch.zeros_like(mel_outputs).cuda().float()
+
+    postnet_tensors = {
+        # inputs
+        'mel_outputs': mel_outputs,
+        # outputs
+        'mel_outputs_postnet': mel_outputs_postnet
+    }
+    print("Running Tacotron2 Postnet")
+    with MeasureTime(measurements, "tacotron2_postnet_time"):
+        run_trt_engine(postnet_context, postnet, postnet_tensors)
+
+    print("Tacotron2 Postnet done")
+
+    return mel_outputs_postnet, mel_lengths
+
+
+def infer_waveglow_trt(waveglow, waveglow_context, mel, measurements):
+
+    mel = mel.unsqueeze(3)
+    mel_size = mel.size(2)
+    batch_size = mel.size(0)
+    stride = 256
+    kernel_size = 1024
+    n_group = 8
+    z_size = (mel_size-1)*stride+(kernel_size-1)+1
+    z_size = z_size - (kernel_size-stride)
+    z_size = z_size//n_group
+    z = torch.randn(batch_size, n_group, z_size, 1).cuda().float()
+
+    audios = torch.zeros(batch_size, mel_size*256).cuda()
+
+    waveglow_tensors = {
+        # inputs
+        'mel': mel, 'z': z,
+        # outputs
+        'audio': audios
+    }
+    print("Running WaveGlow")
+    with MeasureTime(measurements, "waveglow_time"):
+        run_trt_engine(waveglow_context, waveglow, waveglow_tensors)
+
+    return audios
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        description='TensorRT Tacotron 2 Inference')
+    parser = parse_args(parser)
+    args, _ = parser.parse_known_args()
+
+    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+    encoder = load_engine(args.encoder, TRT_LOGGER)
+    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
+    postnet = load_engine(args.postnet, TRT_LOGGER)
+    waveglow = load_engine(args.waveglow, TRT_LOGGER)
+
+
+
+    if args.waveglow_ckpt != "":
+        # setup denoiser using WaveGlow PyTorch checkpoint
+        waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt,
+                                             True, forward_is_infer=True)
+        denoiser = Denoiser(waveglow_ckpt).cuda()
+        # after initialization, we don't need WaveGlow PyTorch checkpoint
+        # anymore - deleting
+        del waveglow_ckpt
+        torch.cuda.empty_cache()
+
+
+    # initialize CUDA state
+    torch.cuda.init()
+    # create TRT contexts for each engine
+    encoder_context = encoder.create_execution_context()
+    decoder_context = decoder_iter.create_execution_context()
+    postnet_context = postnet.create_execution_context()
+    waveglow_context = waveglow.create_execution_context()
+
+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
+                                              args.output+'/'+args.log_file),
+                            StdOutBackend(Verbosity.VERBOSE)])
+
+    texts = []
+    try:
+        f = open(args.input, 'r')
+        texts = f.readlines()
+    except:
+        print("Could not read file")
+        sys.exit(1)
+
+    measurements = {}
+
+    sequences, sequence_lengths = prepare_input_sequence(texts)
+
+    sequences = sequences.to(torch.int32)
+    sequence_lengths = sequence_lengths.to(torch.int32)
+    mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
+                                           encoder_context, decoder_context, postnet_context,
+                                           sequences, sequence_lengths, measurements)
+    audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements)
+
+    with encoder_context, decoder_context,  postnet_context, waveglow_context:
+        pass
+
+    audios.float()
+    if args.waveglow_ckpt != "":
+        with MeasureTime(measurements, "denoiser"):
+            audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
+
+    for i, audio in enumerate(audios):
+        audio = audio[:mel_lengths[i]*args.stft_hop_length]
+        audio = audio/torch.max(torch.abs(audio))
+        audio_path = args.output + "audio_"+str(i)+"_trt.wav"
+        write(audio_path, args.sampling_rate, audio.cpu().numpy())
+
+    DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']})
+    DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']})
+    DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']})
+    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
+    DLLogger.log(step=0, data={"latency": (measurements['tacotron2_encoder_time']+
+                                           measurements['tacotron2_decoder_time']+
+                                           measurements['tacotron2_postnet_time']+
+                                           measurements['waveglow_time'])})
+    if args.waveglow_ckpt != "":
+        DLLogger.log(step=0, data={"denoiser": measurements['denoiser']})
+    DLLogger.flush()
+
+if __name__ == "__main__":
+    main()

+ 4 - 0
PyTorch/SpeechSynthesis/Tacotron2/trt/run_latency_tests_trt.sh

@@ -0,0 +1,4 @@
+bash test_infer.sh --test trt/test_infer_trt.py \
+     -bs 1 -il 128 -p fp16 --num-iters 1003 --encoder ./output/encoder_fp16.engine --decoder ./output/decoder_iter_fp16.engine --postnet ./output/postnet_fp16.engine  --waveglow ./output/waveglow_fp16.engine
+bash test_infer.sh --test trt/test_infer_trt.py \
+     -bs 1 -il 128 -p fp32 --num-iters 1003 --encoder ./output/encoder_fp32.engine --decoder ./output/decoder_iter_fp32.engine --postnet ./output/postnet_fp32.engine  --waveglow ./output/waveglow_fp32.engine

+ 181 - 0
PyTorch/SpeechSynthesis/Tacotron2/trt/test_infer_trt.py

@@ -0,0 +1,181 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import torch
+import argparse
+import numpy as np
+from scipy.io.wavfile import write
+import tensorrt as trt
+import sys
+sys.path.append('./')
+
+import time
+import dllogger as DLLogger
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+
+from apex import amp
+
+from inference import MeasureTime, prepare_input_sequence
+from test_infer import print_stats
+from trt.inference_trt import infer_tacotron2_trt, infer_waveglow_trt
+from trt.trt_utils import load_engine
+import models
+
+from test_infer import print_stats
+
+def parse_args(parser):
+    """
+    Parse commandline arguments.
+    """
+    parser.add_argument('--encoder', type=str, required=True,
+                        help='full path to the Encoder TRT engine')
+    parser.add_argument('--decoder', type=str, required=True,
+                        help='full path to the DecoderIter TRT engine')
+    parser.add_argument('--postnet', type=str, required=True,
+                        help='full path to the Postnet TRT engine')
+    parser.add_argument('--waveglow', type=str, required=True,
+                        help='full path to the WaveGlow TRT engine')
+    parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
+    parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
+                        help='Sampling rate')
+    parser.add_argument('--fp16', action='store_true',
+                        help='inference ')
+    parser.add_argument('--log-file', type=str, default='nvlog.json',
+                        help='Filename for logging')
+    parser.add_argument('--stft-hop-length', type=int, default=256,
+                        help='STFT hop length for estimating audio length from mel size')
+    parser.add_argument('--num-iters', type=int, default=10,
+                        help='Number of iterations')
+    parser.add_argument('-il', '--input-length', type=int, default=64,
+                        help='Input length')
+    parser.add_argument('-bs', '--batch-size', type=int, default=1,
+                        help='Batch size')
+
+    return parser
+
+
+def main():
+    """
+    Launches text to speech (inference).
+    Inference is executed on a single GPU.
+    """
+    parser = argparse.ArgumentParser(
+        description='PyTorch Tacotron 2 Inference')
+    parser = parse_args(parser)
+    args, unknown_args = parser.parse_known_args()
+
+    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+    encoder = load_engine(args.encoder, TRT_LOGGER)
+    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
+    postnet = load_engine(args.postnet, TRT_LOGGER)
+    waveglow = load_engine(args.waveglow, TRT_LOGGER)
+
+    # initialize CUDA state
+    torch.cuda.init()
+    # create TRT contexts for each engine
+    encoder_context = encoder.create_execution_context()
+    decoder_context = decoder_iter.create_execution_context()
+    postnet_context = postnet.create_execution_context()
+    waveglow_context = waveglow.create_execution_context()
+
+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
+                            StdOutBackend(Verbosity.VERBOSE)])
+    for k,v in vars(args).items():
+        DLLogger.log(step="PARAMETER", data={k:v})
+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
+
+    measurements_all = {"pre_processing": [],
+                        "tacotron2_latency": [],
+                        "waveglow_latency": [],
+                        "latency": [],
+                        "type_conversion": [],
+                        "data_transfer": [],
+                        "storage": [],
+                        "tacotron2_items_per_sec": [],
+                        "waveglow_items_per_sec": [],
+                        "num_mels_per_audio": [],
+                        "throughput": []}
+
+    texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
+    texts = [texts[0][:args.input_length]]
+    texts = texts*args.batch_size
+
+    warmup_iters = 3
+
+    for iter in range(args.num_iters):
+
+        measurements = {}
+
+        with MeasureTime(measurements, "pre_processing"):
+            sequences_padded, input_lengths = prepare_input_sequence(texts)
+
+        with torch.no_grad():
+            with MeasureTime(measurements, "latency"):
+                with MeasureTime(measurements, "tacotron2_latency"):
+                    mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
+                                                           encoder_context, decoder_context, postnet_context,
+                                                           sequences_padded, input_lengths, measurements)
+
+                with MeasureTime(measurements, "waveglow_latency"):
+                    audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements)
+
+        num_mels = mel.size(0)*mel.size(2)
+        num_samples = audios.size(0)*audios.size(1)
+
+        with MeasureTime(measurements, "type_conversion"):
+            audios = audios.float()
+
+        with MeasureTime(measurements, "data_transfer"):
+            audios = audios.cpu()
+
+        with MeasureTime(measurements, "storage"):
+            audios = audios.numpy()
+            for i, audio in enumerate(audios):
+                audio_path = "audio_"+str(i)+".wav"
+                write(audio_path, args.sampling_rate,
+                      audio[:mel_lengths[i]*args.stft_hop_length])
+
+        measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
+        measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
+        measurements['num_mels_per_audio'] = mel.size(2)
+        measurements['throughput'] = num_samples/measurements['latency']
+
+        if iter >= warmup_iters:
+            for k,v in measurements.items():
+                if k in measurements_all.keys():
+                    measurements_all[k].append(v)
+                    DLLogger.log(step=(iter-warmup_iters), data={k: v})
+
+    with encoder_context, decoder_context, postnet_context, waveglow_context:
+        pass
+
+    DLLogger.flush()
+
+    print_stats(measurements_all)
+
+if __name__ == '__main__':
+    main()

+ 98 - 0
PyTorch/SpeechSynthesis/Tacotron2/trt/trt_utils.py

@@ -0,0 +1,98 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import tensorrt as trt
+
+def binding_info(engine, context):
+    for i in range(engine.num_bindings):
+        print("|||| binding", i)
+        print("|||| binding_is_input", engine.binding_is_input(i))
+        print("|||| get_binding_dtype", engine.get_binding_dtype(i))
+        print("|||| get_binding_name", engine.get_binding_name(i))
+        print("|||| get_binding_shape", engine.get_binding_shape(i))
+        print("|||| get_binding_vectorized_dim", engine.get_binding_vectorized_dim(i))
+
+    print("|||| all_binding_shapes_specified", context.all_binding_shapes_specified)
+    print("|||| all_shape_inputs_specified", context.all_shape_inputs_specified)
+
+
+def is_dimension_dynamic(dim):
+    return dim is None or dim <= 0
+
+
+def is_shape_dynamic(shape):
+    return any([is_dimension_dynamic(dim) for dim in shape])
+
+
+def run_trt_engine(context, engine, tensors):
+
+    bindings = [None]*engine.num_bindings
+    for name in tensors.keys():
+        idx = engine.get_binding_index(name)
+        tensor = tensors.get(name)
+        bindings[idx] = tensor.data_ptr()
+        if engine.is_shape_binding(idx) and is_shape_dynamic(context.get_shape(idx)):
+            context.set_shape_input(idx, tensor)
+        elif is_shape_dynamic(context.get_binding_shape(idx)):
+            context.set_binding_shape(idx, tensor.shape)
+
+    # binding_info(engine, context)
+    context.execute_v2(bindings=bindings)
+
+
+def load_engine(engine_filepath, trt_logger):
+    with open(engine_filepath, "rb") as f, trt.Runtime(trt_logger) as runtime:
+        engine = runtime.deserialize_cuda_engine(f.read())
+    return engine
+
+
+def build_engine(model_file, shapes, max_ws=512*1024*1024, fp16=False):
+    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+    builder = trt.Builder(TRT_LOGGER)
+    builder.fp16_mode = fp16
+
+    config = builder.create_builder_config()
+    config.max_workspace_size = max_ws
+    if fp16:
+        config.flags |= 1 << int(trt.BuilderFlag.FP16)
+    profile = builder.create_optimization_profile()
+    for s in shapes:
+        profile.set_shape(s['name'], min=s['min'], opt=s['opt'], max=s['max'])
+    config.add_optimization_profile(profile)
+    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(explicit_batch)
+
+    with trt.OnnxParser(network, TRT_LOGGER) as parser:
+        with open(model_file, 'rb') as model:
+            parsed = parser.parse(model.read())
+            for i in range(parser.num_errors):
+                e = parser.get_error(i)
+            engine = builder.build_engine(network, config=config)
+
+            return engine
+
+

+ 8 - 5
PyTorch/SpeechSynthesis/Tacotron2/waveglow/model.py

@@ -233,6 +233,7 @@ class WaveGlow(torch.nn.Module):
         return torch.cat(output_audio, 1), log_s_list, log_det_W_list
 
     def infer(self, spect, sigma=1.0):
+
         spect = self.upsample(spect)
         # trim conv artifacts. maybe pad spec to kernel multiple
         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
@@ -272,21 +273,22 @@ class WaveGlow(torch.nn.Module):
         return audio
 
 
-    def infer_onnx(self, spect, z, sigma=1.0):
+    def infer_onnx(self, spect, z, sigma=0.9):
 
         spect = self.upsample(spect)
         # trim conv artifacts. maybe pad spec to kernel multiple
         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
         spect = spect[:, :, :-time_cutoff]
 
-        length_spect_group = int(spect.size(2)/8)
+        length_spect_group = spect.size(2)//8
         mel_dim = 80
+        batch_size = spect.size(0)
 
         spect = torch.squeeze(spect, 3)
-        spect = spect.view((1, mel_dim, length_spect_group, self.n_group))
+        spect = spect.view((batch_size, mel_dim, length_spect_group, self.n_group))
         spect = spect.permute(0, 2, 1, 3)
         spect = spect.contiguous()
-        spect = spect.view((1, length_spect_group, self.n_group*mel_dim))
+        spect = spect.view((batch_size, length_spect_group, self.n_group*mel_dim))
         spect = spect.permute(0, 2, 1)
         spect = torch.unsqueeze(spect, 3)
 
@@ -312,9 +314,10 @@ class WaveGlow(torch.nn.Module):
                 z = z[:, self.n_early_size:self.n_group, :, :]
 
         audio = torch.squeeze(audio, 3)
-        audio = audio.permute(0,2,1).contiguous().view(1, (length_spect_group * self.n_group))
+        audio = audio.permute(0,2,1).contiguous().view(batch_size, (length_spect_group * self.n_group))
 
         return audio
+
     @staticmethod
     def remove_weightnorm(model):
         waveglow = model