hace 6 años · 2992264d3a
--- a/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile
+++ b/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile
@@ -1,5 +1,6 @@
 
				-FROM nvcr.io/nvidia/pytorch:19.10-py3
			
 
				+FROM nvcr.io/nvidia/pytorch:19.11-py3
			
 
				 
			
 
				 ADD . /workspace/tacotron2
			
 
				 WORKDIR /workspace/tacotron2
			
 
				 RUN pip install -r requirements.txt
			
 
				+RUN pip --no-cache-dir --no-cache install  'git+https://github.com/NVIDIA/dllogger'
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/README.md
+++ b/PyTorch/SpeechSynthesis/Tacotron2/README.md
@@ -704,6 +704,9 @@ November 2019
 
				 * Implemented training resume from checkpoint
			
 
				 * Added notebook for running Tacotron 2 and WaveGlow in TRTIS.
			
 
				 
			
 
				+December  2019
			
 
				+* Added `trt` subfolder for running Tacotron 2 and WaveGlow in TensorRT.
			
 
				+
			
 
				 ### Known issues
			
 
				 
			
 
				 There are no known issues in this release.
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/dllogger/__init__.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/dllogger/__init__.py
--- a/PyTorch/SpeechSynthesis/Tacotron2/dllogger/autologging.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/dllogger/autologging.py
@@ -1,61 +0,0 @@
 
				-#
			
 
				-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-import subprocess
			
 
				-import xml.etree.ElementTree as ET
			
 
				-
			
 
				-from dllogger.logger import LOGGER
			
 
				-
			
 
				-#TODO: print CUDA version, container version etc
			
 
				-
			
 
				-def log_hardware():
			
 
				-    # TODO: asserts - what if you cannot launch those commands?
			
 
				-    # number of CPU threads
			
 
				-    cpu_info_command = 'cat /proc/cpuinfo'
			
 
				-    cpu_info = subprocess.run(cpu_info_command.split(), stdout=subprocess.PIPE).stdout.split()
			
 
				-    cpu_num_index = len(cpu_info) - cpu_info[::-1].index(b'processor') + 1
			
 
				-    cpu_num = int(cpu_info[cpu_num_index]) + 1
			
 
				-
			
 
				-    # CPU name
			
 
				-    cpu_name_begin_index = cpu_info.index(b'name')
			
 
				-    cpu_name_end_index = cpu_info.index(b'stepping')
			
 
				-    cpu_name = b' '.join(cpu_info[cpu_name_begin_index + 2:cpu_name_end_index]).decode('utf-8')
			
 
				-
			
 
				-    LOGGER.log(key='cpu_info', value={"num": cpu_num, "name": cpu_name})
			
 
				-
			
 
				-    # RAM memory
			
 
				-    ram_info_command = 'free -m -h'
			
 
				-    ram_info = subprocess.run(ram_info_command.split(), stdout=subprocess.PIPE).stdout.split()
			
 
				-    ram_index = ram_info.index(b'Mem:') + 1
			
 
				-    ram = ram_info[ram_index].decode('utf-8')
			
 
				-
			
 
				-    LOGGER.log(key='mem_info', value={"ram": ram})
			
 
				-
			
 
				-    # GPU
			
 
				-    nvidia_smi_command = 'nvidia-smi -q -x'
			
 
				-    nvidia_smi_output = subprocess.run(nvidia_smi_command.split(), stdout=subprocess.PIPE).stdout
			
 
				-    nvidia_smi = ET.fromstring(nvidia_smi_output)
			
 
				-    gpus = nvidia_smi.findall('gpu')
			
 
				-    ver = nvidia_smi.findall('driver_version')
			
 
				-
			
 
				-    LOGGER.log(key="gpu_info",
			
 
				-                 value={
			
 
				-                      "driver_version": ver[0].text,
			
 
				-                      "num": len(gpus),
			
 
				-                      "name": [g.find('product_name').text for g in gpus],
			
 
				-                      "mem": [g.find('fb_memory_usage').find('total').text for g in gpus]})
			
 
				-
			
 
				-def log_args(args):
			
 
				-    LOGGER.log(key='args', value=vars(args))
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/dllogger/logger.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/dllogger/logger.py
@@ -1,429 +0,0 @@
 
				-# Copyright 2018 MLBenchmark Group. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-# ==============================================================================
			
 
				-#
			
 
				-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-import time
			
 
				-import json
			
 
				-import logging
			
 
				-import os
			
 
				-import inspect
			
 
				-import sys
			
 
				-import re
			
 
				-from contextlib import contextmanager
			
 
				-import functools
			
 
				-from collections import OrderedDict
			
 
				-
			
 
				-NVLOGGER_NAME = 'nv_logger'
			
 
				-NVLOGGER_VERSION = '0.2.2'
			
 
				-NVLOGGER_TOKEN = ':::NVLOG'
			
 
				-
			
 
				-MLPERF_NAME = 'mlperf_logger'
			
 
				-MLPERF_VERSION = '0.5.0'
			
 
				-MLPERF_TOKEN = ':::MLP'
			
 
				-
			
 
				-DEFAULT_JSON_FILENAME = 'nvlog.json'
			
 
				-
			
 
				-RUN_SCOPE = 0
			
 
				-EPOCH_SCOPE = 1
			
 
				-TRAIN_ITER_SCOPE = 2
			
 
				-
			
 
				-_data = OrderedDict([
			
 
				-    ('model', None),
			
 
				-    ('epoch', -1),
			
 
				-    ('iteration', -1),
			
 
				-    ('total_iteration', -1),
			
 
				-    ('metrics', OrderedDict()),
			
 
				-    ('timed_blocks', OrderedDict()),
			
 
				-    ('current_scope', RUN_SCOPE)
			
 
				-    ])
			
 
				-
			
 
				-def get_caller(stack_index=2, root_dir=None):
			
 
				-    caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
			
 
				-
			
 
				-    # Trim the file names for readability.
			
 
				-    filename = caller.filename
			
 
				-    if root_dir is not None:
			
 
				-        filename = re.sub("^" + root_dir + "/", "", filename)
			
 
				-    return "%s:%d" % (filename, caller.lineno)
			
 
				-
			
 
				-class StandardMeter(object):
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self.reset()
			
 
				-
			
 
				-    def reset(self):
			
 
				-        pass
			
 
				-
			
 
				-    def record(self, value):
			
 
				-        self.value = value
			
 
				-
			
 
				-    def get_value(self):
			
 
				-        return self.value
			
 
				-
			
 
				-    def get_last(self):
			
 
				-        return self.value
			
 
				-
			
 
				-class AverageMeter(object):
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self.reset()
			
 
				-
			
 
				-    def reset(self):
			
 
				-        self.n = 0
			
 
				-        self.value = 0
			
 
				-        self.last = 0
			
 
				-
			
 
				-    def record(self, value, n = 1):
			
 
				-        self.last = value
			
 
				-        self.n += n
			
 
				-        self.value += value * n
			
 
				-
			
 
				-    def get_value(self):
			
 
				-        return self.value / self.n
			
 
				-
			
 
				-    def get_last(self):
			
 
				-        return self.last
			
 
				-
			
 
				-class JsonBackend(object):
			
 
				-
			
 
				-    def __init__(self, log_file=DEFAULT_JSON_FILENAME, logging_scope=TRAIN_ITER_SCOPE,
			
 
				-            iteration_interval=1):
			
 
				-        self.log_file = log_file
			
 
				-        self.logging_scope = logging_scope
			
 
				-        self.iteration_interval = iteration_interval
			
 
				-
			
 
				-        self.json_log = OrderedDict([
			
 
				-            ('run', OrderedDict()),
			
 
				-            ('epoch', OrderedDict()),
			
 
				-            ('iter', OrderedDict()),
			
 
				-            ('event', OrderedDict()),
			
 
				-            ])
			
 
				-        
			
 
				-        self.json_log['epoch']['x'] = []
			
 
				-        if self.logging_scope == TRAIN_ITER_SCOPE:
			
 
				-            self.json_log['iter']['x'] = [[]]
			
 
				-
			
 
				-    def register_metric(self, key, metric_scope):
			
 
				-        if (metric_scope == TRAIN_ITER_SCOPE and 
			
 
				-                self.logging_scope == TRAIN_ITER_SCOPE):
			
 
				-            if not key in self.json_log['iter'].keys():
			
 
				-                self.json_log['iter'][key] = [[]]
			
 
				-        if metric_scope == EPOCH_SCOPE:
			
 
				-            if not key in self.json_log['epoch'].keys():
			
 
				-                self.json_log['epoch'][key] = []
			
 
				-
			
 
				-    def log(self, key, value):
			
 
				-        if _data['current_scope'] == RUN_SCOPE:
			
 
				-            self.json_log['run'][key] = value
			
 
				-        elif _data['current_scope'] == EPOCH_SCOPE: 
			
 
				-            pass
			
 
				-        elif _data['current_scope'] == TRAIN_ITER_SCOPE:
			
 
				-            pass
			
 
				-        else:
			
 
				-            raise ValueError('log function for scope "', _data['current_scope'], 
			
 
				-                    '" not implemented')
			
 
				-    
			
 
				-    def log_event(self, key, value):
			
 
				-        if not key in self.json_log['event'].keys():
			
 
				-            self.json_log['event'][key] = []
			
 
				-        entry = OrderedDict()
			
 
				-        entry['epoch'] = _data['epoch']
			
 
				-        entry['iter'] = _data['iteration']
			
 
				-        entry['timestamp'] = time.time()
			
 
				-        if value:
			
 
				-            entry['value'] = value
			
 
				-        self.json_log['event'][key].append(entry)
			
 
				-
			
 
				-    def log_iteration_summary(self):
			
 
				-        if (self.logging_scope == TRAIN_ITER_SCOPE and 
			
 
				-                _data['total_iteration'] % self.iteration_interval == 0):
			
 
				-            for key, m in _data['metrics'].items():
			
 
				-                if m.metric_scope == TRAIN_ITER_SCOPE:
			
 
				-                    self.json_log['iter'][key][-1].append(m.get_last())
			
 
				-
			
 
				-            # log x for iteration number
			
 
				-            self.json_log['iter']['x'][-1].append(_data['iteration'])
			
 
				-
			
 
				-
			
 
				-    def dump_json(self):
			
 
				-        if self.log_file is None:
			
 
				-            print(json.dumps(self.json_log, indent=4))
			
 
				-        else:
			
 
				-            with open(self.log_file, 'w') as f:
			
 
				-                json.dump(self.json_log, fp=f, indent=4)
			
 
				-
			
 
				-    def log_epoch_summary(self):
			
 
				-        for key, m in _data['metrics'].items():
			
 
				-            if m.metric_scope == EPOCH_SCOPE:
			
 
				-                self.json_log['epoch'][key].append(m.get_value())
			
 
				-            elif (m.metric_scope == TRAIN_ITER_SCOPE and 
			
 
				-                    self.logging_scope == TRAIN_ITER_SCOPE):
			
 
				-                # create new sublists for each iter metric in the next epoch
			
 
				-                self.json_log['iter'][key].append([])
			
 
				-        
			
 
				-        # log x for epoch number
			
 
				-        self.json_log['epoch']['x'].append(_data['epoch'])
			
 
				-
			
 
				-        # create new sublist for iter's x in the next epoch
			
 
				-        if self.logging_scope == TRAIN_ITER_SCOPE:
			
 
				-            self.json_log['iter']['x'].append([])
			
 
				-
			
 
				-        self.dump_json()
			
 
				-
			
 
				-    def timed_block_start(self, name):
			
 
				-        pass
			
 
				-
			
 
				-    def timed_block_stop(self, name):
			
 
				-        pass
			
 
				-
			
 
				-    def finish(self):
			
 
				-        self.dump_json()
			
 
				-
			
 
				-class _ParentStdOutBackend(object):
			
 
				-
			
 
				-    def __init__(self, name, token, version, log_file, logging_scope, iteration_interval):
			
 
				-
			
 
				-        self.root_dir = None
			
 
				-        self.worker = [0]
			
 
				-        self.prefix = ''
			
 
				-
			
 
				-        self.name = name
			
 
				-        self.token = token
			
 
				-        self.version = version
			
 
				-        self.log_file = log_file
			
 
				-        self.logging_scope = logging_scope
			
 
				-        self.iteration_interval = iteration_interval
			
 
				-
			
 
				-        self.logger = logging.getLogger(self.name)
			
 
				-        self.logger.setLevel(logging.DEBUG)
			
 
				-        self.logger.handlers = []
			
 
				-
			
 
				-        if (self.log_file == None):
			
 
				-            self.stream_handler = logging.StreamHandler(stream=sys.stdout)
			
 
				-            self.stream_handler.setLevel(logging.DEBUG)
			
 
				-            self.logger.addHandler(self.stream_handler)
			
 
				-        else:
			
 
				-            self.file_handler = logging.FileHandler(self.log_file, mode='w')
			
 
				-            self.file_handler.setLevel(logging.DEBUG)
			
 
				-            self.logger.addHandler(self.file_handler)
			
 
				-
			
 
				-    def register_metric(self, key, meter=None, metric_scope=EPOCH_SCOPE):
			
 
				-        pass
			
 
				-
			
 
				-    def log_epoch_summary(self):
			
 
				-        pass
			
 
				-
			
 
				-    def log_iteration_summary(self):
			
 
				-        pass
			
 
				-
			
 
				-    def log(self, key, value):
			
 
				-        if _data['current_scope'] > self.logging_scope:
			
 
				-            pass
			
 
				-        elif (_data['current_scope'] == TRAIN_ITER_SCOPE and 
			
 
				-                _data['total_iteration'] % self.iteration_interval != 0):
			
 
				-            pass
			
 
				-        else:
			
 
				-            self.log_stdout(key, value)
			
 
				-
			
 
				-    def log_event(self, key, value):
			
 
				-        self.log_stdout(key, value)
			
 
				-        
			
 
				-    def log_stdout(self, key, value=None, forced=False):
			
 
				-        # TODO: worker 0 
			
 
				-        # only the 0-worker will log
			
 
				-        #if not forced and self.worker != 0:
			
 
				-        #    pass
			
 
				-
			
 
				-        if value is None:
			
 
				-            msg = key
			
 
				-        else:
			
 
				-            str_json = json.dumps(value)
			
 
				-            msg = '{key}: {value}'.format(key=key, value=str_json)
			
 
				-
			
 
				-        call_site = get_caller(root_dir=self.root_dir)
			
 
				-        now = time.time()
			
 
				-
			
 
				-        message = '{prefix}{token}v{ver} {model} {secs:.9f} ({call_site}) {msg}'.format(
			
 
				-            prefix=self.prefix, token=self.token, ver=self.version, secs=now, 
			
 
				-            model=_data['model'],
			
 
				-            call_site=call_site, msg=msg)
			
 
				-
			
 
				-        self.logger.debug(message)
			
 
				-
			
 
				-    def timed_block_start(self, name):
			
 
				-        self.log_stdout(key=name + "_start")
			
 
				-
			
 
				-    def timed_block_stop(self, name):
			
 
				-        self.log_stdout(key=name + "_stop")
			
 
				-
			
 
				-    def finish(self):
			
 
				-        pass
			
 
				-
			
 
				-class StdOutBackend(_ParentStdOutBackend):
			
 
				-
			
 
				-    def __init__(self, log_file=None, logging_scope=EPOCH_SCOPE, iteration_interval=1):
			
 
				-        _ParentStdOutBackend.__init__(self, name=NVLOGGER_NAME, token=NVLOGGER_TOKEN, 
			
 
				-                version=NVLOGGER_VERSION, log_file=log_file, logging_scope=logging_scope, 
			
 
				-                iteration_interval=iteration_interval)
			
 
				-        
			
 
				-class MLPerfBackend(_ParentStdOutBackend):
			
 
				-
			
 
				-    def __init__(self, log_file=None, logging_scope=TRAIN_ITER_SCOPE, iteration_interval=1):
			
 
				-        _ParentStdOutBackend.__init__(self, name=MLPERF_NAME, token=MLPERF_TOKEN, 
			
 
				-                version=MLPERF_VERSION, log_file=log_file, logging_scope=logging_scope, 
			
 
				-                iteration_interval=iteration_interval)
			
 
				-
			
 
				-class _Logger(object):
			
 
				-    def __init__(self):
			
 
				-
			
 
				-        self.backends = [
			
 
				-                StdOutBackend(),
			
 
				-                JsonBackend()
			
 
				-                ]
			
 
				-   
			
 
				-    def set_model_name(self, name):
			
 
				-        _data['model'] = name
			
 
				-
			
 
				-
			
 
				-    def set_backends(self, backends):
			
 
				-        self.backends = backends
			
 
				-
			
 
				-        
			
 
				-    def register_metric(self, key, meter=None, metric_scope=EPOCH_SCOPE):
			
 
				-        if meter == None:
			
 
				-            meter = StandardMeter()
			
 
				-        #TODO: move to argument of Meter?
			
 
				-        meter.metric_scope = metric_scope
			
 
				-        _data['metrics'][key] = meter
			
 
				-        for b in self.backends:
			
 
				-            b.register_metric(key, metric_scope)
			
 
				-
			
 
				-    def log(self, key, value=None, forced=False):
			
 
				-        if _data['current_scope'] == TRAIN_ITER_SCOPE or _data['current_scope'] == EPOCH_SCOPE:
			
 
				-            if key in _data['metrics'].keys():
			
 
				-                if _data['metrics'][key].metric_scope == _data['current_scope']:
			
 
				-                    _data['metrics'][key].record(value)
			
 
				-        for b in self.backends:
			
 
				-            b.log(key, value)
			
 
				-
			
 
				-    def log_event(self, key, value=None):
			
 
				-        for b in self.backends:
			
 
				-            b.log_event(key, value)
			
 
				-    
			
 
				-    def timed_block_start(self, name):
			
 
				-        if not name in _data['timed_blocks']:
			
 
				-            _data['timed_blocks'][name] = OrderedDict()
			
 
				-        _data['timed_blocks'][name]['start'] = time.time()
			
 
				-        for b in self.backends:
			
 
				-            b.timed_block_start(name)
			
 
				-    
			
 
				-    def timed_block_stop(self, name):
			
 
				-        if not name in _data['timed_blocks']:
			
 
				-            raise ValueError('timed_block_stop called before timed_block_start for ' + name)
			
 
				-        _data['timed_blocks'][name]['stop'] = time.time()
			
 
				-        delta = _data['timed_blocks'][name]['stop'] - _data['timed_blocks'][name]['start']
			
 
				-        self.log(name + '_time', delta)
			
 
				-        for b in self.backends:
			
 
				-            b.timed_block_stop(name)
			
 
				-
			
 
				-    def iteration_start(self):
			
 
				-        _data['current_scope'] = TRAIN_ITER_SCOPE
			
 
				-        _data['iteration'] += 1
			
 
				-        _data['total_iteration'] += 1
			
 
				-
			
 
				-
			
 
				-    def iteration_stop(self):
			
 
				-        for b in self.backends:
			
 
				-            b.log_iteration_summary()
			
 
				-        _data['current_scope'] = EPOCH_SCOPE
			
 
				-
			
 
				-    def epoch_start(self):
			
 
				-        _data['current_scope'] = EPOCH_SCOPE 
			
 
				-        _data['epoch'] += 1
			
 
				-        _data['iteration'] = -1
			
 
				-
			
 
				-        for n, m in _data['metrics'].items():
			
 
				-            if m.metric_scope == TRAIN_ITER_SCOPE:
			
 
				-                m.reset()
			
 
				-
			
 
				-    def epoch_stop(self):
			
 
				-        for b in self.backends:
			
 
				-            b.log_epoch_summary()
			
 
				-        _data['current_scope'] = RUN_SCOPE
			
 
				-
			
 
				-    def finish(self):
			
 
				-        for b in self.backends:
			
 
				-            b.finish()
			
 
				-
			
 
				-    def iteration_generator_wrapper(self, gen):
			
 
				-        for g in gen:
			
 
				-            self.iteration_start()
			
 
				-            yield g
			
 
				-            self.iteration_stop()
			
 
				-
			
 
				-    def epoch_generator_wrapper(self, gen):
			
 
				-        for g in gen:
			
 
				-            self.epoch_start()
			
 
				-            yield g
			
 
				-            self.epoch_stop()
			
 
				-
			
 
				-LOGGER = _Logger()
			
 
				-
			
 
				-@contextmanager
			
 
				-def timed_block(prefix, value=None, logger=LOGGER, forced=False):
			
 
				-    """ This function helps with timed blocks
			
 
				-        ----
			
 
				-        Parameters:
			
 
				-        prefix - one of items from TIMED_BLOCKS; the action to be timed
			
 
				-        logger - NVLogger object
			
 
				-        forced - if True then the events are always logged (even if it should be skipped)
			
 
				-    """
			
 
				-    if logger is None:
			
 
				-        pass
			
 
				-    logger.timed_block_start(prefix)
			
 
				-    yield logger
			
 
				-    logger.timed_block_stop(prefix)
			
 
				-
			
 
				-def timed_function(prefix, variable=None, forced=False):
			
 
				-    """ This decorator helps with timed functions
			
 
				-        ----
			
 
				-        Parameters:
			
 
				-        prefix - one of items from TIME_BLOCK; the action to be timed
			
 
				-        logger - NVLogger object
			
 
				-        forced - if True then the events are always logged (even if it should be skipped)
			
 
				-    """
			
 
				-    def timed_function_decorator(func):
			
 
				-        @functools.wraps(func)
			
 
				-        def wrapper(*args, **kwargs):
			
 
				-            logger = kwargs.get('logger', LOGGER)
			
 
				-            value = kwargs.get(variable, next(iter(args), None))
			
 
				-            with timed_block(prefix=prefix, logger=logger, value=value, forced=forced):
			
 
				-                    func(*args, **kwargs)
			
 
				-        return wrapper
			
 
				-    return timed_function_decorator
			
 
				-
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/dllogger/tags.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/dllogger/tags.py
@@ -1,257 +0,0 @@
 
				-# Copyright 2018 MLBenchmark Group. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-# ==============================================================================
			
 
				-#
			
 
				-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-# Common values reported
			
 
				-
			
 
				-VALUE_EPOCH = "epoch"
			
 
				-VALUE_ITERATION = "iteration"
			
 
				-VALUE_ACCURACY = "accuracy"
			
 
				-VALUE_BLEU = "bleu"
			
 
				-VALUE_TOP1 = "top1"
			
 
				-VALUE_TOP5 = "top5"
			
 
				-VALUE_BBOX_MAP = "bbox_map"
			
 
				-VALUE_MASK_MAP = "mask_map"
			
 
				-VALUE_BCE = "binary_cross_entropy"
			
 
				-
			
 
				-
			
 
				-# Timed blocks (used with timed_function & timed_block
			
 
				-# For each there should be *_start and *_stop tags defined
			
 
				-
			
 
				-RUN_BLOCK = "run"
			
 
				-SETUP_BLOCK = "setup"
			
 
				-PREPROC_BLOCK = "preproc"
			
 
				-
			
 
				-TRAIN_BLOCK = "train"
			
 
				-TRAIN_PREPROC_BLOCK = "train_preproc"
			
 
				-TRAIN_EPOCH_BLOCK = "train_epoch"
			
 
				-TRAIN_EPOCH_PREPROC_BLOCK = "train_epoch_preproc"
			
 
				-TRAIN_CHECKPOINT_BLOCK = "train_checkpoint"
			
 
				-TRAIN_ITER_BLOCK = "train_iteration"
			
 
				-
			
 
				-EVAL_BLOCK = "eval"
			
 
				-EVAL_ITER_BLOCK = "eval_iteration"
			
 
				-
			
 
				-#TODO: to remove?
			
 
				-TIMED_BLOCKS = {
			
 
				-    RUN_BLOCK,
			
 
				-    SETUP_BLOCK,
			
 
				-    PREPROC_BLOCK,
			
 
				-    TRAIN_BLOCK,
			
 
				-    TRAIN_PREPROC_BLOCK,
			
 
				-    TRAIN_EPOCH_BLOCK,
			
 
				-    TRAIN_EPOCH_PREPROC_BLOCK,
			
 
				-    TRAIN_CHECKPOINT_BLOCK,
			
 
				-    TRAIN_ITER_BLOCK,
			
 
				-    EVAL_BLOCK,
			
 
				-    EVAL_ITER_BLOCK,
			
 
				-}
			
 
				-
			
 
				-
			
 
				-# Events
			
 
				-
			
 
				-RUN_INIT = "run_init"
			
 
				-
			
 
				-SETUP_START = "setup_start"
			
 
				-SETUP_STOP = "setup_stop"
			
 
				-
			
 
				-PREPROC_START = "preproc_start"
			
 
				-PREPROC_STOP = "preproc_stop"
			
 
				-
			
 
				-RUN_START = "run_start"
			
 
				-RUN_STOP = "run_stop"
			
 
				-RUN_FINAL = "run_final"
			
 
				-
			
 
				-TRAIN_CHECKPOINT_START = "train_checkpoint_start"
			
 
				-TRAIN_CHECKPOINT_STOP = "train_checkpoint_stop"
			
 
				-
			
 
				-TRAIN_PREPROC_START = "train_preproc_start"
			
 
				-TRAIN_PREPROC_STOP = "train_preproc_stop"
			
 
				-
			
 
				-TRAIN_EPOCH_PREPROC_START = "train_epoch_preproc_start"
			
 
				-TRAIN_EPOCH_PREPROC_STOP = "train_epoch_preproc_stop"
			
 
				-
			
 
				-TRAIN_ITER_START = "train_iter_start"
			
 
				-TRAIN_ITER_STOP = "train_iter_stop"
			
 
				-
			
 
				-TRAIN_EPOCH_START = "train_epoch_start"
			
 
				-TRAIN_EPOCH_STOP = "train_epoch_stop"
			
 
				-
			
 
				-
			
 
				-# MLPerf specific tags
			
 
				-
			
 
				-RUN_CLEAR_CACHES = "run_clear_caches"
			
 
				-
			
 
				-PREPROC_NUM_TRAIN_EXAMPLES = "preproc_num_train_examples"
			
 
				-PREPROC_NUM_EVAL_EXAMPLES = "preproc_num_eval_examples"
			
 
				-PREPROC_TOKENIZE_TRAINING = "preproc_tokenize_training"
			
 
				-PREPROC_TOKENIZE_EVAL = "preproc_tokenize_eval"
			
 
				-PREPROC_VOCAB_SIZE = "preproc_vocab_size"
			
 
				-
			
 
				-RUN_SET_RANDOM_SEED = "run_set_random_seed"
			
 
				-
			
 
				-INPUT_SIZE = "input_size"
			
 
				-INPUT_BATCH_SIZE = "input_batch_size"
			
 
				-INPUT_ORDER = "input_order"
			
 
				-INPUT_SHARD = "input_shard"
			
 
				-INPUT_BN_SPAN = "input_bn_span"
			
 
				-
			
 
				-INPUT_CENTRAL_CROP = "input_central_crop"
			
 
				-INPUT_CROP_USES_BBOXES = "input_crop_uses_bboxes"
			
 
				-INPUT_DISTORTED_CROP_MIN_OBJ_COV = "input_distorted_crop_min_object_covered"
			
 
				-INPUT_DISTORTED_CROP_RATIO_RANGE = "input_distorted_crop_aspect_ratio_range"
			
 
				-INPUT_DISTORTED_CROP_AREA_RANGE = "input_distorted_crop_area_range"
			
 
				-INPUT_DISTORTED_CROP_MAX_ATTEMPTS = "input_distorted_crop_max_attempts"
			
 
				-INPUT_MEAN_SUBTRACTION = "input_mean_subtraction"
			
 
				-INPUT_RANDOM_FLIP = "input_random_flip"
			
 
				-
			
 
				-INPUT_RESIZE = "input_resize"
			
 
				-INPUT_RESIZE_ASPECT_PRESERVING = "input_resize_aspect_preserving"
			
 
				-
			
 
				-
			
 
				-# Opt
			
 
				-
			
 
				-OPT_NAME = "opt_name"
			
 
				-
			
 
				-OPT_LR = "opt_learning_rate"
			
 
				-OPT_MOMENTUM = "opt_momentum"
			
 
				-
			
 
				-OPT_WEIGHT_DECAY = "opt_weight_decay"
			
 
				-
			
 
				-OPT_HP_ADAM_BETA1 = "opt_hp_Adam_beta1"
			
 
				-OPT_HP_ADAM_BETA2 = "opt_hp_Adam_beta2"
			
 
				-OPT_HP_ADAM_EPSILON = "opt_hp_Adam_epsilon"
			
 
				-
			
 
				-OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps"
			
 
				-
			
 
				-
			
 
				-#  Train
			
 
				-
			
 
				-TRAIN_LOOP = "train_loop"
			
 
				-TRAIN_EPOCH = "train_epoch"
			
 
				-TRAIN_CHECKPOINT = "train_checkpoint"
			
 
				-TRAIN_LOSS = "train_loss"
			
 
				-TRAIN_ITERATION_LOSS = "train_iteration_loss"
			
 
				-
			
 
				-
			
 
				-# Eval
			
 
				-
			
 
				-EVAL_START = "eval_start"
			
 
				-EVAL_SIZE = "eval_size"
			
 
				-EVAL_TARGET = "eval_target"
			
 
				-EVAL_ACCURACY = "eval_accuracy"
			
 
				-EVAL_STOP = "eval_stop"
			
 
				-
			
 
				-
			
 
				-# Perf
			
 
				-
			
 
				-PERF_IT_PER_SEC = "perf_it_per_sec"
			
 
				-PERF_TIME_TO_TRAIN = "time_to_train"
			
 
				-
			
 
				-EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy"
			
 
				-
			
 
				-
			
 
				-# Model
			
 
				-
			
 
				-MODEL_HP_LOSS_FN = "model_hp_loss_fn"
			
 
				-
			
 
				-MODEL_HP_INITIAL_SHAPE = "model_hp_initial_shape"
			
 
				-MODEL_HP_FINAL_SHAPE = "model_hp_final_shape"
			
 
				-
			
 
				-MODEL_L2_REGULARIZATION = "model_l2_regularization"
			
 
				-MODEL_EXCLUDE_BN_FROM_L2 = "model_exclude_bn_from_l2"
			
 
				-
			
 
				-MODEL_HP_RELU = "model_hp_relu"
			
 
				-MODEL_HP_CONV2D_FIXED_PADDING = "model_hp_conv2d_fixed_padding"
			
 
				-MODEL_HP_BATCH_NORM = "model_hp_batch_norm"
			
 
				-MODEL_HP_DENSE = "model_hp_dense"
			
 
				-
			
 
				-
			
 
				-# GNMT specific
			
 
				-
			
 
				-MODEL_HP_LOSS_SMOOTHING = "model_hp_loss_smoothing"
			
 
				-MODEL_HP_NUM_LAYERS = "model_hp_num_layers"
			
 
				-MODEL_HP_HIDDEN_SIZE = "model_hp_hidden_size"
			
 
				-MODEL_HP_DROPOUT = "model_hp_dropout"
			
 
				-
			
 
				-EVAL_HP_BEAM_SIZE = "eval_hp_beam_size"
			
 
				-TRAIN_HP_MAX_SEQ_LEN = "train_hp_max_sequence_length"
			
 
				-EVAL_HP_MAX_SEQ_LEN = "eval_hp_max_sequence_length"
			
 
				-EVAL_HP_LEN_NORM_CONST = "eval_hp_length_normalization_constant"
			
 
				-EVAL_HP_LEN_NORM_FACTOR = "eval_hp_length_normalization_factor"
			
 
				-EVAL_HP_COV_PENALTY_FACTOR = "eval_hp_coverage_penalty_factor"
			
 
				-
			
 
				-
			
 
				-# NCF specific
			
 
				-
			
 
				-PREPROC_HP_MIN_RATINGS = "preproc_hp_min_ratings"
			
 
				-PREPROC_HP_NUM_EVAL = "preproc_hp_num_eval"
			
 
				-PREPROC_HP_SAMPLE_EVAL_REPLACEMENT = "preproc_hp_sample_eval_replacement"
			
 
				-
			
 
				-INPUT_HP_NUM_NEG = "input_hp_num_neg"
			
 
				-INPUT_HP_SAMPLE_TRAIN_REPLACEMENT = "input_hp_sample_train_replacement"
			
 
				-INPUT_STEP_TRAIN_NEG_GEN = "input_step_train_neg_gen"
			
 
				-INPUT_STEP_EVAL_NEG_GEN = "input_step_eval_neg_gen"
			
 
				-
			
 
				-EVAL_HP_NUM_USERS = "eval_hp_num_users"
			
 
				-EVAL_HP_NUM_NEG = "eval_hp_num_neg"
			
 
				-
			
 
				-MODEL_HP_MF_DIM = "model_hp_mf_dim"
			
 
				-MODEL_HP_MLP_LAYER_SIZES = "model_hp_mlp_layer_sizes"
			
 
				-
			
 
				-
			
 
				-# RESNET specific
			
 
				-
			
 
				-EVAL_EPOCH_OFFSET = "eval_offset"
			
 
				-
			
 
				-MODEL_HP_INITIAL_MAX_POOL = "model_hp_initial_max_pool"
			
 
				-MODEL_HP_BEGIN_BLOCK = "model_hp_begin_block"
			
 
				-MODEL_HP_END_BLOCK = "model_hp_end_block"
			
 
				-MODEL_HP_BLOCK_TYPE = "model_hp_block_type"
			
 
				-MODEL_HP_PROJECTION_SHORTCUT = "model_hp_projection_shortcut"
			
 
				-MODEL_HP_SHORTCUT_ADD = "model_hp_shorcut_add"
			
 
				-MODEL_HP_RESNET_TOPOLOGY = "model_hp_resnet_topology"
			
 
				-
			
 
				-
			
 
				-# Transformer specific
			
 
				-
			
 
				-INPUT_MAX_LENGTH = "input_max_length"
			
 
				-
			
 
				-MODEL_HP_INITIALIZER_GAIN = "model_hp_initializer_gain"
			
 
				-MODEL_HP_VOCAB_SIZE = "model_hp_vocab_size"
			
 
				-MODEL_HP_NUM_HIDDEN_LAYERS = "model_hp_hidden_layers"
			
 
				-MODEL_HP_EMBEDDING_SHARED_WEIGHTS = "model_hp_embedding_shared_weights"
			
 
				-MODEL_HP_ATTENTION_DENSE = "model_hp_attention_dense"
			
 
				-MODEL_HP_ATTENTION_DROPOUT = "model_hp_attention_dropout"
			
 
				-MODEL_HP_FFN_OUTPUT_DENSE = "model_hp_ffn_output_dense"
			
 
				-MODEL_HP_FFN_FILTER_DENSE = "model_hp_ffn_filter_dense"
			
 
				-MODEL_HP_RELU_DROPOUT = "model_hp_relu_dropout"
			
 
				-MODEL_HP_LAYER_POSTPROCESS_DROPOUT = "model_hp_layer_postprocess_dropout"
			
 
				-MODEL_HP_NORM = "model_hp_norm"
			
 
				-MODEL_HP_SEQ_BEAM_SEARCH = "model_hp_sequence_beam_search"
			
 
				-
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_onnx.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_onnx.py
@@ -0,0 +1,414 @@
 
				+# *****************************************************************************
			
 
				+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+#  Redistribution and use in source and binary forms, with or without
			
 
				+#  modification, are permitted provided that the following conditions are met:
			
 
				+#      * Redistributions of source code must retain the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer.
			
 
				+#      * Redistributions in binary form must reproduce the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer in the
			
 
				+#        documentation and/or other materials provided with the distribution.
			
 
				+#      * Neither the name of the NVIDIA CORPORATION nor the
			
 
				+#        names of its contributors may be used to endorse or promote products
			
 
				+#        derived from this software without specific prior written permission.
			
 
				+#
			
 
				+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
			
 
				+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
			
 
				+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
			
 
				+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
			
 
				+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+#
			
 
				+# *****************************************************************************
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from torch.nn import functional as F
			
 
				+import argparse
			
 
				+
			
 
				+import sys
			
 
				+sys.path.append('./')
			
 
				+
			
 
				+import models
			
 
				+from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model, prepare_input_sequence
			
 
				+from common.utils import to_gpu, get_mask_from_lengths
			
 
				+
			
 
				+def parse_args(parser):
			
 
				+    """
			
 
				+    Parse commandline arguments.
			
 
				+    """
			
 
				+    parser.add_argument('--tacotron2', type=str,
			
 
				+                        help='full path to the Tacotron2 model checkpoint file')
			
 
				+    parser.add_argument('-o', '--output', type=str, required=True,
			
 
				+                        help='Directory for the exported Tacotron 2 ONNX model')
			
 
				+
			
 
				+    return parser
			
 
				+
			
 
				+
			
 
				+def encoder_infer(self, x, input_lengths):
			
 
				+    device = x.device
			
 
				+    for conv in self.convolutions:
			
 
				+        x = F.dropout(F.relu(conv(x.to(device))), 0.5, False)
			
 
				+
			
 
				+    x = x.transpose(1, 2)
			
 
				+
			
 
				+    input_lengths_cpu = input_lengths[:] # TODO
			
 
				+    input_lengths_cpu = input_lengths_cpu.cpu().numpy() # TODO
			
 
				+    x = nn.utils.rnn.pack_padded_sequence(
			
 
				+        x, input_lengths_cpu, batch_first=True)
			
 
				+
			
 
				+    outputs, _ = self.lstm(x)
			
 
				+
			
 
				+    outputs, _ = nn.utils.rnn.pad_packed_sequence(
			
 
				+        outputs, batch_first=True)
			
 
				+
			
 
				+    lens = input_lengths*2
			
 
				+
			
 
				+    return outputs, lens
			
 
				+
			
 
				+
			
 
				+class Encoder(torch.nn.Module):
			
 
				+    def __init__(self, tacotron2):
			
 
				+        super(Encoder, self).__init__()
			
 
				+        self.tacotron2 = tacotron2
			
 
				+        self.tacotron2.encoder.lstm.flatten_parameters()
			
 
				+        self.infer = encoder_infer
			
 
				+
			
 
				+    def forward(self, sequence, sequence_lengths):
			
 
				+        embedded_inputs = self.tacotron2.embedding(sequence).transpose(1, 2)
			
 
				+        memory, lens = self.infer(self.tacotron2.encoder, embedded_inputs, sequence_lengths)
			
 
				+        processed_memory = self.tacotron2.decoder.attention_layer.memory_layer(memory)
			
 
				+        return memory, processed_memory, lens
			
 
				+
			
 
				+class Postnet(torch.nn.Module):
			
 
				+    def __init__(self, tacotron2):
			
 
				+        super(Postnet, self).__init__()
			
 
				+        self.tacotron2 = tacotron2
			
 
				+
			
 
				+    def forward(self, mel_outputs):
			
 
				+        mel_outputs_postnet = self.tacotron2.postnet(mel_outputs)
			
 
				+        return mel_outputs + mel_outputs_postnet
			
 
				+
			
 
				+def lstmcell2lstm_params(lstm_mod, lstmcell_mod):
			
 
				+    lstm_mod.weight_ih_l0 = torch.nn.Parameter(lstmcell_mod.weight_ih)
			
 
				+    lstm_mod.weight_hh_l0 = torch.nn.Parameter(lstmcell_mod.weight_hh)
			
 
				+    lstm_mod.bias_ih_l0 = torch.nn.Parameter(lstmcell_mod.bias_ih)
			
 
				+    lstm_mod.bias_hh_l0 = torch.nn.Parameter(lstmcell_mod.bias_hh)
			
 
				+
			
 
				+
			
 
				+def prenet_infer(self, x):
			
 
				+    x1 = x[:]
			
 
				+    for linear in self.layers:
			
 
				+        x1 = F.relu(linear(x1))
			
 
				+        x0 = x1[0].unsqueeze(0)
			
 
				+        mask = torch.le(torch.rand(256, device='cuda').to(torch.float32), 0.5).to(torch.float32)
			
 
				+        mask = mask.expand(x1.size(0), x1.size(1))
			
 
				+        x1 = x1*mask*2.0
			
 
				+
			
 
				+    return x1
			
 
				+
			
 
				+class DecoderIter(torch.nn.Module):
			
 
				+    def __init__(self, tacotron2):
			
 
				+        super(DecoderIter, self).__init__()
			
 
				+
			
 
				+        self.tacotron2 = tacotron2
			
 
				+        dec = tacotron2.decoder
			
 
				+
			
 
				+        self.p_attention_dropout = dec.p_attention_dropout
			
 
				+        self.p_decoder_dropout = dec.p_decoder_dropout
			
 
				+        self.prenet = dec.prenet
			
 
				+
			
 
				+        self.prenet.infer = prenet_infer
			
 
				+
			
 
				+        self.attention_rnn = nn.LSTM(dec.prenet_dim + dec.encoder_embedding_dim,
			
 
				+                                     dec.attention_rnn_dim, 1)
			
 
				+        lstmcell2lstm_params(self.attention_rnn, dec.attention_rnn)
			
 
				+        self.attention_rnn.flatten_parameters()
			
 
				+
			
 
				+        self.attention_layer = dec.attention_layer
			
 
				+
			
 
				+        self.decoder_rnn = nn.LSTM(dec.attention_rnn_dim + dec.encoder_embedding_dim,
			
 
				+                                   dec.decoder_rnn_dim, 1)
			
 
				+        lstmcell2lstm_params(self.decoder_rnn, dec.decoder_rnn)
			
 
				+        self.decoder_rnn.flatten_parameters()
			
 
				+
			
 
				+        self.linear_projection = dec.linear_projection
			
 
				+        self.gate_layer = dec.gate_layer
			
 
				+
			
 
				+
			
 
				+    def decode(self, decoder_input, in_attention_hidden, in_attention_cell,
			
 
				+               in_decoder_hidden, in_decoder_cell, in_attention_weights,
			
 
				+               in_attention_weights_cum, in_attention_context, memory,
			
 
				+               processed_memory, mask):
			
 
				+
			
 
				+        cell_input = torch.cat((decoder_input, in_attention_context), -1)
			
 
				+
			
 
				+        _, (out_attention_hidden, out_attention_cell) = self.attention_rnn(
			
 
				+            cell_input.unsqueeze(0), (in_attention_hidden.unsqueeze(0),
			
 
				+                                      in_attention_cell.unsqueeze(0)))
			
 
				+        out_attention_hidden = out_attention_hidden.squeeze(0)
			
 
				+        out_attention_cell = out_attention_cell.squeeze(0)
			
 
				+
			
 
				+        out_attention_hidden = F.dropout(
			
 
				+            out_attention_hidden, self.p_attention_dropout, False)
			
 
				+
			
 
				+        attention_weights_cat = torch.cat(
			
 
				+            (in_attention_weights.unsqueeze(1),
			
 
				+             in_attention_weights_cum.unsqueeze(1)), dim=1)
			
 
				+        out_attention_context, out_attention_weights = self.attention_layer(
			
 
				+            out_attention_hidden, memory, processed_memory,
			
 
				+            attention_weights_cat, mask)
			
 
				+
			
 
				+        out_attention_weights_cum = in_attention_weights_cum + out_attention_weights
			
 
				+        decoder_input_tmp = torch.cat(
			
 
				+            (out_attention_hidden, out_attention_context), -1)
			
 
				+
			
 
				+        _, (out_decoder_hidden, out_decoder_cell) = self.decoder_rnn(
			
 
				+            decoder_input_tmp.unsqueeze(0), (in_decoder_hidden.unsqueeze(0),
			
 
				+                                             in_decoder_cell.unsqueeze(0)))
			
 
				+        out_decoder_hidden = out_decoder_hidden.squeeze(0)
			
 
				+        out_decoder_cell = out_decoder_cell.squeeze(0)
			
 
				+
			
 
				+        out_decoder_hidden = F.dropout(
			
 
				+            out_decoder_hidden, self.p_decoder_dropout, False)
			
 
				+
			
 
				+        decoder_hidden_attention_context = torch.cat(
			
 
				+            (out_decoder_hidden, out_attention_context), 1)
			
 
				+
			
 
				+        decoder_output = self.linear_projection(
			
 
				+            decoder_hidden_attention_context)
			
 
				+
			
 
				+        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
			
 
				+
			
 
				+        return (decoder_output, gate_prediction, out_attention_hidden,
			
 
				+                out_attention_cell, out_decoder_hidden, out_decoder_cell,
			
 
				+                out_attention_weights, out_attention_weights_cum, out_attention_context)
			
 
				+
			
 
				+    # @torch.jit.script
			
 
				+    def forward(self,
			
 
				+                decoder_input,
			
 
				+                attention_hidden,
			
 
				+                attention_cell,
			
 
				+                decoder_hidden,
			
 
				+                decoder_cell,
			
 
				+                attention_weights,
			
 
				+                attention_weights_cum,
			
 
				+                attention_context,
			
 
				+                memory,
			
 
				+                processed_memory,
			
 
				+                mask):
			
 
				+        decoder_input1 = self.prenet.infer(self.prenet, decoder_input)
			
 
				+        outputs = self.decode(decoder_input1,
			
 
				+                              attention_hidden,
			
 
				+                              attention_cell,
			
 
				+                              decoder_hidden,
			
 
				+                              decoder_cell,
			
 
				+                              attention_weights,
			
 
				+                              attention_weights_cum,
			
 
				+                              attention_context,
			
 
				+                              memory,
			
 
				+                              processed_memory,
			
 
				+                              mask)
			
 
				+        return outputs
			
 
				+
			
 
				+
			
 
				+
			
 
				+def test_inference(encoder, decoder_iter, postnet):
			
 
				+
			
 
				+    encoder.eval()
			
 
				+    decoder_iter.eval()
			
 
				+    postnet.eval()
			
 
				+
			
 
				+    from trt.inference_trt import init_decoder_inputs
			
 
				+
			
 
				+    texts = ["Hello World, good day."]
			
 
				+    sequences, sequence_lengths = prepare_input_sequence(texts)
			
 
				+
			
 
				+    measurements = {}
			
 
				+
			
 
				+    print("Running Tacotron2 Encoder")
			
 
				+    with torch.no_grad():
			
 
				+        memory, processed_memory, lens = encoder(sequences, sequence_lengths)
			
 
				+
			
 
				+    print("Running Tacotron2 Decoder")
			
 
				+    device = memory.device
			
 
				+    mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device = device)
			
 
				+    not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device = device)
			
 
				+    mel_outputs, gate_outputs, alignments = (torch.zeros(1), torch.zeros(1), torch.zeros(1))
			
 
				+    gate_threshold = 0.6
			
 
				+    max_decoder_steps = 1000
			
 
				+    first_iter = True
			
 
				+
			
 
				+    (decoder_input, attention_hidden, attention_cell, decoder_hidden,
			
 
				+     decoder_cell, attention_weights, attention_weights_cum,
			
 
				+     attention_context, memory, processed_memory,
			
 
				+     mask) = init_decoder_inputs(memory, processed_memory, sequence_lengths)
			
 
				+
			
 
				+    while True:
			
 
				+        with torch.no_grad():
			
 
				+            (mel_output, gate_output,
			
 
				+             attention_hidden, attention_cell,
			
 
				+             decoder_hidden, decoder_cell,
			
 
				+             attention_weights, attention_weights_cum,
			
 
				+             attention_context) = decoder_iter(decoder_input, attention_hidden, attention_cell, decoder_hidden,
			
 
				+                                               decoder_cell, attention_weights, attention_weights_cum,
			
 
				+                                               attention_context, memory, processed_memory, mask)
			
 
				+
			
 
				+        if first_iter:
			
 
				+            mel_outputs = torch.unsqueeze(mel_output, 2)
			
 
				+            gate_outputs = torch.unsqueeze(gate_output, 2)
			
 
				+            alignments = torch.unsqueeze(attention_weights, 2)
			
 
				+            first_iter = False
			
 
				+        else:
			
 
				+            mel_outputs = torch.cat((mel_outputs, torch.unsqueeze(mel_output, 2)), 2)
			
 
				+            gate_outputs = torch.cat((gate_outputs, torch.unsqueeze(gate_output, 2)), 2)
			
 
				+            alignments = torch.cat((alignments, torch.unsqueeze(attention_weights, 2)), 2)
			
 
				+
			
 
				+        dec = torch.le(torch.sigmoid(gate_output), gate_threshold).to(torch.int32).squeeze(1)
			
 
				+        not_finished = not_finished*dec
			
 
				+        mel_lengths += not_finished
			
 
				+
			
 
				+        if torch.sum(not_finished) == 0:
			
 
				+            print("Stopping after ",mel_outputs.size(2)," decoder steps")
			
 
				+            break
			
 
				+        if mel_outputs.size(2) == max_decoder_steps:
			
 
				+            print("Warning! Reached max decoder steps")
			
 
				+            break
			
 
				+
			
 
				+        decoder_input = mel_output
			
 
				+
			
 
				+
			
 
				+    print("Running Tacotron2 PostNet")
			
 
				+    with torch.no_grad():
			
 
				+        mel_outputs_postnet = postnet(mel_outputs)
			
 
				+
			
 
				+    return mel_outputs_postnet
			
 
				+
			
 
				+def main():
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='PyTorch Tacotron 2 export to TRT')
			
 
				+    parser = parse_args(parser)
			
 
				+    args, _ = parser.parse_known_args()
			
 
				+
			
 
				+    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, False)
			
 
				+
			
 
				+    opset_version = 10
			
 
				+
			
 
				+    sequences = torch.randint(low=0, high=148, size=(1,50),
			
 
				+                             dtype=torch.long).cuda()
			
 
				+    sequence_lengths = torch.IntTensor([sequences.size(1)]).cuda().long()
			
 
				+    dummy_input = (sequences, sequence_lengths)
			
 
				+
			
 
				+    encoder = Encoder(tacotron2)
			
 
				+    encoder.eval()
			
 
				+    with torch.no_grad():
			
 
				+        encoder(*dummy_input)
			
 
				+
			
 
				+    torch.onnx.export(encoder, dummy_input, args.output+"/"+"encoder.onnx",
			
 
				+                      opset_version=opset_version,
			
 
				+                      do_constant_folding=True,
			
 
				+                      input_names=["sequences", "sequence_lengths"],
			
 
				+                      output_names=["memory", "processed_memory", "lens"],
			
 
				+                      dynamic_axes={"sequences": {0: "batch_size", 1: "text_seq"},
			
 
				+                                    "sequence_lengths": {0: "batch_size"},
			
 
				+                                    "memory": {0: "batch_size", 1: "mem_seq"},
			
 
				+                                    "processed_memory": {0: "batch_size", 1: "mem_seq"},
			
 
				+                                    "lens": {0: "batch_size"},
			
 
				+                      })
			
 
				+
			
 
				+    decoder_iter = DecoderIter(tacotron2)
			
 
				+    memory = torch.randn((1,sequence_lengths[0],512)).cuda() #encoder_outputs
			
 
				+    memory_lengths = sequence_lengths
			
 
				+    # initialize decoder states for dummy_input
			
 
				+    decoder_input = tacotron2.decoder.get_go_frame(memory)
			
 
				+    mask = get_mask_from_lengths(memory_lengths)
			
 
				+    (attention_hidden,
			
 
				+     attention_cell,
			
 
				+     decoder_hidden,
			
 
				+     decoder_cell,
			
 
				+     attention_weights,
			
 
				+     attention_weights_cum,
			
 
				+     attention_context,
			
 
				+     processed_memory) = tacotron2.decoder.initialize_decoder_states(memory)
			
 
				+    dummy_input = (decoder_input,
			
 
				+                   attention_hidden,
			
 
				+                   attention_cell,
			
 
				+                   decoder_hidden,
			
 
				+                   decoder_cell,
			
 
				+                   attention_weights,
			
 
				+                   attention_weights_cum,
			
 
				+                   attention_context,
			
 
				+                   memory,
			
 
				+                   processed_memory,
			
 
				+                   mask)
			
 
				+
			
 
				+    decoder_iter = DecoderIter(tacotron2)
			
 
				+    decoder_iter.eval()
			
 
				+    with torch.no_grad():
			
 
				+        decoder_iter(*dummy_input)
			
 
				+
			
 
				+    torch.onnx.export(decoder_iter, dummy_input, args.output+"/"+"decoder_iter.onnx",
			
 
				+                      opset_version=opset_version,
			
 
				+                      do_constant_folding=True,
			
 
				+                      input_names=["decoder_input",
			
 
				+                                   "attention_hidden",
			
 
				+                                   "attention_cell",
			
 
				+                                   "decoder_hidden",
			
 
				+                                   "decoder_cell",
			
 
				+                                   "attention_weights",
			
 
				+                                   "attention_weights_cum",
			
 
				+                                   "attention_context",
			
 
				+                                   "memory",
			
 
				+                                   "processed_memory",
			
 
				+                                   "mask"],
			
 
				+                      output_names=["decoder_output",
			
 
				+                                    "gate_prediction",
			
 
				+                                    "out_attention_hidden",
			
 
				+                                    "out_attention_cell",
			
 
				+                                    "out_decoder_hidden",
			
 
				+                                    "out_decoder_cell",
			
 
				+                                    "out_attention_weights",
			
 
				+                                    "out_attention_weights_cum",
			
 
				+                                    "out_attention_context"],
			
 
				+                      dynamic_axes={"decoder_input" : {0: "batch_size"},
			
 
				+                                    "attention_hidden" : {0: "batch_size"},
			
 
				+                                    "attention_cell" : {0: "batch_size"},
			
 
				+                                    "decoder_hidden" : {0: "batch_size"},
			
 
				+                                    "decoder_cell" : {0: "batch_size"},
			
 
				+                                    "attention_weights" : {0: "batch_size", 1: "seq_len"},
			
 
				+                                    "attention_weights_cum" : {0: "batch_size", 1: "seq_len"},
			
 
				+                                    "attention_context" : {0: "batch_size"},
			
 
				+                                    "memory" : {0: "batch_size", 1: "seq_len"},
			
 
				+                                    "processed_memory" : {0: "batch_size", 1: "seq_len"},
			
 
				+                                    "mask" : {0: "batch_size", 1: "seq_len"},
			
 
				+                                    "decoder_output" : {0: "batch_size"},
			
 
				+                                    "gate_prediction" : {0: "batch_size"},
			
 
				+                                    "out_attention_hidden" : {0: "batch_size"},
			
 
				+                                    "out_attention_cell" : {0: "batch_size"},
			
 
				+                                    "out_decoder_hidden" : {0: "batch_size"},
			
 
				+                                    "out_decoder_cell" : {0: "batch_size"},
			
 
				+                                    "out_attention_weights" : {0: "batch_size", 1: "seq_len"},
			
 
				+                                    "out_attention_weights_cum" : {0: "batch_size", 1: "seq_len"},
			
 
				+                                    "out_attention_context" : {0: "batch_size"}
			
 
				+                      })
			
 
				+
			
 
				+    postnet = Postnet(tacotron2)
			
 
				+    dummy_input = torch.randn((1,80,620)).cuda()
			
 
				+    torch.onnx.export(postnet, dummy_input, args.output+"/"+"postnet.onnx",
			
 
				+                      opset_version=opset_version,
			
 
				+                      do_constant_folding=True,
			
 
				+                      input_names=["mel_outputs"],
			
 
				+                      output_names=["mel_outputs_postnet"],
			
 
				+                      dynamic_axes={"mel_outputs": {0: "batch_size", 2: "mel_seq"},
			
 
				+                                    "mel_outputs_postnet": {0: "batch_size", 2: "mel_seq"}})
			
 
				+
			
 
				+    mel = test_inference(encoder, decoder_iter, postnet)
			
 
				+    torch.save(mel, "mel.pt")
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts.py
@@ -28,7 +28,6 @@
 
				 import torch
			
 
				 import argparse
			
 
				 from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model
			
 
				-from dllogger.autologging import log_hardware, log_args
			
 
				 
			
 
				 def parse_args(parser):
			
 
				     """
			
@@ -52,9 +51,8 @@ def main():
 
				     parser = parse_args(parser)
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				-    log_args(args)    
			
 
				     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
			
 
				-                                     args.amp_run, rename=True)
			
 
				+                                     args.amp_run, forward_is_infer=True)
			
 
				     
			
 
				     jitted_tacotron2 = torch.jit.script(tacotron2)
			
 
				 
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts_config.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts_config.py
@@ -28,7 +28,6 @@
 
				 
			
 
				 import os
			
 
				 import argparse
			
 
				-from dllogger.autologging import log_hardware, log_args
			
 
				 
			
 
				 
			
 
				 def parse_args(parser):
			
@@ -59,8 +58,6 @@ def main():
 
				     parser = parse_args(parser)
			
 
				     args = parser.parse_args()
			
 
				     
			
 
				-    log_args(args)    
			
 
				-    
			
 
				     # prepare repository
			
 
				     model_folder = os.path.join('./trtis_repo', args.trtis_model_name)
			
 
				     version_folder = os.path.join(model_folder, str(args.trtis_model_version))
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py
@@ -28,9 +28,10 @@
 
				 import torch
			
 
				 import argparse
			
 
				 
			
 
				-from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model
			
 
				+import sys
			
 
				+sys.path.append('./')
			
 
				 
			
 
				-from dllogger.autologging import log_args
			
 
				+from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model
			
 
				 
			
 
				 def parse_args(parser):
			
 
				     """
			
@@ -38,8 +39,8 @@ def parse_args(parser):
 
				     """
			
 
				     parser.add_argument('--waveglow', type=str, required=True,
			
 
				                         help='full path to the WaveGlow model checkpoint file')
			
 
				-    parser.add_argument('-o', '--output', type=str, default="waveglow.onnx",
			
 
				-                        help='filename for the exported WaveGlow TRT engine')
			
 
				+    parser.add_argument('-o', '--output', type=str, required=True,
			
 
				+                        help='Directory for the exported WaveGlow ONNX model')
			
 
				     parser.add_argument('--amp-run', action='store_true',
			
 
				                         help='inference with AMP')
			
 
				     parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
			
@@ -112,9 +113,38 @@ def convert_1d_to_2d_(glow):
 
				 
			
 
				     glow.cuda()
			
 
				 
			
 
				+def test_inference(waveglow):
			
 
				+
			
 
				+
			
 
				+    from scipy.io.wavfile import write
			
 
				+
			
 
				+    mel = torch.load("mel.pt").cuda()
			
 
				+    # mel = torch.load("mel_spectrograms/LJ001-0015.wav.pt").cuda()
			
 
				+    # mel = mel.unsqueeze(0)
			
 
				+    mel_lengths = [mel.size(2)]
			
 
				+    stride = 256
			
 
				+    kernel_size = 1024
			
 
				+    n_group = 8
			
 
				+    z_size2 = (mel.size(2)-1)*stride+(kernel_size-1)+1
			
 
				+    # corresponds to cutoff in infer_onnx
			
 
				+    z_size2 = z_size2 - (kernel_size-stride)
			
 
				+    z_size2 = z_size2//n_group
			
 
				+    z = torch.randn(1, n_group, z_size2, 1).cuda()
			
 
				+    mel = mel.unsqueeze(3)
			
 
				+
			
 
				+    with torch.no_grad():
			
 
				+        audios = waveglow(mel, z)
			
 
				+
			
 
				+    for i, audio in enumerate(audios):
			
 
				+        audio = audio[:mel_lengths[i]*256]
			
 
				+        audio = audio/torch.max(torch.abs(audio))
			
 
				+        write("audio_pyt.wav", 22050, audio.cpu().numpy())
			
 
				+
			
 
				+
			
 
				 def export_onnx(parser, args):
			
 
				 
			
 
				-    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run)
			
 
				+    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
			
 
				+                                    args.amp_run, forward_is_infer=False)
			
 
				 
			
 
				     # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
			
 
				     mel = torch.randn(1, 80, 620).cuda()
			
@@ -140,7 +170,18 @@ def export_onnx(parser, args):
 
				         if args.amp_run:
			
 
				             waveglow.half()
			
 
				         mel = mel.unsqueeze(3)
			
 
				-        torch.onnx.export(waveglow, (mel, z), args.output)
			
 
				+
			
 
				+        opset_version = 10
			
 
				+        torch.onnx.export(waveglow, (mel, z), args.output+"/"+"waveglow.onnx",
			
 
				+                          opset_version=opset_version,
			
 
				+                          do_constant_folding=True,
			
 
				+                          input_names=["mel", "z"],
			
 
				+                          output_names=["audio"],
			
 
				+                          dynamic_axes={"mel":   {0: "batch_size", 2: "mel_seq"},
			
 
				+                                        "z":     {0: "batch_size", 2: "z_seq"},
			
 
				+                                        "audio": {0: "batch_size", 1: "audio_seq"}})
			
 
				+
			
 
				+    test_inference(waveglow)
			
 
				 
			
 
				 
			
 
				 def main():
			
@@ -150,8 +191,6 @@ def main():
 
				     parser = parse_args(parser)
			
 
				     args, _ = parser.parse_known_args()
			
 
				 
			
 
				-    log_args(args)
			
 
				-
			
 
				     export_onnx(parser, args)
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_trt_config.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_trt_config.py
@@ -28,7 +28,6 @@
 
				 
			
 
				 import os
			
 
				 import argparse
			
 
				-from dllogger.autologging import log_hardware, log_args
			
 
				 
			
 
				 
			
 
				 def parse_args(parser):
			
@@ -54,8 +53,6 @@ def main():
 
				     parser = parse_args(parser)
			
 
				     args = parser.parse_args()
			
 
				     
			
 
				-    log_args(args)    
			
 
				-    
			
 
				     # prepare repository
			
 
				     model_folder = os.path.join('./trtis_repo', args.trtis_model_name)
			
 
				     version_folder = os.path.join(model_folder, str(args.trtis_model_version))
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/inference.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/inference.py
@@ -35,9 +35,8 @@ from scipy.io.wavfile import write
 
				 import sys
			
 
				 
			
 
				 import time
			
 
				-from dllogger.logger import LOGGER
			
 
				-import dllogger.logger as dllg
			
 
				-from dllogger.autologging import log_hardware, log_args
			
 
				+import dllogger as DLLogger
			
 
				+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
			
 
				 
			
 
				 from apex import amp
			
 
				 
			
@@ -72,16 +71,6 @@ def parse_args(parser):
 
				     return parser
			
 
				 
			
 
				 
			
 
				-def load_checkpoint(checkpoint_path, model_name):
			
 
				-    assert os.path.isfile(checkpoint_path)
			
 
				-
			
 
				-    print("Loading checkpoint '{}'".format(checkpoint_path))
			
 
				-    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
			
 
				-    model.load_state_dict(checkpoint_dict['state_dict'])
			
 
				-    print("Loaded '{}' checkpoint '{}'" .format(model_name, checkpoint_path))
			
 
				-    return model
			
 
				-
			
 
				-
			
 
				 def checkpoint_from_distributed(state_dict):
			
 
				     """
			
 
				     Checks whether checkpoint was generated by DistributedDataParallel. DDP
			
@@ -111,12 +100,13 @@ def unwrap_distributed(state_dict):
 
				     return new_state_dict
			
 
				 
			
 
				 
			
 
				-def load_and_setup_model(model_name, parser, checkpoint, amp_run, rename=False):
			
 
				+def load_and_setup_model(model_name, parser, checkpoint, amp_run, forward_is_infer=False):
			
 
				     model_parser = models.parse_model_args(model_name, parser, add_help=False)
			
 
				     model_args, _ = model_parser.parse_known_args()
			
 
				 
			
 
				     model_config = models.get_model_config(model_name, model_args)
			
 
				-    model = models.get_model(model_name, model_config, to_cuda=True, rename=rename)
			
 
				+    model = models.get_model(model_name, model_config, to_cuda=True,
			
 
				+                             forward_is_infer=forward_is_infer)
			
 
				 
			
 
				     if checkpoint is not None:
			
 
				         state_dict = torch.load(checkpoint)['state_dict']
			
@@ -195,30 +185,19 @@ def main():
 
				     parser = parse_args(parser)
			
 
				     args, _ = parser.parse_known_args()
			
 
				 
			
 
				-    LOGGER.set_model_name("Tacotron2_PyT")
			
 
				-    LOGGER.set_backends([
			
 
				-        dllg.StdOutBackend(log_file=None,
			
 
				-                           logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
			
 
				-        dllg.JsonBackend(log_file=args.log_file,
			
 
				-                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
			
 
				-    ])
			
 
				-    LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("waveglow_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-
			
 
				-    log_hardware()
			
 
				-    log_args(args)
			
 
				+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
			
 
				+                                              args.output+'/'+args.log_file),
			
 
				+                            StdOutBackend(Verbosity.VERBOSE)])
			
 
				+    for k,v in vars(args).items():
			
 
				+        DLLogger.log(step="PARAMETER", data={k:v})
			
 
				+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
			
 
				 
			
 
				     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
			
 
				-                                     args.amp_run)
			
 
				+                                     args.amp_run, forward_is_infer=True)
			
 
				     waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
			
 
				-                                    args.amp_run)
			
 
				+                                    args.amp_run, forward_is_infer=True)
			
 
				     denoiser = Denoiser(waveglow).cuda()
			
 
				 
			
 
				-    tacotron2.forward = tacotron2.infer
			
 
				-    type(tacotron2).forward = type(tacotron2).infer
			
 
				     jitted_tacotron2 = torch.jit.script(tacotron2)
			
 
				 
			
 
				     texts = []
			
@@ -228,7 +207,7 @@ def main():
 
				     except:
			
 
				         print("Could not read file")
			
 
				         sys.exit(1)
			
 
				-
			
 
				+    
			
 
				     if args.include_warmup:
			
 
				         sequence = torch.randint(low=0, high=148, size=(1,50),
			
 
				                                  dtype=torch.long).cuda()
			
@@ -236,9 +215,7 @@ def main():
 
				         for i in range(3):
			
 
				             with torch.no_grad():
			
 
				                 mel, mel_lengths = jitted_tacotron2(sequence, input_lengths)
			
 
				-                _ = waveglow.infer(mel)
			
 
				-
			
 
				-    LOGGER.iteration_start()
			
 
				+                _ = waveglow(mel)
			
 
				 
			
 
				     measurements = {}
			
 
				 
			
@@ -248,19 +225,19 @@ def main():
 
				         mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths)
			
 
				 
			
 
				     with torch.no_grad(), MeasureTime(measurements, "waveglow_time"):
			
 
				-        audios = waveglow.infer(mel, sigma=args.sigma_infer)
			
 
				+        audios = waveglow(mel, sigma=args.sigma_infer)
			
 
				         audios = audios.float()
			
 
				         audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
			
 
				 
			
 
				+    print("Stopping after",mel.size(2),"decoder steps")
			
 
				     tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time']
			
 
				     waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time']
			
 
				 
			
 
				-    LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf)
			
 
				-    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
			
 
				-    LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf)
			
 
				-    LOGGER.log(key="waveglow_latency", value=measurements['waveglow_time'])
			
 
				-    LOGGER.log(key="latency", value=(measurements['tacotron2_time']+
			
 
				-                                     measurements['waveglow_time']))
			
 
				+    DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf})
			
 
				+    DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']})
			
 
				+    DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
			
 
				+    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
			
 
				+    DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time'])})
			
 
				 
			
 
				     for i, audio in enumerate(audios):
			
 
				         audio = audio[:mel_lengths[i]*args.stft_hop_length]
			
@@ -268,8 +245,7 @@ def main():
 
				         audio_path = args.output + "audio_"+str(i)+".wav"
			
 
				         write(audio_path, args.sampling_rate, audio.cpu().numpy())
			
 
				 
			
 
				-    LOGGER.iteration_stop()
			
 
				-    LOGGER.finish()
			
 
				+    DLLogger.flush()
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/inference_perf.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/inference_perf.py
@@ -34,10 +34,8 @@ import time
 
				 
			
 
				 from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model, MeasureTime
			
 
				 
			
 
				-from dllogger.logger import LOGGER
			
 
				-import dllogger.logger as dllg
			
 
				-from dllogger import tags
			
 
				-from dllogger.autologging import log_hardware, log_args
			
 
				+import dllogger as DLLogger
			
 
				+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
			
 
				 
			
 
				 from apex import amp
			
 
				 
			
@@ -52,6 +50,8 @@ def parse_args(parser):
 
				     parser.add_argument('--amp-run', action='store_true',
			
 
				                         help='inference with AMP')
			
 
				     parser.add_argument('-bs', '--batch-size', type=int, default=1)
			
 
				+    parser.add_argument('-o', '--output', type=str, required=True,
			
 
				+                        help='Directory to save results')
			
 
				     parser.add_argument('--log-file', type=str, default='nvlog.json',
			
 
				                         help='Filename for logging')
			
 
				 
			
@@ -70,29 +70,23 @@ def main():
 
				 
			
 
				     log_file = args.log_file
			
 
				 
			
 
				-    LOGGER.set_model_name("Tacotron2_PyT")
			
 
				-    LOGGER.set_backends([
			
 
				-        dllg.StdOutBackend(log_file=None,
			
 
				-                           logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
			
 
				-        dllg.JsonBackend(log_file,
			
 
				-                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
			
 
				-    ])
			
 
				-    LOGGER.register_metric("items_per_sec",
			
 
				-                           metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("latency",
			
 
				-                           metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
			
 
				+                                              args.output+'/'+args.log_file),
			
 
				+                            StdOutBackend(Verbosity.VERBOSE)])
			
 
				+    for k,v in vars(args).items():
			
 
				+        DLLogger.log(step="PARAMETER", data={k:v})
			
 
				+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
			
 
				 
			
 
				-    log_hardware()
			
 
				-    log_args(args)
			
 
				+    model = load_and_setup_model(args.model_name, parser, None, args.amp_run,
			
 
				+                                 forward_is_infer=True)
			
 
				 
			
 
				-    model = load_and_setup_model(args.model_name, parser, None, args.amp_run)
			
 
				+    if args.model_name == "Tacotron2":
			
 
				+        model = torch.jit.script(model)
			
 
				 
			
 
				     warmup_iters = 3
			
 
				     num_iters = 1+warmup_iters
			
 
				 
			
 
				     for i in range(num_iters):
			
 
				-        if i >= warmup_iters:
			
 
				-            LOGGER.iteration_start()
			
 
				 
			
 
				         measurements = {}
			
 
				 
			
@@ -101,7 +95,7 @@ def main():
 
				                                         dtype=torch.long).cuda()
			
 
				             input_lengths = torch.IntTensor([text_padded.size(1)]*args.batch_size).cuda().long()
			
 
				             with torch.no_grad(), MeasureTime(measurements, "inference_time"):
			
 
				-                mels, _ = model.infer(text_padded, input_lengths)
			
 
				+                mels, _ = model(text_padded, input_lengths)
			
 
				             num_items = mels.size(0)*mels.size(2)
			
 
				 
			
 
				         if args.model_name == 'WaveGlow':
			
@@ -113,16 +107,20 @@ def main():
 
				                 mel_padded = mel_padded.half()
			
 
				 
			
 
				             with torch.no_grad(), MeasureTime(measurements, "inference_time"):
			
 
				-                audios = model.infer(mel_padded)
			
 
				+                audios = model(mel_padded)
			
 
				                 audios = audios.float()
			
 
				             num_items = audios.size(0)*audios.size(1)
			
 
				 
			
 
				         if i >= warmup_iters:
			
 
				-            LOGGER.log(key="items_per_sec", value=(num_items/measurements['inference_time']))
			
 
				-            LOGGER.log(key="latency", value=measurements['inference_time'])
			
 
				-            LOGGER.iteration_stop()
			
 
				+            DLLogger.log(step=(i-warmup_iters,), data={"latency": measurements['inference_time']})
			
 
				+            DLLogger.log(step=(i-warmup_iters,), data={"items_per_sec": num_items/measurements['inference_time']})
			
 
				+
			
 
				+    DLLogger.log(step=tuple(),
			
 
				+                 data={'infer_latency': measurements['inference_time']})
			
 
				+    DLLogger.log(step=tuple(),
			
 
				+                 data={'infer_items_per_sec': num_items/measurements['inference_time']})
			
 
				 
			
 
				-    LOGGER.finish()
			
 
				+    DLLogger.flush()
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/main.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/main.py
@@ -0,0 +1,53 @@
 
				+# *****************************************************************************
			
 
				+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+#  Redistribution and use in source and binary forms, with or without
			
 
				+#  modification, are permitted provided that the following conditions are met:
			
 
				+#      * Redistributions of source code must retain the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer.
			
 
				+#      * Redistributions in binary form must reproduce the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer in the
			
 
				+#        documentation and/or other materials provided with the distribution.
			
 
				+#      * Neither the name of the NVIDIA CORPORATION nor the
			
 
				+#        names of its contributors may be used to endorse or promote products
			
 
				+#        derived from this software without specific prior written permission.
			
 
				+#
			
 
				+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
			
 
				+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
			
 
				+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
			
 
				+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
			
 
				+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+#
			
 
				+# *****************************************************************************
			
 
				+
			
 
				+import argparse
			
 
				+from train import main as main_train
			
 
				+from inference_perf import main as main_infer
			
 
				+
			
 
				+def parse_args(parser):
			
 
				+    """
			
 
				+    Parse commandline arguments.
			
 
				+    """
			
 
				+
			
 
				+    parser.add_argument('--bench-class',  type=str, choices=['train', 'perf-infer', 'perf-train'], required=True, help='Choose test class')
			
 
				+
			
 
				+    return parser
			
 
				+
			
 
				+def main():
			
 
				+    
			
 
				+    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Testing')
			
 
				+    parser = parse_args(parser)
			
 
				+    args, unknown_args = parser.parse_known_args()
			
 
				+
			
 
				+    if "train" in args.bench_class:
			
 
				+        main_train()
			
 
				+    else:
			
 
				+        main_infer()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/models.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/models.py
@@ -63,19 +63,25 @@ def init_bn(module):
 
				 
			
 
				 
			
 
				 def get_model(model_name, model_config, to_cuda,
			
 
				-              uniform_initialize_bn_weight=False, rename=False):
			
 
				+              uniform_initialize_bn_weight=False, forward_is_infer=False):
			
 
				     """ Code chooses a model based on name"""
			
 
				     model = None
			
 
				     if model_name == 'Tacotron2':
			
 
				-        if rename:
			
 
				-            class Tacotron2_extra(Tacotron2):
			
 
				+        if forward_is_infer:
			
 
				+            class Tacotron2__forward_is_infer(Tacotron2):
			
 
				                 def forward(self, inputs, input_lengths):
			
 
				                     return self.infer(inputs, input_lengths)
			
 
				-            model = Tacotron2_extra(**model_config)
			
 
				+            model = Tacotron2__forward_is_infer(**model_config)
			
 
				         else:
			
 
				             model = Tacotron2(**model_config)
			
 
				     elif model_name == 'WaveGlow':
			
 
				-        model = WaveGlow(**model_config)
			
 
				+        if forward_is_infer:
			
 
				+            class WaveGlow__forward_is_infer(WaveGlow):
			
 
				+                def forward(self, spect, sigma=1.0):
			
 
				+                    return self.infer(spect, sigma)
			
 
				+            model = WaveGlow__forward_is_infer(**model_config)
			
 
				+        else:
			
 
				+            model = WaveGlow(**model_config)
			
 
				     else:
			
 
				         raise NotImplementedError(model_name)
			
 
				 
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/README.md
+++ b/PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/README.md
@@ -48,7 +48,7 @@ Follow the Tacotron 2 Quick Start Guide (points 1-4) to start the container.
 
				 Inside the container, type:
			
 
				 ```bash
			
 
				 cd /workspace/tacotron2/
			
 
				-python export_tacotron2_ts_config.py --amp-run
			
 
				+python exports/export_tacotron2_ts_config.py --amp-run
			
 
				 ```
			
 
				 
			
 
				 This will export the folder structure of the TRTIS repository and the config file of Tacotron 2. 
			
@@ -67,7 +67,7 @@ Move the downloaded model to `trtis_repo/tacotron2/1/model.pt`
 
				 
			
 
				 To export the Tacotron 2 model using TorchScript, type:
			
 
				 ```bash
			
 
				-python export_tacotron2_ts.py --tacotron2 <tacotron2_checkpoint> -o trtis_repo/tacotron2/1/model.pt --amp-run
			
 
				+python exports/export_tacotron2_ts.py --tacotron2 <tacotron2_checkpoint> -o trtis_repo/tacotron2/1/model.pt --amp-run
			
 
				 ```
			
 
				 
			
 
				 This will save the model as ``trtis_repo/tacotron2/1/model.pt``.
			
@@ -78,7 +78,7 @@ For WaveGlow, we also need to create the folder structure that will be used by t
 
				 Inside the container, type:
			
 
				 ```bash
			
 
				 cd /workspace/tacotron2/
			
 
				-python export_waveglow_trt_config.py --amp-run
			
 
				+python exports/export_waveglow_trt_config.py --amp-run
			
 
				 ```
			
 
				 
			
 
				 This will export the folder structure of the TRTIS repository and the config file of Waveglow. 
			
@@ -106,7 +106,7 @@ cd /workspace/onnx-tensorrt/build && cmake .. -DCMAKE_CXX_FLAGS=-isystem\ /usr/l
 
				 In order to export the model into the ONNX intermediate representation, type:
			
 
				 
			
 
				 ```bash
			
 
				-python export_waveglow_trt.py --waveglow <waveglow_checkpoint> --wn-channels 256 --amp-run
			
 
				+python exports/export_waveglow_onnx.py --waveglow <waveglow_checkpoint> --wn-channels 256 --amp-run
			
 
				 ```
			
 
				 
			
 
				 This will save the model as `waveglow.onnx` (you can change its name with the flag `--output <filename>`).
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_1GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_1GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3
			
 
				+python train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_4GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_4GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3
			
 
				+python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_8GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_AMP_DGX1_16GB_8GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3
			
 
				+python -m multiproc train.py -m Tacotron2 -o output/ --amp-run -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.3
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_1GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_1GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1
			
 
				+python train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_4GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_4GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1
			
 
				+python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_8GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_tacotron2_FP32_DGX1_16GB_8GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1
			
 
				+python -m multiproc train.py -m Tacotron2 -o output/ -lr 1e-3 --epochs 1501 -bs 64 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --load-mel-from-disk --training-files=filelists/ljs_mel_text_train_filelist.txt --validation-files=filelists/ljs_mel_text_val_filelist.txt --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_1GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_1GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
			
 
				+python train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_4GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_4GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
			
 
				+python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_8GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_AMP_DGX1_16GB_8GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
			
 
				+python -m multiproc train.py -m WaveGlow -o output/ --amp-run -lr 1e-4 --epochs 1001 -bs 10 --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_1GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_1GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
			
 
				+python train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_4GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_4GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
			
 
				+python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_8GPU.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/platform/train_waveglow_FP32_DGX1_16GB_8GPU.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
			
 
				+python -m multiproc train.py -m WaveGlow -o output/ -lr 1e-4 --epochs 1001 -bs 4 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 3.4028234663852886e+38 --cudnn-benchmark --cudnn-enabled --log-file nvlog.json
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/run_latency_tests.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/run_latency_tests.sh
@@ -1,5 +1,4 @@
 
				-bash test_infer.sh -bs 1 -il 128 -p amp --num-iters 1003 --tacotron2 checkpoint_Tacotron2_amp --waveglow checkpoint_WaveGlow_amp
			
 
				-bash test_infer.sh -bs 4 -il 128 -p amp --num-iters 1003 --tacotron2 checkpoint_Tacotron2_amp --waveglow checkpoint_WaveGlow_amp
			
 
				-bash test_infer.sh -bs 1 -il 128 -p fp32 --num-iters 1003 --tacotron2 checkpoint_Tacotron2_fp32 --waveglow checkpoint_WaveGlow_fp32
			
 
				-bash test_infer.sh -bs 4 -il 128 -p fp32 --num-iters 1003 --tacotron2 checkpoint_Tacotron2_fp32 --waveglow checkpoint_WaveGlow_fp32
			
 
				-
			
 
				+bash test_infer.sh -bs 1 -il 128 -p amp --num-iters 1003 --tacotron2 ./checkpoints/checkpoint_Tacotron2_amp --waveglow ./checkpoints/checkpoint_WaveGlow_amp
			
 
				+bash test_infer.sh -bs 4 -il 128 -p amp --num-iters 1003 --tacotron2 ./checkpoints/checkpoint_Tacotron2_amp --waveglow ./checkpoints/checkpoint_WaveGlow_amp
			
 
				+bash test_infer.sh -bs 1 -il 128 -p fp32 --num-iters 1003 --tacotron2 ./checkpoints/checkpoint_Tacotron2_fp32 --waveglow ./checkpoints/checkpoint_WaveGlow_fp32
			
 
				+bash test_infer.sh -bs 4 -il 128 -p fp32 --num-iters 1003 --tacotron2 ./checkpoints/checkpoint_Tacotron2_fp32 --waveglow ./checkpoints/checkpoint_WaveGlow_fp32
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/scripts/train_tacotron2.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/scripts/train_tacotron2.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m Tacotron2 -o ./output/ -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file ./output/nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1 --amp-run
			
 
				+python -m multiproc train.py -m Tacotron2 -o ./output/ -lr 1e-3 --epochs 1501 -bs 128 --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-enabled --log-file nvlog.json --anneal-steps 500 1000 1500 --anneal-factor 0.1 --amp-run
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/scripts/train_waveglow.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/scripts/train_waveglow.sh
@@ -1,2 +1,2 @@
 
				 mkdir -p output
			
 
				-python -m multiproc train.py -m WaveGlow -o ./output/ -lr 1e-4 --epochs 1001 -bs 10 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-enabled --cudnn-benchmark --log-file ./output/nvlog.json --amp-run
			
 
				+python -m multiproc train.py -m WaveGlow -o ./output/ -lr 1e-4 --epochs 1001 -bs 10 --segment-length  8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-enabled --cudnn-benchmark --log-file nvlog.json --amp-run
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py
@@ -90,7 +90,7 @@ class Attention(nn.Module):
 
				         energies = self.v(torch.tanh(
			
 
				             processed_query + processed_attention_weights + processed_memory))
			
 
				 
			
 
				-        energies = energies.squeeze(-1)
			
 
				+        energies = energies.squeeze(2)
			
 
				         return energies
			
 
				 
			
 
				     def forward(self, attention_hidden_state, memory, processed_memory,
			
@@ -107,7 +107,7 @@ class Attention(nn.Module):
 
				         alignment = self.get_alignment_energies(
			
 
				             attention_hidden_state, processed_memory, attention_weights_cat)
			
 
				 
			
 
				-        alignment.masked_fill_(mask, self.score_mask_value)
			
 
				+        alignment = alignment.masked_fill(mask, self.score_mask_value)
			
 
				 
			
 
				         attention_weights = F.softmax(alignment, dim=1)
			
 
				         attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/test_infer.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/test_infer.py
@@ -34,10 +34,11 @@ from scipy.io.wavfile import write
 
				 
			
 
				 import sys
			
 
				 
			
 
				+from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence
			
 
				+
			
 
				 import time
			
 
				-from dllogger.logger import LOGGER
			
 
				-import dllogger.logger as dllg
			
 
				-from dllogger.autologging import log_hardware, log_args
			
 
				+import dllogger as DLLogger
			
 
				+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
			
 
				 
			
 
				 from apex import amp
			
 
				 
			
@@ -69,35 +70,6 @@ def parse_args(parser):
 
				     return parser
			
 
				 
			
 
				 
			
 
				-def checkpoint_from_distributed(state_dict):
			
 
				-    """
			
 
				-    Checks whether checkpoint was generated by DistributedDataParallel. DDP
			
 
				-    wraps model in additional "module.", it needs to be unwrapped for single
			
 
				-    GPU inference.
			
 
				-    :param state_dict: model's state dict
			
 
				-    """
			
 
				-    ret = False
			
 
				-    for key, _ in state_dict.items():
			
 
				-        if key.find('module.') != -1:
			
 
				-            ret = True
			
 
				-            break
			
 
				-    return ret
			
 
				-
			
 
				-
			
 
				-def unwrap_distributed(state_dict):
			
 
				-    """
			
 
				-    Unwraps model from DistributedDataParallel.
			
 
				-    DDP wraps model in additional "module.", it needs to be removed for single
			
 
				-    GPU inference.
			
 
				-    :param state_dict: model's state dict
			
 
				-    """
			
 
				-    new_state_dict = {}
			
 
				-    for key, value in state_dict.items():
			
 
				-        new_key = key.replace('module.', '')
			
 
				-        new_state_dict[new_key] = value
			
 
				-    return new_state_dict
			
 
				-
			
 
				-
			
 
				 def load_and_setup_model(model_name, parser, checkpoint, amp_run, to_cuda=True):
			
 
				     model_parser = models.parse_model_args(model_name, parser, add_help=False)
			
 
				     model_args, _ = model_parser.parse_known_args()
			
@@ -126,52 +98,44 @@ def load_and_setup_model(model_name, parser, checkpoint, amp_run, to_cuda=True):
 
				     return model
			
 
				 
			
 
				 
			
 
				-# taken from tacotron2/data_function.py:TextMelCollate.__call__
			
 
				-def pad_sequences(batch):
			
 
				-    # Right zero-pad all one-hot text sequences to max input length
			
 
				-    input_lengths, ids_sorted_decreasing = torch.sort(
			
 
				-        torch.LongTensor([len(x) for x in batch]),
			
 
				-        dim=0, descending=True)
			
 
				-    max_input_len = input_lengths[0]
			
 
				-
			
 
				-    text_padded = torch.LongTensor(len(batch), max_input_len)
			
 
				-    text_padded.zero_()
			
 
				-    for i in range(len(ids_sorted_decreasing)):
			
 
				-        text = batch[ids_sorted_decreasing[i]]
			
 
				-        text_padded[i, :text.size(0)] = text
			
 
				-
			
 
				-    return text_padded, input_lengths
			
 
				+def print_stats(measurements_all):
			
 
				 
			
 
				+    print(np.mean(measurements_all['latency'][1:]),
			
 
				+          np.mean(measurements_all['throughput'][1:]),
			
 
				+          np.mean(measurements_all['pre_processing'][1:]),
			
 
				+          np.mean(measurements_all['type_conversion'][1:])+
			
 
				+          np.mean(measurements_all['storage'][1:])+
			
 
				+          np.mean(measurements_all['data_transfer'][1:]),
			
 
				+          np.mean(measurements_all['num_mels_per_audio'][1:]))
			
 
				 
			
 
				-def prepare_input_sequence(texts):
			
 
				-
			
 
				-    d = []
			
 
				-    for i,text in enumerate(texts):
			
 
				-        d.append(torch.IntTensor(
			
 
				-            text_to_sequence(text, ['english_cleaners'])[:]))
			
 
				-
			
 
				-    text_padded, input_lengths = pad_sequences(d)
			
 
				-    if torch.cuda.is_available():
			
 
				-        text_padded = torch.autograd.Variable(text_padded).cuda().long()
			
 
				-        input_lengths = torch.autograd.Variable(input_lengths).cuda().long()
			
 
				-    else:
			
 
				-        text_padded = torch.autograd.Variable(text_padded).long()
			
 
				-        input_lengths = torch.autograd.Variable(input_lengths).long()
			
 
				-
			
 
				-    return text_padded, input_lengths
			
 
				+    throughput = measurements_all['throughput']
			
 
				+    preprocessing = measurements_all['pre_processing']
			
 
				+    type_conversion = measurements_all['type_conversion']
			
 
				+    storage = measurements_all['storage']
			
 
				+    data_transfer = measurements_all['data_transfer']
			
 
				+    postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
			
 
				+    latency = measurements_all['latency']
			
 
				+    num_mels_per_audio = measurements_all['num_mels_per_audio']
			
 
				 
			
 
				-class MeasureTime():
			
 
				-    def __init__(self, measurements, key):
			
 
				-        self.measurements = measurements
			
 
				-        self.key = key
			
 
				+    latency.sort()
			
 
				 
			
 
				-    def __enter__(self):
			
 
				-        torch.cuda.synchronize()
			
 
				-        self.t0 = time.perf_counter()
			
 
				+    cf_50 = max(latency[:int(len(latency)*0.50)])
			
 
				+    cf_90 = max(latency[:int(len(latency)*0.90)])
			
 
				+    cf_95 = max(latency[:int(len(latency)*0.95)])
			
 
				+    cf_99 = max(latency[:int(len(latency)*0.99)])
			
 
				+    cf_100 = max(latency[:int(len(latency)*1.0)])
			
 
				 
			
 
				-    def __exit__(self, exc_type, exc_value, exc_traceback):
			
 
				-        torch.cuda.synchronize()
			
 
				-        self.measurements[self.key] = time.perf_counter() - self.t0
			
 
				+    print("Throughput average (samples/sec) = {:.4f}".format(np.mean(throughput)))
			
 
				+    print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
			
 
				+    print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
			
 
				+    print("Number of mels per audio average = {}".format(np.mean(num_mels_per_audio)))
			
 
				+    print("Latency average (seconds) = {:.4f}".format(np.mean(latency)))
			
 
				+    print("Latency std (seconds) = {:.4f}".format(np.std(latency)))
			
 
				+    print("Latency cl 50 (seconds) = {:.4f}".format(cf_50))
			
 
				+    print("Latency cl 90 (seconds) = {:.4f}".format(cf_90))
			
 
				+    print("Latency cl 95 (seconds) = {:.4f}".format(cf_95))
			
 
				+    print("Latency cl 99 (seconds) = {:.4f}".format(cf_99))
			
 
				+    print("Latency cl 100 (seconds) = {:.4f}".format(cf_100))
			
 
				 
			
 
				 
			
 
				 def main():
			
@@ -184,22 +148,11 @@ def main():
 
				     parser = parse_args(parser)
			
 
				     args, unknown_args = parser.parse_known_args()
			
 
				 
			
 
				-    LOGGER.set_model_name("Tacotron2_PyT")
			
 
				-    LOGGER.set_backends([
			
 
				-        dllg.JsonBackend(log_file=args.log_file,
			
 
				-                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
			
 
				-    ])
			
 
				-    LOGGER.register_metric("pre_processing", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("waveglow_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("type_conversion", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("storage", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("data_transfer", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("num_mels_per_audio", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("throughput", metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
			
 
				+                            StdOutBackend(Verbosity.VERBOSE)])
			
 
				+    for k,v in vars(args).items():
			
 
				+        DLLogger.log(step="PARAMETER", data={k:v})
			
 
				+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
			
 
				 
			
 
				     measurements_all = {"pre_processing": [],
			
 
				                         "tacotron2_latency": [],
			
@@ -213,9 +166,6 @@ def main():
 
				                         "num_mels_per_audio": [],
			
 
				                         "throughput": []}
			
 
				 
			
 
				-    log_hardware()
			
 
				-    log_args(args)
			
 
				-
			
 
				     print("args:", args, unknown_args)
			
 
				 
			
 
				     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run)
			
@@ -229,9 +179,6 @@ def main():
 
				 
			
 
				     for iter in range(args.num_iters):
			
 
				 
			
 
				-        if iter >= warmup_iters:
			
 
				-            LOGGER.iteration_start()
			
 
				-
			
 
				         measurements = {}
			
 
				 
			
 
				         with MeasureTime(measurements, "pre_processing"):
			
@@ -269,48 +216,11 @@ def main():
 
				         if iter >= warmup_iters:
			
 
				             for k,v in measurements.items():
			
 
				                 measurements_all[k].append(v)
			
 
				-                LOGGER.log(key=k, value=v)
			
 
				+                DLLogger.log(step=(iter-warmup_iters), data={k: v})
			
 
				 
			
 
				-            LOGGER.iteration_stop()
			
 
				-
			
 
				-    LOGGER.finish()
			
 
				-
			
 
				-    print(np.mean(measurements_all['latency'][1:]),
			
 
				-          np.mean(measurements_all['throughput'][1:]),
			
 
				-          np.mean(measurements_all['pre_processing'][1:]),
			
 
				-          np.mean(measurements_all['type_conversion'][1:])+
			
 
				-          np.mean(measurements_all['storage'][1:])+
			
 
				-          np.mean(measurements_all['data_transfer'][1:]),
			
 
				-          np.mean(measurements_all['num_mels_per_audio'][1:]))
			
 
				+    DLLogger.flush()
			
 
				 
			
 
				-    throughput = measurements_all['throughput']
			
 
				-    preprocessing = measurements_all['pre_processing']
			
 
				-    type_conversion = measurements_all['type_conversion']
			
 
				-    storage = measurements_all['storage']
			
 
				-    data_transfer = measurements_all['data_transfer']
			
 
				-    postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
			
 
				-    latency = measurements_all['latency']
			
 
				-    num_mels_per_audio = measurements_all['num_mels_per_audio']
			
 
				-
			
 
				-    latency.sort()
			
 
				-
			
 
				-    cf_50 = max(latency[:int(len(latency)*0.50)])
			
 
				-    cf_90 = max(latency[:int(len(latency)*0.90)])
			
 
				-    cf_95 = max(latency[:int(len(latency)*0.95)])
			
 
				-    cf_99 = max(latency[:int(len(latency)*0.99)])
			
 
				-    cf_100 = max(latency[:int(len(latency)*1.0)])
			
 
				-
			
 
				-    print("Throughput average (samples/sec) = {:.4f}".format(np.mean(throughput)))
			
 
				-    print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
			
 
				-    print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
			
 
				-    print("Number of mels per audio average = {}".format(np.mean(num_mels_per_audio)))
			
 
				-    print("Latency average (seconds) = {:.4f}".format(np.mean(latency)))
			
 
				-    print("Latency std (seconds) = {:.4f}".format(np.std(latency)))
			
 
				-    print("Latency cl 50 (seconds) = {:.4f}".format(cf_50))
			
 
				-    print("Latency cl 90 (seconds) = {:.4f}".format(cf_90))
			
 
				-    print("Latency cl 95 (seconds) = {:.4f}".format(cf_95))
			
 
				-    print("Latency cl 99 (seconds) = {:.4f}".format(cf_99))
			
 
				-    print("Latency cl 100 (seconds) = {:.4f}".format(cf_100))
			
 
				+    print_stats(measurements_all)
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/test_infer.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/test_infer.sh
@@ -7,6 +7,7 @@ NUM_ITERS=1003 # extra 3 iterations for warmup
 
				 TACOTRON2_CKPT="checkpoint_Tacotron2_1500_fp32"
			
 
				 WAVEGLOW_CKPT="checkpoint_WaveGlow_1000_fp32"
			
 
				 AMP_RUN=""
			
 
				+TEST_PROGRAM="test_infer.py"
			
 
				 
			
 
				 while [ -n "$1" ]
			
 
				 do
			
@@ -27,10 +28,26 @@ do
 
				 	    NUM_ITERS="$2"
			
 
				 	    shift
			
 
				 	    ;;
			
 
				+	--test)
			
 
				+	    TEST_PROGRAM="$2"
			
 
				+	    shift
			
 
				+	    ;;
			
 
				 	--tacotron2)
			
 
				 	    TACOTRON2_CKPT="$2"
			
 
				 	    shift
			
 
				 	    ;;
			
 
				+	--encoder)
			
 
				+	    ENCODER_CKPT="$2"
			
 
				+	    shift
			
 
				+	    ;;
			
 
				+	--decoder)
			
 
				+	    DECODER_CKPT="$2"
			
 
				+	    shift
			
 
				+	    ;;
			
 
				+	--postnet)
			
 
				+	    POSTNET_CKPT="$2"
			
 
				+	    shift
			
 
				+	    ;;
			
 
				 	--waveglow)
			
 
				 	    WAVEGLOW_CKPT="$2"
			
 
				 	    shift
			
@@ -51,9 +68,17 @@ NVLOG_FILE=nvlog_${LOG_SUFFIX}.json
 
				 TMP_LOGFILE=tmp_log_${LOG_SUFFIX}.log
			
 
				 LOGFILE=log_${LOG_SUFFIX}.log
			
 
				 
			
 
				+
			
 
				+if [ "$TEST_PROGRAM" = "trt/test_infer_trt.py" ]
			
 
				+then
			
 
				+    MODELS="--encoder $ENCODER_CKPT --decoder $DECODER_CKPT --postnet $POSTNET_CKPT"
			
 
				+else
			
 
				+    MODELS="--tacotron2 $TACOTRON2_CKPT"
			
 
				+fi
			
 
				+
			
 
				 set -x
			
 
				-python test_infer.py \
			
 
				-       --tacotron2 $TACOTRON2_CKPT \
			
 
				+python $TEST_PROGRAM \
			
 
				+       $MODELS \
			
 
				        --waveglow $WAVEGLOW_CKPT \
			
 
				        --batch-size $BATCH_SIZE \
			
 
				        --input-length $INPUT_LENGTH $AMP_RUN \
			
@@ -67,7 +92,8 @@ PERF=$(cat $TMP_LOGFILE | grep -F 'Throughput average (samples/sec)' | awk -F'=
 
				 NUM_MELS=$(cat $TMP_LOGFILE | grep -F 'Number of mels per audio average' | awk -F'= ' '{print $2}')
			
 
				 LATENCY=$(cat $TMP_LOGFILE | grep -F 'Latency average (seconds)' | awk -F'= ' '{print $2}')
			
 
				 LATENCYSTD=$(cat $TMP_LOGFILE | grep -F 'Latency std (seconds)' | awk -F'= ' '{print $2}')
			
 
				-LATENCY50=$(cat $TMP_LOGFILE | grep -F 'Latency cl 50 (seconds)' | awk -F'= ' '{print $2}')
			
 
				-LATENCY100=$(cat $TMP_LOGFILE | grep -F 'Latency cl 100 (seconds)' | awk -F'= ' '{print $2}')
			
 
				+LATENCY90=$(cat $TMP_LOGFILE | grep -F 'Latency cl 90 (seconds)' | awk -F'= ' '{print $2}')
			
 
				+LATENCY95=$(cat $TMP_LOGFILE | grep -F 'Latency cl 95 (seconds)' | awk -F'= ' '{print $2}')
			
 
				+LATENCY99=$(cat $TMP_LOGFILE | grep -F 'Latency cl 99 (seconds)' | awk -F'= ' '{print $2}')
			
 
				 
			
 
				-echo "$BATCH_SIZE,$INPUT_LENGTH,$PRECISION,$NUM_ITERS,$LATENCY,$LATENCYSTD,$LATENCY50,$LATENCY100,$PERF,$NUM_MELS" >> $LOGFILE
			
 
				+echo "$BATCH_SIZE,$INPUT_LENGTH,$PRECISION,$NUM_ITERS,$LATENCY,$LATENCYSTD,$LATENCY90,$LATENCY95,$LATENCY99,$PERF,$NUM_MELS" >> $LOGFILE
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/train.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/train.py
@@ -45,10 +45,9 @@ import models
 
				 import loss_functions
			
 
				 import data_functions
			
 
				 
			
 
				-from dllogger.logger import LOGGER
			
 
				-import dllogger.logger as dllg
			
 
				-from dllogger import tags
			
 
				-from dllogger.autologging import log_hardware, log_args
			
 
				+import dllogger as DLLogger
			
 
				+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
			
 
				+
			
 
				 from scipy.io.wavfile import write as write_wav
			
 
				 
			
 
				 from apex import amp
			
@@ -61,7 +60,7 @@ def parse_args(parser):
 
				     Parse commandline arguments.
			
 
				     """
			
 
				 
			
 
				-    parser.add_argument('-o', '--output_directory', type=str, required=True,
			
 
				+    parser.add_argument('-o', '--output', type=str, required=True,
			
 
				                         help='Directory to save checkpoints')
			
 
				     parser.add_argument('-d', '--dataset-path', type=str,
			
 
				                         default='./', help='Path to dataset')
			
@@ -154,6 +153,9 @@ def parse_args(parser):
 
				     distributed.add_argument('--dist-backend', default='nccl', type=str, choices={'nccl'},
			
 
				                              help='Distributed run backend')
			
 
				 
			
 
				+    benchmark = parser.add_argument_group('benchmark')
			
 
				+    benchmark.add_argument('--bench-class', type=str, default='')
			
 
				+
			
 
				     return parser
			
 
				 
			
 
				 
			
@@ -223,8 +225,8 @@ def evaluating(model):
 
				             model.train()
			
 
				 
			
 
				 
			
 
				-def validate(model, criterion, valset, iteration, batch_size, world_size,
			
 
				-             collate_fn, distributed_run, rank, batch_to_gpu):
			
 
				+def validate(model, criterion, valset, epoch, batch_iter, batch_size,
			
 
				+             world_size, collate_fn, distributed_run, rank, batch_to_gpu):
			
 
				     """Handles all the validation scoring and printing"""
			
 
				     with evaluating(model), torch.no_grad():
			
 
				         val_sampler = DistributedSampler(valset) if distributed_run else None
			
@@ -245,11 +247,11 @@ def validate(model, criterion, valset, iteration, batch_size, world_size,
 
				             val_loss += reduced_val_loss
			
 
				         val_loss = val_loss / (i + 1)
			
 
				 
			
 
				-    LOGGER.log(key="val_iter_loss", value=val_loss)
			
 
				+        DLLogger.log(step=(epoch, batch_iter, epoch), data={'val_iter_loss': val_loss})
			
 
				+        return val_loss
			
 
				 
			
 
				-
			
 
				-def adjust_learning_rate(epoch, optimizer, learning_rate,
			
 
				-                         anneal_steps, anneal_factor):
			
 
				+def adjust_learning_rate(iteration, epoch, optimizer, learning_rate,
			
 
				+                         anneal_steps, anneal_factor, rank):
			
 
				 
			
 
				     p = 0
			
 
				     if anneal_steps is not None:
			
@@ -263,8 +265,7 @@ def adjust_learning_rate(epoch, optimizer, learning_rate,
 
				         lr = learning_rate*(anneal_factor ** p)
			
 
				 
			
 
				     if optimizer.param_groups[0]['lr'] != lr:
			
 
				-        LOGGER.log_event("learning_rate changed",
			
 
				-                         value=str(optimizer.param_groups[0]['lr']) + " -> " + str(lr))
			
 
				+        DLLogger.log(step=(epoch, iteration), data={'learning_rate changed': str(optimizer.param_groups[0]['lr'])+" -> "+str(lr)})
			
 
				 
			
 
				     for param_group in optimizer.param_groups:
			
 
				         param_group['lr'] = lr
			
@@ -276,51 +277,38 @@ def main():
 
				     parser = parse_args(parser)
			
 
				     args, _ = parser.parse_known_args()
			
 
				 
			
 
				-    LOGGER.set_model_name("Tacotron2_PyT")
			
 
				-    LOGGER.set_backends([
			
 
				-        dllg.StdOutBackend(log_file=None,
			
 
				-                           logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
			
 
				-        dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None,
			
 
				-                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
			
 
				-    ])
			
 
				-
			
 
				-    LOGGER.timed_block_start("run")
			
 
				-    LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS,
			
 
				-                           metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("iter_time",
			
 
				-                           metric_scope=dllg.TRAIN_ITER_SCOPE)
			
 
				-    LOGGER.register_metric("epoch_time",
			
 
				-                           metric_scope=dllg.EPOCH_SCOPE)
			
 
				-    LOGGER.register_metric("run_time",
			
 
				-                           metric_scope=dllg.RUN_SCOPE)
			
 
				-    LOGGER.register_metric("val_iter_loss",
			
 
				-                           metric_scope=dllg.EPOCH_SCOPE)
			
 
				-    LOGGER.register_metric("train_epoch_items/sec",
			
 
				-                           metric_scope=dllg.EPOCH_SCOPE)
			
 
				-    LOGGER.register_metric("train_epoch_avg_items/sec",
			
 
				-                           metric_scope=dllg.EPOCH_SCOPE)
			
 
				-    LOGGER.register_metric("train_epoch_avg_loss",
			
 
				-                           metric_scope=dllg.EPOCH_SCOPE)
			
 
				-
			
 
				-    log_hardware()
			
 
				+    if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ:
			
 
				+        local_rank = int(os.environ['LOCAL_RANK'])
			
 
				+        world_size = int(os.environ['WORLD_SIZE'])
			
 
				+    else:
			
 
				+        local_rank = args.rank
			
 
				+        world_size = args.world_size
			
 
				 
			
 
				-    model_name = args.model_name
			
 
				-    parser = models.parse_model_args(model_name, parser)
			
 
				-    parser.parse_args()
			
 
				+    distributed_run = world_size > 1
			
 
				 
			
 
				-    args = parser.parse_args()
			
 
				+    if local_rank == 0:
			
 
				+        DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
			
 
				+                                                  args.output+'/'+args.log_file),
			
 
				+                                StdOutBackend(Verbosity.VERBOSE)])
			
 
				+    else:
			
 
				+        DLLogger.init(backends=[])
			
 
				 
			
 
				-    log_args(args)
			
 
				+    for k,v in vars(args).items():
			
 
				+        DLLogger.log(step="PARAMETER", data={k:v})
			
 
				+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
			
 
				+
			
 
				+    model_name = args.model_name
			
 
				+    parser = models.parse_model_args(model_name, parser)
			
 
				+    args, _ = parser.parse_known_args()
			
 
				 
			
 
				     torch.backends.cudnn.enabled = args.cudnn_enabled
			
 
				     torch.backends.cudnn.benchmark = args.cudnn_benchmark
			
 
				 
			
 
				-    distributed_run = args.world_size > 1
			
 
				     if distributed_run:
			
 
				-        init_distributed(args, args.world_size, args.rank, args.group_name)
			
 
				+        init_distributed(args, world_size, local_rank, args.group_name)
			
 
				 
			
 
				-    LOGGER.log(key=tags.RUN_START)
			
 
				     run_start_time = time.time()
			
 
				+    DLLogger.log(step=tuple(), data={'run_start': run_start_time})
			
 
				 
			
 
				     model_config = models.get_model_config(model_name, args)
			
 
				     model = models.get_model(model_name, model_config,
			
@@ -374,21 +362,22 @@ def main():
 
				     batch_to_gpu = data_functions.get_batch_to_gpu(model_name)
			
 
				 
			
 
				     iteration = 0
			
 
				-    model.train()
			
 
				+    train_epoch_avg_items_per_sec = 0.0
			
 
				+    val_loss = 0.0
			
 
				+    num_iters = 0
			
 
				 
			
 
				-    LOGGER.log(key=tags.TRAIN_LOOP)
			
 
				+    model.train()
			
 
				 
			
 
				     for epoch in range(start_epoch, args.epochs):
			
 
				-        LOGGER.epoch_start()
			
 
				         epoch_start_time = time.time()
			
 
				-        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
			
 
				-
			
 
				+        DLLogger.log(step=(epoch,) , data={'train_epoch_start': epoch_start_time})
			
 
				         # used to calculate avg items/sec over epoch
			
 
				         reduced_num_items_epoch = 0
			
 
				 
			
 
				         # used to calculate avg loss over epoch
			
 
				         train_epoch_avg_loss = 0.0
			
 
				         train_epoch_avg_items_per_sec = 0.0
			
 
				+
			
 
				         num_iters = 0
			
 
				 
			
 
				         # if overflow at the last iteration then do not save checkpoint
			
@@ -398,14 +387,14 @@ def main():
 
				             train_loader.sampler.set_epoch(epoch)
			
 
				 
			
 
				         for i, batch in enumerate(train_loader):
			
 
				-            print("Batch: {}/{} epoch {}".format(i, len(train_loader), epoch))
			
 
				-            LOGGER.iteration_start()
			
 
				             iter_start_time = time.time()
			
 
				-            LOGGER.log(key=tags.TRAIN_ITER_START, value=i)
			
 
				+            DLLogger.log(step=(epoch, i),
			
 
				+                         data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))})
			
 
				+            DLLogger.log(step=(epoch, i), data={'train_iter_start': iter_start_time})
			
 
				 
			
 
				             start = time.perf_counter()
			
 
				-            adjust_learning_rate(epoch, optimizer, args.learning_rate,
			
 
				-                                 args.anneal_steps, args.anneal_factor)
			
 
				+            adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate,
			
 
				+                                 args.anneal_steps, args.anneal_factor, local_rank)
			
 
				 
			
 
				             model.zero_grad()
			
 
				             x, y, num_items = batch_to_gpu(batch)
			
@@ -414,7 +403,7 @@ def main():
 
				             loss = criterion(y_pred, y)
			
 
				 
			
 
				             if distributed_run:
			
 
				-                reduced_loss = reduce_tensor(loss.data, args.world_size).item()
			
 
				+                reduced_loss = reduce_tensor(loss.data, world_size).item()
			
 
				                 reduced_num_items = reduce_tensor(num_items.data, 1).item()
			
 
				             else:
			
 
				                 reduced_loss = loss.item()
			
@@ -422,7 +411,7 @@ def main():
 
				             if np.isnan(reduced_loss):
			
 
				                 raise Exception("loss is NaN")
			
 
				 
			
 
				-            LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss)
			
 
				+            DLLogger.log(step=(epoch,i), data={'train_iter_loss': reduced_loss})
			
 
				 
			
 
				             train_epoch_avg_loss += reduced_loss
			
 
				             num_iters += 1
			
@@ -442,60 +431,49 @@ def main():
 
				 
			
 
				             optimizer.step()
			
 
				 
			
 
				-            iteration += 1
			
 
				-
			
 
				-            LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i)
			
 
				-
			
 
				             iter_stop_time = time.time()
			
 
				             iter_time = iter_stop_time - iter_start_time
			
 
				             items_per_sec = reduced_num_items/iter_time
			
 
				             train_epoch_avg_items_per_sec += items_per_sec
			
 
				 
			
 
				-            LOGGER.log(key="train_iter_items/sec",
			
 
				-                       value=items_per_sec)
			
 
				-            LOGGER.log(key="iter_time", value=iter_time)
			
 
				-            LOGGER.iteration_stop()
			
 
				+            DLLogger.log(step=(epoch, i), data={'train_iter_items/sec': items_per_sec})
			
 
				+            DLLogger.log(step=(epoch, i), data={'train_iter_stop': iter_stop_time})
			
 
				+            DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time})
			
 
				+            iteration += 1
			
 
				+
			
 
				 
			
 
				-        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
			
 
				         epoch_stop_time = time.time()
			
 
				         epoch_time = epoch_stop_time - epoch_start_time
			
 
				 
			
 
				-        LOGGER.log(key="train_epoch_items/sec",
			
 
				-                   value=(reduced_num_items_epoch/epoch_time))
			
 
				-        LOGGER.log(key="train_epoch_avg_items/sec",
			
 
				-                   value=(train_epoch_avg_items_per_sec/num_iters if num_iters > 0 else 0.0))
			
 
				-        LOGGER.log(key="train_epoch_avg_loss", value=(
			
 
				-            train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0))
			
 
				-        LOGGER.log(key="epoch_time", value=epoch_time)
			
 
				-
			
 
				-        LOGGER.log(key=tags.EVAL_START, value=epoch)
			
 
				-
			
 
				-        validate(model, criterion, valset, iteration,
			
 
				-                 args.batch_size, args.world_size, collate_fn,
			
 
				-                 distributed_run, args.rank, batch_to_gpu)
			
 
				+        DLLogger.log(step=(epoch,), data={'train_epoch_items/sec': reduced_num_items_epoch/epoch_time})
			
 
				+        DLLogger.log(step=(epoch,), data={'train_epoch_avg_items/sec':
			
 
				+                                          (train_epoch_avg_items_per_sec/num_iters if num_iters > 0 else 0.0)})
			
 
				+        DLLogger.log(step=(epoch,), data={'train_epoch_avg_loss': (train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0)})
			
 
				+        DLLogger.log(step=(epoch,), data={'epoch_time': epoch_time})
			
 
				 
			
 
				-        LOGGER.log(key=tags.EVAL_STOP, value=epoch)
			
 
				+        val_loss = validate(model, criterion, valset, epoch, i,
			
 
				+                            args.batch_size, world_size, collate_fn,
			
 
				+                            distributed_run, local_rank, batch_to_gpu)
			
 
				 
			
 
				-        if (epoch % args.epochs_per_checkpoint == 0) and args.rank == 0:
			
 
				+        if (epoch % args.epochs_per_checkpoint == 0) and local_rank == 0 and args.bench_class == "":
			
 
				             checkpoint_path = os.path.join(
			
 
				-                args.output_directory, "checkpoint_{}_{}".format(model_name, epoch))
			
 
				+                args.output, "checkpoint_{}_{}".format(model_name, epoch))
			
 
				             save_checkpoint(model, optimizer, epoch, model_config,
			
 
				                             args.amp_run, checkpoint_path)
			
 
				+        if local_rank == 0:
			
 
				+            DLLogger.flush()
			
 
				 
			
 
				-        LOGGER.epoch_stop()
			
 
				 
			
 
				     run_stop_time = time.time()
			
 
				+    DLLogger.log(step=tuple(), data={'run_stop': run_start_time})
			
 
				     run_time = run_stop_time - run_start_time
			
 
				-    LOGGER.log(key="run_time", value=run_time)
			
 
				-    LOGGER.log(key=tags.RUN_FINAL)
			
 
				-
			
 
				-    print("training time", run_stop_time - run_start_time)
			
 
				-
			
 
				-    LOGGER.timed_block_stop("run")
			
 
				-
			
 
				-    if args.rank == 0:
			
 
				-        LOGGER.finish()
			
 
				+    DLLogger.log(step=tuple(), data={'run_time': run_time})
			
 
				+    DLLogger.log(step=tuple(), data={'train_items_per_sec':
			
 
				+                                     (train_epoch_avg_items_per_sec/num_iters if num_iters > 0 else 0.0)})
			
 
				+    DLLogger.log(step=tuple(), data={'val_loss': val_loss})
			
 
				 
			
 
				+    if local_rank == 0:
			
 
				+        DLLogger.flush()
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trt/README.md
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trt/README.md
@@ -0,0 +1,93 @@
 
				+# Tacotron 2 and WaveGlow Inference For TensorRT
			
 
				+
			
 
				+This is subfolder of the Tacotron 2 for PyTorch repository, tested and
			
 
				+maintained by NVIDIA, and provides scripts to perform high-performance
			
 
				+inference using NVIDIA TensorRT.
			
 
				+The Tacotron 2 and WaveGlow models form a text-to-speech (TTS) system that
			
 
				+enables users to synthesize natural sounding speech from raw transcripts
			
 
				+without any additional information such as patterns and/or rhythms of speech.
			
 
				+More information about the TTS system and its training can be found in the
			
 
				+[Tacotron 2 PyTorch README](../README.md).
			
 
				+NVIDIA TensorRT is a platform for high-performance deep learning inference.
			
 
				+It includes a deep learning inference optimizer and runtime that delivers low
			
 
				+latency and high-throughput for deep learning inference applications. After
			
 
				+optimizing the compute-intensive acoustic model with NVIDIA TensorRT,
			
 
				+inference throughput increased by up to *Xx* over native PyTorch.
			
 
				+
			
 
				+
			
 
				+## Quick Start Guide
			
 
				+
			
 
				+1. Clone the repository.
			
 
				+
			
 
				+	```bash
			
 
				+    git clone https://github.com/NVIDIA/DeepLearningExamples
			
 
				+    cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2
			
 
				+    ```
			
 
				+
			
 
				+2. Download pretrained checkpoints from [NGC](https://ngc.nvidia.com/catalog/models)
			
 
				+and store them in `./checkpoints` directory:
			
 
				+
			
 
				+- [Tacotron2 checkpoint](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_fp16)
			
 
				+- [WaveGlow checkpoint](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_fp16)
			
 
				+
			
 
				+    ```bash
			
 
				+    mkdir -p checkpoints
			
 
				+    mv <Tacotron2_checkpoint> <WaveGlow_checkpoint> ./checkpoints/
			
 
				+    ```
			
 
				+
			
 
				+3. Build the Tacotron 2 and WaveGlow PyTorch NGC container.
			
 
				+
			
 
				+    ```bash
			
 
				+    bash scripts/docker/build.sh
			
 
				+    ```
			
 
				+
			
 
				+4. Start an interactive session in the NGC container to run training/inference.
			
 
				+After you build the container image, you can start an interactive CLI session with:
			
 
				+
			
 
				+    ```bash
			
 
				+    bash scripts/docker/interactive.sh
			
 
				+    ```
			
 
				+
			
 
				+5. Export the models to ONNX intermediate representations (ONNX IRs).
			
 
				+Export Tacotron 2 to three ONNX parts: Encoder, Decoder, and Postnet:
			
 
				+
			
 
				+	```bash
			
 
				+	python exports/export_tacotron2_onnx.py --tacotron2 ./checkpoints/nvidia_tacotron2pyt_fp16_20190427 -o output/
			
 
				+	```
			
 
				+
			
 
				+Export WaveGlow to ONNX IR:
			
 
				+
			
 
				+	```bash
			
 
				+	python exports/export_waveglow_onnx.py --waveglow ./checkpoints/nvidia_waveglow256pyt_fp16 --wn-channels 256 -o output/
			
 
				+	```
			
 
				+
			
 
				+After running the above commands, there should be four new files in `./output/`
			
 
				+directory: `encoder.onnx`, `decoder_iter.onnx`, `postnet.onnx`, and 'waveglow.onnx`.
			
 
				+
			
 
				+6. Export the ONNX IRs to TensorRT engines:
			
 
				+
			
 
				+	```bash
			
 
				+	python trt/export_onnx2trt.py --encoder output/encoder.onnx --decoder output/decoder_iter.onnx --postnet output/postnet.onnx --waveglow output/waveglow.onnx -o output/ --fp16
			
 
				+	```
			
 
				+
			
 
				+After running the command, there should be four new files in `./output/`
			
 
				+directory: `encoder_fp16.engine`, `decoder_iter_fp16.engine`, 
			
 
				+`postnet_fp16.engine`, and 'waveglow_fp16.engine`.
			
 
				+
			
 
				+7. Run the inference:
			
 
				+
			
 
				+	```bash
			
 
				+	python trt/inference_trt.py -i phrases/phrase.txt --encoder output/encoder_fp16.engine --decoder output/decoder_iter_fp16.engine --postnet output/postnet_fp16.engine --waveglow output/waveglow_fp16.engine -o output/
			
 
				+	```
			
 
				+
			
 
				+## Inference performance: NVIDIA T4
			
 
				+
			
 
				+Our results were obtained by running the `./trt/run_latency_tests_trt.sh` script in
			
 
				+the PyTorch-19.11-py3 NGC container. Please note that to reproduce the results,
			
 
				+you need to provide pretrained checkpoints for Tacotron 2 and WaveGlow. Please
			
 
				+edit the script to provide your checkpoint filenames.
			
 
				+
			
 
				+|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
			
 
				+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
			
 
				+|1| 128| FP16| | | | | | | | | | |
			
 
				+|1| 128| FP32| | | | | | | | | | |
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trt/export_onnx2trt.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trt/export_onnx2trt.py
@@ -0,0 +1,130 @@
 
				+# *****************************************************************************
			
 
				+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+#  Redistribution and use in source and binary forms, with or without
			
 
				+#  modification, are permitted provided that the following conditions are met:
			
 
				+#      * Redistributions of source code must retain the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer.
			
 
				+#      * Redistributions in binary form must reproduce the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer in the
			
 
				+#        documentation and/or other materials provided with the distribution.
			
 
				+#      * Neither the name of the NVIDIA CORPORATION nor the
			
 
				+#        names of its contributors may be used to endorse or promote products
			
 
				+#        derived from this software without specific prior written permission.
			
 
				+#
			
 
				+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
			
 
				+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
			
 
				+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
			
 
				+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
			
 
				+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+#
			
 
				+# *****************************************************************************
			
 
				+
			
 
				+import pycuda.driver as cuda
			
 
				+import pycuda.autoinit
			
 
				+import tensorrt as trt
			
 
				+import onnx
			
 
				+import argparse
			
 
				+
			
 
				+import sys
			
 
				+sys.path.append('./')
			
 
				+
			
 
				+from trt.trt_utils import build_engine
			
 
				+
			
 
				+def parse_args(parser):
			
 
				+    """
			
 
				+    Parse commandline arguments.
			
 
				+    """
			
 
				+    parser.add_argument('-o', '--output', required=True,
			
 
				+                        help='output folder to save audio (file per phrase)')
			
 
				+    parser.add_argument('--encoder', type=str, default="",
			
 
				+                        help='full path to the Encoder ONNX')
			
 
				+    parser.add_argument('--decoder', type=str, default="",
			
 
				+                        help='full path to the DecoderIter ONNX')
			
 
				+    parser.add_argument('--postnet', type=str, default="",
			
 
				+                        help='full path to the Postnet ONNX')
			
 
				+    parser.add_argument('--waveglow', type=str, default="",
			
 
				+                        help='full path to the WaveGlow ONNX')
			
 
				+    parser.add_argument('--fp16', action='store_true',
			
 
				+                        help='inference with FP16')
			
 
				+
			
 
				+    return parser
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='Export from ONNX to TensorRT for Tacotron 2 and WaveGlow')
			
 
				+    parser = parse_args(parser)
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    engine_prec = "_fp16" if args.fp16 else "_fp32"
			
 
				+
			
 
				+    # Encoder
			
 
				+    shapes=[{"name": "sequences",        "min": (1,4), "opt": (1,128), "max": (4,256)},
			
 
				+            {"name": "sequence_lengths", "min": (1,),  "opt": (1,),    "max": (4,)}]
			
 
				+    if args.encoder != "":
			
 
				+        print("Building Encoder ...")
			
 
				+        encoder_engine = build_engine(args.encoder, shapes=shapes, fp16=args.fp16)
			
 
				+        if encoder_engine is not None:
			
 
				+            with open(args.output+"/"+"encoder"+engine_prec+".engine", 'wb') as f:
			
 
				+                f.write(encoder_engine.serialize())
			
 
				+        else:
			
 
				+            print("Failed to build engine from", args.encoder)
			
 
				+            sys.exit()
			
 
				+
			
 
				+    # DecoderIter
			
 
				+    shapes=[{"name": "decoder_input",         "min": (1,80),    "opt": (1,80),      "max": (4,80)},
			
 
				+            {"name": "attention_hidden",      "min": (1,1024),  "opt": (1,1024),    "max": (4,1024)},
			
 
				+            {"name": "attention_cell",        "min": (1,1024),  "opt": (1,1024),    "max": (4,1024)},
			
 
				+            {"name": "decoder_hidden",        "min": (1,1024),  "opt": (1,1024),    "max": (4,1024)},
			
 
				+            {"name": "decoder_cell",          "min": (1,1024),  "opt": (1,1024),    "max": (4,1024)},
			
 
				+            {"name": "attention_weights",     "min": (1,4),     "opt": (1,128),     "max": (4,256)},
			
 
				+            {"name": "attention_weights_cum", "min": (1,4),     "opt": (1,128),     "max": (4,256)},
			
 
				+            {"name": "attention_context",     "min": (1,512),   "opt": (1,512),     "max": (4,512)},
			
 
				+            {"name": "memory",                "min": (1,4,512), "opt": (1,128,512), "max": (4,256,512)},
			
 
				+            {"name": "processed_memory",      "min": (1,4,128), "opt": (1,128,128), "max": (4,256,128)},
			
 
				+            {"name": "mask",                  "min": (1,4),     "opt": (1,128),     "max": (4,256)}]
			
 
				+    if args.decoder != "":
			
 
				+        print("Building Decoder ...")
			
 
				+        decoder_iter_engine = build_engine(args.decoder, shapes=shapes, fp16=args.fp16)
			
 
				+        if decoder_iter_engine is not None:
			
 
				+            with open(args.output+"/"+"decoder_iter"+engine_prec+".engine", 'wb') as f:
			
 
				+                f.write(decoder_iter_engine.serialize())
			
 
				+        else:
			
 
				+            print("Failed to build engine from", args.decoder)
			
 
				+            sys.exit()
			
 
				+
			
 
				+    # Postnet
			
 
				+    shapes=[{"name": "mel_outputs", "min": (1,80,32), "opt": (1,80,768), "max": (4,80,1664)}]
			
 
				+    if args.postnet != "":
			
 
				+        print("Building Postnet ...")
			
 
				+        postnet_engine = build_engine(args.postnet, shapes=shapes, fp16=args.fp16)
			
 
				+        if postnet_engine is not None:
			
 
				+            with open(args.output+"/"+"postnet"+engine_prec+".engine", 'wb') as f:
			
 
				+                f.write(postnet_engine.serialize())
			
 
				+        else:
			
 
				+            print("Failed to build engine from", args.postnet)
			
 
				+            sys.exit()
			
 
				+
			
 
				+    # WaveGlow
			
 
				+    shapes=[{"name": "mel", "min": (1,80,32,1),  "opt": (1,80,768,1),  "max": (1,80,1664,1)},
			
 
				+            {"name": "z",   "min": (1,8,1024,1), "opt": (1,8,24576,1), "max": (1,8,53248,1)}]
			
 
				+    if args.waveglow != "":
			
 
				+        print("Building WaveGlow ...")
			
 
				+        waveglow_engine = build_engine(args.waveglow, shapes=shapes, fp16=args.fp16)
			
 
				+        if waveglow_engine is not None:
			
 
				+            with open(args.output+"/"+"waveglow"+engine_prec+".engine", 'wb') as f:
			
 
				+                f.write(waveglow_engine.serialize())
			
 
				+        else:
			
 
				+            print("Failed to build engine from", args.waveglow)
			
 
				+            sys.exit()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trt/inference_trt.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trt/inference_trt.py
@@ -0,0 +1,368 @@
 
				+# *****************************************************************************
			
 
				+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+#  Redistribution and use in source and binary forms, with or without
			
 
				+#  modification, are permitted provided that the following conditions are met:
			
 
				+#      * Redistributions of source code must retain the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer.
			
 
				+#      * Redistributions in binary form must reproduce the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer in the
			
 
				+#        documentation and/or other materials provided with the distribution.
			
 
				+#      * Neither the name of the NVIDIA CORPORATION nor the
			
 
				+#        names of its contributors may be used to endorse or promote products
			
 
				+#        derived from this software without specific prior written permission.
			
 
				+#
			
 
				+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
			
 
				+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
			
 
				+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
			
 
				+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
			
 
				+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+#
			
 
				+# *****************************************************************************
			
 
				+
			
 
				+import tensorrt as trt
			
 
				+import numpy as np
			
 
				+from scipy.io.wavfile import write
			
 
				+import pycuda.autoinit
			
 
				+import pycuda.driver as cuda
			
 
				+import time
			
 
				+import torch
			
 
				+import argparse
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append('./')
			
 
				+
			
 
				+from common.utils import to_gpu, get_mask_from_lengths
			
 
				+from tacotron2.text import text_to_sequence
			
 
				+from inference import MeasureTime, prepare_input_sequence, load_and_setup_model
			
 
				+import dllogger as DLLogger
			
 
				+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
			
 
				+from trt.trt_utils import load_engine, run_trt_engine
			
 
				+
			
 
				+from waveglow.denoiser import Denoiser
			
 
				+
			
 
				+def parse_args(parser):
			
 
				+    """
			
 
				+    Parse commandline arguments.
			
 
				+    """
			
 
				+    parser.add_argument('-i', '--input', type=str, required=True,
			
 
				+                        help='full path to the input text (phareses separated by new line)')
			
 
				+    parser.add_argument('-o', '--output', required=True,
			
 
				+                        help='output folder to save audio (file per phrase)')
			
 
				+    parser.add_argument('--encoder', type=str, required=True,
			
 
				+                        help='full path to the Encoder engine')
			
 
				+    parser.add_argument('--decoder', type=str, required=True,
			
 
				+                        help='full path to the DecoderIter engine')
			
 
				+    parser.add_argument('--postnet', type=str, required=True,
			
 
				+                        help='full path to the Postnet engine')
			
 
				+    parser.add_argument('--waveglow', type=str, required=True,
			
 
				+                        help='full path to the WaveGlow engine')
			
 
				+    parser.add_argument('--waveglow-ckpt', type=str, default="",
			
 
				+                        help='full path to the WaveGlow model checkpoint file')
			
 
				+    parser.add_argument('--log-file', type=str, default='nvlog.json',
			
 
				+                        help='Filename for logging')
			
 
				+    parser.add_argument('-d', '--denoising-strength', default=0.01, type=float)
			
 
				+    parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
			
 
				+                        help='Sampling rate')
			
 
				+    parser.add_argument('--stft-hop-length', type=int, default=256,
			
 
				+                        help='STFT hop length for estimating audio length from mel size')
			
 
				+
			
 
				+    return parser
			
 
				+
			
 
				+
			
 
				+def init_decoder_inputs(memory, processed_memory, memory_lengths):
			
 
				+
			
 
				+    bs = memory.size(0)
			
 
				+    seq_len = memory.size(1)
			
 
				+    attention_rnn_dim = 1024
			
 
				+    decoder_rnn_dim = 1024
			
 
				+    encoder_embedding_dim = 512
			
 
				+    n_mel_channels = 80
			
 
				+
			
 
				+    attention_hidden = torch.zeros(bs, attention_rnn_dim).cuda().float()
			
 
				+    attention_cell = torch.zeros(bs, attention_rnn_dim).cuda().float()
			
 
				+    decoder_hidden = torch.zeros(bs, decoder_rnn_dim).cuda().float()
			
 
				+    decoder_cell = torch.zeros(bs, decoder_rnn_dim).cuda().float()
			
 
				+    attention_weights = torch.zeros(bs, seq_len).cuda().float()
			
 
				+    attention_weights_cum = torch.zeros(bs, seq_len).cuda().float()
			
 
				+    attention_context = torch.zeros(bs, encoder_embedding_dim).cuda().float()
			
 
				+    mask = get_mask_from_lengths(memory_lengths).cuda()
			
 
				+    decoder_input = torch.zeros(bs, n_mel_channels).cuda().float()
			
 
				+
			
 
				+    return (decoder_input, attention_hidden, attention_cell, decoder_hidden,
			
 
				+            decoder_cell, attention_weights, attention_weights_cum,
			
 
				+            attention_context, memory, processed_memory, mask)
			
 
				+
			
 
				+def init_decoder_outputs(memory, memory_lengths):
			
 
				+
			
 
				+    bs = memory.size(0)
			
 
				+    seq_len = memory.size(1)
			
 
				+    attention_rnn_dim = 1024
			
 
				+    decoder_rnn_dim = 1024
			
 
				+    encoder_embedding_dim = 512
			
 
				+    n_mel_channels = 80
			
 
				+
			
 
				+    attention_hidden = torch.zeros(bs, attention_rnn_dim).cuda().float()
			
 
				+    attention_cell = torch.zeros(bs, attention_rnn_dim).cuda().float()
			
 
				+    decoder_hidden = torch.zeros(bs, decoder_rnn_dim).cuda().float()
			
 
				+    decoder_cell = torch.zeros(bs, decoder_rnn_dim).cuda().float()
			
 
				+    attention_weights = torch.zeros(bs, seq_len).cuda().float()
			
 
				+    attention_weights_cum = torch.zeros(bs, seq_len).cuda().float()
			
 
				+    attention_context = torch.zeros(bs, encoder_embedding_dim).cuda().float()
			
 
				+    decoder_output = torch.zeros(bs, n_mel_channels).cuda().float()
			
 
				+    gate_prediction = torch.zeros(bs, 1).cuda().float()
			
 
				+
			
 
				+    return (attention_hidden, attention_cell, decoder_hidden,
			
 
				+            decoder_cell, attention_weights, attention_weights_cum,
			
 
				+            attention_context, decoder_output, gate_prediction)
			
 
				+
			
 
				+def init_decoder_tensors(decoder_inputs, decoder_outputs):
			
 
				+
			
 
				+    decoder_tensors = {
			
 
				+        # inputs
			
 
				+        'decoder_input': decoder_inputs[0],
			
 
				+        'attention_hidden': decoder_inputs[1],
			
 
				+        'attention_cell': decoder_inputs[2],
			
 
				+        'decoder_hidden': decoder_inputs[3],
			
 
				+        'decoder_cell': decoder_inputs[4],
			
 
				+        'attention_weights': decoder_inputs[5],
			
 
				+        'attention_weights_cum': decoder_inputs[6],
			
 
				+        'attention_context': decoder_inputs[7],
			
 
				+        'memory': decoder_inputs[8],
			
 
				+        'processed_memory': decoder_inputs[9],
			
 
				+        'mask': decoder_inputs[10],
			
 
				+        # outputs
			
 
				+        'out_attention_hidden': decoder_outputs[0],
			
 
				+        'out_attention_cell': decoder_outputs[1],
			
 
				+        'out_decoder_hidden': decoder_outputs[2],
			
 
				+        'out_decoder_cell': decoder_outputs[3],
			
 
				+        'out_attention_weights': decoder_outputs[4],
			
 
				+        'out_attention_weights_cum': decoder_outputs[5],
			
 
				+        'out_attention_context': decoder_outputs[6],
			
 
				+        'decoder_output': decoder_outputs[7],
			
 
				+        'gate_prediction': decoder_outputs[8],
			
 
				+    }
			
 
				+    return decoder_tensors
			
 
				+
			
 
				+def swap_inputs_outputs(decoder_inputs, decoder_outputs):
			
 
				+
			
 
				+    new_decoder_inputs = (decoder_outputs[7], # decoder_output
			
 
				+                          decoder_outputs[0], # attention_hidden
			
 
				+                          decoder_outputs[1], # attention_cell
			
 
				+                          decoder_outputs[2], # decoder_hidden
			
 
				+                          decoder_outputs[3], # decoder_cell
			
 
				+                          decoder_outputs[4], # attention_weights
			
 
				+                          decoder_outputs[5], # attention_weights_cum
			
 
				+                          decoder_outputs[6], # attention_context
			
 
				+                          decoder_inputs[8],  # memory
			
 
				+                          decoder_inputs[9],  # processed_memory
			
 
				+                          decoder_inputs[10]) # mask
			
 
				+
			
 
				+    new_decoder_outputs = (decoder_inputs[1], # attention_hidden
			
 
				+                           decoder_inputs[2], # attention_cell
			
 
				+                           decoder_inputs[3], # decoder_hidden
			
 
				+                           decoder_inputs[4], # decoder_cell
			
 
				+                           decoder_inputs[5], # attention_weights
			
 
				+                           decoder_inputs[6], # attention_weights_cum
			
 
				+                           decoder_inputs[7], # attention_context
			
 
				+                           decoder_inputs[0], # decoder_input
			
 
				+                           decoder_outputs[8])# gate_output
			
 
				+
			
 
				+    return new_decoder_inputs, new_decoder_outputs
			
 
				+
			
 
				+
			
 
				+def infer_tacotron2_trt(encoder, decoder_iter, postnet,
			
 
				+                        encoder_context, decoder_context, postnet_context,
			
 
				+                        sequences, sequence_lengths, measurements):
			
 
				+
			
 
				+    memory = torch.zeros((len(sequence_lengths),sequence_lengths[0],512)).cuda().float()
			
 
				+    processed_memory = torch.zeros((len(sequence_lengths),sequence_lengths[0],128)).cuda().float()
			
 
				+    lens = torch.zeros_like(sequence_lengths)
			
 
				+
			
 
				+    encoder_tensors = {
			
 
				+        # inputs
			
 
				+        'sequences': sequences, 'sequence_lengths': sequence_lengths,
			
 
				+        # outputs
			
 
				+        'memory': memory, 'lens': lens, 'processed_memory': processed_memory
			
 
				+    }
			
 
				+
			
 
				+    print("Running Tacotron2 Encoder")
			
 
				+    with MeasureTime(measurements, "tacotron2_encoder_time"):
			
 
				+        run_trt_engine(encoder_context, encoder, encoder_tensors)
			
 
				+
			
 
				+    device = memory.device
			
 
				+    mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device = device)
			
 
				+    not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device = device)
			
 
				+    mel_outputs, gate_outputs, alignments = (torch.zeros(1, device = device), torch.zeros(1, device = device), torch.zeros(1, device = device))
			
 
				+    gate_threshold = 0.6
			
 
				+    max_decoder_steps = 1664
			
 
				+    first_iter = True
			
 
				+
			
 
				+    decoder_inputs = init_decoder_inputs(memory, processed_memory, sequence_lengths)
			
 
				+    decoder_outputs = init_decoder_outputs(memory, sequence_lengths)
			
 
				+
			
 
				+    print("Running Tacotron2 Decoder")
			
 
				+    while True:
			
 
				+        decoder_tensors = init_decoder_tensors(decoder_inputs, decoder_outputs)
			
 
				+        with MeasureTime(measurements, "step"):
			
 
				+            run_trt_engine(decoder_context, decoder_iter, decoder_tensors)
			
 
				+
			
 
				+        if first_iter:
			
 
				+            mel_outputs = torch.unsqueeze(decoder_outputs[7], 2)
			
 
				+            gate_outputs = torch.unsqueeze(decoder_outputs[8], 2)
			
 
				+            alignments = torch.unsqueeze(decoder_outputs[4], 2)
			
 
				+            measurements['tacotron2_decoder_time'] = measurements['step']
			
 
				+            first_iter = False
			
 
				+        else:
			
 
				+            mel_outputs = torch.cat((mel_outputs, torch.unsqueeze(decoder_outputs[7], 2)), 2)
			
 
				+            gate_outputs = torch.cat((gate_outputs, torch.unsqueeze(decoder_outputs[8], 2)), 2)
			
 
				+            alignments = torch.cat((alignments, torch.unsqueeze(decoder_outputs[4], 2)), 2)
			
 
				+            measurements['tacotron2_decoder_time'] += measurements['step']
			
 
				+
			
 
				+        dec = torch.le(torch.sigmoid(decoder_outputs[8]), gate_threshold).to(torch.int32).squeeze(1)
			
 
				+        not_finished = not_finished*dec
			
 
				+        mel_lengths += not_finished
			
 
				+
			
 
				+        if torch.sum(not_finished) == 0:
			
 
				+            print("Stopping after",mel_outputs.size(2),"decoder steps")
			
 
				+            break
			
 
				+        if mel_outputs.size(2) == max_decoder_steps:
			
 
				+            print("Warning! Reached max decoder steps")
			
 
				+            break
			
 
				+
			
 
				+        decoder_inputs, decoder_outputs = swap_inputs_outputs(decoder_inputs, decoder_outputs)
			
 
				+
			
 
				+    mel_outputs_postnet = torch.zeros_like(mel_outputs).cuda().float()
			
 
				+
			
 
				+    postnet_tensors = {
			
 
				+        # inputs
			
 
				+        'mel_outputs': mel_outputs,
			
 
				+        # outputs
			
 
				+        'mel_outputs_postnet': mel_outputs_postnet
			
 
				+    }
			
 
				+    print("Running Tacotron2 Postnet")
			
 
				+    with MeasureTime(measurements, "tacotron2_postnet_time"):
			
 
				+        run_trt_engine(postnet_context, postnet, postnet_tensors)
			
 
				+
			
 
				+    print("Tacotron2 Postnet done")
			
 
				+
			
 
				+    return mel_outputs_postnet, mel_lengths
			
 
				+
			
 
				+
			
 
				+def infer_waveglow_trt(waveglow, waveglow_context, mel, measurements):
			
 
				+
			
 
				+    mel = mel.unsqueeze(3)
			
 
				+    mel_size = mel.size(2)
			
 
				+    batch_size = mel.size(0)
			
 
				+    stride = 256
			
 
				+    kernel_size = 1024
			
 
				+    n_group = 8
			
 
				+    z_size = (mel_size-1)*stride+(kernel_size-1)+1
			
 
				+    z_size = z_size - (kernel_size-stride)
			
 
				+    z_size = z_size//n_group
			
 
				+    z = torch.randn(batch_size, n_group, z_size, 1).cuda().float()
			
 
				+
			
 
				+    audios = torch.zeros(batch_size, mel_size*256).cuda()
			
 
				+
			
 
				+    waveglow_tensors = {
			
 
				+        # inputs
			
 
				+        'mel': mel, 'z': z,
			
 
				+        # outputs
			
 
				+        'audio': audios
			
 
				+    }
			
 
				+    print("Running WaveGlow")
			
 
				+    with MeasureTime(measurements, "waveglow_time"):
			
 
				+        run_trt_engine(waveglow_context, waveglow, waveglow_tensors)
			
 
				+
			
 
				+    return audios
			
 
				+
			
 
				+def main():
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='TensorRT Tacotron 2 Inference')
			
 
				+    parser = parse_args(parser)
			
 
				+    args, _ = parser.parse_known_args()
			
 
				+
			
 
				+    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
			
 
				+    encoder = load_engine(args.encoder, TRT_LOGGER)
			
 
				+    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
			
 
				+    postnet = load_engine(args.postnet, TRT_LOGGER)
			
 
				+    waveglow = load_engine(args.waveglow, TRT_LOGGER)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    if args.waveglow_ckpt != "":
			
 
				+        # setup denoiser using WaveGlow PyTorch checkpoint
			
 
				+        waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt,
			
 
				+                                             True, forward_is_infer=True)
			
 
				+        denoiser = Denoiser(waveglow_ckpt).cuda()
			
 
				+        # after initialization, we don't need WaveGlow PyTorch checkpoint
			
 
				+        # anymore - deleting
			
 
				+        del waveglow_ckpt
			
 
				+        torch.cuda.empty_cache()
			
 
				+
			
 
				+
			
 
				+    # initialize CUDA state
			
 
				+    torch.cuda.init()
			
 
				+    # create TRT contexts for each engine
			
 
				+    encoder_context = encoder.create_execution_context()
			
 
				+    decoder_context = decoder_iter.create_execution_context()
			
 
				+    postnet_context = postnet.create_execution_context()
			
 
				+    waveglow_context = waveglow.create_execution_context()
			
 
				+
			
 
				+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
			
 
				+                                              args.output+'/'+args.log_file),
			
 
				+                            StdOutBackend(Verbosity.VERBOSE)])
			
 
				+
			
 
				+    texts = []
			
 
				+    try:
			
 
				+        f = open(args.input, 'r')
			
 
				+        texts = f.readlines()
			
 
				+    except:
			
 
				+        print("Could not read file")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    measurements = {}
			
 
				+
			
 
				+    sequences, sequence_lengths = prepare_input_sequence(texts)
			
 
				+
			
 
				+    sequences = sequences.to(torch.int32)
			
 
				+    sequence_lengths = sequence_lengths.to(torch.int32)
			
 
				+    mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
			
 
				+                                           encoder_context, decoder_context, postnet_context,
			
 
				+                                           sequences, sequence_lengths, measurements)
			
 
				+    audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements)
			
 
				+
			
 
				+    with encoder_context, decoder_context,  postnet_context, waveglow_context:
			
 
				+        pass
			
 
				+
			
 
				+    audios.float()
			
 
				+    if args.waveglow_ckpt != "":
			
 
				+        with MeasureTime(measurements, "denoiser"):
			
 
				+            audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
			
 
				+
			
 
				+    for i, audio in enumerate(audios):
			
 
				+        audio = audio[:mel_lengths[i]*args.stft_hop_length]
			
 
				+        audio = audio/torch.max(torch.abs(audio))
			
 
				+        audio_path = args.output + "audio_"+str(i)+"_trt.wav"
			
 
				+        write(audio_path, args.sampling_rate, audio.cpu().numpy())
			
 
				+
			
 
				+    DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']})
			
 
				+    DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']})
			
 
				+    DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']})
			
 
				+    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
			
 
				+    DLLogger.log(step=0, data={"latency": (measurements['tacotron2_encoder_time']+
			
 
				+                                           measurements['tacotron2_decoder_time']+
			
 
				+                                           measurements['tacotron2_postnet_time']+
			
 
				+                                           measurements['waveglow_time'])})
			
 
				+    if args.waveglow_ckpt != "":
			
 
				+        DLLogger.log(step=0, data={"denoiser": measurements['denoiser']})
			
 
				+    DLLogger.flush()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trt/run_latency_tests_trt.sh
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trt/run_latency_tests_trt.sh
@@ -0,0 +1,4 @@
 
				+bash test_infer.sh --test trt/test_infer_trt.py \
			
 
				+     -bs 1 -il 128 -p fp16 --num-iters 1003 --encoder ./output/encoder_fp16.engine --decoder ./output/decoder_iter_fp16.engine --postnet ./output/postnet_fp16.engine  --waveglow ./output/waveglow_fp16.engine
			
 
				+bash test_infer.sh --test trt/test_infer_trt.py \
			
 
				+     -bs 1 -il 128 -p fp32 --num-iters 1003 --encoder ./output/encoder_fp32.engine --decoder ./output/decoder_iter_fp32.engine --postnet ./output/postnet_fp32.engine  --waveglow ./output/waveglow_fp32.engine
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trt/test_infer_trt.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trt/test_infer_trt.py
@@ -0,0 +1,181 @@
 
				+# *****************************************************************************
			
 
				+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+#  Redistribution and use in source and binary forms, with or without
			
 
				+#  modification, are permitted provided that the following conditions are met:
			
 
				+#      * Redistributions of source code must retain the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer.
			
 
				+#      * Redistributions in binary form must reproduce the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer in the
			
 
				+#        documentation and/or other materials provided with the distribution.
			
 
				+#      * Neither the name of the NVIDIA CORPORATION nor the
			
 
				+#        names of its contributors may be used to endorse or promote products
			
 
				+#        derived from this software without specific prior written permission.
			
 
				+#
			
 
				+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
			
 
				+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
			
 
				+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
			
 
				+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
			
 
				+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+#
			
 
				+# *****************************************************************************
			
 
				+
			
 
				+import torch
			
 
				+import argparse
			
 
				+import numpy as np
			
 
				+from scipy.io.wavfile import write
			
 
				+import tensorrt as trt
			
 
				+import sys
			
 
				+sys.path.append('./')
			
 
				+
			
 
				+import time
			
 
				+import dllogger as DLLogger
			
 
				+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
			
 
				+
			
 
				+from apex import amp
			
 
				+
			
 
				+from inference import MeasureTime, prepare_input_sequence
			
 
				+from test_infer import print_stats
			
 
				+from trt.inference_trt import infer_tacotron2_trt, infer_waveglow_trt
			
 
				+from trt.trt_utils import load_engine
			
 
				+import models
			
 
				+
			
 
				+from test_infer import print_stats
			
 
				+
			
 
				+def parse_args(parser):
			
 
				+    """
			
 
				+    Parse commandline arguments.
			
 
				+    """
			
 
				+    parser.add_argument('--encoder', type=str, required=True,
			
 
				+                        help='full path to the Encoder TRT engine')
			
 
				+    parser.add_argument('--decoder', type=str, required=True,
			
 
				+                        help='full path to the DecoderIter TRT engine')
			
 
				+    parser.add_argument('--postnet', type=str, required=True,
			
 
				+                        help='full path to the Postnet TRT engine')
			
 
				+    parser.add_argument('--waveglow', type=str, required=True,
			
 
				+                        help='full path to the WaveGlow TRT engine')
			
 
				+    parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
			
 
				+    parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
			
 
				+                        help='Sampling rate')
			
 
				+    parser.add_argument('--fp16', action='store_true',
			
 
				+                        help='inference ')
			
 
				+    parser.add_argument('--log-file', type=str, default='nvlog.json',
			
 
				+                        help='Filename for logging')
			
 
				+    parser.add_argument('--stft-hop-length', type=int, default=256,
			
 
				+                        help='STFT hop length for estimating audio length from mel size')
			
 
				+    parser.add_argument('--num-iters', type=int, default=10,
			
 
				+                        help='Number of iterations')
			
 
				+    parser.add_argument('-il', '--input-length', type=int, default=64,
			
 
				+                        help='Input length')
			
 
				+    parser.add_argument('-bs', '--batch-size', type=int, default=1,
			
 
				+                        help='Batch size')
			
 
				+
			
 
				+    return parser
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    Launches text to speech (inference).
			
 
				+    Inference is executed on a single GPU.
			
 
				+    """
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='PyTorch Tacotron 2 Inference')
			
 
				+    parser = parse_args(parser)
			
 
				+    args, unknown_args = parser.parse_known_args()
			
 
				+
			
 
				+    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
			
 
				+    encoder = load_engine(args.encoder, TRT_LOGGER)
			
 
				+    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
			
 
				+    postnet = load_engine(args.postnet, TRT_LOGGER)
			
 
				+    waveglow = load_engine(args.waveglow, TRT_LOGGER)
			
 
				+
			
 
				+    # initialize CUDA state
			
 
				+    torch.cuda.init()
			
 
				+    # create TRT contexts for each engine
			
 
				+    encoder_context = encoder.create_execution_context()
			
 
				+    decoder_context = decoder_iter.create_execution_context()
			
 
				+    postnet_context = postnet.create_execution_context()
			
 
				+    waveglow_context = waveglow.create_execution_context()
			
 
				+
			
 
				+    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
			
 
				+                            StdOutBackend(Verbosity.VERBOSE)])
			
 
				+    for k,v in vars(args).items():
			
 
				+        DLLogger.log(step="PARAMETER", data={k:v})
			
 
				+    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
			
 
				+
			
 
				+    measurements_all = {"pre_processing": [],
			
 
				+                        "tacotron2_latency": [],
			
 
				+                        "waveglow_latency": [],
			
 
				+                        "latency": [],
			
 
				+                        "type_conversion": [],
			
 
				+                        "data_transfer": [],
			
 
				+                        "storage": [],
			
 
				+                        "tacotron2_items_per_sec": [],
			
 
				+                        "waveglow_items_per_sec": [],
			
 
				+                        "num_mels_per_audio": [],
			
 
				+                        "throughput": []}
			
 
				+
			
 
				+    texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
			
 
				+    texts = [texts[0][:args.input_length]]
			
 
				+    texts = texts*args.batch_size
			
 
				+
			
 
				+    warmup_iters = 3
			
 
				+
			
 
				+    for iter in range(args.num_iters):
			
 
				+
			
 
				+        measurements = {}
			
 
				+
			
 
				+        with MeasureTime(measurements, "pre_processing"):
			
 
				+            sequences_padded, input_lengths = prepare_input_sequence(texts)
			
 
				+
			
 
				+        with torch.no_grad():
			
 
				+            with MeasureTime(measurements, "latency"):
			
 
				+                with MeasureTime(measurements, "tacotron2_latency"):
			
 
				+                    mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
			
 
				+                                                           encoder_context, decoder_context, postnet_context,
			
 
				+                                                           sequences_padded, input_lengths, measurements)
			
 
				+
			
 
				+                with MeasureTime(measurements, "waveglow_latency"):
			
 
				+                    audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements)
			
 
				+
			
 
				+        num_mels = mel.size(0)*mel.size(2)
			
 
				+        num_samples = audios.size(0)*audios.size(1)
			
 
				+
			
 
				+        with MeasureTime(measurements, "type_conversion"):
			
 
				+            audios = audios.float()
			
 
				+
			
 
				+        with MeasureTime(measurements, "data_transfer"):
			
 
				+            audios = audios.cpu()
			
 
				+
			
 
				+        with MeasureTime(measurements, "storage"):
			
 
				+            audios = audios.numpy()
			
 
				+            for i, audio in enumerate(audios):
			
 
				+                audio_path = "audio_"+str(i)+".wav"
			
 
				+                write(audio_path, args.sampling_rate,
			
 
				+                      audio[:mel_lengths[i]*args.stft_hop_length])
			
 
				+
			
 
				+        measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
			
 
				+        measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
			
 
				+        measurements['num_mels_per_audio'] = mel.size(2)
			
 
				+        measurements['throughput'] = num_samples/measurements['latency']
			
 
				+
			
 
				+        if iter >= warmup_iters:
			
 
				+            for k,v in measurements.items():
			
 
				+                if k in measurements_all.keys():
			
 
				+                    measurements_all[k].append(v)
			
 
				+                    DLLogger.log(step=(iter-warmup_iters), data={k: v})
			
 
				+
			
 
				+    with encoder_context, decoder_context, postnet_context, waveglow_context:
			
 
				+        pass
			
 
				+
			
 
				+    DLLogger.flush()
			
 
				+
			
 
				+    print_stats(measurements_all)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/trt/trt_utils.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/trt/trt_utils.py
@@ -0,0 +1,98 @@
 
				+# *****************************************************************************
			
 
				+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+#  Redistribution and use in source and binary forms, with or without
			
 
				+#  modification, are permitted provided that the following conditions are met:
			
 
				+#      * Redistributions of source code must retain the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer.
			
 
				+#      * Redistributions in binary form must reproduce the above copyright
			
 
				+#        notice, this list of conditions and the following disclaimer in the
			
 
				+#        documentation and/or other materials provided with the distribution.
			
 
				+#      * Neither the name of the NVIDIA CORPORATION nor the
			
 
				+#        names of its contributors may be used to endorse or promote products
			
 
				+#        derived from this software without specific prior written permission.
			
 
				+#
			
 
				+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
			
 
				+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
			
 
				+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
			
 
				+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
			
 
				+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+#
			
 
				+# *****************************************************************************
			
 
				+
			
 
				+import tensorrt as trt
			
 
				+
			
 
				+def binding_info(engine, context):
			
 
				+    for i in range(engine.num_bindings):
			
 
				+        print("|||| binding", i)
			
 
				+        print("|||| binding_is_input", engine.binding_is_input(i))
			
 
				+        print("|||| get_binding_dtype", engine.get_binding_dtype(i))
			
 
				+        print("|||| get_binding_name", engine.get_binding_name(i))
			
 
				+        print("|||| get_binding_shape", engine.get_binding_shape(i))
			
 
				+        print("|||| get_binding_vectorized_dim", engine.get_binding_vectorized_dim(i))
			
 
				+
			
 
				+    print("|||| all_binding_shapes_specified", context.all_binding_shapes_specified)
			
 
				+    print("|||| all_shape_inputs_specified", context.all_shape_inputs_specified)
			
 
				+
			
 
				+
			
 
				+def is_dimension_dynamic(dim):
			
 
				+    return dim is None or dim <= 0
			
 
				+
			
 
				+
			
 
				+def is_shape_dynamic(shape):
			
 
				+    return any([is_dimension_dynamic(dim) for dim in shape])
			
 
				+
			
 
				+
			
 
				+def run_trt_engine(context, engine, tensors):
			
 
				+
			
 
				+    bindings = [None]*engine.num_bindings
			
 
				+    for name in tensors.keys():
			
 
				+        idx = engine.get_binding_index(name)
			
 
				+        tensor = tensors.get(name)
			
 
				+        bindings[idx] = tensor.data_ptr()
			
 
				+        if engine.is_shape_binding(idx) and is_shape_dynamic(context.get_shape(idx)):
			
 
				+            context.set_shape_input(idx, tensor)
			
 
				+        elif is_shape_dynamic(context.get_binding_shape(idx)):
			
 
				+            context.set_binding_shape(idx, tensor.shape)
			
 
				+
			
 
				+    # binding_info(engine, context)
			
 
				+    context.execute_v2(bindings=bindings)
			
 
				+
			
 
				+
			
 
				+def load_engine(engine_filepath, trt_logger):
			
 
				+    with open(engine_filepath, "rb") as f, trt.Runtime(trt_logger) as runtime:
			
 
				+        engine = runtime.deserialize_cuda_engine(f.read())
			
 
				+    return engine
			
 
				+
			
 
				+
			
 
				+def build_engine(model_file, shapes, max_ws=512*1024*1024, fp16=False):
			
 
				+    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
			
 
				+    builder = trt.Builder(TRT_LOGGER)
			
 
				+    builder.fp16_mode = fp16
			
 
				+
			
 
				+    config = builder.create_builder_config()
			
 
				+    config.max_workspace_size = max_ws
			
 
				+    if fp16:
			
 
				+        config.flags |= 1 << int(trt.BuilderFlag.FP16)
			
 
				+    profile = builder.create_optimization_profile()
			
 
				+    for s in shapes:
			
 
				+        profile.set_shape(s['name'], min=s['min'], opt=s['opt'], max=s['max'])
			
 
				+    config.add_optimization_profile(profile)
			
 
				+    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
			
 
				+    network = builder.create_network(explicit_batch)
			
 
				+
			
 
				+    with trt.OnnxParser(network, TRT_LOGGER) as parser:
			
 
				+        with open(model_file, 'rb') as model:
			
 
				+            parsed = parser.parse(model.read())
			
 
				+            for i in range(parser.num_errors):
			
 
				+                e = parser.get_error(i)
			
 
				+            engine = builder.build_engine(network, config=config)
			
 
				+
			
 
				+            return engine
			
 
				+
			
 
				+
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/waveglow/model.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/waveglow/model.py
@@ -233,6 +233,7 @@ class WaveGlow(torch.nn.Module):
 
				         return torch.cat(output_audio, 1), log_s_list, log_det_W_list
			
 
				 
			
 
				     def infer(self, spect, sigma=1.0):
			
 
				+
			
 
				         spect = self.upsample(spect)
			
 
				         # trim conv artifacts. maybe pad spec to kernel multiple
			
 
				         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
			
@@ -272,21 +273,22 @@ class WaveGlow(torch.nn.Module):
 
				         return audio
			
 
				 
			
 
				 
			
 
				-    def infer_onnx(self, spect, z, sigma=1.0):
			
 
				+    def infer_onnx(self, spect, z, sigma=0.9):
			
 
				 
			
 
				         spect = self.upsample(spect)
			
 
				         # trim conv artifacts. maybe pad spec to kernel multiple
			
 
				         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
			
 
				         spect = spect[:, :, :-time_cutoff]
			
 
				 
			
 
				-        length_spect_group = int(spect.size(2)/8)
			
 
				+        length_spect_group = spect.size(2)//8
			
 
				         mel_dim = 80
			
 
				+        batch_size = spect.size(0)
			
 
				 
			
 
				         spect = torch.squeeze(spect, 3)
			
 
				-        spect = spect.view((1, mel_dim, length_spect_group, self.n_group))
			
 
				+        spect = spect.view((batch_size, mel_dim, length_spect_group, self.n_group))
			
 
				         spect = spect.permute(0, 2, 1, 3)
			
 
				         spect = spect.contiguous()
			
 
				-        spect = spect.view((1, length_spect_group, self.n_group*mel_dim))
			
 
				+        spect = spect.view((batch_size, length_spect_group, self.n_group*mel_dim))
			
 
				         spect = spect.permute(0, 2, 1)
			
 
				         spect = torch.unsqueeze(spect, 3)
			
 
				 
			
@@ -312,9 +314,10 @@ class WaveGlow(torch.nn.Module):
 
				                 z = z[:, self.n_early_size:self.n_group, :, :]
			
 
				 
			
 
				         audio = torch.squeeze(audio, 3)
			
 
				-        audio = audio.permute(0,2,1).contiguous().view(1, (length_spect_group * self.n_group))
			
 
				+        audio = audio.permute(0,2,1).contiguous().view(batch_size, (length_spect_group * self.n_group))
			
 
				 
			
 
				         return audio
			
 
				+
			
 
				     @staticmethod
			
 
				     def remove_weightnorm(model):
			
 
				         waveglow = model