5 лет назад · 8d8337196f
--- a/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts.py
@@ -40,8 +40,8 @@ def parse_args(parser):
 
				 
			
 
				     parser.add_argument('-o', '--output', type=str, default="trtis_repo/tacotron/1/model.pt",
			
 
				                         help='filename for the Tacotron 2 TorchScript model')
			
 
				-    parser.add_argument('--amp-run', action='store_true',
			
 
				-                        help='inference with AMP')
			
 
				+    parser.add_argument('--fp16', action='store_true',
			
 
				+                        help='inference with mixed precision')
			
 
				 
			
 
				     return parser
			
 
				 
			
@@ -54,7 +54,8 @@ def main():
 
				     args = parser.parse_args()
			
 
				 
			
 
				     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
			
 
				-                                     args.amp_run, forward_is_infer=True)
			
 
				+                                     amp_run=args.fp16, cpu_run=False,
			
 
				+                                     forward_is_infer=True)
			
 
				     
			
 
				     jitted_tacotron2 = torch.jit.script(tacotron2)
			
 
				 
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts_config.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts_config.py
@@ -44,11 +44,11 @@ def parse_args(parser):
 
				                         help="exports to appropriate directory for TRTIS")
			
 
				     parser.add_argument("--trtis_max_batch_size",
			
 
				                         type=int,
			
 
				-                        default=8,
			
 
				+                        default=1,
			
 
				                         help="Specifies the 'max_batch_size' in the TRTIS model config.\
			
 
				                               See the TRTIS documentation for more info.")
			
 
				-    parser.add_argument('--amp-run', action='store_true',
			
 
				-                        help='inference with AMP')
			
 
				+    parser.add_argument('--fp16', action='store_true',
			
 
				+                        help='inference with mixed precision')
			
 
				     return parser
			
 
				 
			
 
				 
			
@@ -106,7 +106,7 @@ output [
 
				     config_values = {
			
 
				         "model_name": args.trtis_model_name,
			
 
				         "max_batch_size": args.trtis_max_batch_size,
			
 
				-        "fp_type": "TYPE_FP16" if args.amp_run else "TYPE_FP32"
			
 
				+        "fp_type": "TYPE_FP16" if args.fp16 else "TYPE_FP32"
			
 
				     }
			
 
				     
			
 
				     with open(model_folder + "/config.pbtxt", "w") as file:
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py
@@ -199,9 +199,9 @@ def export_onnx(parser, args):
 
				                           do_constant_folding=True,
			
 
				                           input_names=["mel", "z"],
			
 
				                           output_names=["audio"],
			
 
				-                          dynamic_axes={"mel":   {2: "mel_seq"},
			
 
				-                                        "z":     {2: "z_seq"},
			
 
				-                                        "audio": {1: "audio_seq"}})
			
 
				+                          dynamic_axes={"mel":   {0: "batch_size", 2: "mel_seq"},
			
 
				+                                        "z":     {0: "batch_size", 2: "z_seq"},
			
 
				+                                        "audio": {0: "batch_size", 1: "audio_seq"}})
			
 
				 
			
 
				 
			
 
				 def main():
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_trt_config.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_trt_config.py
@@ -32,7 +32,7 @@ import argparse
 
				 
			
 
				 def parse_args(parser):
			
 
				     """
			
 
				-        Parse commandline arguments. 
			
 
				+        Parse commandline arguments.
			
 
				     """
			
 
				     parser.add_argument("--trtis_model_name",
			
 
				                         type=str,
			
@@ -42,8 +42,8 @@ def parse_args(parser):
 
				                         type=int,
			
 
				                         default=1,
			
 
				                         help="exports to appropriate directory for TRTIS")
			
 
				-    parser.add_argument('--amp-run', action='store_true',
			
 
				-                        help='inference with AMP')
			
 
				+    parser.add_argument('--fp16', action='store_true',
			
 
				+                        help='inference with mixed precision')
			
 
				     return parser
			
 
				 
			
 
				 
			
@@ -52,13 +52,13 @@ def main():
 
				         description='PyTorch WaveGlow TRTIS config exporter')
			
 
				     parser = parse_args(parser)
			
 
				     args = parser.parse_args()
			
 
				-    
			
 
				+
			
 
				     # prepare repository
			
 
				     model_folder = os.path.join('./trtis_repo', args.trtis_model_name)
			
 
				     version_folder = os.path.join(model_folder, str(args.trtis_model_version))
			
 
				     if not os.path.exists(version_folder):
			
 
				         os.makedirs(version_folder)
			
 
				-    
			
 
				+
			
 
				     # build the config for TRTIS
			
 
				     config_filename = os.path.join(model_folder, "config.pbtxt")
			
 
				     config_template = r"""
			
@@ -84,12 +84,12 @@ output {{
 
				   dims: [-1]
			
 
				 }}
			
 
				 """
			
 
				-    
			
 
				+
			
 
				     config_values = {
			
 
				         "model_name": args.trtis_model_name,
			
 
				-        "fp_type": "TYPE_FP16" if args.amp_run else "TYPE_FP32"
			
 
				+        "fp_type": "TYPE_FP16" if args.fp16 else "TYPE_FP32"
			
 
				     }
			
 
				-    
			
 
				+
			
 
				     with open(model_folder + "/config.pbtxt", "w") as file:
			
 
				         final_config_str = config_template.format_map(config_values)
			
 
				         file.write(final_config_str)
			
@@ -97,4 +97,3 @@ output {{
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
 
				-
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/inference.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/inference.py
@@ -257,8 +257,6 @@ def main():
 
				     DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
			
 
				     DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time'])})
			
 
				 
			
 
				-    alignments = alignments.unfold(1, audios.size(0), audios.size(0)).transpose(0,2)
			
 
				-
			
 
				     for i, audio in enumerate(audios):
			
 
				 
			
 
				         plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower")
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/README.md
+++ b/PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/README.md
@@ -48,7 +48,7 @@ Follow the Tacotron 2 Quick Start Guide (points 1-4) to start the container.
 
				 Inside the container, type:
			
 
				 ```bash
			
 
				 cd /workspace/tacotron2/
			
 
				-python exports/export_tacotron2_ts_config.py --amp-run
			
 
				+python exports/export_tacotron2_ts_config.py --fp16
			
 
				 ```
			
 
				 
			
 
				 This will export the folder structure of the TRTIS repository and the config file of Tacotron 2. 
			
@@ -67,7 +67,7 @@ Move the downloaded model to `trtis_repo/tacotron2/1/model.pt`
 
				 
			
 
				 To export the Tacotron 2 model using TorchScript, type:
			
 
				 ```bash
			
 
				-python exports/export_tacotron2_ts.py --tacotron2 <tacotron2_checkpoint> -o trtis_repo/tacotron2/1/model.pt --amp-run
			
 
				+python exports/export_tacotron2_ts.py --tacotron2 <tacotron2_checkpoint> -o trtis_repo/tacotron2/1/model.pt --fp16
			
 
				 ```
			
 
				 
			
 
				 This will save the model as ``trtis_repo/tacotron2/1/model.pt``.
			
@@ -78,7 +78,7 @@ For WaveGlow, we also need to create the folder structure that will be used by t
 
				 Inside the container, type:
			
 
				 ```bash
			
 
				 cd /workspace/tacotron2/
			
 
				-python exports/export_waveglow_trt_config.py --amp-run
			
 
				+python exports/export_waveglow_trt_config.py --fp16
			
 
				 ```
			
 
				 
			
 
				 This will export the folder structure of the TRTIS repository and the config file of Waveglow. 
			
@@ -106,7 +106,7 @@ cd /workspace/onnx-tensorrt/build && cmake .. -DCMAKE_CXX_FLAGS=-isystem\ /usr/l
 
				 In order to export the model into the ONNX intermediate representation, type:
			
 
				 
			
 
				 ```bash
			
 
				-python exports/export_waveglow_onnx.py --waveglow <waveglow_checkpoint> --wn-channels 256 --amp-run --output ./output
			
 
				+python exports/export_waveglow_onnx.py --waveglow <waveglow_checkpoint> --wn-channels 256 --fp16 --output ./output
			
 
				 ```
			
 
				 
			
 
				 This will save the model as `waveglow.onnx` (you can change its name with the flag `--output <filename>`).
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/notebook.ipynb
+++ b/PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/notebook.ipynb
@@ -148,32 +148,12 @@
 
				     "    return mel, mel_lengths, alignments\n",
			
 
				     "\n",
			
 
				     "\n",
			
 
				-    "def force_to_shape(mel, length):\n",
			
 
				-    "    ''' preprocessor of waveglow\n",
			
 
				-    "        :: mel :: numpy array \n",
			
 
				-    "        :: length :: int \n",
			
 
				-    "        :: return :: m padded (or trimmed) to length in dimension 1\n",
			
 
				-    "    '''\n",
			
 
				-    "    diff = length - mel.shape[1]\n",
			
 
				-    "    if 0 < diff:\n",
			
 
				-    "        # pad it\n",
			
 
				-    "        min_value = mel.min()\n",
			
 
				-    "        shape = ((0,0),(0,diff))\n",
			
 
				-    "        ret = np.pad(mel, shape, mode='constant', constant_values=min_value)\n",
			
 
				-    "    else:\n",
			
 
				-    "        # trim it\n",
			
 
				-    "        ret = mel[:,:length]\n",
			
 
				-    "    ret = ret[:,:,None]\n",
			
 
				-    "    return ret\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				     "def mel_to_signal(mel, mel_lengths):\n",
			
 
				     "    ''' calls waveglow\n",
			
 
				     "        ::mel:: mel spectrogram\n",
			
 
				     "        ::mel_lengths:: original length of mel spectrogram\n",
			
 
				     "        ::returns:: waveform\n",
			
 
				     "    '''\n",
			
 
				-    "    # padding/trimming mel to dimension 620\n",
			
 
				     "    mel = mel[:,:,None]\n",
			
 
				     "    # prepare input/output\n",
			
 
				     "    input_dict = {}\n",
			
--- a/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py
+++ b/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py
@@ -537,7 +537,7 @@ class Decoder(nn.Module):
 
				          processed_memory) = self.initialize_decoder_states(memory)
			
 
				         mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device=memory.device)
			
 
				         not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device=memory.device)
			
 
				-        
			
 
				+
			
 
				         mel_outputs, gate_outputs, alignments = (
			
 
				             torch.zeros(1), torch.zeros(1), torch.zeros(1))
			
 
				         first_iter = True
			
@@ -684,4 +684,7 @@ class Tacotron2(nn.Module):
 
				         mel_outputs_postnet = self.postnet(mel_outputs)
			
 
				         mel_outputs_postnet = mel_outputs + mel_outputs_postnet
			
 
				 
			
 
				+        BS = mel_outputs_postnet.size(0)
			
 
				+        alignments = alignments.unfold(1, BS, BS).transpose(0,2)
			
 
				+
			
 
				         return mel_outputs_postnet, mel_lengths, alignments