gkarch 5 лет назад
Родитель
Сommit
8d8337196f

+ 4 - 3
PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts.py

@@ -40,8 +40,8 @@ def parse_args(parser):
 
     parser.add_argument('-o', '--output', type=str, default="trtis_repo/tacotron/1/model.pt",
                         help='filename for the Tacotron 2 TorchScript model')
-    parser.add_argument('--amp-run', action='store_true',
-                        help='inference with AMP')
+    parser.add_argument('--fp16', action='store_true',
+                        help='inference with mixed precision')
 
     return parser
 
@@ -54,7 +54,8 @@ def main():
     args = parser.parse_args()
 
     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
-                                     args.amp_run, forward_is_infer=True)
+                                     amp_run=args.fp16, cpu_run=False,
+                                     forward_is_infer=True)
     
     jitted_tacotron2 = torch.jit.script(tacotron2)
 

+ 4 - 4
PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_ts_config.py

@@ -44,11 +44,11 @@ def parse_args(parser):
                         help="exports to appropriate directory for TRTIS")
     parser.add_argument("--trtis_max_batch_size",
                         type=int,
-                        default=8,
+                        default=1,
                         help="Specifies the 'max_batch_size' in the TRTIS model config.\
                               See the TRTIS documentation for more info.")
-    parser.add_argument('--amp-run', action='store_true',
-                        help='inference with AMP')
+    parser.add_argument('--fp16', action='store_true',
+                        help='inference with mixed precision')
     return parser
 
 
@@ -106,7 +106,7 @@ output [
     config_values = {
         "model_name": args.trtis_model_name,
         "max_batch_size": args.trtis_max_batch_size,
-        "fp_type": "TYPE_FP16" if args.amp_run else "TYPE_FP32"
+        "fp_type": "TYPE_FP16" if args.fp16 else "TYPE_FP32"
     }
     
     with open(model_folder + "/config.pbtxt", "w") as file:

+ 3 - 3
PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py

@@ -199,9 +199,9 @@ def export_onnx(parser, args):
                           do_constant_folding=True,
                           input_names=["mel", "z"],
                           output_names=["audio"],
-                          dynamic_axes={"mel":   {2: "mel_seq"},
-                                        "z":     {2: "z_seq"},
-                                        "audio": {1: "audio_seq"}})
+                          dynamic_axes={"mel":   {0: "batch_size", 2: "mel_seq"},
+                                        "z":     {0: "batch_size", 2: "z_seq"},
+                                        "audio": {0: "batch_size", 1: "audio_seq"}})
 
 
 def main():

+ 8 - 9
PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_trt_config.py

@@ -32,7 +32,7 @@ import argparse
 
 def parse_args(parser):
     """
-        Parse commandline arguments. 
+        Parse commandline arguments.
     """
     parser.add_argument("--trtis_model_name",
                         type=str,
@@ -42,8 +42,8 @@ def parse_args(parser):
                         type=int,
                         default=1,
                         help="exports to appropriate directory for TRTIS")
-    parser.add_argument('--amp-run', action='store_true',
-                        help='inference with AMP')
+    parser.add_argument('--fp16', action='store_true',
+                        help='inference with mixed precision')
     return parser
 
 
@@ -52,13 +52,13 @@ def main():
         description='PyTorch WaveGlow TRTIS config exporter')
     parser = parse_args(parser)
     args = parser.parse_args()
-    
+
     # prepare repository
     model_folder = os.path.join('./trtis_repo', args.trtis_model_name)
     version_folder = os.path.join(model_folder, str(args.trtis_model_version))
     if not os.path.exists(version_folder):
         os.makedirs(version_folder)
-    
+
     # build the config for TRTIS
     config_filename = os.path.join(model_folder, "config.pbtxt")
     config_template = r"""
@@ -84,12 +84,12 @@ output {{
   dims: [-1]
 }}
 """
-    
+
     config_values = {
         "model_name": args.trtis_model_name,
-        "fp_type": "TYPE_FP16" if args.amp_run else "TYPE_FP32"
+        "fp_type": "TYPE_FP16" if args.fp16 else "TYPE_FP32"
     }
-    
+
     with open(model_folder + "/config.pbtxt", "w") as file:
         final_config_str = config_template.format_map(config_values)
         file.write(final_config_str)
@@ -97,4 +97,3 @@ output {{
 
 if __name__ == '__main__':
     main()
-

+ 0 - 2
PyTorch/SpeechSynthesis/Tacotron2/inference.py

@@ -257,8 +257,6 @@ def main():
     DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
     DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time'])})
 
-    alignments = alignments.unfold(1, audios.size(0), audios.size(0)).transpose(0,2)
-
     for i, audio in enumerate(audios):
 
         plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower")

+ 4 - 4
PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/README.md

@@ -48,7 +48,7 @@ Follow the Tacotron 2 Quick Start Guide (points 1-4) to start the container.
 Inside the container, type:
 ```bash
 cd /workspace/tacotron2/
-python exports/export_tacotron2_ts_config.py --amp-run
+python exports/export_tacotron2_ts_config.py --fp16
 ```
 
 This will export the folder structure of the TRTIS repository and the config file of Tacotron 2. 
@@ -67,7 +67,7 @@ Move the downloaded model to `trtis_repo/tacotron2/1/model.pt`
 
 To export the Tacotron 2 model using TorchScript, type:
 ```bash
-python exports/export_tacotron2_ts.py --tacotron2 <tacotron2_checkpoint> -o trtis_repo/tacotron2/1/model.pt --amp-run
+python exports/export_tacotron2_ts.py --tacotron2 <tacotron2_checkpoint> -o trtis_repo/tacotron2/1/model.pt --fp16
 ```
 
 This will save the model as ``trtis_repo/tacotron2/1/model.pt``.
@@ -78,7 +78,7 @@ For WaveGlow, we also need to create the folder structure that will be used by t
 Inside the container, type:
 ```bash
 cd /workspace/tacotron2/
-python exports/export_waveglow_trt_config.py --amp-run
+python exports/export_waveglow_trt_config.py --fp16
 ```
 
 This will export the folder structure of the TRTIS repository and the config file of Waveglow. 
@@ -106,7 +106,7 @@ cd /workspace/onnx-tensorrt/build && cmake .. -DCMAKE_CXX_FLAGS=-isystem\ /usr/l
 In order to export the model into the ONNX intermediate representation, type:
 
 ```bash
-python exports/export_waveglow_onnx.py --waveglow <waveglow_checkpoint> --wn-channels 256 --amp-run --output ./output
+python exports/export_waveglow_onnx.py --waveglow <waveglow_checkpoint> --wn-channels 256 --fp16 --output ./output
 ```
 
 This will save the model as `waveglow.onnx` (you can change its name with the flag `--output <filename>`).

+ 0 - 20
PyTorch/SpeechSynthesis/Tacotron2/notebooks/trtis/notebook.ipynb

@@ -148,32 +148,12 @@
     "    return mel, mel_lengths, alignments\n",
     "\n",
     "\n",
-    "def force_to_shape(mel, length):\n",
-    "    ''' preprocessor of waveglow\n",
-    "        :: mel :: numpy array \n",
-    "        :: length :: int \n",
-    "        :: return :: m padded (or trimmed) to length in dimension 1\n",
-    "    '''\n",
-    "    diff = length - mel.shape[1]\n",
-    "    if 0 < diff:\n",
-    "        # pad it\n",
-    "        min_value = mel.min()\n",
-    "        shape = ((0,0),(0,diff))\n",
-    "        ret = np.pad(mel, shape, mode='constant', constant_values=min_value)\n",
-    "    else:\n",
-    "        # trim it\n",
-    "        ret = mel[:,:length]\n",
-    "    ret = ret[:,:,None]\n",
-    "    return ret\n",
-    "\n",
-    "\n",
     "def mel_to_signal(mel, mel_lengths):\n",
     "    ''' calls waveglow\n",
     "        ::mel:: mel spectrogram\n",
     "        ::mel_lengths:: original length of mel spectrogram\n",
     "        ::returns:: waveform\n",
     "    '''\n",
-    "    # padding/trimming mel to dimension 620\n",
     "    mel = mel[:,:,None]\n",
     "    # prepare input/output\n",
     "    input_dict = {}\n",

+ 4 - 1
PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py

@@ -537,7 +537,7 @@ class Decoder(nn.Module):
          processed_memory) = self.initialize_decoder_states(memory)
         mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device=memory.device)
         not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device=memory.device)
-        
+
         mel_outputs, gate_outputs, alignments = (
             torch.zeros(1), torch.zeros(1), torch.zeros(1))
         first_iter = True
@@ -684,4 +684,7 @@ class Tacotron2(nn.Module):
         mel_outputs_postnet = self.postnet(mel_outputs)
         mel_outputs_postnet = mel_outputs + mel_outputs_postnet
 
+        BS = mel_outputs_postnet.size(0)
+        alignments = alignments.unfold(1, BS, BS).transpose(0,2)
+
         return mel_outputs_postnet, mel_lengths, alignments