model.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import argparse
  15. import sys
  16. from os.path import abspath, dirname
  17. sys.path.append(abspath(dirname(__file__)+'/../'))
  18. from common.text import symbols
  19. from inference import load_model_from_ckpt
  20. import models
  21. from torch.utils.data import DataLoader
  22. import torch
  23. import numpy as np
  24. def update_argparser(parser):
  25. ### copy-paste from ./fastpitch/arg_parser.py
  26. io = parser.add_argument_group('io parameters')
  27. io.add_argument('--n-mel-channels', default=80, type=int,
  28. help='Number of bins in mel-spectrograms')
  29. symbols = parser.add_argument_group('symbols parameters')
  30. symbols.add_argument('--n-symbols', default=148, type=int,
  31. help='Number of symbols in dictionary')
  32. symbols.add_argument('--padding-idx', default=0, type=int,
  33. help='Index of padding symbol in dictionary')
  34. symbols.add_argument('--symbols-embedding-dim', default=384, type=int,
  35. help='Input embedding dimension')
  36. text_processing = parser.add_argument_group('Text processing parameters')
  37. text_processing.add_argument('--symbol-set', type=str, default='english_basic',
  38. help='Define symbol set for input text')
  39. in_fft = parser.add_argument_group('input FFT parameters')
  40. in_fft.add_argument('--in-fft-n-layers', default=6, type=int,
  41. help='Number of FFT blocks')
  42. in_fft.add_argument('--in-fft-n-heads', default=1, type=int,
  43. help='Number of attention heads')
  44. in_fft.add_argument('--in-fft-d-head', default=64, type=int,
  45. help='Dim of attention heads')
  46. in_fft.add_argument('--in-fft-conv1d-kernel-size', default=3, type=int,
  47. help='Conv-1D kernel size')
  48. in_fft.add_argument('--in-fft-conv1d-filter-size', default=1536, type=int,
  49. help='Conv-1D filter size')
  50. in_fft.add_argument('--in-fft-output-size', default=384, type=int,
  51. help='Output dim')
  52. in_fft.add_argument('--p-in-fft-dropout', default=0.1, type=float,
  53. help='Dropout probability')
  54. in_fft.add_argument('--p-in-fft-dropatt', default=0.1, type=float,
  55. help='Multi-head attention dropout')
  56. in_fft.add_argument('--p-in-fft-dropemb', default=0.0, type=float,
  57. help='Dropout added to word+positional embeddings')
  58. out_fft = parser.add_argument_group('output FFT parameters')
  59. out_fft.add_argument('--out-fft-n-layers', default=6, type=int,
  60. help='Number of FFT blocks')
  61. out_fft.add_argument('--out-fft-n-heads', default=1, type=int,
  62. help='Number of attention heads')
  63. out_fft.add_argument('--out-fft-d-head', default=64, type=int,
  64. help='Dim of attention head')
  65. out_fft.add_argument('--out-fft-conv1d-kernel-size', default=3, type=int,
  66. help='Conv-1D kernel size')
  67. out_fft.add_argument('--out-fft-conv1d-filter-size', default=1536, type=int,
  68. help='Conv-1D filter size')
  69. out_fft.add_argument('--out-fft-output-size', default=384, type=int,
  70. help='Output dim')
  71. out_fft.add_argument('--p-out-fft-dropout', default=0.1, type=float,
  72. help='Dropout probability for out_fft')
  73. out_fft.add_argument('--p-out-fft-dropatt', default=0.1, type=float,
  74. help='Multi-head attention dropout')
  75. out_fft.add_argument('--p-out-fft-dropemb', default=0.0, type=float,
  76. help='Dropout added to word+positional embeddings')
  77. dur_pred = parser.add_argument_group('duration predictor parameters')
  78. dur_pred.add_argument('--dur-predictor-kernel-size', default=3, type=int,
  79. help='Duration predictor conv-1D kernel size')
  80. dur_pred.add_argument('--dur-predictor-filter-size', default=256, type=int,
  81. help='Duration predictor conv-1D filter size')
  82. dur_pred.add_argument('--p-dur-predictor-dropout', default=0.1, type=float,
  83. help='Dropout probability for duration predictor')
  84. dur_pred.add_argument('--dur-predictor-n-layers', default=2, type=int,
  85. help='Number of conv-1D layers')
  86. pitch_pred = parser.add_argument_group('pitch predictor parameters')
  87. pitch_pred.add_argument('--pitch-predictor-kernel-size', default=3, type=int,
  88. help='Pitch predictor conv-1D kernel size')
  89. pitch_pred.add_argument('--pitch-predictor-filter-size', default=256, type=int,
  90. help='Pitch predictor conv-1D filter size')
  91. pitch_pred.add_argument('--p-pitch-predictor-dropout', default=0.1, type=float,
  92. help='Pitch probability for pitch predictor')
  93. pitch_pred.add_argument('--pitch-predictor-n-layers', default=2, type=int,
  94. help='Number of conv-1D layers')
  95. energy_pred = parser.add_argument_group('energy predictor parameters')
  96. energy_pred.add_argument('--energy-conditioning', type=bool, default=True)
  97. energy_pred.add_argument('--energy-predictor-kernel-size', default=3, type=int,
  98. help='Pitch predictor conv-1D kernel size')
  99. energy_pred.add_argument('--energy-predictor-filter-size', default=256, type=int,
  100. help='Pitch predictor conv-1D filter size')
  101. energy_pred.add_argument('--p-energy-predictor-dropout', default=0.1, type=float,
  102. help='Pitch probability for energy predictor')
  103. energy_pred.add_argument('--energy-predictor-n-layers', default=2, type=int,
  104. help='Number of conv-1D layers')
  105. ###~copy-paste from ./fastpitch/arg_parser.py
  106. parser.add_argument('--checkpoint', type=str,
  107. help='Full path to the FastPitch checkpoint file')
  108. parser.add_argument('--torchscript', action='store_true',
  109. help='Apply TorchScript')
  110. parser.add_argument('--ema', action='store_true',
  111. help='Use EMA averaged model \
  112. (if saved in checkpoints)')
  113. cond = parser.add_argument_group('conditioning parameters')
  114. cond.add_argument('--pitch-embedding-kernel-size', default=3, type=int,
  115. help='Pitch embedding conv-1D kernel size')
  116. cond.add_argument('--energy-embedding-kernel-size', default=3, type=int,
  117. help='Pitch embedding conv-1D kernel size')
  118. cond.add_argument('--speaker-emb-weight', type=float, default=1.0,
  119. help='Scale speaker embedding')
  120. cond.add_argument('--n-speakers', type=int, default=1,
  121. help='Number of speakers in the model.')
  122. cond.add_argument('--pitch-conditioning-formants', default=1, type=int,
  123. help='Number of speech formants to condition on.')
  124. parser.add_argument("--precision", type=str, default="fp32",
  125. choices=["fp32", "fp16"],
  126. help="PyTorch model precision")
  127. parser.add_argument("--output-format", type=str, required=True,
  128. help="Output format")
  129. def get_model(**model_args):
  130. import argparse
  131. args = argparse.Namespace(**model_args)
  132. model_config = models.get_model_config(model_name="FastPitch",
  133. args=args)
  134. jittable = True if 'ts-' in args.output_format else False
  135. model = models.get_model(model_name="FastPitch",
  136. model_config=model_config,
  137. device='cuda',
  138. forward_is_infer=True,
  139. jitable=jittable)
  140. model = load_model_from_ckpt(args.checkpoint, args.ema, model)
  141. if args.precision == "fp16":
  142. model = model.half()
  143. model.eval()
  144. tensor_names = {"inputs": ["INPUT__0"],
  145. "outputs" : ["OUTPUT__0", "OUTPUT__1",
  146. "OUTPUT__2", "OUTPUT__3", "OUTPUT__4"]}
  147. return model, tensor_names