hubconf.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. import urllib.request
  2. import torch
  3. import os
  4. import sys
  5. # from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
  6. def checkpoint_from_distributed(state_dict):
  7. """
  8. Checks whether checkpoint was generated by DistributedDataParallel. DDP
  9. wraps model in additional "module.", it needs to be unwrapped for single
  10. GPU inference.
  11. :param state_dict: model's state dict
  12. """
  13. ret = False
  14. for key, _ in state_dict.items():
  15. if key.find('module.') != -1:
  16. ret = True
  17. break
  18. return ret
  19. # from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
  20. def unwrap_distributed(state_dict):
  21. """
  22. Unwraps model from DistributedDataParallel.
  23. DDP wraps model in additional "module.", it needs to be removed for single
  24. GPU inference.
  25. :param state_dict: model's state dict
  26. """
  27. new_state_dict = {}
  28. for key, value in state_dict.items():
  29. new_key = key.replace('module.1.', '')
  30. new_key = new_key.replace('module.', '')
  31. new_state_dict[new_key] = value
  32. return new_state_dict
  33. dependencies = ['torch']
  34. def nvidia_ncf(pretrained=True, **kwargs):
  35. """Constructs an NCF model.
  36. For detailed information on model input and output, training recipies, inference and performance
  37. visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
  38. Args:
  39. pretrained (bool, True): If True, returns a model pretrained on ml-20m dataset.
  40. model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
  41. nb_users (int): number of users
  42. nb_items (int): number of items
  43. mf_dim (int, 64): dimension of latent space in matrix factorization
  44. mlp_layer_sizes (list, [256,256,128,64]): sizes of layers of multi-layer-perceptron
  45. dropout (float, 0.5): dropout
  46. """
  47. from PyTorch.Recommendation.NCF import neumf as ncf
  48. fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
  49. force_reload = "force_reload" in kwargs and kwargs["force_reload"]
  50. config = {'nb_users': None, 'nb_items': None, 'mf_dim': 64, 'mf_reg': 0.,
  51. 'mlp_layer_sizes': [256, 256, 128, 64], 'mlp_layer_regs':[0, 0, 0, 0], 'dropout': 0.5}
  52. if pretrained:
  53. if fp16:
  54. checkpoint = 'https://developer.nvidia.com/joc-ncf-fp16-pyt-20190225'
  55. else:
  56. checkpoint = 'https://developer.nvidia.com/joc-ncf-fp32-pyt-20190225'
  57. ckpt_file = os.path.basename(checkpoint)
  58. if not os.path.exists(ckpt_file) or force_reload:
  59. sys.stderr.write('Downloading checkpoint from {}\n'.format(checkpoint))
  60. urllib.request.urlretrieve(checkpoint, ckpt_file)
  61. ckpt = torch.load(ckpt_file)
  62. if checkpoint_from_distributed(ckpt):
  63. ckpt = unwrap_distributed(ckpt)
  64. config['nb_users'] = ckpt['mf_user_embed.weight'].shape[0]
  65. config['nb_items'] = ckpt['mf_item_embed.weight'].shape[0]
  66. config['mf_dim'] = ckpt['mf_item_embed.weight'].shape[1]
  67. mlp_shapes = [ckpt[k].shape for k in ckpt.keys() if 'mlp' in k and 'weight' in k and 'embed' not in k]
  68. config['mlp_layer_sizes'] = [mlp_shapes[0][1], mlp_shapes[1][1], mlp_shapes[2][1], mlp_shapes[2][0]]
  69. config['mlp_layer_regs'] = [0] * len(config['mlp_layer_sizes'])
  70. else:
  71. if 'nb_users' not in kwargs:
  72. raise ValueError("Missing 'nb_users' argument.")
  73. if 'nb_items' not in kwargs:
  74. raise ValueError("Missing 'nb_items' argument.")
  75. for k,v in kwargs.items():
  76. if k in config.keys():
  77. config[k] = v
  78. config['mlp_layer_regs'] = [0] * len(config['mlp_layer_sizes'])
  79. m = ncf.NeuMF(**config)
  80. if fp16:
  81. m.half()
  82. if pretrained:
  83. m.load_state_dict(ckpt)
  84. return m
  85. def nvidia_tacotron2(pretrained=True, **kwargs):
  86. """Constructs a Tacotron 2 model (nn.module with additional infer(input) method).
  87. For detailed information on model input and output, training recipies, inference and performance
  88. visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
  89. Args (type[, default value]):
  90. pretrained (bool, True): If True, returns a model pretrained on LJ Speech dataset.
  91. model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
  92. n_symbols (int, 148): Number of symbols used in a sequence passed to the prenet, see
  93. https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/text/symbols.py
  94. p_attention_dropout (float, 0.1): dropout probability on attention LSTM (1st LSTM layer in decoder)
  95. p_decoder_dropout (float, 0.1): dropout probability on decoder LSTM (2nd LSTM layer in decoder)
  96. max_decoder_steps (int, 1000): maximum number of generated mel spectrograms during inference
  97. """
  98. from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import model as tacotron2
  99. from PyTorch.SpeechSynthesis.Tacotron2.models import lstmcell_to_float, batchnorm_to_float
  100. from PyTorch.SpeechSynthesis.Tacotron2.tacotron2.text import text_to_sequence
  101. fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
  102. force_reload = "force_reload" in kwargs and kwargs["force_reload"]
  103. if pretrained:
  104. if fp16:
  105. checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp16-pyt-20190306'
  106. else:
  107. checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp32-pyt-20190306'
  108. ckpt_file = os.path.basename(checkpoint)
  109. if not os.path.exists(ckpt_file) or force_reload:
  110. sys.stderr.write('Downloading checkpoint from {}\n'.format(checkpoint))
  111. urllib.request.urlretrieve(checkpoint, ckpt_file)
  112. ckpt = torch.load(ckpt_file)
  113. state_dict = ckpt['state_dict']
  114. if checkpoint_from_distributed(state_dict):
  115. state_dict = unwrap_distributed(state_dict)
  116. config = ckpt['config']
  117. else:
  118. config = {'mask_padding': False, 'n_mel_channels': 80, 'n_symbols': 148,
  119. 'symbols_embedding_dim': 512, 'encoder_kernel_size': 5,
  120. 'encoder_n_convolutions': 3, 'encoder_embedding_dim': 512,
  121. 'attention_rnn_dim': 1024, 'attention_dim': 128,
  122. 'attention_location_n_filters': 32,
  123. 'attention_location_kernel_size': 31, 'n_frames_per_step': 1,
  124. 'decoder_rnn_dim': 1024, 'prenet_dim': 256,
  125. 'max_decoder_steps': 1000, 'gate_threshold': 0.5,
  126. 'p_attention_dropout': 0.1, 'p_decoder_dropout': 0.1,
  127. 'postnet_embedding_dim': 512, 'postnet_kernel_size': 5,
  128. 'postnet_n_convolutions': 5, 'decoder_no_early_stopping': False}
  129. for k,v in kwargs.items():
  130. if k in config.keys():
  131. config[k] = v
  132. m = tacotron2.Tacotron2(**config)
  133. if fp16:
  134. m = batchnorm_to_float(m.half())
  135. m = lstmcell_to_float(m)
  136. if pretrained:
  137. m.load_state_dict(state_dict)
  138. m.text_to_sequence = text_to_sequence
  139. return m
  140. def nvidia_waveglow(pretrained=True, **kwargs):
  141. """Constructs a WaveGlow model (nn.module with additional infer(input) method).
  142. For detailed information on model input and output, training recipies, inference and performance
  143. visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
  144. Args:
  145. pretrained (bool): If True, returns a model pretrained on LJ Speech dataset.
  146. model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
  147. """
  148. from PyTorch.SpeechSynthesis.Tacotron2.waveglow import model as waveglow
  149. from PyTorch.SpeechSynthesis.Tacotron2.models import batchnorm_to_float
  150. fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
  151. force_reload = "force_reload" in kwargs and kwargs["force_reload"]
  152. if pretrained:
  153. if fp16:
  154. checkpoint = 'https://developer.nvidia.com/joc-waveglow-fp16-pyt-20190306'
  155. else:
  156. checkpoint = 'https://developer.nvidia.com/joc-waveglow-fp32-pyt-20190306'
  157. ckpt_file = os.path.basename(checkpoint)
  158. if not os.path.exists(ckpt_file) or force_reload:
  159. sys.stderr.write('Downloading checkpoint from {}\n'.format(checkpoint))
  160. urllib.request.urlretrieve(checkpoint, ckpt_file)
  161. ckpt = torch.load(ckpt_file)
  162. state_dict = ckpt['state_dict']
  163. if checkpoint_from_distributed(state_dict):
  164. state_dict = unwrap_distributed(state_dict)
  165. config = ckpt['config']
  166. else:
  167. config = {'n_mel_channels': 80, 'n_flows': 12, 'n_group': 8,
  168. 'n_early_every': 4, 'n_early_size': 2,
  169. 'WN_config': {'n_layers': 8, 'kernel_size': 3,
  170. 'n_channels': 512}}
  171. for k,v in kwargs.items():
  172. if k in config.keys():
  173. config[k] = v
  174. elif k in config['WN_config'].keys():
  175. config['WN_config'][k] = v
  176. m = waveglow.WaveGlow(**config)
  177. if fp16:
  178. m = batchnorm_to_float(m.half())
  179. for mat in m.convinv:
  180. mat.float()
  181. if pretrained:
  182. m.load_state_dict(state_dict)
  183. return m
  184. def nvidia_ssd_processing_utils():
  185. import numpy as np
  186. import skimage
  187. from PyTorch.Detection.SSD.src.utils import dboxes300_coco, Encoder
  188. class Processing:
  189. @staticmethod
  190. def load_image(image_path):
  191. """Code from Loading_Pretrained_Models.ipynb - a Caffe2 tutorial"""
  192. img = skimage.img_as_float(skimage.io.imread(image_path))
  193. if len(img.shape) == 2:
  194. img = np.array([img, img, img]).swapaxes(0, 2)
  195. return img
  196. @staticmethod
  197. def rescale(img, input_height, input_width):
  198. """Code from Loading_Pretrained_Models.ipynb - a Caffe2 tutorial"""
  199. aspect = img.shape[1] / float(img.shape[0])
  200. if (aspect > 1):
  201. # landscape orientation - wide image
  202. res = int(aspect * input_height)
  203. imgScaled = skimage.transform.resize(img, (input_width, res))
  204. if (aspect < 1):
  205. # portrait orientation - tall image
  206. res = int(input_width / aspect)
  207. imgScaled = skimage.transform.resize(img, (res, input_height))
  208. if (aspect == 1):
  209. imgScaled = skimage.transform.resize(img, (input_width, input_height))
  210. return imgScaled
  211. @staticmethod
  212. def crop_center(img, cropx, cropy):
  213. """Code from Loading_Pretrained_Models.ipynb - a Caffe2 tutorial"""
  214. y, x, c = img.shape
  215. startx = x // 2 - (cropx // 2)
  216. starty = y // 2 - (cropy // 2)
  217. return img[starty:starty + cropy, startx:startx + cropx]
  218. @staticmethod
  219. def normalize(img, mean=128, std=128):
  220. img = (img * 256 - mean) / std
  221. return img
  222. @staticmethod
  223. def prepare_tensor(inputs, fp16=False):
  224. NHWC = np.array(inputs)
  225. NCHW = np.swapaxes(np.swapaxes(NHWC, 1, 3), 2, 3)
  226. tensor = torch.from_numpy(NCHW)
  227. tensor = tensor.cuda()
  228. tensor = tensor.float()
  229. if fp16:
  230. tensor = tensor.half()
  231. return tensor
  232. @staticmethod
  233. def prepare_input(img_uri):
  234. img = Processing.load_image(img_uri)
  235. img = Processing.rescale(img, 300, 300)
  236. img = Processing.crop_center(img, 300, 300)
  237. img = Processing.normalize(img)
  238. return img
  239. @staticmethod
  240. def decode_results(predictions):
  241. dboxes = dboxes300_coco()
  242. encoder = Encoder(dboxes)
  243. ploc, plabel = [val.float() for val in predictions]
  244. results = encoder.decode_batch(ploc, plabel, criteria=0.5, max_output=20)
  245. return [[pred.detach().cpu().numpy() for pred in detections] for detections in results]
  246. @staticmethod
  247. def pick_best(detections, threshold=0.3):
  248. bboxes, classes, confidences = detections
  249. best = np.argwhere(confidences > threshold)[:, 0]
  250. return [pred[best] for pred in detections]
  251. @staticmethod
  252. def get_coco_object_dictionary():
  253. import os
  254. file_with_coco_names = "category_names.txt"
  255. if not os.path.exists(file_with_coco_names):
  256. print("Downloading COCO annotations.")
  257. import urllib
  258. import zipfile
  259. import json
  260. import shutil
  261. urllib.request.urlretrieve("http://images.cocodataset.org/annotations/annotations_trainval2017.zip", "cocoanno.zip")
  262. with zipfile.ZipFile("cocoanno.zip", "r") as f:
  263. f.extractall()
  264. print("Downloading finished.")
  265. with open("annotations/instances_val2017.json", 'r') as COCO:
  266. js = json.loads(COCO.read())
  267. class_names = [category['name'] for category in js['categories']]
  268. open("category_names.txt", 'w').writelines([c+"\n" for c in class_names])
  269. os.remove("cocoanno.zip")
  270. shutil.rmtree("annotations")
  271. else:
  272. class_names = open("category_names.txt").readlines()
  273. class_names = [c.strip() for c in class_names]
  274. return class_names
  275. return Processing()
  276. def nvidia_ssd(pretrained=True, **kwargs):
  277. """Constructs an SSD300 model.
  278. For detailed information on model input and output, training recipies, inference and performance
  279. visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
  280. Args:
  281. pretrained (bool, True): If True, returns a model pretrained on COCO dataset.
  282. model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
  283. """
  284. from PyTorch.Detection.SSD.src import model as ssd
  285. fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
  286. force_reload = "force_reload" in kwargs and kwargs["force_reload"]
  287. m = ssd.SSD300()
  288. if fp16:
  289. m = m.half()
  290. def batchnorm_to_float(module):
  291. """Converts batch norm to FP32"""
  292. if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
  293. module.float()
  294. for child in module.children():
  295. batchnorm_to_float(child)
  296. return module
  297. m = batchnorm_to_float(m)
  298. if pretrained:
  299. if fp16:
  300. checkpoint = 'https://developer.nvidia.com/joc-ssd-fp16-pyt-20190225'
  301. else:
  302. checkpoint = 'https://developer.nvidia.com/joc-ssd-fp32-pyt-20190225'
  303. ckpt_file = os.path.basename(checkpoint)
  304. if not os.path.exists(ckpt_file) or force_reload:
  305. sys.stderr.write('Downloading checkpoint from {}\n'.format(checkpoint))
  306. urllib.request.urlretrieve(checkpoint, ckpt_file)
  307. ckpt = torch.load(ckpt_file)
  308. ckpt = ckpt['model']
  309. if checkpoint_from_distributed(ckpt):
  310. ckpt = unwrap_distributed(ckpt)
  311. m.load_state_dict(ckpt)
  312. return m