deployer_lib.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. #!/usr/bin/python
  2. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import os
  16. import sys
  17. import time
  18. import json
  19. import torch
  20. import argparse
  21. import statistics
  22. from collections import Counter
  23. torch_type_to_triton_type = {
  24. torch.bool: 'TYPE_BOOL',
  25. torch.int8: 'TYPE_INT8',
  26. torch.int16: 'TYPE_INT16',
  27. torch.int32: 'TYPE_INT32',
  28. torch.int64: 'TYPE_INT64',
  29. torch.uint8: 'TYPE_UINT8',
  30. torch.float16: 'TYPE_FP16',
  31. torch.float32: 'TYPE_FP32',
  32. torch.float64: 'TYPE_FP64'
  33. }
  34. CONFIG_TEMPLATE = r"""
  35. name: "{model_name}"
  36. platform: "{platform}"
  37. max_batch_size: {max_batch_size}
  38. input [
  39. {spec_inputs}
  40. ]
  41. output [
  42. {spec_outputs}
  43. ]
  44. {dynamic_batching}
  45. {model_optimizations}
  46. instance_group [
  47. {{
  48. count: {engine_count}
  49. kind: KIND_GPU
  50. gpus: [ {gpu_list} ]
  51. }}
  52. ]"""
  53. INPUT_TEMPLATE = r"""
  54. {{
  55. name: "input__{num}"
  56. data_type: {type}
  57. dims: {dims}
  58. {reshape}
  59. }},"""
  60. OUTPUT_TEMPLATE = r"""
  61. {{
  62. name: "output__{num}"
  63. data_type: {type}
  64. dims: {dims}
  65. {reshape}
  66. }},"""
  67. MODEL_OPTIMIZATION_TEMPLATE = r"""
  68. optimization {{
  69. cuda {{
  70. graphs: {capture_cuda_graph}
  71. }}
  72. }}"""
  73. def remove_empty_lines(text):
  74. ''' removes empty lines from text, returns the result '''
  75. ret = "".join([s for s in text.strip().splitlines(True) if s.strip()])
  76. return ret
  77. def create_deployer(argv):
  78. ''' takes a list of arguments, returns a deployer object and the list of unused arguments '''
  79. parser = argparse.ArgumentParser()
  80. # required args
  81. method = parser.add_mutually_exclusive_group(required=True)
  82. method.add_argument('--ts-script',
  83. action='store_true',
  84. help='convert to torchscript using torch.jit.script')
  85. method.add_argument('--ts-trace',
  86. action='store_true',
  87. help='convert to torchscript using torch.jit.trace')
  88. method.add_argument('--onnx',
  89. action='store_true',
  90. help='convert to onnx using torch.onnx.export')
  91. # triton related args
  92. arguments = parser.add_argument_group('triton related flags')
  93. arguments.add_argument('--triton-no-cuda',
  94. action='store_true',
  95. help='Use the CPU for tracing.')
  96. arguments.add_argument('--triton-model-name',
  97. type=str,
  98. default="model",
  99. help="exports to appropriate directory structure for Triton")
  100. arguments.add_argument("--triton-model-version",
  101. type=int,
  102. default=1,
  103. help="exports to appropriate directory structure for Triton")
  104. arguments.add_argument("--triton-server-url",
  105. type=str,
  106. default="localhost:8001",
  107. help="exports to appropriate directory structure for Triton")
  108. arguments.add_argument("--triton-max-batch-size",
  109. type=int,
  110. default=8,
  111. help="Specifies the 'max_batch_size' in the Triton model config.\
  112. See the Triton documentation for more info.")
  113. arguments.add_argument("--triton-dyn-batching-delay",
  114. type=float,
  115. default=0,
  116. help="Determines the dynamic_batching queue delay in milliseconds(ms) for\
  117. the Triton model config. Use '0' or '-1' to specify static batching.\
  118. See the Triton documentation for more info.")
  119. arguments.add_argument("--triton-engine-count",
  120. type=int,
  121. default=1,
  122. help="Specifies the 'instance_group' count value in the Triton model config.\
  123. See the Triton documentation for more info.")
  124. arguments.add_argument('--save-dir', type=str, default='./triton_models', help='Saved model directory')
  125. # optimization args
  126. arguments = parser.add_argument_group('optimization flags')
  127. arguments.add_argument("--capture-cuda-graph",
  128. type=int,
  129. default=0,
  130. help="capture cuda graph for obtaining speedup. possible values: 0, 1. default: 0 (automatic). ")
  131. # remainder args
  132. arguments.add_argument('model_arguments', nargs=argparse.REMAINDER, help='arguments that will be ignored by deployer lib and will be forwarded to your deployer script')
  133. #
  134. args = parser.parse_args(argv)
  135. deployer = Deployer(args)
  136. #
  137. return deployer, args.model_arguments[1:]
  138. class DeployerLibrary:
  139. def __init__(self, args):
  140. self.args = args
  141. self.platform = None
  142. def set_platform(self, platform):
  143. ''' sets the platform
  144. :: platform :: "pytorch_libtorch" or "onnxruntime_onnx" or "tensorrt_plan"
  145. '''
  146. self.platform = platform
  147. def prepare_inputs(self, dataloader, device):
  148. ''' load sample inputs to device '''
  149. inputs = []
  150. for batch in dataloader:
  151. if type(batch) is torch.Tensor:
  152. batch_d = batch.to(device)
  153. batch_d = (batch_d,)
  154. inputs.append(batch_d)
  155. else:
  156. batch_d = []
  157. for x in batch:
  158. assert type(x) is torch.Tensor, "input is not a tensor"
  159. batch_d.append(x.to(device))
  160. batch_d = tuple(batch_d)
  161. inputs.append(batch_d)
  162. return inputs
  163. def get_list_of_shapes(self, l, fun):
  164. ''' returns the list of min/max shapes, depending on fun
  165. :: l :: list of tuples of tensors
  166. :: fun :: min or max
  167. '''
  168. tensor_tuple = l[0]
  169. shapes = [list(x.shape) for x in tensor_tuple]
  170. for tensor_tuple in l:
  171. assert len(tensor_tuple) == len(shapes), "tensors with varying shape lengths are not supported"
  172. for i,x in enumerate(tensor_tuple):
  173. for j in range(len(x.shape)):
  174. shapes[i][j] = fun(shapes[i][j], x.shape[j])
  175. return shapes # a list of shapes
  176. def get_tuple_of_min_shapes(self, l):
  177. ''' returns the tuple of min shapes
  178. :: l :: list of tuples of tensors '''
  179. shapes = self.get_list_of_shapes(l, min)
  180. min_batch = 1
  181. shapes = [[min_batch,*shape[1:]] for shape in shapes]
  182. shapes = tuple(shapes)
  183. return shapes # tuple of min shapes
  184. def get_tuple_of_max_shapes(self, l):
  185. ''' returns the tuple of max shapes
  186. :: l :: list of tuples of tensors '''
  187. shapes = self.get_list_of_shapes(l, max)
  188. max_batch = max(2,shapes[0][0])
  189. shapes = [[max_batch,*shape[1:]] for shape in shapes]
  190. shapes = tuple(shapes)
  191. return shapes # tuple of max shapes
  192. def get_tuple_of_opt_shapes(self, l):
  193. ''' returns the tuple of opt shapes
  194. :: l :: list of tuples of tensors '''
  195. counter = Counter()
  196. for tensor_tuple in l:
  197. shapes = [tuple(x.shape) for x in tensor_tuple]
  198. shapes = tuple(shapes)
  199. counter[shapes] += 1
  200. shapes = counter.most_common(1)[0][0]
  201. return shapes # tuple of most common occuring shapes
  202. def get_tuple_of_dynamic_shapes(self, l):
  203. ''' returns a tuple of dynamic shapes: variable tensor dimensions
  204. (for ex. batch size) occur as -1 in the tuple
  205. :: l :: list of tuples of tensors '''
  206. tensor_tuple = l[0]
  207. shapes = [list(x.shape) for x in tensor_tuple]
  208. for tensor_tuple in l:
  209. err_msg = "tensors with varying shape lengths are not supported"
  210. assert len(tensor_tuple) == len(shapes), err_msg
  211. for i,x in enumerate(tensor_tuple):
  212. for j in range(len(x.shape)):
  213. if shapes[i][j] != x.shape[j] or j == 0:
  214. shapes[i][j] = -1
  215. shapes = tuple(shapes)
  216. return shapes # tuple of dynamic shapes
  217. def run_models(self, models, inputs):
  218. ''' run the models on inputs, return the outputs and execution times '''
  219. ret = []
  220. for model in models:
  221. torch.cuda.synchronize()
  222. time_start = time.time()
  223. outputs = []
  224. for input in inputs:
  225. with torch.no_grad():
  226. output = model(*input)
  227. if type(output) is torch.Tensor:
  228. output = [output]
  229. outputs.append(output)
  230. torch.cuda.synchronize()
  231. time_end = time.time()
  232. t = time_end - time_start
  233. ret.append(outputs)
  234. ret.append(t)
  235. return ret
  236. def compute_errors(self, outputs_A, outputs_B):
  237. ''' returns the list of L_inf errors computed over every single output tensor '''
  238. Linf_errors = []
  239. for output_A,output_B in zip(outputs_A,outputs_B):
  240. for x,y in zip(output_A, output_B):
  241. error = (x - y).norm(float('inf')).item()
  242. Linf_errors.append(error)
  243. return Linf_errors
  244. def print_errors(self, Linf_errors):
  245. ''' print various statistcs of Linf errors '''
  246. print()
  247. print("conversion correctness test results")
  248. print("-----------------------------------")
  249. print("maximal absolute error over dataset (L_inf): ", max(Linf_errors))
  250. print()
  251. print("average L_inf error over output tensors: ", statistics.mean(Linf_errors))
  252. print("variance of L_inf error over output tensors: ", statistics.variance(Linf_errors))
  253. print("stddev of L_inf error over output tensors: ", statistics.stdev(Linf_errors))
  254. print()
  255. def write_config(self, config_filename,
  256. input_shapes, input_types,
  257. output_shapes, output_types):
  258. ''' writes Triton config file
  259. :: config_filename :: the file to write the config file into
  260. :: input_shapes :: tuple of dynamic shapes of the input tensors
  261. :: input_types :: tuple of torch types of the input tensors
  262. :: output_shapes :: tuple of dynamic shapes of the output tensors
  263. :: output_types :: tuple of torch types of the output tensors
  264. '''
  265. assert self.platform is not None, "error - platform is not set"
  266. config_template = CONFIG_TEMPLATE
  267. input_template = INPUT_TEMPLATE
  268. optimization_template = MODEL_OPTIMIZATION_TEMPLATE
  269. spec_inputs = r""""""
  270. for i,(shape,typ) in enumerate(zip(input_shapes,input_types)):
  271. d = {
  272. 'num' : str(i),
  273. 'type': torch_type_to_triton_type[typ],
  274. 'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size
  275. }
  276. d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
  277. spec_inputs += input_template.format_map(d)
  278. spec_inputs = spec_inputs[:-1]
  279. output_template = OUTPUT_TEMPLATE
  280. spec_outputs = r""""""
  281. for i,(shape,typ) in enumerate(zip(output_shapes,output_types)):
  282. d = {
  283. 'num' : str(i),
  284. 'type': torch_type_to_triton_type[typ],
  285. 'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size
  286. }
  287. d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
  288. spec_outputs += output_template.format_map(d)
  289. spec_outputs = spec_outputs[:-1]
  290. batching_str = ""
  291. max_batch_size = self.args.triton_max_batch_size
  292. if (self.args.triton_dyn_batching_delay > 0):
  293. # Use only full and half full batches
  294. pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]
  295. batching_str = r"""
  296. dynamic_batching {{
  297. preferred_batch_size: [{0}]
  298. max_queue_delay_microseconds: {1}
  299. }}""".format(", ".join([str(x) for x in pref_batch_size]),
  300. int(self.args.triton_dyn_batching_delay * 1000.0))
  301. d = {
  302. "capture_cuda_graph": str(self.args.capture_cuda_graph)
  303. }
  304. optimization_str = optimization_template.format_map(d)
  305. config_values = {
  306. "model_name": self.args.triton_model_name,
  307. "platform": self.platform,
  308. "max_batch_size": max_batch_size,
  309. "spec_inputs": spec_inputs,
  310. "spec_outputs": spec_outputs,
  311. "dynamic_batching": batching_str,
  312. "model_optimizations" : optimization_str,
  313. "gpu_list": ", ".join([str(x) for x in range(torch.cuda.device_count())]),
  314. "engine_count": self.args.triton_engine_count
  315. }
  316. # write config
  317. with open(config_filename, "w") as file:
  318. final_config_str = config_template.format_map(config_values)
  319. final_config_str = remove_empty_lines(final_config_str)
  320. file.write(final_config_str)
  321. class Deployer:
  322. def __init__(self, args):
  323. self.args = args
  324. self.lib = DeployerLibrary(args)
  325. def deploy(self, dataloader, model):
  326. ''' deploy the model and test for correctness with dataloader '''
  327. if self.args.ts_script or self.args.ts_trace:
  328. self.lib.set_platform("pytorch_libtorch")
  329. print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
  330. self.to_triton_torchscript(dataloader, model)
  331. elif self.args.onnx:
  332. self.lib.set_platform("onnxruntime_onnx")
  333. print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
  334. self.to_triton_onnx(dataloader, model)
  335. else:
  336. assert False, "error"
  337. print("done")
  338. def to_triton_onnx(self, dataloader, model):
  339. ''' export the model to onnx and test correctness on dataloader '''
  340. import onnx
  341. import onnxruntime
  342. # setup device
  343. if self.args.triton_no_cuda:
  344. device = torch.device('cpu')
  345. else:
  346. device = torch.device('cuda')
  347. # prepare model
  348. model.to(device)
  349. model.eval()
  350. assert not model.training, "internal error - model should be in eval() mode! "
  351. # prepare inputs
  352. inputs = self.lib.prepare_inputs(dataloader, device)
  353. # generate outputs
  354. outputs = []
  355. for input in inputs:
  356. with torch.no_grad():
  357. output = model(*input)
  358. if type(output) is torch.Tensor:
  359. output = [output]
  360. outputs.append(output)
  361. # generate input shapes - dynamic tensor shape support
  362. input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
  363. # generate output shapes - dynamic tensor shape support
  364. output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
  365. # generate input types
  366. input_types = [x.dtype for x in inputs[0]]
  367. # generate output types
  368. output_types = [x.dtype for x in outputs[0]]
  369. # get input names
  370. rng = range(len(input_types))
  371. input_names = ["input__" + str(num) for num in rng]
  372. # get output names
  373. rng = range(len(output_types))
  374. output_names = ["output__" + str(num) for num in rng]
  375. # prepare save path
  376. model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
  377. version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
  378. if not os.path.exists(version_folder):
  379. os.makedirs(version_folder)
  380. final_model_path = os.path.join(version_folder, 'model.onnx')
  381. # get indices of dynamic input and output shapes
  382. dynamic_axes = {}
  383. for input_name,input_shape in zip(input_names,input_shapes):
  384. dynamic_axes[input_name] = [i for i,x in enumerate(input_shape) if x == -1]
  385. for output_name,output_shape in zip(output_names,output_shapes):
  386. dynamic_axes[output_name] = [i for i,x in enumerate(output_shape) if x == -1]
  387. # export the model
  388. assert not model.training, "internal error - model should be in eval() mode! "
  389. with torch.no_grad():
  390. torch.onnx.export(model, inputs[0], final_model_path, verbose=False,
  391. input_names=input_names, output_names=output_names,
  392. dynamic_axes=dynamic_axes, opset_version=11)
  393. # syntactic error check
  394. converted_model = onnx.load(final_model_path)
  395. # check that the IR is well formed
  396. onnx.checker.check_model(converted_model)
  397. # load the model
  398. session = onnxruntime.InferenceSession(final_model_path, None)
  399. class ONNX_model:
  400. def __init__(self, session, input_names, device):
  401. self.session = session
  402. self.input_names = input_names
  403. def to_numpy(self, tensor):
  404. return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
  405. def __call__(self, *inputs):
  406. inp = [(input_name, inputs[i]) for i,input_name in enumerate(self.input_names)]
  407. inp = {input_name : self.to_numpy(x) for input_name,x in inp}
  408. outputs = self.session.run(None, inp)
  409. outputs = [torch.from_numpy(output) for output in outputs]
  410. outputs = [output.to(device) for output in outputs]
  411. if len(outputs) == 1:
  412. outputs = outputs[0]
  413. return outputs
  414. # switch to eval mode
  415. model_onnx = ONNX_model(session, input_names, device)
  416. # run both models on inputs
  417. assert not model.training, "internal error - model should be in eval() mode! "
  418. models = (model, model_onnx)
  419. outputs, time_model, outputs_onnx, time_model_onnx = self.lib.run_models(models, inputs)
  420. # check for errors
  421. Linf_errors = self.lib.compute_errors(outputs, outputs_onnx)
  422. self.lib.print_errors(Linf_errors)
  423. print('time of error check of native model: ', time_model, 'seconds')
  424. print('time of error check of onnx model: ', time_model_onnx, 'seconds')
  425. print()
  426. # write Triton config
  427. config_filename = os.path.join(model_folder, "config.pbtxt")
  428. self.lib.write_config(config_filename,
  429. input_shapes, input_types,
  430. output_shapes, output_types)
  431. def to_triton_torchscript(self, dataloader, model):
  432. ''' export the model to torchscript and test correctness on dataloader '''
  433. # setup device
  434. if self.args.triton_no_cuda:
  435. device = torch.device('cpu')
  436. else:
  437. device = torch.device('cuda')
  438. # prepare model
  439. model.to(device)
  440. model.eval()
  441. assert not model.training, "internal error - model should be in eval() mode! "
  442. # prepare inputs
  443. inputs = self.lib.prepare_inputs(dataloader, device)
  444. # generate input shapes - dynamic tensor shape support
  445. input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
  446. # generate input types
  447. input_types = [x.dtype for x in inputs[0]]
  448. # prepare save path
  449. model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
  450. version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
  451. if not os.path.exists(version_folder):
  452. os.makedirs(version_folder)
  453. final_model_path = os.path.join(version_folder, 'model.pt')
  454. # convert the model
  455. with torch.no_grad():
  456. if self.args.ts_trace: # trace it
  457. model_ts = torch.jit.trace(model, inputs[0])
  458. if self.args.ts_script: # script it
  459. model_ts = torch.jit.script(model)
  460. # save the model
  461. torch.jit.save(model_ts, final_model_path)
  462. # load the model
  463. model_ts = torch.jit.load(final_model_path)
  464. model_ts.eval() # WAR for bug : by default, model_ts gets loaded in training mode
  465. # run both models on inputs
  466. assert not model.training, "internal error - model should be in eval() mode! "
  467. assert not model_ts.training, "internal error - converted model should be in eval() mode! "
  468. models = (model, model_ts)
  469. outputs, time_model, outputs_ts, time_model_ts = self.lib.run_models(models, inputs)
  470. # check for errors
  471. Linf_errors = self.lib.compute_errors(outputs, outputs_ts)
  472. self.lib.print_errors(Linf_errors)
  473. print('time of error check of native model: ', time_model, 'seconds')
  474. print('time of error check of ts model: ', time_model_ts, 'seconds')
  475. print()
  476. # generate output shapes - dynamic tensor shape support
  477. output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
  478. # generate output types
  479. output_types = [x.dtype for x in outputs[0]]
  480. # now we build the config for Triton
  481. config_filename = os.path.join(model_folder, "config.pbtxt")
  482. self.lib.write_config(config_filename,
  483. input_shapes, input_types,
  484. output_shapes, output_types)