|
|
@@ -0,0 +1,911 @@
|
|
|
+#!/usr/bin/python
|
|
|
+
|
|
|
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
|
|
+#
|
|
|
+# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
+# you may not use this file except in compliance with the License.
|
|
|
+# You may obtain a copy of the License at
|
|
|
+#
|
|
|
+# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
+#
|
|
|
+# Unless required by applicable law or agreed to in writing, software
|
|
|
+# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+# See the License for the specific language governing permissions and
|
|
|
+# limitations under the License.
|
|
|
+
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import time
|
|
|
+import json
|
|
|
+import torch
|
|
|
+import argparse
|
|
|
+import statistics
|
|
|
+from collections import Counter
|
|
|
+
|
|
|
+torch_type_to_triton_type = {
|
|
|
+ torch.bool: "TYPE_BOOL",
|
|
|
+ torch.int8: "TYPE_INT8",
|
|
|
+ torch.int16: "TYPE_INT16",
|
|
|
+ torch.int32: "TYPE_INT32",
|
|
|
+ torch.int64: "TYPE_INT64",
|
|
|
+ torch.uint8: "TYPE_UINT8",
|
|
|
+ torch.float16: "TYPE_FP16",
|
|
|
+ torch.float32: "TYPE_FP32",
|
|
|
+ torch.float64: "TYPE_FP64",
|
|
|
+}
|
|
|
+
|
|
|
+CONFIG_TEMPLATE = r"""
|
|
|
+name: "{model_name}"
|
|
|
+platform: "{platform}"
|
|
|
+max_batch_size: {max_batch_size}
|
|
|
+input [
|
|
|
+ {spec_inputs}
|
|
|
+]
|
|
|
+output [
|
|
|
+ {spec_outputs}
|
|
|
+]
|
|
|
+{dynamic_batching}
|
|
|
+{model_optimizations}
|
|
|
+instance_group [
|
|
|
+ {{
|
|
|
+ count: {engine_count}
|
|
|
+ kind: KIND_GPU
|
|
|
+ gpus: [ {gpu_list} ]
|
|
|
+ }}
|
|
|
+]"""
|
|
|
+
|
|
|
+INPUT_TEMPLATE = r"""
|
|
|
+{{
|
|
|
+ name: "input__{num}"
|
|
|
+ data_type: {type}
|
|
|
+ dims: {dims}
|
|
|
+ {reshape}
|
|
|
+}},"""
|
|
|
+
|
|
|
+OUTPUT_TEMPLATE = r"""
|
|
|
+{{
|
|
|
+ name: "output__{num}"
|
|
|
+ data_type: {type}
|
|
|
+ dims: {dims}
|
|
|
+ {reshape}
|
|
|
+}},"""
|
|
|
+
|
|
|
+MODEL_OPTIMIZATION_TEMPLATE = r"""
|
|
|
+optimization {{
|
|
|
+ {execution_accelerator}
|
|
|
+ cuda {{
|
|
|
+ graphs: {capture_cuda_graph}
|
|
|
+ }}
|
|
|
+}}"""
|
|
|
+
|
|
|
+EXECUTION_ACCELERATOR_TEMPLATE = r"""
|
|
|
+ execution_accelerators {{
|
|
|
+ gpu_execution_accelerator: [
|
|
|
+ {{
|
|
|
+ name: "tensorrt"
|
|
|
+ }}
|
|
|
+ ]
|
|
|
+ }},"""
|
|
|
+
|
|
|
+
|
|
|
+def remove_empty_lines(text):
|
|
|
+ """ removes empty lines from text, returns the result """
|
|
|
+ ret = "".join([s for s in text.strip().splitlines(True) if s.strip()])
|
|
|
+ return ret
|
|
|
+
|
|
|
+
|
|
|
+def create_deployer(argv):
|
|
|
+ """ takes a list of arguments, returns a deployer object and the list of unused arguments """
|
|
|
+ parser = argparse.ArgumentParser()
|
|
|
+ # required args
|
|
|
+ method = parser.add_mutually_exclusive_group(required=True)
|
|
|
+ method.add_argument(
|
|
|
+ "--ts-script",
|
|
|
+ action="store_true",
|
|
|
+ help="convert to torchscript using torch.jit.script",
|
|
|
+ )
|
|
|
+ method.add_argument(
|
|
|
+ "--ts-trace",
|
|
|
+ action="store_true",
|
|
|
+ help="convert to torchscript using torch.jit.trace",
|
|
|
+ )
|
|
|
+ method.add_argument(
|
|
|
+ "--onnx", action="store_true", help="convert to onnx using torch.onnx.export"
|
|
|
+ )
|
|
|
+ method.add_argument(
|
|
|
+ "--trt", action="store_true", help="convert to trt using tensorrt"
|
|
|
+ )
|
|
|
+ # triton related args
|
|
|
+ arguments = parser.add_argument_group("triton related flags")
|
|
|
+ arguments.add_argument(
|
|
|
+ "--triton-no-cuda", action="store_true", help="Use the CPU for tracing."
|
|
|
+ )
|
|
|
+ arguments.add_argument(
|
|
|
+ "--triton-model-name",
|
|
|
+ type=str,
|
|
|
+ default="model",
|
|
|
+ help="exports to appropriate directory structure for TRITON",
|
|
|
+ )
|
|
|
+ arguments.add_argument(
|
|
|
+ "--triton-model-version",
|
|
|
+ type=int,
|
|
|
+ default=1,
|
|
|
+ help="exports to appropriate directory structure for TRITON",
|
|
|
+ )
|
|
|
+ arguments.add_argument(
|
|
|
+ "--triton-max-batch-size",
|
|
|
+ type=int,
|
|
|
+ default=8,
|
|
|
+ help="Specifies the 'max_batch_size' in the TRITON model config.\
|
|
|
+ See the TRITON documentation for more info.",
|
|
|
+ )
|
|
|
+ arguments.add_argument(
|
|
|
+ "--triton-dyn-batching-delay",
|
|
|
+ type=float,
|
|
|
+ default=0,
|
|
|
+ help="Determines the dynamic_batching queue delay in milliseconds(ms) for\
|
|
|
+ the TRITON model config. Use '0' or '-1' to specify static batching.\
|
|
|
+ See the TRITON documentation for more info.",
|
|
|
+ )
|
|
|
+ arguments.add_argument(
|
|
|
+ "--triton-engine-count",
|
|
|
+ type=int,
|
|
|
+ default=1,
|
|
|
+ help="Specifies the 'instance_group' count value in the TRITON model config.\
|
|
|
+ See the TRITON documentation for more info.",
|
|
|
+ )
|
|
|
+ arguments.add_argument(
|
|
|
+ "--save-dir", type=str, default="./triton_models", help="Saved model directory"
|
|
|
+ )
|
|
|
+ # optimization args
|
|
|
+ arguments = parser.add_argument_group("optimization flags")
|
|
|
+ arguments.add_argument(
|
|
|
+ "--max_workspace_size",
|
|
|
+ type=int,
|
|
|
+ default=512 * 1024 * 1024,
|
|
|
+ help="set the size of the workspace for trt export",
|
|
|
+ )
|
|
|
+ arguments.add_argument(
|
|
|
+ "--trt-fp16",
|
|
|
+ action="store_true",
|
|
|
+ help="trt flag ---- export model in mixed precision mode",
|
|
|
+ )
|
|
|
+ arguments.add_argument(
|
|
|
+ "--capture-cuda-graph",
|
|
|
+ type=int,
|
|
|
+ default=1,
|
|
|
+ help="capture cuda graph for obtaining speedup. possible values: 0, 1. default: 1. ",
|
|
|
+ )
|
|
|
+
|
|
|
+ # remainder args
|
|
|
+ arguments.add_argument(
|
|
|
+ "model_arguments",
|
|
|
+ nargs=argparse.REMAINDER,
|
|
|
+ help="arguments that will be ignored by deployer lib and will be forwarded to your deployer script",
|
|
|
+ )
|
|
|
+ #
|
|
|
+ args = parser.parse_args(argv)
|
|
|
+ deployer = Deployer(args)
|
|
|
+ #
|
|
|
+ return deployer, args.model_arguments[1:]
|
|
|
+
|
|
|
+
|
|
|
+class DeployerLibrary:
|
|
|
+ def __init__(self, args):
|
|
|
+ self.args = args
|
|
|
+ self.platform = None
|
|
|
+
|
|
|
+ def set_platform(self, platform):
|
|
|
+ """ sets the platform
|
|
|
+ :: platform :: "pytorch_libtorch" or "onnxruntime_onnx" or "tensorrt_plan"
|
|
|
+ """
|
|
|
+ self.platform = platform
|
|
|
+
|
|
|
+ def build_trt_engine(self, model_file, shapes):
|
|
|
+ """ takes a path to an onnx file, and shape information, returns a trt engine
|
|
|
+ :: model_file :: path to an onnx model
|
|
|
+ :: shapes :: dictionary containing min shape, max shape, opt shape for the trt engine
|
|
|
+ """
|
|
|
+ import tensorrt as trt
|
|
|
+
|
|
|
+ TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
|
|
|
+ builder = trt.Builder(TRT_LOGGER)
|
|
|
+ builder.fp16_mode = self.args.trt_fp16
|
|
|
+ builder.max_batch_size = self.args.triton_max_batch_size
|
|
|
+ #
|
|
|
+ config = builder.create_builder_config()
|
|
|
+ config.max_workspace_size = self.args.max_workspace_size
|
|
|
+ if self.args.trt_fp16:
|
|
|
+ config.flags |= 1 << int(trt.BuilderFlag.FP16)
|
|
|
+ profile = builder.create_optimization_profile()
|
|
|
+ for s in shapes:
|
|
|
+ profile.set_shape(s["name"], min=s["min"], opt=s["opt"], max=s["max"])
|
|
|
+ config.add_optimization_profile(profile)
|
|
|
+ explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
|
|
|
+ network = builder.create_network(explicit_batch)
|
|
|
+ #
|
|
|
+ with trt.OnnxParser(network, TRT_LOGGER) as parser:
|
|
|
+ with open(model_file, "rb") as model:
|
|
|
+ parser.parse(model.read())
|
|
|
+ for i in range(parser.num_errors):
|
|
|
+ e = parser.get_error(i)
|
|
|
+ print("||||e", e)
|
|
|
+ engine = builder.build_engine(network, config=config)
|
|
|
+ return engine
|
|
|
+
|
|
|
+ def load_engine(self, engine_filepath):
|
|
|
+ """ loads a trt engine from engine_filepath, returns it """
|
|
|
+ import tensorrt as trt
|
|
|
+
|
|
|
+ TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
|
|
|
+ with open(engine_filepath, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
|
|
|
+ engine = runtime.deserialize_cuda_engine(f.read())
|
|
|
+ return engine
|
|
|
+
|
|
|
+ def prepare_inputs(self, dataloader, device):
|
|
|
+ """ load sample inputs to device """
|
|
|
+ inputs = []
|
|
|
+ for batch in dataloader:
|
|
|
+ if type(batch) is torch.Tensor:
|
|
|
+ batch_d = batch.to(device)
|
|
|
+ batch_d = (batch_d,)
|
|
|
+ inputs.append(batch_d)
|
|
|
+ else:
|
|
|
+ batch_d = []
|
|
|
+ for x in batch:
|
|
|
+ assert type(x) is torch.Tensor, "input is not a tensor"
|
|
|
+ batch_d.append(x.to(device))
|
|
|
+ batch_d = tuple(batch_d)
|
|
|
+ inputs.append(batch_d)
|
|
|
+ return inputs
|
|
|
+
|
|
|
+ def get_list_of_shapes(self, l, fun):
|
|
|
+ """ returns the list of min/max shapes, depending on fun
|
|
|
+ :: l :: list of tuples of tensors
|
|
|
+ :: fun :: min or max
|
|
|
+ """
|
|
|
+ tensor_tuple = l[0]
|
|
|
+ shapes = [list(x.shape) for x in tensor_tuple]
|
|
|
+ for tensor_tuple in l:
|
|
|
+ assert len(tensor_tuple) == len(
|
|
|
+ shapes
|
|
|
+ ), "tensors with varying shape lengths are not supported"
|
|
|
+ for i, x in enumerate(tensor_tuple):
|
|
|
+ for j in range(len(x.shape)):
|
|
|
+ shapes[i][j] = fun(shapes[i][j], x.shape[j])
|
|
|
+ return shapes # a list of shapes
|
|
|
+
|
|
|
+ def get_tuple_of_min_shapes(self, l):
|
|
|
+ """ returns the tuple of min shapes
|
|
|
+ :: l :: list of tuples of tensors """
|
|
|
+ shapes = self.get_list_of_shapes(l, min)
|
|
|
+ min_batch = 1
|
|
|
+ shapes = [[min_batch, *shape[1:]] for shape in shapes]
|
|
|
+ shapes = tuple(shapes)
|
|
|
+ return shapes # tuple of min shapes
|
|
|
+
|
|
|
+ def get_tuple_of_max_shapes(self, l):
|
|
|
+ """ returns the tuple of max shapes
|
|
|
+ :: l :: list of tuples of tensors """
|
|
|
+ shapes = self.get_list_of_shapes(l, max)
|
|
|
+ max_batch = max(2, shapes[0][0])
|
|
|
+ shapes = [[max_batch, *shape[1:]] for shape in shapes]
|
|
|
+ shapes = tuple(shapes)
|
|
|
+ return shapes # tuple of max shapes
|
|
|
+
|
|
|
+ def get_tuple_of_opt_shapes(self, l):
|
|
|
+ """ returns the tuple of opt shapes
|
|
|
+ :: l :: list of tuples of tensors """
|
|
|
+ counter = Counter()
|
|
|
+ for tensor_tuple in l:
|
|
|
+ shapes = [tuple(x.shape) for x in tensor_tuple]
|
|
|
+ shapes = tuple(shapes)
|
|
|
+ counter[shapes] += 1
|
|
|
+ shapes = counter.most_common(1)[0][0]
|
|
|
+ return shapes # tuple of most common occuring shapes
|
|
|
+
|
|
|
+ def get_tuple_of_dynamic_shapes(self, l):
|
|
|
+ """ returns a tuple of dynamic shapes: variable tensor dimensions
|
|
|
+ (for ex. batch size) occur as -1 in the tuple
|
|
|
+ :: l :: list of tuples of tensors """
|
|
|
+ tensor_tuple = l[0]
|
|
|
+ shapes = [list(x.shape) for x in tensor_tuple]
|
|
|
+ for tensor_tuple in l:
|
|
|
+ err_msg = "tensors with varying shape lengths are not supported"
|
|
|
+ assert len(tensor_tuple) == len(shapes), err_msg
|
|
|
+ for i, x in enumerate(tensor_tuple):
|
|
|
+ for j in range(len(x.shape)):
|
|
|
+ if shapes[i][j] != x.shape[j] or j == 0:
|
|
|
+ shapes[i][j] = -1
|
|
|
+ shapes = tuple(shapes)
|
|
|
+ return shapes # tuple of dynamic shapes
|
|
|
+
|
|
|
+ def run_models(self, models, inputs):
|
|
|
+ """ run the models on inputs, return the outputs and execution times """
|
|
|
+ ret = []
|
|
|
+ for model in models:
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ time_start = time.time()
|
|
|
+ outputs = []
|
|
|
+ for input in inputs:
|
|
|
+ with torch.no_grad():
|
|
|
+ output = model(*input)
|
|
|
+ if type(output) is torch.Tensor:
|
|
|
+ output = [output]
|
|
|
+ outputs.append(output)
|
|
|
+ torch.cuda.synchronize()
|
|
|
+ time_end = time.time()
|
|
|
+ t = time_end - time_start
|
|
|
+ ret.append(outputs)
|
|
|
+ ret.append(t)
|
|
|
+ return ret
|
|
|
+
|
|
|
+ def compute_tensor_stats(self, tensor):
|
|
|
+ return {
|
|
|
+ "std": tensor.std().item(),
|
|
|
+ "mean": tensor.mean().item(),
|
|
|
+ "max": tensor.max().item(),
|
|
|
+ "min": tensor.min().item(),
|
|
|
+ }
|
|
|
+
|
|
|
+ def compute_errors(self, outputs_A, outputs_B):
|
|
|
+ """ returns dictionary with errors statistics """
|
|
|
+ device = outputs_A[0][0][0].device
|
|
|
+ dtype = outputs_A[0][0][0].dtype
|
|
|
+ x_values = torch.zeros(0, device=device, dtype=dtype)
|
|
|
+ y_values = torch.zeros(0, device=device, dtype=dtype)
|
|
|
+ d_values = torch.zeros(0, device=device, dtype=dtype)
|
|
|
+ for output_A, output_B in zip(outputs_A, outputs_B):
|
|
|
+ for x, y in zip(output_A, output_B):
|
|
|
+ d = abs(x - y)
|
|
|
+ x_values = torch.cat((x_values, x), 0)
|
|
|
+ y_values = torch.cat((y_values, y), 0)
|
|
|
+ d_values = torch.cat((d_values, d), 0)
|
|
|
+ Error_stats = {
|
|
|
+ "Original": self.compute_tensor_stats(x_values),
|
|
|
+ "Converted": self.compute_tensor_stats(y_values),
|
|
|
+ "Absolute difference": self.compute_tensor_stats(d_values),
|
|
|
+ }
|
|
|
+ return Error_stats
|
|
|
+
|
|
|
+ def print_errors(self, Error_stats):
|
|
|
+ """ print various statistcs of Linf errors """
|
|
|
+ print()
|
|
|
+ print("conversion correctness test results")
|
|
|
+ print("-----------------------------------")
|
|
|
+ import pandas as pd
|
|
|
+
|
|
|
+ print(pd.DataFrame(Error_stats))
|
|
|
+
|
|
|
+ def write_config(
|
|
|
+ self, config_filename, input_shapes, input_types, output_shapes, output_types
|
|
|
+ ):
|
|
|
+ """ writes TRTIS config file
|
|
|
+ :: config_filename :: the file to write the config file into
|
|
|
+ :: input_shapes :: tuple of dynamic shapes of the input tensors
|
|
|
+ :: input_types :: tuple of torch types of the input tensors
|
|
|
+ :: output_shapes :: tuple of dynamic shapes of the output tensors
|
|
|
+ :: output_types :: tuple of torch types of the output tensors
|
|
|
+ """
|
|
|
+ assert self.platform is not None, "error - platform is not set"
|
|
|
+
|
|
|
+ config_template = CONFIG_TEMPLATE
|
|
|
+ input_template = INPUT_TEMPLATE
|
|
|
+ optimization_template = MODEL_OPTIMIZATION_TEMPLATE
|
|
|
+ accelerator_template = EXECUTION_ACCELERATOR_TEMPLATE
|
|
|
+
|
|
|
+ spec_inputs = r""""""
|
|
|
+ for i, (shape, typ) in enumerate(zip(input_shapes, input_types)):
|
|
|
+ d = {
|
|
|
+ "num": str(i),
|
|
|
+ "type": torch_type_to_triton_type[typ],
|
|
|
+ "dims": str([1])
|
|
|
+ if len(shape) == 1
|
|
|
+ else str(list(shape)[1:]), # first dimension is the batch size
|
|
|
+ }
|
|
|
+ d["reshape"] = "reshape: { shape: [ ] }" if len(shape) == 1 else ""
|
|
|
+ spec_inputs += input_template.format_map(d)
|
|
|
+ spec_inputs = spec_inputs[:-1]
|
|
|
+
|
|
|
+ output_template = OUTPUT_TEMPLATE
|
|
|
+ spec_outputs = r""""""
|
|
|
+ for i, (shape, typ) in enumerate(zip(output_shapes, output_types)):
|
|
|
+ d = {
|
|
|
+ "num": str(i),
|
|
|
+ "type": torch_type_to_triton_type[typ],
|
|
|
+ "dims": str([1])
|
|
|
+ if len(shape) == 1
|
|
|
+ else str(list(shape)[1:]), # first dimension is the batch size
|
|
|
+ }
|
|
|
+ d["reshape"] = "reshape: { shape: [ ] }" if len(shape) == 1 else ""
|
|
|
+ spec_outputs += output_template.format_map(d)
|
|
|
+ spec_outputs = spec_outputs[:-1]
|
|
|
+
|
|
|
+ batching_str = ""
|
|
|
+ max_batch_size = self.args.triton_max_batch_size
|
|
|
+
|
|
|
+ if self.args.triton_dyn_batching_delay >= 0:
|
|
|
+ # Use only full and half full batches
|
|
|
+ pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]
|
|
|
+
|
|
|
+ if self.args.triton_dyn_batching_delay > 0:
|
|
|
+ dyn_batch_delay_str = f"max_queue_delay_microseconds: {int(self.args.triton_dyn_batching_delay * 1000.0)}"
|
|
|
+ else:
|
|
|
+ dyn_batch_delay_str = ""
|
|
|
+
|
|
|
+ batching_str = r"""
|
|
|
+dynamic_batching {{
|
|
|
+ preferred_batch_size: [{0}]
|
|
|
+ {1}
|
|
|
+}}""".format(
|
|
|
+ ", ".join([str(x) for x in pref_batch_size]), dyn_batch_delay_str
|
|
|
+ )
|
|
|
+
|
|
|
+ accelerator_str = ""
|
|
|
+
|
|
|
+ d = {
|
|
|
+ "execution_accelerator": accelerator_str,
|
|
|
+ "capture_cuda_graph": str(self.args.capture_cuda_graph),
|
|
|
+ }
|
|
|
+ optimization_str = optimization_template.format_map(d)
|
|
|
+
|
|
|
+ config_values = {
|
|
|
+ "model_name": self.args.triton_model_name,
|
|
|
+ "platform": self.platform,
|
|
|
+ "max_batch_size": max_batch_size,
|
|
|
+ "spec_inputs": spec_inputs,
|
|
|
+ "spec_outputs": spec_outputs,
|
|
|
+ "dynamic_batching": batching_str,
|
|
|
+ "model_optimizations": optimization_str,
|
|
|
+ "gpu_list": ", ".join([str(x) for x in range(torch.cuda.device_count())]),
|
|
|
+ "engine_count": self.args.triton_engine_count,
|
|
|
+ }
|
|
|
+
|
|
|
+ # write config
|
|
|
+ with open(config_filename, "w") as file:
|
|
|
+ final_config_str = config_template.format_map(config_values)
|
|
|
+ final_config_str = remove_empty_lines(final_config_str)
|
|
|
+ file.write(final_config_str)
|
|
|
+
|
|
|
+
|
|
|
+class Deployer:
|
|
|
+ def __init__(self, args):
|
|
|
+ self.args = args
|
|
|
+ self.lib = DeployerLibrary(args)
|
|
|
+
|
|
|
+ def deploy(self, dataloader, model):
|
|
|
+ """ deploy the model and test for correctness with dataloader """
|
|
|
+ if self.args.ts_script or self.args.ts_trace:
|
|
|
+ self.lib.set_platform("pytorch_libtorch")
|
|
|
+ print(
|
|
|
+ "deploying model "
|
|
|
+ + self.args.triton_model_name
|
|
|
+ + " in format "
|
|
|
+ + self.lib.platform
|
|
|
+ )
|
|
|
+ self.to_triton_torchscript(dataloader, model)
|
|
|
+ elif self.args.onnx:
|
|
|
+ self.lib.set_platform("onnxruntime_onnx")
|
|
|
+ print(
|
|
|
+ "deploying model "
|
|
|
+ + self.args.triton_model_name
|
|
|
+ + " in format "
|
|
|
+ + self.lib.platform
|
|
|
+ )
|
|
|
+ self.to_triton_onnx(dataloader, model)
|
|
|
+ elif self.args.trt:
|
|
|
+ self.lib.set_platform("tensorrt_plan")
|
|
|
+ print(
|
|
|
+ "deploying model "
|
|
|
+ + self.args.triton_model_name
|
|
|
+ + " in format "
|
|
|
+ + self.lib.platform
|
|
|
+ )
|
|
|
+ self.to_triton_trt(dataloader, model)
|
|
|
+ else:
|
|
|
+ assert False, "error"
|
|
|
+ print("done")
|
|
|
+
|
|
|
+ def to_triton_trt(self, dataloader, model):
|
|
|
+ """ export the model to trt and test correctness on dataloader """
|
|
|
+ import tensorrt as trt
|
|
|
+
|
|
|
+ # setup device
|
|
|
+ if self.args.triton_no_cuda:
|
|
|
+ device = torch.device("cpu")
|
|
|
+ else:
|
|
|
+ device = torch.device("cuda")
|
|
|
+
|
|
|
+ # prepare model
|
|
|
+ model.to(device)
|
|
|
+ model.eval()
|
|
|
+ assert not model.training, "internal error - model should be in eval() mode! "
|
|
|
+
|
|
|
+ # prepare inputs
|
|
|
+ inputs = self.lib.prepare_inputs(dataloader, device)
|
|
|
+
|
|
|
+ # generate outputs
|
|
|
+ outputs = []
|
|
|
+ for input in inputs:
|
|
|
+ with torch.no_grad():
|
|
|
+ output = model(*input)
|
|
|
+ if type(output) is torch.Tensor:
|
|
|
+ output = [output]
|
|
|
+ outputs.append(output)
|
|
|
+
|
|
|
+ # generate input shapes - dynamic tensor shape support
|
|
|
+ input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
|
|
|
+
|
|
|
+ # generate output shapes - dynamic tensor shape support
|
|
|
+ output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
|
|
|
+
|
|
|
+ # generate input types
|
|
|
+ input_types = [x.dtype for x in inputs[0]]
|
|
|
+
|
|
|
+ # generate output types
|
|
|
+ output_types = [x.dtype for x in outputs[0]]
|
|
|
+
|
|
|
+ # get input names
|
|
|
+ rng = range(len(input_types))
|
|
|
+ input_names = ["input__" + str(num) for num in rng]
|
|
|
+
|
|
|
+ # get output names
|
|
|
+ rng = range(len(output_types))
|
|
|
+ output_names = ["output__" + str(num) for num in rng]
|
|
|
+
|
|
|
+ # prepare save path
|
|
|
+ model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
|
|
|
+ version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
|
|
|
+ if not os.path.exists(version_folder):
|
|
|
+ os.makedirs(version_folder)
|
|
|
+ final_model_path = os.path.join(version_folder, "model.plan")
|
|
|
+
|
|
|
+ # get indices of dynamic input and output shapes
|
|
|
+ dynamic_axes = {}
|
|
|
+ for input_name, shape in zip(input_names, input_shapes):
|
|
|
+ dynamic_axes[input_name] = [i for i, x in enumerate(shape) if x == -1]
|
|
|
+ for output_name, shape in zip(output_names, output_shapes):
|
|
|
+ dynamic_axes[output_name] = [i for i, x in enumerate(shape) if x == -1]
|
|
|
+
|
|
|
+ # export the model to onnx first
|
|
|
+ with torch.no_grad():
|
|
|
+ torch.onnx.export(
|
|
|
+ model,
|
|
|
+ inputs[0],
|
|
|
+ final_model_path,
|
|
|
+ verbose=False,
|
|
|
+ input_names=input_names,
|
|
|
+ output_names=output_names,
|
|
|
+ dynamic_axes=dynamic_axes,
|
|
|
+ opset_version=11,
|
|
|
+ )
|
|
|
+
|
|
|
+ # get shapes
|
|
|
+ min_shapes = self.lib.get_tuple_of_min_shapes(inputs)
|
|
|
+ opt_shapes = self.lib.get_tuple_of_opt_shapes(inputs)
|
|
|
+ max_shapes = self.lib.get_tuple_of_max_shapes(inputs)
|
|
|
+
|
|
|
+ zipped = zip(input_names, min_shapes, opt_shapes, max_shapes)
|
|
|
+ shapes = []
|
|
|
+ for name, min_shape, opt_shape, max_shape in zipped:
|
|
|
+ d = {"name": name, "min": min_shape, "opt": opt_shape, "max": max_shape}
|
|
|
+ shapes.append(d)
|
|
|
+
|
|
|
+ # build trt engine
|
|
|
+ engine = self.lib.build_trt_engine(final_model_path, shapes)
|
|
|
+ assert engine is not None, " trt export failure "
|
|
|
+
|
|
|
+ # write trt engine
|
|
|
+ with open(final_model_path, "wb") as f:
|
|
|
+ f.write(engine.serialize())
|
|
|
+
|
|
|
+ # load the model
|
|
|
+ engine = self.lib.load_engine(final_model_path)
|
|
|
+
|
|
|
+ class TRT_model:
|
|
|
+ def __init__(self, engine, input_names, output_names, output_types, device):
|
|
|
+ self.engine = engine
|
|
|
+ self.context = self.engine.create_execution_context()
|
|
|
+ self.input_names = input_names
|
|
|
+ self.output_names = output_names
|
|
|
+ self.output_types = output_types
|
|
|
+ self.device = device
|
|
|
+
|
|
|
+ def is_dimension_dynamic(self, dim):
|
|
|
+ return dim is None or dim <= 0
|
|
|
+
|
|
|
+ def is_shape_dynamic(self, shape):
|
|
|
+ return any([self.is_dimension_dynamic(dim) for dim in shape])
|
|
|
+
|
|
|
+ def __call__(self, *inputs):
|
|
|
+ # get input shapes
|
|
|
+ input_shapes = [x.shape for x in inputs]
|
|
|
+ # bindings
|
|
|
+ bindings = [None] * self.engine.num_bindings
|
|
|
+ # set input shapes, bind input tensors
|
|
|
+ zipped = zip(self.input_names, inputs)
|
|
|
+ for key, input in zipped:
|
|
|
+ idx = self.engine.get_binding_index(key)
|
|
|
+ bindings[idx] = input.data_ptr()
|
|
|
+ if self.engine.is_shape_binding(idx) and self.is_shape_dynamic(
|
|
|
+ self.context.get_shape(idx)
|
|
|
+ ):
|
|
|
+ self.context.set_shape_input(idx, input)
|
|
|
+ elif self.is_shape_dynamic(self.engine.get_binding_shape(idx)):
|
|
|
+ self.context.set_binding_shape(idx, input.shape)
|
|
|
+ assert self.context.all_binding_shapes_specified, "trt error"
|
|
|
+ assert self.context.all_shape_inputs_specified, "trt error"
|
|
|
+ # calculate output shapes, allocate output tensors and bind them
|
|
|
+ outputs = []
|
|
|
+ zipped = zip(self.output_names, self.output_types)
|
|
|
+ for key, dtype in zipped:
|
|
|
+ idx = self.engine.get_binding_index(key)
|
|
|
+ shape = self.context.get_binding_shape(idx)
|
|
|
+ shape = tuple(shape)
|
|
|
+ assert -1 not in shape, "trt error"
|
|
|
+ tensor = torch.zeros(shape, dtype=dtype, device=self.device)
|
|
|
+ outputs.append(tensor)
|
|
|
+ bindings[idx] = outputs[-1].data_ptr()
|
|
|
+ # run inference
|
|
|
+ self.context.execute_v2(bindings=bindings)
|
|
|
+ # return the result
|
|
|
+ if len(outputs) == 1:
|
|
|
+ outputs = outputs[0]
|
|
|
+ return outputs
|
|
|
+
|
|
|
+ model_trt = TRT_model(engine, input_names, output_names, output_types, device)
|
|
|
+
|
|
|
+ # run both models on inputs
|
|
|
+ assert not model.training, "internal error - model should be in eval() mode! "
|
|
|
+ models = (model, model_trt)
|
|
|
+ outputs, time_model, outputs_trt, time_model_trt = self.lib.run_models(
|
|
|
+ models, inputs
|
|
|
+ )
|
|
|
+
|
|
|
+ # check for errors
|
|
|
+ Error_stats = self.lib.compute_errors(outputs, outputs_trt)
|
|
|
+ self.lib.print_errors(Error_stats)
|
|
|
+ print("time of error check of native model: ", time_model, "seconds")
|
|
|
+ print("time of error check of trt model: ", time_model_trt, "seconds")
|
|
|
+ print()
|
|
|
+
|
|
|
+ # write TRTIS config
|
|
|
+ config_filename = os.path.join(model_folder, "config.pbtxt")
|
|
|
+ self.lib.write_config(
|
|
|
+ config_filename, input_shapes, input_types, output_shapes, output_types
|
|
|
+ )
|
|
|
+
|
|
|
+ def name_onnx_nodes(self, model_path):
|
|
|
+ """
|
|
|
+ Name all unnamed nodes in ONNX model
|
|
|
+ parameter model_path: path ONNX model
|
|
|
+ return: none
|
|
|
+ """
|
|
|
+ model = onnx.load(model_path)
|
|
|
+ node_id = 0
|
|
|
+ for node in model.graph.node:
|
|
|
+ if len(node.name) == 0:
|
|
|
+ node.name = "unnamed_node_%d" % node_id
|
|
|
+ node_id += 1
|
|
|
+ # This check partially validates model
|
|
|
+ onnx.checker.check_model(model)
|
|
|
+ onnx.save(model, model_path)
|
|
|
+ # Only inference really checks ONNX model for some issues
|
|
|
+ # like duplicated node names
|
|
|
+ onnxruntime.InferenceSession(model_path, None)
|
|
|
+
|
|
|
+ def to_triton_onnx(self, dataloader, model):
|
|
|
+ """ export the model to onnx and test correctness on dataloader """
|
|
|
+ import onnx as local_onnx
|
|
|
+
|
|
|
+ global onnx
|
|
|
+ onnx = local_onnx
|
|
|
+ import onnxruntime as local_onnxruntime
|
|
|
+
|
|
|
+ global onnxruntime
|
|
|
+ onnxruntime = local_onnxruntime
|
|
|
+ # setup device
|
|
|
+ if self.args.triton_no_cuda:
|
|
|
+ device = torch.device("cpu")
|
|
|
+ else:
|
|
|
+ device = torch.device("cuda")
|
|
|
+
|
|
|
+ # prepare model
|
|
|
+ model.to(device)
|
|
|
+ model.eval()
|
|
|
+ assert not model.training, "internal error - model should be in eval() mode! "
|
|
|
+
|
|
|
+ # prepare inputs
|
|
|
+ inputs = self.lib.prepare_inputs(dataloader, device)
|
|
|
+
|
|
|
+ # generate outputs
|
|
|
+ outputs = []
|
|
|
+ for input in inputs:
|
|
|
+ with torch.no_grad():
|
|
|
+ output = model(*input)
|
|
|
+ if type(output) is torch.Tensor:
|
|
|
+ output = [output]
|
|
|
+ outputs.append(output)
|
|
|
+
|
|
|
+ # generate input shapes - dynamic tensor shape support
|
|
|
+ input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
|
|
|
+
|
|
|
+ # generate output shapes - dynamic tensor shape support
|
|
|
+ output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
|
|
|
+
|
|
|
+ # generate input types
|
|
|
+ input_types = [x.dtype for x in inputs[0]]
|
|
|
+
|
|
|
+ # generate output types
|
|
|
+ output_types = [x.dtype for x in outputs[0]]
|
|
|
+
|
|
|
+ # get input names
|
|
|
+ rng = range(len(input_types))
|
|
|
+ input_names = ["input__" + str(num) for num in rng]
|
|
|
+
|
|
|
+ # get output names
|
|
|
+ rng = range(len(output_types))
|
|
|
+ output_names = ["output__" + str(num) for num in rng]
|
|
|
+
|
|
|
+ # prepare save path
|
|
|
+ model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
|
|
|
+ version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
|
|
|
+ if not os.path.exists(version_folder):
|
|
|
+ os.makedirs(version_folder)
|
|
|
+ final_model_path = os.path.join(version_folder, "model.onnx")
|
|
|
+
|
|
|
+ # get indices of dynamic input and output shapes
|
|
|
+ dynamic_axes = {}
|
|
|
+ for input_name, input_shape in zip(input_names, input_shapes):
|
|
|
+ dynamic_axes[input_name] = [i for i, x in enumerate(input_shape) if x == -1]
|
|
|
+ for output_name, output_shape in zip(output_names, output_shapes):
|
|
|
+ dynamic_axes[output_name] = [
|
|
|
+ i for i, x in enumerate(output_shape) if x == -1
|
|
|
+ ]
|
|
|
+
|
|
|
+ # export the model
|
|
|
+ assert not model.training, "internal error - model should be in eval() mode! "
|
|
|
+ with torch.no_grad():
|
|
|
+ torch.onnx.export(
|
|
|
+ model,
|
|
|
+ inputs[0],
|
|
|
+ final_model_path,
|
|
|
+ verbose=True,
|
|
|
+ input_names=input_names,
|
|
|
+ output_names=output_names,
|
|
|
+ dynamic_axes=dynamic_axes,
|
|
|
+ opset_version=11,
|
|
|
+ )
|
|
|
+
|
|
|
+ # syntactic error check
|
|
|
+ converted_model = onnx.load(final_model_path)
|
|
|
+ # check that the IR is well formed
|
|
|
+ onnx.checker.check_model(converted_model)
|
|
|
+
|
|
|
+ # Name unnamed nodes - it helps for some other processing tools
|
|
|
+ self.name_onnx_nodes(final_model_path)
|
|
|
+ converted_model = onnx.load(final_model_path)
|
|
|
+
|
|
|
+ # load the model
|
|
|
+ session = onnxruntime.InferenceSession(final_model_path, None)
|
|
|
+
|
|
|
+ class ONNX_model:
|
|
|
+ def __init__(self, session, input_names, device):
|
|
|
+ self.session = session
|
|
|
+ self.input_names = input_names
|
|
|
+
|
|
|
+ def to_numpy(self, tensor):
|
|
|
+ return (
|
|
|
+ tensor.detach().cpu().numpy()
|
|
|
+ if tensor.requires_grad
|
|
|
+ else tensor.cpu().numpy()
|
|
|
+ )
|
|
|
+
|
|
|
+ def __call__(self, *inputs):
|
|
|
+ inp = [
|
|
|
+ (input_name, inputs[i])
|
|
|
+ for i, input_name in enumerate(self.input_names)
|
|
|
+ ]
|
|
|
+ inp = {input_name: self.to_numpy(x) for input_name, x in inp}
|
|
|
+ outputs = self.session.run(None, inp)
|
|
|
+ outputs = [torch.from_numpy(output) for output in outputs]
|
|
|
+ outputs = [output.to(device) for output in outputs]
|
|
|
+ if len(outputs) == 1:
|
|
|
+ outputs = outputs[0]
|
|
|
+ return outputs
|
|
|
+
|
|
|
+ # switch to eval mode
|
|
|
+ model_onnx = ONNX_model(session, input_names, device)
|
|
|
+
|
|
|
+ # run both models on inputs
|
|
|
+ assert not model.training, "internal error - model should be in eval() mode! "
|
|
|
+ models = (model, model_onnx)
|
|
|
+ outputs, time_model, outputs_onnx, time_model_onnx = self.lib.run_models(
|
|
|
+ models, inputs
|
|
|
+ )
|
|
|
+
|
|
|
+ # check for errors
|
|
|
+ Error_stats = self.lib.compute_errors(outputs, outputs_onnx)
|
|
|
+ self.lib.print_errors(Error_stats)
|
|
|
+ print("time of error check of native model: ", time_model, "seconds")
|
|
|
+ print("time of error check of onnx model: ", time_model_onnx, "seconds")
|
|
|
+ print()
|
|
|
+
|
|
|
+ # write TRTIS config
|
|
|
+ config_filename = os.path.join(model_folder, "config.pbtxt")
|
|
|
+ self.lib.write_config(
|
|
|
+ config_filename, input_shapes, input_types, output_shapes, output_types
|
|
|
+ )
|
|
|
+
|
|
|
+ def to_triton_torchscript(self, dataloader, model):
|
|
|
+ """ export the model to torchscript and test correctness on dataloader """
|
|
|
+ # setup device
|
|
|
+ if self.args.triton_no_cuda:
|
|
|
+ device = torch.device("cpu")
|
|
|
+ else:
|
|
|
+ device = torch.device("cuda")
|
|
|
+
|
|
|
+ # prepare model
|
|
|
+ model.to(device)
|
|
|
+ model.eval()
|
|
|
+ assert not model.training, "internal error - model should be in eval() mode! "
|
|
|
+
|
|
|
+ # prepare inputs
|
|
|
+ inputs = self.lib.prepare_inputs(dataloader, device)
|
|
|
+
|
|
|
+ # generate input shapes - dynamic tensor shape support
|
|
|
+ input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
|
|
|
+
|
|
|
+ # generate input types
|
|
|
+ input_types = [x.dtype for x in inputs[0]]
|
|
|
+
|
|
|
+ # prepare save path
|
|
|
+ model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
|
|
|
+ version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
|
|
|
+ if not os.path.exists(version_folder):
|
|
|
+ os.makedirs(version_folder)
|
|
|
+ final_model_path = os.path.join(version_folder, "model.pt")
|
|
|
+
|
|
|
+ # convert the model
|
|
|
+ with torch.no_grad():
|
|
|
+ if self.args.ts_trace: # trace it
|
|
|
+ model_ts = torch.jit.trace(model, inputs[0])
|
|
|
+ if self.args.ts_script: # script it
|
|
|
+ model_ts = torch.jit.script(model)
|
|
|
+
|
|
|
+ # save the model
|
|
|
+ torch.jit.save(model_ts, final_model_path)
|
|
|
+
|
|
|
+ # load the model
|
|
|
+ model_ts = torch.jit.load(final_model_path)
|
|
|
+ model_ts.eval() # WAR for bug : by default, model_ts gets loaded in training mode
|
|
|
+
|
|
|
+ # run both models on inputs
|
|
|
+ assert not model.training, "internal error - model should be in eval() mode! "
|
|
|
+ assert (
|
|
|
+ not model_ts.training
|
|
|
+ ), "internal error - converted model should be in eval() mode! "
|
|
|
+ models = (model, model_ts)
|
|
|
+ outputs, time_model, outputs_ts, time_model_ts = self.lib.run_models(
|
|
|
+ models, inputs
|
|
|
+ )
|
|
|
+
|
|
|
+ # check for errors
|
|
|
+ Error_stats = self.lib.compute_errors(outputs, outputs_ts)
|
|
|
+ self.lib.print_errors(Error_stats)
|
|
|
+ print("time of error check of native model: ", time_model, "seconds")
|
|
|
+ print("time of error check of ts model: ", time_model_ts, "seconds")
|
|
|
+ print()
|
|
|
+
|
|
|
+ # generate output shapes - dynamic tensor shape support
|
|
|
+ output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
|
|
|
+
|
|
|
+ # generate output types
|
|
|
+ output_types = [x.dtype for x in outputs[0]]
|
|
|
+
|
|
|
+ # now we build the config for TRTIS
|
|
|
+ config_filename = os.path.join(model_folder, "config.pbtxt")
|
|
|
+ self.lib.write_config(
|
|
|
+ config_filename, input_shapes, input_types, output_shapes, output_types
|
|
|
+ )
|