| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202 |
- #!/usr/bin/env python3
- # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- r"""
- To configure model on Triton, you can use `config_model_on_triton.py` script.
- This will prepare layout of Model Repository, including Model Configuration.
- ```shell script
- python ./triton/config_model_on_triton.py \
- --model-repository /model_repository \
- --model-path /models/exported/model.onnx \
- --model-format onnx \
- --model-name ResNet50 \
- --model-version 1 \
- --max-batch-size 32 \
- --precision fp16 \
- --backend-accelerator trt \
- --load-model explicit \
- --timeout 120 \
- --verbose
- ```
- If Triton server to which we prepare model repository is running with **explicit model control mode**,
- use `--load-model` argument to send request load_model request to Triton Inference Server.
- If server is listening on non-default address or port use `--server-url` argument to point server control endpoint.
- If it is required to use HTTP protocol to communicate with Triton server use `--http` argument.
- To improve inference throughput you can use
- [dynamic batching](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#dynamic-batcher)
- for your model by providing `--preferred-batch-sizes` and `--max-queue-delay-us` parameters.
- For models which doesn't support batching, set `--max-batch-sizes` to 0.
- By default Triton will [automatically obtain inputs and outputs definitions](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#auto-generated-model-configuration).
- but for TorchScript ang TF GraphDef models script uses file with I/O specs. This file is automatically generated
- when the model is converted to ScriptModule (either traced or scripted).
- If there is a need to pass different than default path to I/O spec file use `--io-spec` CLI argument.
- I/O spec file is yaml file with below structure:
- ```yaml
- - inputs:
- - name: input
- dtype: float32 # np.dtype name
- shape: [None, 224, 224, 3]
- - outputs:
- - name: probabilities
- dtype: float32
- shape: [None, 1001]
- - name: classes
- dtype: int32
- shape: [None, 1]
- ```
- """
- import argparse
- import logging
- import time
- from model_navigator import Accelerator, Format, Precision
- from model_navigator.args import str2bool
- from model_navigator.log import set_logger, log_dict
- from model_navigator.triton import ModelConfig, TritonClient, TritonModelStore
- LOGGER = logging.getLogger("config_model")
- def _available_enum_values(my_enum):
- return [item.value for item in my_enum]
- def main():
- parser = argparse.ArgumentParser(
- description="Create Triton model repository and model configuration", allow_abbrev=False
- )
- parser.add_argument("--model-repository", required=True, help="Path to Triton model repository.")
- parser.add_argument("--model-path", required=True, help="Path to model to configure")
- # TODO: automation
- parser.add_argument(
- "--model-format",
- required=True,
- choices=_available_enum_values(Format),
- help="Format of model to deploy",
- )
- parser.add_argument("--model-name", required=True, help="Model name")
- parser.add_argument("--model-version", default="1", help="Version of model (default 1)")
- parser.add_argument(
- "--max-batch-size",
- type=int,
- default=32,
- help="Maximum batch size allowed for inference. "
- "A max_batch_size value of 0 indicates that batching is not allowed for the model",
- )
- # TODO: automation
- parser.add_argument(
- "--precision",
- type=str,
- default=Precision.FP16.value,
- choices=_available_enum_values(Precision),
- help="Model precision (parameter used only by Tensorflow backend with TensorRT optimization)",
- )
- # Triton Inference Server endpoint
- parser.add_argument(
- "--server-url",
- type=str,
- default="grpc://localhost:8001",
- help="Inference server URL in format protocol://host[:port] (default grpc://localhost:8001)",
- )
- parser.add_argument(
- "--load-model",
- choices=["none", "poll", "explicit"],
- help="Loading model while Triton Server is in given model control mode",
- )
- parser.add_argument(
- "--timeout", default=120, help="Timeout in seconds to wait till model load (default=120)", type=int
- )
- # optimization related
- parser.add_argument(
- "--backend-accelerator",
- type=str,
- choices=_available_enum_values(Accelerator),
- default=Accelerator.TRT.value,
- help="Select Backend Accelerator used to serve model",
- )
- parser.add_argument("--number-of-model-instances", type=int, default=1, help="Number of model instances per GPU")
- parser.add_argument(
- "--preferred-batch-sizes",
- type=int,
- nargs="*",
- help="Batch sizes that the dynamic batcher should attempt to create. "
- "In case --max-queue-delay-us is set and this parameter is not, default value will be --max-batch-size",
- )
- parser.add_argument(
- "--max-queue-delay-us",
- type=int,
- default=0,
- help="Max delay time which dynamic batcher shall wait to form a batch (default 0)",
- )
- parser.add_argument(
- "--capture-cuda-graph",
- type=int,
- default=0,
- help="Use cuda capture graph (used only by TensorRT platform)",
- )
- parser.add_argument("-v", "--verbose", help="Provide verbose logs", type=str2bool, default=False)
- args = parser.parse_args()
- set_logger(verbose=args.verbose)
- log_dict("args", vars(args))
- config = ModelConfig.create(
- model_path=args.model_path,
- # model definition
- model_name=args.model_name,
- model_version=args.model_version,
- model_format=args.model_format,
- precision=args.precision,
- max_batch_size=args.max_batch_size,
- # optimization
- accelerator=args.backend_accelerator,
- gpu_engine_count=args.number_of_model_instances,
- preferred_batch_sizes=args.preferred_batch_sizes or [],
- max_queue_delay_us=args.max_queue_delay_us,
- capture_cuda_graph=args.capture_cuda_graph,
- )
- model_store = TritonModelStore(args.model_repository)
- model_store.deploy_model(model_config=config, model_path=args.model_path)
- if args.load_model != "none":
- client = TritonClient(server_url=args.server_url, verbose=args.verbose)
- client.wait_for_server_ready(timeout=args.timeout)
- if args.load_model == "explicit":
- client.load_model(model_name=args.model_name)
- if args.load_model == "poll":
- time.sleep(15)
- client.wait_for_model(model_name=args.model_name, model_version=args.model_version, timeout_s=args.timeout)
- if __name__ == "__main__":
- main()
|