4 лет назад · 5c33a8289b
--- a/PyTorch/Classification/ConvNets/triton/deployment_toolkit/bermuda/onnx2trt_conv.py
+++ b/PyTorch/Classification/ConvNets/triton/deployment_toolkit/bermuda/onnx2trt_conv.py
@@ -96,6 +96,12 @@ def onnx2trt(
 
				                 LOGGER.error(f"OnnxParser error {i}/{parser.num_errors}: {parser.get_error(i)}")
			
 
				             raise RuntimeError("Error during parsing ONNX model (see logs for details)")
			
 
				 
			
 
				+        # OnnxParser produces here FP32 TensorRT engine for FP16 network
			
 
				+        # so we force FP16 here for first input/output
			
 
				+        if fp16_mode:
			
 
				+            network.get_input(0).dtype = trt.DataType.HALF
			
 
				+            network.get_output(0).dtype = trt.DataType.HALF
			
 
				+
			
 
				         # optimization
			
 
				         config = builder.create_builder_config()
			
 
				         config.flags |= bool(fp16_mode) << int(trt.BuilderFlag.FP16)
			
--- a/PyTorch/Classification/ConvNets/triton/scripts/docker/triton_inference_server.sh
+++ b/PyTorch/Classification/ConvNets/triton/scripts/docker/triton_inference_server.sh
@@ -21,6 +21,7 @@ docker run --rm -d \
 
				   -p 8002:8002 \
			
 
				   --runtime=nvidia \
			
 
				   -e NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES} \
			
 
				+  -e ORT_TENSORRT_FP16_ENABLE=1 \
			
 
				   -v ${MODEL_REPOSITORY_PATH}:${MODEL_REPOSITORY_PATH} \
			
 
				   --shm-size=1g \
			
 
				   --ulimit memlock=-1 \
			
--- a/PyTorch/Classification/ConvNets/triton/scripts/setup_parameters.sh
+++ b/PyTorch/Classification/ConvNets/triton/scripts/setup_parameters.sh
@@ -13,9 +13,9 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 export PRECISION="fp16"
			
 
				-export FORMAT="trt"
			
 
				+export FORMAT="onnx"
			
 
				 export BATCH_SIZE="1,2,4,8,16,32,64,128"
			
 
				-export BACKEND_ACCELERATOR="cuda"
			
 
				+export BACKEND_ACCELERATOR="trt"
			
 
				 export MAX_BATCH_SIZE="128"
			
 
				 export NUMBER_OF_MODEL_INSTANCES="1"
			
 
				 export TRITON_MAX_QUEUE_DELAY="1"