【猿代码科技】TensorRT保姆级实操手册快速入门学习路线
1 TensorRT基础
TensorRT
TensorRT的
与
TensorRT
TensorRT 安装
2 模型优化
模型
优化
模型
行推理
TensorRT优化策略
1
精度
模FP32FP16
层融合
4
动态张量与静态张量
使用
插件扩展
TensorRT
多模型与多流推理
同时优化多个模型
使用进行多流并行推理
用
TensorRT部署
在NVIDIA JetsonDatacenter GPUsEdge设备上部署
使用Der器
实
的模型推理
特性与
学习内容
的和。
enrator
实项目:
一个的模型(如)化
的进行推理。
TensorRT和。
性能
项目
一个的模型。
性能。
用
学习
TensorRT物体检测
的。
目:实
TensorRT化一个模型
一个实用。
程的型
描述:一个系统,时物体检测行。
sudo apt-get update
sudo apt-get install libnvinfer7
.deb
sudo dpkg -i nv-tensorrt-repo-ubuntu1804-cuda10.0-trt5.0.2.6-rc-20190227_1-1_amd64.deb
.exe.msi
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network: print("TensorRT Hello World!")
hello_tensorrt.py
python hello_tensorrt.py
.onnx
torch.onnx.export()tf2onnx.convertSavedModel
import torch import torchvision.models as models # Initialize model model = models.resnet18(pretrained=True) model.eval() # Dummy input x = torch.randn(1, 3, 224, 224) # Export to ONNX format torch.onnx.export(model, x, "resnet18.onnx", verbose=True)
export_pytorch_to_onnx.py
python export_pytorch_to_onnx.py
resnet18.onnx
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # Load ONNX model with open("resnet18.onnx", "rb") as f: onnx_model = f.read() # Create a TensorRT builder and network with trt.Builder(TRT_LOGGER) as builder, builder.create_network(common_flags=1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: builder.max_workspace_size = 1 << 28 builder.max_batch_size = 1 # Parse ONNX model into TensorRT network parser.parse(onnx_model) # Build optimized engine with builder.build_cuda_engine(network) as engine: # Inference (use the engine for inference) with engine.create_execution_context() as context: print("Optimized model successfully!")
optimize_with_tensorrt.py
python optimize_with_tensorrt.py
model.eval()builder.max_workspace_size
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # Load ONNX model with open("resnet18.onnx", "rb") as f: onnx_model = f.read() # Function to build engine def build_engine(onnx_model, fp16_mode=False): with trt.Builder(TRT_LOGGER) as builder, builder.create_network(common_flags=1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: builder.max_workspace_size = 1 << 28 builder.max_batch_size = 1 builder.fp16_mode = fp16_mode # Parse ONNX model into TensorRT network parser.parse(onnx_model) # Build optimized engine return builder.build_cuda_engine(network) # Build FP32 and FP16 engines engine_fp32 = build_engine(onnx_model, fp16_mode=False) engine_fp16 = build_engine(onnx_model, fp16_mode=True) print("Built FP32 and FP16 engines!")
optimize_fp32_fp16.py
python optimize_fp32_fp16.py
context.executetime
import time import numpy as np # Create execution context with engine_fp32.create_execution_context() as context_fp32, engine_fp16.create_execution_context() as context_fp16: # Create input and output buffer h_input = np.random.random((1, 3, 224, 224)).astype(np.float32) h_output_fp32 = np.empty((1, 1000), dtype=np.float32) h_output_fp16 = np.empty((1, 1000), dtype=np.float32) # Run inference and time it start_time = time.time() context_fp32.execute(batch_size=1, bindings=[int(h_input.ctypes.data), int(h_output_fp32.ctypes.data)]) print("FP32 Inference Time: ", time.time() - start_time) start_time = time.time() context_fp16.execute(batch_size=1, bindings=[int(h_input.ctypes.data), int(h_output_fp16.ctypes.data)]) print("FP16 Inference Time: ", time.time() - start_time)
optimize_fp32_fp16.py
python optimize_fp32_fp16.py
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) def build_int8_engine(onnx_model_path): with trt.Builder(TRT_LOGGER) as builder, builder.create_network(common_flags=1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: builder.int8_mode = True builder.int8_calibrator = YourOwnCalibrator() # You need to implement this with open(onnx_model_path, 'rb') as model: parser.parse(model.read()) return builder.build_cuda_engine(network) int8_engine = build_int8_engine("resnet18.onnx")
optimize_int8.py
python optimize_int8.py
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) def build_dynamic_engine(onnx_model_path): EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: # Specify that the first dimension is dynamic profile = builder.create_optimization_profile() profile.set_shape("input_tensor_name", (1, 3, 224, 224), (8, 3, 224, 224), (16, 3, 224, 224)) config.add_optimization_profile(profile) with open(onnx_model_path, 'rb') as model: parser.parse(model.read()) return builder.build_cuda_engine(network) dynamic_engine = build_dynamic_engine("resnet18.onnx")
dynamic_input.py
python dynamic_input.py
tensorrt.IPluginV2ExtenqueuegetOutputDimensions
import tensorrt as trt class ReLU6Plugin(trt.IPluginV2Ext): def __init__(self): pass def get_output_dimension(self, input_dimension, input_tensors): return input_dimension # Output dimension is same as input for ReLU6 def enqueue(self, bindings, input_tensors, output_tensors, stream_handle): # Your CUDA implementation for ReLU6 comes here pass # Other necessary methods like clone, getNbOutputs, etc.
enqueueclonegetNbOutputs
import tensorrt as trt # Register the custom plugin TRT_LOGGER = trt.Logger(trt.Logger.WARNING) trt.init_libnvinfer_plugins(TRT_LOGGER, "") plugin_creator = trt.get_plugin_registry().get_plugin_creator("ReLU6_TRT", "1", "") # Create the ReLU6 plugin and add it to the network relu6_plugin = ReLU6Plugin() network.add_plugin_v2(inputs=[input_tensor], plugin=relu6_plugin) # Build the engine and run inference with trt.Builder(TRT_LOGGER) as builder, builder.build_cuda_engine(network) as engine, engine.create_execution_context() as context: # Run inference with the custom ReLU6 plugin
add_plugin_v2
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) def build_engine(onnx_model_path): with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser: with open(onnx_model_path, 'rb') as model: parser.parse(model.read()) return builder.build_cuda_engine(network) classification_engine = build_engine("classification.onnx") detection_engine = build_engine("detection.onnx")
optimize_models.py
python optimize_models.py
import cuda import pycuda.driver as cuda_driver import pycuda.autoinit stream1 = cuda_driver.Stream() stream2 = cuda_driver.Stream() # Create execution context for each engine context1 = classification_engine.create_execution_context() context2 = detection_engine.create_execution_context() # Prepare input and output buffer # ... def inference(execution_context, stream, h_input, h_output): # Run inference using TensorRT and CUDA stream # ... # Run inferences in parallel on two CUDA streams inference(context1, stream1, h_input1, h_output1) inference(context2, stream2, h_input2, h_output2)
multi_stream_inference.py
python multi_stream_inference.py
.plan
# Use TensorRT base image FROM nvcr.io/nvidia/tensorrt:latest # Copy model and app COPY ./models /models COPY ./app /app # Run inference app CMD ["python", "/app/inference.py"]
docker build -t tensorrt_app . docker run --gpus all tensorrt_app
--gpus all
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) class MyCalibrator(trt.IInt8EntropyCalibrator2): def __init__(self, data): self.data = data # Other initialization steps # Implement other required methods for the calibrator class # Create calibrator calibrator = MyCalibrator(data) def build_engine(onnx_model_path): with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser: builder.int8_mode = True builder.int8_calibrator = calibrator with open(onnx_model_path, 'rb') as model: parser.parse(model.read()) return builder.build_cuda_engine(network) engine = build_engine("yolo.onnx")
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # Create DLA context with trt.Builder(TRT_LOGGER) as builder, builder.build_cuda_engine(network) as engine: dla_core = builder.get_dla_core() context = engine.create_execution_context() context.set_binding_shape(0, (1, 3, 300, 300)) context.active_optimization_profile = 0 context.set_device_type(0, trt.DeviceType.DLA) context.set_device_core(0, dla_core) # Run inference using DLA
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # Create builder with trt.Builder(TRT_LOGGER) as builder: # Enable various optimization builder.fp16_mode = True builder.strict_type_constraints = True # Diagnose performance builder.profiler = MyProfiler() # Implement your own profiler # Build and return engine with builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser: with open("model.onnx", 'rb') as model_file: parser.parse(model_file.read()) engine = builder.build_cuda_engine(network)
# Enable more optimization builder.max_workspace_size = 1 << 30 # 1GB builder.max_batch_size = 8
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) def build_engine(onnx_model_path): with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser: with open(onnx_model_path, 'rb') as model: parser.parse(model.read()) return builder.build_cuda_engine(network) face_recognition_engine = build_engine("face_recognition.onnx")
import CV2 # Initialize camera and face recognition engine cap = CV2.VideoCapture(0) context = face_recognition_engine.create_execution_context() while True: ret, frame = cap.read() if not ret: break # Prepare input and output buffers # ... # Run inference context.execute_async(batch_size=1, bindings=[input_buffers, output_buffers]) # Process and display results # ... CV2.imshow('Face Recognition', frame) if CV2.waitKey(1) & 0xFF == ord('q'): break cap.release() CV2.destroyAllWindows()
pip install opencv-pythonpip install numpy下载链接配置文件
导入
import CV2 import numpy as np
cap = CV2.VideoCapture(0) net = CV2.dnn.readNet("yolov3.weights", "yolov3.cfg") # Download these files layer_names = net.getLayerNames() output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()] face_cascade = CV2.CascadeClassifier('haarcascade_frontalface_default.xml') # Download this XML file
def object_detection(frame): height, width, channels = frame.shape blob = CV2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False) net.setInput(blob) outs = net.forward(output_layers) class_ids = [] confidences = [] boxes = [] for out in outs: for detection in out: scores = detection[5:] class_id = np.argmax(scores) confidence = scores[class_id] if confidence > 0.5: # Object detected # We will concentrate on the class_id = 0 (person) for this example if class_id == 0: center_x = int(detection[0] * width) center_y = int(detection[1] * height) w = int(detection[2] * width) h = int(detection[3] * height) x = int(center_x - w / 2) y = int(center_y - h / 2) boxes.append([x, y, w, h]) confidences.append(float(confidence)) class_ids.append(class_id) indexes = CV2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4) for i in range(len(boxes)): if i in indexes: x, y, w, h = boxes[i] label = "Person" confidence = confidences[i] color = (0, 255, 0) CV2.rectangle(frame, (x, y), (x + w, y + h), color, 2) CV2.putText(frame, label + " " + str(round(confidence, 2)), (x, y + 30), CV2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 2)
def face_recognition(frame): gray = CV2.cvtColor(frame, CV2.COLOR_BGR2GRAY) faces = face_cascade.detectMultiScale(gray, 1.3, 5) for (x, y, w, h) in faces: CV2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
while True: ret, frame = cap.read() if not ret: break object_detection(frame) face_recognition(frame) CV2.imshow('Smart Surveillance', frame) if CV2.waitKey(1) & 0xFF == ord('q'): break cap.release() CV2.destroyAllWindows()
while True: ret, frame = cap.read() if not ret: break object_detection(frame, net, output_layers) face_recognition(frame, face_cascade) CV2.imshow('Smart Surveillance', frame) if CV2.waitKey(1) & 0xFF == ord('q'): break cap.release() CV2.destroyAllWindows()
smart_surveillance.pyyolov3.weightsyolov3.cfgpython smart_surveillance.py

