TensorRT:动态batch和多batch推理总结

2023-07-24 17:41 作者:喜欢玩AI的东哥 0人读过 | 我要投稿

用tensorrt进行yolov8的多batch推理：

步骤1：pt转换onnx

转换出来的onnx具有动态batch和size。

步骤2：加载engine模型，创建context，确定输入的固定尺寸

nvinfer1::ICudaEngine* engine_infer = engine_runtime->deserializeCudaEngine(data.get(), length, nullptr);

nvinfer1::IExecutionContext* engine_context = engine_infer->createExecutionContext();

int input_index = engine_infer->getBindingIndex("images"); //1x3x640x640

int output_index = engine_infer->getBindingIndex("output0"); //1

//engine模型动态batch（BATCH_SIZE, 3, width, height）

nvinfer1::Dims inputSize=engine_infer->getBindingDimensions(input_index);

nvinfer1::Dims outputSize = engine_infer->getBindingDimensions(output_index);

std::cout << "输入的index: " << input_index << " 输出的num_detections-> " << output_index << std::endl;

if (engine_context == nullptr)

{

std::cerr << "Failed to create TensorRT Execution Context." << std::endl;

}

//固定context的输入为（BATCH_SIZE, 3, 640, 640）

engine_context->setBindingDimensions(0, nvinfer1::Dims4(BATCH_SIZE, 3, 640, 640));

inputSize = engine_context->getBindingDimensions(input_index);

outputSize = engine_context->getBindingDimensions(output_index);

步骤3：前处理，多batch输入图片

for (size_t j = 0; j < BATCH_SIZE; j++)

{

///CV2读图片

cv::Mat image = images[i*BATCH_SIZE+j];

std::cout << fn[i] << std::endl;

afterScale = true;

int step =j * INPUT_SIZE * INPUT_SIZE * 3;

//preProcess

if (afterScale)

{ //方法1：

preprocess(image, h_input);

memcpy(h_inputs+ step, h_input, INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float));

}

else

{ //方法2：

factor = preprocess(image, h_input, INPUT_W, INPUT_H, 3);

memcpy(h_inputs + step, h_input, INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float));

}

void* buffers[2];

cudaMalloc(&buffers[0], BATCH_SIZE * INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float)); //<- input

cudaMalloc(&buffers[1], BATCH_SIZE * OUTPUT_SIZE * (NUMS_CLASS + 4) * sizeof(float)); //<- num_detections

cudaMemcpy(buffers[0], h_inputs, BATCH_SIZE*INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float), cudaMemcpyHostToDevice);

步骤4：推理

//同步推理

engine_context->executeV2(buffers);

//异步推理

//engine_context->enqueueV2(buffers, stream, start);

cudaMemcpy(h_output, buffers[1], BATCH_SIZE * OUTPUT_SIZE * (NUMS_CLASS + 4) * sizeof(float), cudaMemcpyDeviceToHost);

步骤5：后处理

//postProcess

for (size_t bsi = 0; bsi < BATCH_SIZE; bsi++)

{

int step =bsi * OUTPUT_SIZE * (NUMS_CLASS + 4);

const int out_rows = NUMS_CLASS + 4; //获得"output"节点的rows

const int out_cols = OUTPUT_SIZE; //获得"output"节点的cols

const cv::Mat det_output(out_rows, out_cols, CV_32F, (float*)h_output + step);

std::vector<cv::Rect> boxes;

std::vector<int> class_ids;

std::vector<float> confidences;

kNmsThresh = 0.3f;

kConfThresh = 0.2f;

kClassScore = 0.2f;

//方法1：直接得到原图的bbox尺寸

// 输出格式是[11,8400], 每列代表一个框(即最多有8400个框), 前面4行分别是cx, cy, ow, oh, 后面7行是每个类别的置信度

for (int i = 0; i < det_output.cols; ++i) {

const cv::Mat classes_scores = det_output.col(i).rowRange(4, 11);//将类别得分取出来

cv::Point class_id_point;

double score;

cv::minMaxLoc(classes_scores, nullptr, &score, nullptr, &class_id_point);//找到对应得分最大的类别及其坐标

// 置信度 0～1之间

if (score > kClassScore) {

const float cx = det_output.at<float>(0, i);

const float cy = det_output.at<float>(1, i);

const float ow = det_output.at<float>(2, i);

const float oh = det_output.at<float>(3, i);

cv::Rect box;

if (afterScale)

{

box.x = static_cast<int>(cx);

box.y = static_cast<int>(cy);

box.width = static_cast<int>(ow);

box.height = static_cast<int>(oh);

}

else

{

//const float scale = std::min(INPUT_H / float(image.rows), INPUT_W / float(image.cols));

//const float factor = 1 / scale;

box.x = static_cast<int>((cx - 0.5 * ow) * factor);

box.y = static_cast<int>((cy - 0.5 * oh) * factor);

box.width = static_cast<int>(ow * factor);

box.height = static_cast<int>(oh * factor);

}

boxes.push_back(box);

class_ids.push_back(class_id_point.y);//class_id_point=point(i,class),class是对应的类别，属于point.y

confidences.push_back(score);

}

// NMS, 消除具有较低置信度的冗余重叠框

std::vector<int> indexes;

cv::dnn::NMSBoxes(boxes, confidences, kConfThresh, kNmsThresh, indexes);

Mat disImage = disPlayImages[i * BATCH_SIZE + bsi];

if (!afterScale)

{

//方法1:

for (size_t i = 0; i < indexes.size(); i++) {

const int index = indexes[i];

const int idx = class_ids[index];

cv::rectangle(disImage, boxes[index], cv::Scalar(0, 0, 255), 2, 8);

cv::rectangle(disImage, cv::Point(boxes[index].tl().x, boxes[index].tl().y - 20),

cv::Point(boxes[index].br().x, boxes[index].tl().y), cv::Scalar(0, 255, 255), -1);

string nameScore = class_names[idx] + " " + std::to_string(confidences[idx]);

cv::putText(disImage, nameScore, cv::Point(boxes[index].tl().x, boxes[index].tl().y - 10), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));

}

std::string savePath = "trt_res/result_" +std::to_string(i)+"_"+ std::to_string(bsi)+ ".jpg";

cv::imwrite(savePath, disImage);

}

else

{

//方法2：得到模型输出的bbox尺寸，再转换为原图

std::vector<Bbox> pred_box;

//方法1:

for (size_t i = 0; i < indexes.size(); i++) {

const int index = indexes[i];

const int idx = class_ids[index];

Bbox box;

box.x = boxes[index].x; //(h_output_1[i * 4 + 2] + h_output_1[i * 4]) / 2.0;

box.y = boxes[index].y;// (h_output_1[i * 4 + 3] + h_output_1[i * 4 + 1]) / 2.0;

box.w = boxes[index].width;// h_output_1[i * 4 + 2] - h_output_1[i * 4];

box.h = boxes[index].height; // h_output_1[i * 4 + 3] - h_output_1[i * 4 + 1];

box.score = confidences[idx];

box.classes = (int)class_ids[index];

pred_box.push_back(box);

}

std::vector<Bbox> out = rescale_box(pred_box, disImage.cols, disImage.rows);

cv::Mat img = renderBoundingBox(disImage, out);

std::string savePath = "trt_res/result_" + std::to_string(i) + "_" + std::to_string(bsi) + ".jpg";

cv::imwrite(savePath, img);

nums++;

}

步骤6：推理速速和显存使用情况

step1：单batch推理：batchsize=1

显存使用：原始2.2g，运行时的显存为3g

step2：多batch推理：batchsize=4

显存使用：原始2.2g，运行时的显存为3.1g,推理时间4.7ms

总结：

多batch的优势在于增加了吞吐量，单batch的优势降低了延时，保证实时性，当batchsize取值在一个合适的位置时，可以保证吞吐量和低延时的最佳匹配，这里需要根据项目工艺要求，设备本身来确定合适的batchsize。

这里做一个扩展：项目往往不是一个模型就可以解决的，有时会使用多个模型，每个模型文件可以被重复加载成多个engine，每个engine却只能绑定一个context，我们开启多线程并发模式使用context进行异步推理，可以获得不错的实时性和吞吐量。具体使用方式如下图所示，根据不同的设备，所能承载的模型数量不同，合理的通过需求配置设备和模型，可以使效益最大化。

代码在我的github：https://github.com/dongguazi。

标签：

TensorRT:动态batch和多batch推理总结

TensorRT:动态batch和多batch推理总结的评论 (共条)

你可能也喜欢这些文章

最新发布的文章

TensorRT:动态batch和多batch推理总结

本文作者的其他文章

TensorRT:动态batch和多batch推理总结的评论 (共 条)

你可能也喜欢这些文章

最新发布的文章

TensorRT:动态batch和多batch推理总结的评论 (共条)