Onnx model to engine and analysis of the whole process of reasoning

After the deep learning model is trained, the next step is to deploy it to different devices for testing. The conversion between different devices can generally be converted through the middleware ONNX to achieve common use on different platforms. This article starts with the conversion of the model to ONNX, and analyzes and introduces the entire process link of converting ONNX to TensorRT Engine and performing inference.

1. ONNX is serialized to TensorRT Engine

The entire process of serializing ONNX into a TRT model can be represented by the following figure

When using C++ API for development, it is necessary to introduce header files NvInfer and NvOnnxParser. C++ interfaces are defined by interface classes starting with I, such as ILogger, IBuilder, etc.

#include "NvInfer.h"
#include "NvOnnxParser.h"

using namespace nvonnxparser;
using namespace nvinfer1;

1.1 Create builder

There are two ways to instantiate ILogger before creating the builder:
1. Reference logging.h of tensorrtx and use Logger

 #include "logging.h"
\t
static Logger gLogger;
IBuilder* builder = createInferBuilder(gLogger);

2. Inherit ILogger and instantiate the interface

 class Logger : public ILogger
{<!-- -->
void log(Severity severity, const char* msg) noexcept override
{<!-- -->
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
} logger;
 IBuilder* builder = createInferBuilder(gLogger);

1.2 Create a network

After creating the builder, you need to create a network definition for model optimization:

 INetworkDefinition *network = builder->createNetworkV2(0U); //Whether it is 0U or 1u depends on the situation

1.3 Create a parser

Create an onnx parser to populate network definitions, read model files and handle errors.

 IParser* parser = createParser(*network, gLogger);
parser->parseFromFile(onnx_path, static_cast<int32_t>(ILogger::Severity::kWARNING));
for (int32_t i = 0; i < parser->getNbErrors(); + + i)
    {<!-- -->
        std::cout << parser->getError(i)->desc() << std::endl;
    }
    std::cout << "successfully parse the onnx model" << std::endl;

1.4 Set the necessary parameters and create Engine

 IBuilderConfig *config = builder->createBuilderConfig();
builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    
    auto profile = builder->createOptimizationProfile();
    auto input_tensor = network->getInput(0);
    auto input_dims = input_tensor->getDimensions();

    input_dims.d[0] = 1;
    profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
    profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
    input_dims.d[0] = batchSize;
    profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
    config->addOptimizationProfile(profile);
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
#ifdef USE_INT8
    config->setFlag(BuilderFlag::kINT8);
#endif

1.5 Create Engine and serialize

 ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
assert(engine != nullptr);
(*modelStream) = engine->serialize();
assert(modelStream != nullptr);
std::ofstream p(engine_path, std::ios::binary);
    if (!p)
    {<!-- -->
        std::cerr << "could not open plan output file" << std::endl;
        return -1;
    }
    p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
    modelStream->destroy();

2. Read the serialized TensorRT Engine and perform inference

After onnx is converted to engine and serialized, the time to build and optimize the model can be reduced. As shown in the figure below, the entire reasoning process is completed from the serialized engine reading.

2.1 Deserialization engine

Read the serialized model and store it in trtModelstream.

 size_t size{<!-- --> 0 };
    std::ifstream file(engine_path, std::ios::binary);
    if (file.good()) {<!-- -->
        file.seekg(0, file.end);
        size = file. tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file. close();

2.2 Create runtime

Create runtime through logger

 IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);

2.3 Create engine

Parse trtModelstream through runtime and create engine

 ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);

2.4 Create context

 IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    runtime->destroy();

2.5 Pre-processing + forward reasoning + post-processing

pre-processing

float* input_data = (float*)malloc(3 * input_h * input_w * sizeof(float));
int ImgCount = InputImage. size();
    for (int b = 0; b < ImgCount; b ++ ) {<!-- -->
        cv::Mat img = InputImage.at(b);
        int w = img. cols;
        int h = img.rows;
        int i = 0;
        for (int row = 0; row < h; + + row) {<!-- -->
            uchar* uc_pixel = img.data + row * img.step;
            for (int col = 0; col < input_w; + + col) {<!-- -->
                input_data[b * 3 * input_h * input_w + i] = (float)uc_pixel[2] / 255.0;
                input_data[b * 3 * input_h * input_w + i + input_h * input_w] = (float)uc_pixel[1] / 255.0;
                input_data[b * 3 * input_h * input_w + i + 2 * input_h * input_w] = (float)uc_pixel[0] / 255.0;
                uc_pixel += 3;
                 + + i;
            }
        }

    }

forward reasoning

void doInference()
{<!-- -->
    const ICudaEngine & engine = context. getEngine();
    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    //assert(engine. getNbBindings() == 2);
    void* buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine. getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine. getBindingIndex(OUTPUT_BLOB_NAME);
    //const int inputIndex = 0;
    //const int outputIndex = 1;
    // Create GPU buffers on device
    cudaMalloc( & buffers[inputIndex], batchSize * 3 * input_h * input_w * sizeof(float));
    cudaMalloc( &buffers[outputIndex], batchSize * output_size * sizeof(float));
    //Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate( & stream));
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 *input_h * input_w * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));

Post-processing
Take LPRNet as an example

 std::vector<int> preds;
    std::cout << std::endl;
    for (int i = 0; i < 18; i ++ ) {<!-- -->
        int maxj = 0;
        for (int j = 0; j < 68; j ++ ) {<!-- -->
            if (prob[i + 18 * j] > prob[i + 18 * maxj]) maxj = j;
        }
        preds.push_back(maxj);
    }
    int pre_c = preds[0];
    std::vector<int> no_repeat_blank_label;
    for (auto c: preds) {<!-- -->
        if (c == pre_c || c == 68 - 1) {<!-- -->
            if (c == 68 - 1) pre_c = c;
            continue;
        }
        no_repeat_blank_label.push_back(c);
        pre_c = c;
    }
    std::string str;
    for (auto v: no_repeat_blank_label) {<!-- -->
        str + = alphabet[v];
    }

The above is the analysis of the whole process of using TensorRT C++ API to build trt engine with ONNX and perform inference. Basically all onnx is converted into TRT model for inference is included in the above method, only this record.

–END–