OpenMMlab exports the mobilenet-v2 model and uses onnxruntime and tensorrt for inference

Export onnx file

Use mmpretrain to export the onnx model of mobilenet-v2:

import torch
from mmpretrain import get_model

model = get_model('mobilenet-v2_8xb32_in1k', pretrained='mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth', device='cpu')
input = torch.zeros(1, 3, 224, 224)
out = model(input)
torch.onnx.export(model, input, "mobilenet-v2.onnx", opset_version=11)

If mmdeploy is installed, you can export it as follows:

from mmdeploy.apis import torch2onnx
from mmdeploy.backend.sdk.export_info import export2SDK

img = 'goldfish.jpg'
work_dir = './work_dir/onnx/mobilenet_v2'
save_file = './end2end.onnx'
deploy_cfg = 'mmdeploy/configs/mmpretrain/'
model_cfg = 'mmpretrain/configs/mobilenet_v2/'
model_checkpoint = './checkpoints/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth'
device = 'cpu'

# 1. convert model to onnx
torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, model_checkpoint, device)

# 2. extract pipeline info for sdk use (dump-info)
export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)

onnxruntime reasoning

Inference via onnxruntime:

import cv2
import numpy as np
import onnxruntime

if __name__ == '__main__':
    img = cv2.imread('goldfish.jpg')
    if img.shape[0] < img.shape[1]: #h<w
        img = cv2.resize(img, (int(256*img.shape[1]/img.shape[0]), 256))
        img = cv2.resize(img, (256, int(256*img.shape[0]/img.shape[1])))

    crop_size = min(img.shape[0], img.shape[1])
    left = int((img.shape[1]-crop_size)/2)
    top = int((img.shape[0]-crop_size)/2)
    img_crop = img[top:top + crop_size, left:left + crop_size]
    img_crop = cv2.resize(img_crop, (224,224))

    img_crop = img_crop[:,:,::-1].transpose(2,0,1).astype(np.float32) #BGR2RGB and HWC2CHW
    img_crop[0,:] = (img_crop[0,:] - 123.675) / 58.395
    img_crop[1,:] = (img_crop[1,:] - 116.28) / 57.12
    img_crop[2,:] = (img_crop[2,:] - 103.53) / 57.375
    input = np.expand_dims(img_crop, axis=0)

    onnx_session = onnxruntime.InferenceSession("mobilenet_v2.onnx", providers=['CPUExecutionProvider'])

    for node in onnx_session.get_inputs():

    for node in onnx_session.get_outputs():

    input_feed={<!-- -->}
    for name in input_name:
        input_feed[name] = input

    pred =, input_feed)

Use mmdeploy to reason:

from mmdeploy.apis import inference_model

_cfg = 'mmpretrain/configs/mobilenet_v2/'
deploy_cfg = 'mmdeploy/configs/mmpretrain/'
img = 'goldfish.jpg'
backend_files = ['work_dir/onnx/mobilenet_v2/end2end.onnx']
device = 'cpu'

result = inference_model(model_cfg, deploy_cfg, backend_files, img, device)


import cv2
from mmdeploy_runtime import Classifier

img = cv2.imread('goldfish.jpg')
classifier = Classifier(model_path='work_dir/onnx/mobilenet_v2', device_name='cpu')
result = classifier(img)
for label_id, score in result:
    print(label_id, score)

Export engine file

Here, onnx files are converted through trtexec. The version of LZ is TensorRT-

./trtexec.exe --onnx=mobilenet_v2.onnx --saveEngine=mobilenet_v2.engine

tensorrt inference

import cv2
import numpy as np
import tensorrt as trt
import pycuda.autoinit #Responsible for data initialization, memory management, destruction, etc.
import pycuda.driver as cuda #Data transfer between GPU and CPU

if __name__ == '__main__':
    #Create logger: logger
    logger = trt.Logger(trt.Logger.WARNING)
    #Create runtime and deserialize to generate engine
    with open("mobilenet_v2.engine", "rb") as f, trt.Runtime(logger) as runtime:
        engine = runtime.deserialize_cuda_engine(
    context = engine.create_execution_context()
    # Allocate CPU lock page memory and GPU video memory
    h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
    h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    #Create cuda stream
    stream = cuda.Stream()

    img = cv2.imread('goldfish.jpg')
    if img.shape[0] < img.shape[1]: #h<w
        img = cv2.resize(img, (int(256*img.shape[1]/img.shape[0]), 256))
        img = cv2.resize(img, (256, int(256*img.shape[0]/img.shape[1])))
    crop_size = min(img.shape[0], img.shape[1])
    left = int((img.shape[1]-crop_size)/2)
    top = int((img.shape[0]-crop_size)/2)
    img_crop = img[top:top + crop_size, left:left + crop_size]
    img_crop = cv2.resize(img_crop, (224,224))
    img_crop = img_crop[:,:,::-1].transpose(2,0,1).astype(np.float32) #BGR2RGB and HWC2CHW
    img_crop[0,:] = (img_crop[0,:] - 123.675) / 58.395
    img_crop[1,:] = (img_crop[1,:] - 116.28) / 57.12
    img_crop[2,:] = (img_crop[2,:] - 103.53) / 57.375
    input = np.expand_dims(img_crop, axis=0)
    np.copyto(h_input, input.ravel())

    #Create context and perform inference
    with engine.create_execution_context() as context:
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        # Return the host output. This data is equivalent to the output data of the original model
        pred = np.argmax(h_output)

Use mmdeploy to reason:

from mmdeploy.apis import inference_model

model_cfg = 'mmpretrain/configs/mobilenet_v2/'
deploy_cfg = 'mmdeploy/configs/mmpretrain/'
backend_files = ['work_dir/trt/mobilenet_v2/end2end.engine']
img = 'goldfish.jpg'
device = 'cuda'

result = inference_model(model_cfg, deploy_cfg, backend_files, img, device)


import cv2
from mmdeploy_runtime import Classifier

img = cv2.imread('goldfish.jpg')
classifier = Classifier(model_path='work_dir/onnx/mobilenet_v2', device_name='cpu')

result = classifier(img)
for label_id, score in result:
    print(label_id, score)