Export onnx file
Use mmpretrain to export the onnx model of mobilenet-v2:
import torch from mmpretrain import get_model model = get_model('mobilenet-v2_8xb32_in1k', pretrained='mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth', device='cpu') input = torch.zeros(1, 3, 224, 224) out = model(input) torch.onnx.export(model, input, "mobilenet-v2.onnx", opset_version=11)
If mmdeploy is installed, you can export it as follows:
from mmdeploy.apis import torch2onnx from mmdeploy.backend.sdk.export_info import export2SDK img = 'goldfish.jpg' work_dir = './work_dir/onnx/mobilenet_v2' save_file = './end2end.onnx' deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_static.py' model_cfg = 'mmpretrain/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py' model_checkpoint = './checkpoints/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth' device = 'cpu' # 1. convert model to onnx torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, model_checkpoint, device) # 2. extract pipeline info for sdk use (dump-info) export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)
onnxruntime reasoning
Inference via onnxruntime:
import cv2 import numpy as np import onnxruntime if __name__ == '__main__': img = cv2.imread('goldfish.jpg') if img.shape[0] < img.shape[1]: #h<w img = cv2.resize(img, (int(256*img.shape[1]/img.shape[0]), 256)) else: img = cv2.resize(img, (256, int(256*img.shape[0]/img.shape[1]))) crop_size = min(img.shape[0], img.shape[1]) left = int((img.shape[1]-crop_size)/2) top = int((img.shape[0]-crop_size)/2) img_crop = img[top:top + crop_size, left:left + crop_size] img_crop = cv2.resize(img_crop, (224,224)) img_crop = img_crop[:,:,::-1].transpose(2,0,1).astype(np.float32) #BGR2RGB and HWC2CHW img_crop[0,:] = (img_crop[0,:] - 123.675) / 58.395 img_crop[1,:] = (img_crop[1,:] - 116.28) / 57.12 img_crop[2,:] = (img_crop[2,:] - 103.53) / 57.375 input = np.expand_dims(img_crop, axis=0) onnx_session = onnxruntime.InferenceSession("mobilenet_v2.onnx", providers=['CPUExecutionProvider']) input_name=[] for node in onnx_session.get_inputs(): input_name.append(node.name) output_name=[] for node in onnx_session.get_outputs(): output_name.append(node.name) input_feed={<!-- -->} for name in input_name: input_feed[name] = input pred = onnx_session.run(None, input_feed) print(np.argmax(pred))
Use mmdeploy to reason:
from mmdeploy.apis import inference_model model _cfg = 'mmpretrain/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py' deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_static.py' img = 'goldfish.jpg' backend_files = ['work_dir/onnx/mobilenet_v2/end2end.onnx'] device = 'cpu' result = inference_model(model_cfg, deploy_cfg, backend_files, img, device) print(result)
or
import cv2 from mmdeploy_runtime import Classifier img = cv2.imread('goldfish.jpg') classifier = Classifier(model_path='work_dir/onnx/mobilenet_v2', device_name='cpu') result = classifier(img) for label_id, score in result: print(label_id, score)
Export engine file
Here, onnx files are converted through trtexec. The version of LZ is TensorRT-8.2.1.8.
./trtexec.exe --onnx=mobilenet_v2.onnx --saveEngine=mobilenet_v2.engine
tensorrt inference
import cv2 import numpy as np import tensorrt as trt import pycuda.autoinit #Responsible for data initialization, memory management, destruction, etc. import pycuda.driver as cuda #Data transfer between GPU and CPU if __name__ == '__main__': #Create logger: logger logger = trt.Logger(trt.Logger.WARNING) #Create runtime and deserialize to generate engine with open("mobilenet_v2.engine", "rb") as f, trt.Runtime(logger) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() # Allocate CPU lock page memory and GPU video memory h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32) h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32) d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) #Create cuda stream stream = cuda.Stream() img = cv2.imread('goldfish.jpg') if img.shape[0] < img.shape[1]: #h<w img = cv2.resize(img, (int(256*img.shape[1]/img.shape[0]), 256)) else: img = cv2.resize(img, (256, int(256*img.shape[0]/img.shape[1]))) crop_size = min(img.shape[0], img.shape[1]) left = int((img.shape[1]-crop_size)/2) top = int((img.shape[0]-crop_size)/2) img_crop = img[top:top + crop_size, left:left + crop_size] img_crop = cv2.resize(img_crop, (224,224)) img_crop = img_crop[:,:,::-1].transpose(2,0,1).astype(np.float32) #BGR2RGB and HWC2CHW img_crop[0,:] = (img_crop[0,:] - 123.675) / 58.395 img_crop[1,:] = (img_crop[1,:] - 116.28) / 57.12 img_crop[2,:] = (img_crop[2,:] - 103.53) / 57.375 input = np.expand_dims(img_crop, axis=0) np.copyto(h_input, input.ravel()) #Create context and perform inference with engine.create_execution_context() as context: # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input, h_input, stream) # Run inference. context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() # Return the host output. This data is equivalent to the output data of the original model pred = np.argmax(h_output) print(pred)
Use mmdeploy to reason:
from mmdeploy.apis import inference_model model_cfg = 'mmpretrain/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py' deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_tensorrt_static-224x224.py' backend_files = ['work_dir/trt/mobilenet_v2/end2end.engine'] img = 'goldfish.jpg' device = 'cuda' result = inference_model(model_cfg, deploy_cfg, backend_files, img, device) print(result)
or
import cv2 from mmdeploy_runtime import Classifier img = cv2.imread('goldfish.jpg') classifier = Classifier(model_path='work_dir/onnx/mobilenet_v2', device_name='cpu') result = classifier(img) for label_id, score in result: print(label_id, score)