1. Prepare source code and model files, download yolov7official source code and pre-training model,
2. Use the source code script to export the model to onnx format: python3 export.py –weights yolov7.pt –grid –simplify –topk-all 100 –img-size 640 640 –max-wh 640
3. Model conversion, use the Shengteng atc model conversion tool to convert the onnx model to om format. The test device is the Ascend Atlas300I inference card, so the soc_version needs to be set to Ascend310, and the conversion command: atc –model=yolov7.onnx –framework=5 –output=yolov7 –soc_version=Ascend310
4. Use the Shengteng acl interface to write inference scripts. Acl supports c++ and python. Here, the python interface is used for testing. Main steps: acl initialization (setting device, creating context), loading model (obtaining model id, model description, applying for input and output memory in npu according to model information), executing reasoning (copying input data, executing reasoning, copying output data).
om model inference script:
import acl import numpy as np class ACL_inference(object): def __init__(self, device_id): self.device_id = device_id acl.init() acl.rt.set_device(self.device_id) self.context, _ = acl.rt.create_context(self.device_id) self.ACL_MEMCPY_HOST_TO_DEVICE = 1 self.ACL_MEMCPY_DEVICE_TO_HOST = 2 self.ACL_MEM_MALLOC_HUGE_ONLY = 2 self.model_id = None self.model_desc = None self.load_input_dataset = None self.load_output_dataset = None self. input_data = [] self. output_data = [] def init(self, model_path): self.model_id, _ = acl.mdl.load_from_file(model_path) self.model_desc = acl.mdl.create_desc() acl.mdl.get_desc(self.model_desc, self.model_id) self. gen_input_dataset() self. gen_output_dataset() def gen_output_dataset(self): self.load_output_dataset = acl.mdl.create_dataset() # Get the number of model outputs. output_size = acl.mdl.get_num_outputs(self.model_desc) # Loop to apply for memory for each output, and add each output to the data of type aclmdlDataset. for i in range(output_size): buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i) # Apply for output memory. buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY) data = acl. create_data_buffer(buffer, buffer_size) _, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data) self.output_data.append({"buffer": buffer, "size": buffer_size}) def gen_input_dataset(self): self.load_input_dataset = acl.mdl.create_dataset() input_size = acl.mdl.get_num_inputs(self.model_desc) for i in range(input_size): buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i) buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY) data = acl. create_data_buffer(buffer, buffer_size) _, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data) self.input_data.append({"buffer": buffer, "size": buffer_size}) def process_output(self): inference_result = [] for i, item in enumerate(self. output_data): dims = acl.mdl.get_output_dims(self.model_desc, i) shape = tuple(dims[i]["dims"]) buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"]) # Transfer inference output data from Device to Host. acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"], self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST) bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"]) data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape) inference_result.append(data) return inference_result def load_input_data(self, img): bytes_data = img.tobytes() np_ptr = acl.util.bytes_to_ptr(bytes_data) # Transfer image data from Host to Device. acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr, self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE) def execute(self): acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset) def destroy(self): acl.rt.destroy_context(self.context) acl.rt.reset_device(self.device_id) acl. finalize()
5. Inference test script. The overall reasoning process is image preprocessing, loading model, loading input data, performing reasoning, obtaining output results, and image post-processing.
import cv2 import time from NMS import nms, draw time_ = time. time() image_path = '1.jpg' img_ = cv2.imread(image_path) x_scale = img_.shape[1] / 640 y_scale = img_.shape[0] / 640 img = img_ / 255. img = cv2.resize(img, (640, 640)) img = np.transpose(img, (2, 0, 1)) img = np. expand_dims(img, axis=0) img = img.astype(np.float32) print(f'image preprocess time cost:{time.time() - time_}') model_path = './yolov7.om' device_id = 0 model = ACL_inference(device_id) model.init(model_path) time0 = time. time() model. load_input_data(img) print(f'data copy to device time cost:{time.time() - time0}') time1 = time. time() model. execute() print(f'device inference time cost:{time.time() - time1}') time2 = time. time() result = model. process_output()[0][0] print(f'data copy to host time cost:{time.time() - time2}') model. destroy() print(result. shape) result = nms(result, 0.35, 0.45) print(result) result_img = draw(img_, x_scale, y_scale, result) cv2.imwrite('result.jpg', result_img)
Test image:
Output result:
6. Reasoning result analysis
Judging from the inference results, the accuracy basically meets normal reasoning. In terms of performance, the inference time consumption on the npu is 30 milliseconds, and real-time video inference within 30 fps can basically be realized. The main time consumption is in image decoding and transmission, which can be used later. The DVPP module on the npu performs image/video decoding, and adds AIPP image preprocessing function for optimization during atc conversion, reducing data preprocessing time.