YOLOv7 Migration Ascension NPU Reasoning Test

1. Prepare source code and model files, download yolov7official source code and pre-training model,

2. Use the source code script to export the model to onnx format: python3 export.py –weights yolov7.pt –grid –simplify –topk-all 100 –img-size 640 640 –max-wh 640

3. Model conversion, use the Shengteng atc model conversion tool to convert the onnx model to om format. The test device is the Ascend Atlas300I inference card, so the soc_version needs to be set to Ascend310, and the conversion command: atc –model=yolov7.onnx –framework=5 –output=yolov7 –soc_version=Ascend310

4. Use the Shengteng acl interface to write inference scripts. Acl supports c++ and python. Here, the python interface is used for testing. Main steps: acl initialization (setting device, creating context), loading model (obtaining model id, model description, applying for input and output memory in npu according to model information), executing reasoning (copying input data, executing reasoning, copying output data).

om model inference script:

import acl
import numpy as np


class ACL_inference(object):
    def __init__(self, device_id):
        self.device_id = device_id
        acl.init()
        acl.rt.set_device(self.device_id)
        self.context, _ = acl.rt.create_context(self.device_id)
        self.ACL_MEMCPY_HOST_TO_DEVICE = 1
        self.ACL_MEMCPY_DEVICE_TO_HOST = 2
        self.ACL_MEM_MALLOC_HUGE_ONLY = 2
        self.model_id = None
        self.model_desc = None
        self.load_input_dataset = None
        self.load_output_dataset = None
        self. input_data = []
        self. output_data = []

    def init(self, model_path):
        self.model_id, _ = acl.mdl.load_from_file(model_path)
        self.model_desc = acl.mdl.create_desc()
        acl.mdl.get_desc(self.model_desc, self.model_id)
        self. gen_input_dataset()
        self. gen_output_dataset()

    def gen_output_dataset(self):
        self.load_output_dataset = acl.mdl.create_dataset()
        # Get the number of model outputs.
        output_size = acl.mdl.get_num_outputs(self.model_desc)
        # Loop to apply for memory for each output, and add each output to the data of type aclmdlDataset.
        for i in range(output_size):
            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
            # Apply for output memory.
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl. create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data)
            self.output_data.append({"buffer": buffer, "size": buffer_size})

    def gen_input_dataset(self):
        self.load_input_dataset = acl.mdl.create_dataset()
        input_size = acl.mdl.get_num_inputs(self.model_desc)
        for i in range(input_size):
            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl. create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data)
            self.input_data.append({"buffer": buffer, "size": buffer_size})

    def process_output(self):
        inference_result = []
        for i, item in enumerate(self. output_data):
            dims = acl.mdl.get_output_dims(self.model_desc, i)
            shape = tuple(dims[i]["dims"])
            buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
            # Transfer inference output data from Device to Host.
            acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"],
                          self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST)
            bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
            data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape)
            inference_result.append(data)
        return inference_result

    def load_input_data(self, img):
        bytes_data = img.tobytes()
        np_ptr = acl.util.bytes_to_ptr(bytes_data)
        # Transfer image data from Host to Device.
        acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr,
                      self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE)

    def execute(self):
        acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)

    def destroy(self):
        acl.rt.destroy_context(self.context)
        acl.rt.reset_device(self.device_id)
        acl. finalize()

5. Inference test script. The overall reasoning process is image preprocessing, loading model, loading input data, performing reasoning, obtaining output results, and image post-processing.

import cv2
import time
from NMS import nms, draw
time_ = time. time()
image_path = '1.jpg'
img_ = cv2.imread(image_path)
x_scale = img_.shape[1] / 640
y_scale = img_.shape[0] / 640
img = img_ / 255.
img = cv2.resize(img, (640, 640))
img = np.transpose(img, (2, 0, 1))
img = np. expand_dims(img, axis=0)
img = img.astype(np.float32)
print(f'image preprocess time cost:{time.time() - time_}')
model_path = './yolov7.om'
device_id = 0
model = ACL_inference(device_id)
model.init(model_path)
time0 = time. time()
model. load_input_data(img)
print(f'data copy to device time cost:{time.time() - time0}')
time1 = time. time()
model. execute()
print(f'device inference time cost:{time.time() - time1}')
time2 = time. time()
result = model. process_output()[0][0]
print(f'data copy to host time cost:{time.time() - time2}')
model. destroy()
print(result. shape)
result = nms(result, 0.35, 0.45)
print(result)
result_img = draw(img_, x_scale, y_scale, result)
cv2.imwrite('result.jpg', result_img)

Test image:

Output result:

6. Reasoning result analysis

Judging from the inference results, the accuracy basically meets normal reasoning. In terms of performance, the inference time consumption on the npu is 30 milliseconds, and real-time video inference within 30 fps can basically be realized. The main time consumption is in image decoding and transmission, which can be used later. The DVPP module on the npu performs image/video decoding, and adds AIPP image preprocessing function for optimization during atc conversion, reducing data preprocessing time.