[Solved] cuMemcpyHtoDAsync failed: invalid argument

Tenorrt runs an error:

pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument
————————————————– —————–
PyCUDA ERROR: The context stack was not empty upon module cleanup.
————————————————– —————–
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
————————————————– —————–

tensorrt inference code:

import sys
sys.path.append('../../tools/')
import cv2
import time

import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

print('trt version',trt.__version__)

TRT_LOGGER = trt.Logger()

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\\
" + str(self.host) + "\\
Device:\\
" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine, context):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for i, binding in enumerate(engine):
        size = trt.volume(context.get_binding_shape(i))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

# rewrite softmax in numpy
def softmax(out_np, dim):
    s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True)
    return s_value


class FaceClassify(object):
    def __init__(self, configs):
        self.engine_path = configs.face_classify_engine
        self.input_size = configs.classify_input_size
        self.image_size = self.input_size
        self.MEAN = configs.classify_mean
        self.STD = configs.classify_std
        self.engine = self.get_engine()
        self.context = self.engine.create_execution_context()


    def get_engine(self):
        # If a serialized engine exists, use it instead of building an engine.
        f = open(self.engine_path, 'rb')
        runtime = trt.Runtime(TRT_LOGGER)
        return runtime.deserialize_cuda_engine(f.read())


    def detect(self, image_src, cuda_ctx = pycuda.autoinit.context):
        cuda_ctx.push()
        start_all=time.time()
        IN_IMAGE_H, IN_IMAGE_W = self.image_size

        # Input
        img_in = cv2.cvtColor(image_src, cv2.COLOR_BGR2RGB)
        img_in = cv2.resize(img_in, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)

        img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) # (3, 240, 240)
        img_in /= 255.0 # normalize [0, 1]

        # mean = (0.485, 0.456, 0.406)
        mean0 = np.expand_dims(self.MEAN[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean1 = np.expand_dims(self.MEAN[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean2 = np.expand_dims(self.MEAN[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean = np.concatenate((mean0, mean1, mean2), axis=0)

        # std = (0.229, 0.224, 0.225)
        std0 = np.expand_dims(self.STD[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std1 = np.expand_dims(self.STD[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std2 = np.expand_dims(self.STD[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std = np.concatenate((std0, std1, std2), axis=0)

        img_in = ((img_in - mean) / std).astype(np.float32)
        img_in = np.expand_dims(img_in, axis=0) # (1, 3, 240, 240)

        img_in = np.ascontiguousarray(img_in)

        start=time.time()
        # dynamic input
        self.context.active_optimization_profile = 0
        origin_inputshape = self.context.get_binding_shape(0)
        origin_inputshape[0], origin_inputshape[1], origin_inputshape[2], origin_inputshape[3] = img_in.shape
        self.context.set_binding_shape(0, (origin_inputshape)) # If the size of each input is different, you can change the size in the corresponding context according to the size of the inputs

        inputs, outputs, bindings, stream = allocate_buffers(self.engine, self.context)
        # Do inference
        inputs[0].host = img_in
        trt_outputs = do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs,
                                   stream=stream, batch_size=1)
        print('infer time',time.time()-start,trt_outputs)
        if cuda_ctx:
            cuda_ctx.pop()


        labels_sm = softmax(trt_outputs, dim=0)
        labels_max = np.argmax(labels_sm, axis=1)
        print('time_a',time.time()-start_all)
        return labels_max.item() ,trt_outputs

if __name__ == '__main__':
    class Params:
        pass

    opt = Params()
    opt.face_classify_engine = 'efficientnet_b1.trt'
    opt.classify_input_size = [128 ,128]
    opt.classify_mean = [0.5 ,0.5 ,0.5]
    opt.classify_std = [0.5 ,0.5 ,0.5]

    face =FaceClassify(opt)
    image_src =cv2.imread(r'987.jpg')
    # image_src =cv2.imread(r'F:\project\detect\yolov5\tensorrt\yolo-tensorrt_dll_trt8\sln\x64\Release\16_1.jpg')

    for i in range(10):
        labels_max ,trt_outputs =face.detect(image_src)
    print(trt_outputs)
    print(labels_max)

The reason, the data is not formatted as float32 type,

Solution:

img_in = ((img_in - mean) / std).astype(np.float32)

You can also refer to the answers of netizens:

The reason I personally feel is that the input data does not match the address applied for by the model data entry:

The shape of the input image data is wrong, it may not be (N, C, H, W)
The dtype of the input image data is wrong. This is the case for me. Since I converted from pytorch to ONNX and then to tensorRT, the input in ONNX does not support float64, only single-precision data format is supported, and my own input in tensorRT I didn’t turn it like this, I entered a float64 image, so I reported an error, and it was stable if I changed it to float32.

Original link: https://blog.csdn.net/GungnirsPledge/article/details/108428651

The article also has a workaround.