Tenorrt runs an error:
pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument
————————————————– —————–
PyCUDA ERROR: The context stack was not empty upon module cleanup.
————————————————– —————–
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
————————————————– —————–
tensorrt inference code:
import sys sys.path.append('../../tools/') import cv2 import time import numpy as np import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit print('trt version',trt.__version__) TRT_LOGGER = trt.Logger() class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\\ " + str(self.host) + "\\ Device:\\ " + str(self.device) def __repr__(self): return self.__str__() # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. def allocate_buffers(engine, context): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for i, binding in enumerate(engine): size = trt.volume(context.get_binding_shape(i)) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream # This function is generalized for multiple inputs/outputs. # inputs and outputs are expected to be lists of HostDeviceMem objects. def do_inference(context, bindings, inputs, outputs, stream, batch_size): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] # rewrite softmax in numpy def softmax(out_np, dim): s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True) return s_value class FaceClassify(object): def __init__(self, configs): self.engine_path = configs.face_classify_engine self.input_size = configs.classify_input_size self.image_size = self.input_size self.MEAN = configs.classify_mean self.STD = configs.classify_std self.engine = self.get_engine() self.context = self.engine.create_execution_context() def get_engine(self): # If a serialized engine exists, use it instead of building an engine. f = open(self.engine_path, 'rb') runtime = trt.Runtime(TRT_LOGGER) return runtime.deserialize_cuda_engine(f.read()) def detect(self, image_src, cuda_ctx = pycuda.autoinit.context): cuda_ctx.push() start_all=time.time() IN_IMAGE_H, IN_IMAGE_W = self.image_size # Input img_in = cv2.cvtColor(image_src, cv2.COLOR_BGR2RGB) img_in = cv2.resize(img_in, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR) img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) # (3, 240, 240) img_in /= 255.0 # normalize [0, 1] # mean = (0.485, 0.456, 0.406) mean0 = np.expand_dims(self.MEAN[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0) mean1 = np.expand_dims(self.MEAN[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0) mean2 = np.expand_dims(self.MEAN[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0) mean = np.concatenate((mean0, mean1, mean2), axis=0) # std = (0.229, 0.224, 0.225) std0 = np.expand_dims(self.STD[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0) std1 = np.expand_dims(self.STD[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0) std2 = np.expand_dims(self.STD[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0) std = np.concatenate((std0, std1, std2), axis=0) img_in = ((img_in - mean) / std).astype(np.float32) img_in = np.expand_dims(img_in, axis=0) # (1, 3, 240, 240) img_in = np.ascontiguousarray(img_in) start=time.time() # dynamic input self.context.active_optimization_profile = 0 origin_inputshape = self.context.get_binding_shape(0) origin_inputshape[0], origin_inputshape[1], origin_inputshape[2], origin_inputshape[3] = img_in.shape self.context.set_binding_shape(0, (origin_inputshape)) # If the size of each input is different, you can change the size in the corresponding context according to the size of the inputs inputs, outputs, bindings, stream = allocate_buffers(self.engine, self.context) # Do inference inputs[0].host = img_in trt_outputs = do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1) print('infer time',time.time()-start,trt_outputs) if cuda_ctx: cuda_ctx.pop() labels_sm = softmax(trt_outputs, dim=0) labels_max = np.argmax(labels_sm, axis=1) print('time_a',time.time()-start_all) return labels_max.item() ,trt_outputs if __name__ == '__main__': class Params: pass opt = Params() opt.face_classify_engine = 'efficientnet_b1.trt' opt.classify_input_size = [128 ,128] opt.classify_mean = [0.5 ,0.5 ,0.5] opt.classify_std = [0.5 ,0.5 ,0.5] face =FaceClassify(opt) image_src =cv2.imread(r'987.jpg') # image_src =cv2.imread(r'F:\project\detect\yolov5\tensorrt\yolo-tensorrt_dll_trt8\sln\x64\Release\16_1.jpg') for i in range(10): labels_max ,trt_outputs =face.detect(image_src) print(trt_outputs) print(labels_max)
The reason, the data is not formatted as float32 type,
Solution:
img_in = ((img_in - mean) / std).astype(np.float32)
You can also refer to the answers of netizens:
The reason I personally feel is that the input data does not match the address applied for by the model data entry:
The shape of the input image data is wrong, it may not be (N, C, H, W)
The dtype of the input image data is wrong. This is the case for me. Since I converted from pytorch to ONNX and then to tensorRT, the input in ONNX does not support float64, only single-precision data format is supported, and my own input in tensorRT I didn’t turn it like this, I entered a float64 image, so I reported an error, and it was stable if I changed it to float32.
Original link: https://blog.csdn.net/GungnirsPledge/article/details/108428651
The article also has a workaround.