tenorrt运行报错:
tensorrt 推理代码:
import sys
sys.path.append('../../tools/')
import cv2
import time
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
print('trt version',trt.__version__)
TRT_LOGGER = trt.Logger()
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine, context):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for i, binding in enumerate(engine):
size = trt.volume(context.get_binding_shape(i))
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
# 用numpy重写softmax
def softmax(out_np, dim):
s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True)
return s_value
class FaceClassify(object):
def __init__(self, configs):
self.engine_path = configs.face_classify_engine
self.input_size = configs.classify_input_size
self.image_size = self.input_size
self.MEAN = configs.classify_mean
self.STD = configs.classify_std
self.engine = self.get_engine()
self.context = self.engine.create_execution_context()
def get_engine(self):
# If a serialized engine exists, use it instead of building an engine.
f = open(self.engine_path, 'rb')
runtime = trt.Runtime(TRT_LOGGER)
return runtime.deserialize_cuda_engine(f.read())
def detect(self, image_src, cuda_ctx = pycuda.autoinit.context):
cuda_ctx.push()
start_all=time.time()
IN_IMAGE_H, IN_IMAGE_W = self.image_size
# Input
img_in = cv2.cvtColor(image_src, cv2.COLOR_BGR2RGB)
img_in = cv2.resize(img_in, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)
img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) # (3, 240, 240)
img_in /= 255.0 # 归一化[0, 1]
# mean = (0.485, 0.456, 0.406)
mean0 = np.expand_dims(self.MEAN[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
mean1 = np.expand_dims(self.MEAN[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
mean2 = np.expand_dims(self.MEAN[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
mean = np.concatenate((mean0, mean1, mean2), axis=0)
# std = (0.229, 0.224, 0.225)
std0 = np.expand_dims(self.STD[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
std1 = np.expand_dims(self.STD[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
std2 = np.expand_dims(self.STD[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
std = np.concatenate((std0, std1, std2), axis=0)
img_in = ((img_in - mean) / std).astype(np.float32)
img_in = np.expand_dims(img_in, axis=0) # (1, 3, 240, 240)
img_in = np.ascontiguousarray(img_in)
start=time.time()
# 动态输入
self.context.active_optimization_profile = 0
origin_inputshape = self.context.get_binding_shape(0)
origin_inputshape[0], origin_inputshape[1], origin_inputshape[2], origin_inputshape[3] = img_in.shape
self.context.set_binding_shape(0, (origin_inputshape)) # 若每个输入的size不一样,可根据inputs的size更改对应的context中的size
inputs, outputs, bindings, stream = allocate_buffers(self.engine, self.context)
# Do inference
inputs[0].host = img_in
trt_outputs = do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs,
stream=stream, batch_size=1)
print('infer time',time.time()-start,trt_outputs)
if cuda_ctx:
cuda_ctx.pop()
labels_sm = softmax(trt_outputs, dim=0)
labels_max = np.argmax(labels_sm, axis=1)
print('time_a',time.time()-start_all)
return labels_max.item() ,trt_outputs
if __name__ == '__main__':
class Params:
pass
opt = Params()
opt.face_classify_engine = 'efficientnet_b1.trt'
opt.classify_input_size = [128 ,128]
opt.classify_mean = [0.5 ,0.5 ,0.5]
opt.classify_std = [0.5 ,0.5 ,0.5]
face =FaceClassify(opt)
image_src =cv2.imread(r'987.jpg')
# image_src =cv2.imread(r'F:\project\detect\yolov5\tensorrt\yolo-tensorrt_dll_trt8\sln\x64\Release\16_1.jpg')
for i in range(10):
labels_max ,trt_outputs =face.detect(image_src)
print(trt_outputs)
print(labels_max)
原因,数据没有格式化为float32类型,
解决方法:
img_in = ((img_in - mean) / std).astype(np.float32)
网友的答案也可以参考:
我个人感觉的原因是输入的数据和模型数据入口所申请的地址不匹配:
输入图片数据shape不对, 可能不是(N, C, H, W)
输入图片数据的dtype不对 我是这种情况, 由于我是pytorch 转 ONNX 再转 tensorRT的, 在ONNX中的输入是不支持float64为,只支持单精度的数据格式, 而我自己在tensorRT里的输入没有这么转, 输入了float64的图片,所以报错, 把它改成float32 就稳了.
原文链接:https://blog.csdn.net/GungnirsPledge/article/details/108428651
文章也有解决方法。
版权说明 : 本文为转载文章, 版权归原作者所有 版权申明
原文链接 : https://blog.csdn.net/jacke121/article/details/125904060
内容来源于网络,如有侵权,请联系作者删除!