tensorrt 推理代码:
import sys
import cv2
import time
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
print('trt version',trt.__version__)
TRT_LOGGER = trt.Logger()
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine, context):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for i, binding in enumerate(engine):
size = trt.volume(context.get_binding_shape(i))
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
# Return only the host outputs.
return [out.host for out in outputs]
# 用numpy重写softmax
def softmax(out_np, dim):
s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True)
return s_value
class FaceClassify(object):
def __init__(self, configs):
self.engine_path = configs.face_classify_engine
self.input_size = configs.classify_input_size
self.image_size = self.input_size
self.MEAN = configs.classify_mean
self.STD = configs.classify_std
self.engine = self.get_engine()
self.context = self.engine.create_execution_context()
def get_engine(self):
# If a serialized engine exists, use it instead of building an engine.
f = open(self.engine_path, 'rb')
runtime = trt.Runtime(TRT_LOGGER)
return runtime.deserialize_cuda_engine(f.read())
def detect(self, image_src, cuda_ctx = pycuda.autoinit.context):
IN_IMAGE_H, IN_IMAGE_W = self.image_size
# Input
img_in = cv2.cvtColor(image_src, cv2.COLOR_BGR2RGB)
img_in = cv2.resize(img_in, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)
img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) # (3, 240, 240)
img_in /= 255.0 # 归一化[0, 1]
# mean = (0.485, 0.456, 0.406)
mean0 = np.expand_dims(self.MEAN[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
mean1 = np.expand_dims(self.MEAN[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
mean2 = np.expand_dims(self.MEAN[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
mean = np.concatenate((mean0, mean1, mean2), axis=0)
# std = (0.229, 0.224, 0.225)
std0 = np.expand_dims(self.STD[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
std1 = np.expand_dims(self.STD[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
std2 = np.expand_dims(self.STD[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
std = np.concatenate((std0, std1, std2), axis=0)
img_in = ((img_in - mean) / std).astype(np.float32)
img_in = np.expand_dims(img_in, axis=0) # (1, 3, 240, 240)
img_in = np.ascontiguousarray(img_in)
# 动态输入
self.context.active_optimization_profile = 0
origin_inputshape = self.context.get_binding_shape(0)
origin_inputshape[0], origin_inputshape[1], origin_inputshape[2], origin_inputshape[3] = img_in.shape
self.context.set_binding_shape(0, (origin_inputshape)) # 若每个输入的size不一样,可根据inputs的size更改对应的context中的size
inputs, outputs, bindings, stream = allocate_buffers(self.engine, self.context)
# Do inference
inputs[0].host = img_in
trt_outputs = do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs,
stream=stream, batch_size=1)
print('infer time',time.time()-start,trt_outputs)
if cuda_ctx:
labels_sm = softmax(trt_outputs, dim=0)
labels_max = np.argmax(labels_sm, axis=1)
return labels_max.item() ,trt_outputs
if __name__ == '__main__':
class Params:
opt = Params()
opt.face_classify_engine = 'efficientnet_b1.trt'
opt.classify_input_size = [128 ,128]
opt.classify_mean = [0.5 ,0.5 ,0.5]
opt.classify_std = [0.5 ,0.5 ,0.5]
face =FaceClassify(opt)
image_src =cv2.imread(r'987.jpg')
for i in range(10):
labels_max ,trt_outputs =face.detect(image_src)
img_in = ((img_in - mean) / std).astype(np.float32)
输入图片数据shape不对, 可能不是(N, C, H, W)
输入图片数据的dtype不对 我是这种情况, 由于我是pytorch 转 ONNX 再转 tensorRT的, 在ONNX中的输入是不支持float64为,只支持单精度的数据格式, 而我自己在tensorRT里的输入没有这么转, 输入了float64的图片,所以报错, 把它改成float32 就稳了.
