OpenCV如何使用CUDA内核对图像应用滤镜？

我想使用CUDA对来自Lenna 512 x512的OpenCV ROI应用过滤器。但是我想我在将数据从主机正确复制到设备时遇到了问题。
我意识到Mat对象是不连续的，因此ROI矩阵的维度不是预期的那样; step[0]比cols*elemSize（）大得多。
当我看到the result时，我看到过滤器几乎应用于图像的所有宽度。我试着调整总字节数，但它只是改变了过滤器的高度。
我希望至少在黑色矩形（194 x194）内应用过滤器，而不使用OpenCV CUDA API（GpuMat）
这是我目前的代码：

主要功能

#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/objdetect.hpp>
#include <iostream>

using namespace cv;
using namespace std;
int main(){
    int THREADS = 8;
    
    //Load Image
    string path = "../resources/lenna.png"; //512x512 size
    Mat img = imread(path);

    //Load haarcascade
    CascadeClassifier faceCascade;
    faceCascade.load("../resources/haarcascade_frontalface_default.xml");

    Mat img_gray;
    cvtColor(img, img_gray, COLOR_BGR2GRAY);

    vector<Rect> faces;
    faceCascade.detectMultiScale(img_gray, faces, 1.1, 10);

    for (int i = 0; i < faces.size(); i++){
        //The int conversion is needed for my final filter (blur).
        img.convertTo(img, CV_32SC3);

        Rect R = setROI(faces[i]); //Adjust black rectangle region.

        Point top_left = R.tl();
        Point bot_right = R.br();

        int w = bot_right.x - top_left.x;
        int h = bot_right.y - top_left.y;

        Mat faceROI = img(R);
        faceROI.convertTo(faceROI, CV_32SC3);

        //Filter apply
        myFilter(faceROI, w, h, THREADS);

        //Draws rectangles. Green one is for detected face and black one is for previous adjustment.
        rectangle(img, faces[i].tl(), faces[i].br(), Scalar(0, 255,0), 3);
        rectangle(img, top_left, bot_right, Scalar(0,0,0), 3);

        //Recover original format.
        img.convertTo(img, CV_8UC3);

        imwrite("../resources/test.jpg", img);
    }
    return 0;
}

我的筛选器

void myFilter(Mat face, int w, int h, int THREADS){
    //It's confirmed that w, h = 194

    /*CUDA WORK*/
    int faceBytes = face.step[0]*face.rows;  //Should be face.elemSize()*sizeof(int)*w*h = 3*4*194*194, but it gives 3*4*512*194
    //face.isContinuous() gives 0
    //face.rows = face.cols = 194

    int *d_face;
    cudaMalloc<int>(&d_face, faceBytes);

    cudaMemcpy(d_face, face.ptr(), faceBytes, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(THREADS, THREADS);
    dim3 numBlocks(ceil(w / threadsPerBlock.x), ceil(h / threadsPerBlock.y));

    myFilterKernel<<<numBlocks, threadsPerBlock>>>(d_face, w, h, face.step1());

    cudaDeviceSynchronize();
    cudaMemcpy(face.ptr(), d_face, faceBytes, cudaMemcpyDeviceToHost);
    cudaFree(d_face);
}

我的筛选器内核

__global__ void myFilterKernel(int* d_face, int width, int height, int faceStep){
    int x = blockIdx.x*blockDim.x+threadIdx.x;
    int y = blockIdx.y*blockDim.y+threadIdx.y;

    int face_c = faceStep /width; //Channel count, it should be 3
    
    if (y < height && x < width){
        //Thread pos
        int face_tid = y * faceStep + (face_c * x); 

        //Filter
        for (int i = 0; i < face_c; i++){
            d_face[face_tid + i] *= 2;
        }
    }
}

当过滤器应用特定ROI时，我们可以将ROI从主机复制到设备，对ROI应用过滤器，并将过滤后的ROI从设备复制到主机。
为了将ROI从主机复制到设备并返回，我们可以使用cudaMemcpy2D而不是cudaMemcpy。
使用cudaMemcpy2D时，我们必须设置“源间距”和“目标间距”

源间距在源行之间应用以字节为单位的跨距（步长）。
目标间距在目标行之间应用以字节为单位的步幅（步长）。

在下图中，我们可以只分配、复制和处理小矩形：

<- pitch in bytes: 6144 ->
 ------------------------
|                        |
| face.ptr()             |
|    --> --------        |
|       |        | 144   |
|       |        |height |
|       |        |       |
|        --------        |
|        <-2384 ->       |
|     width in bytes     |
|                        |
 -----------------------

代码示例（无人脸检测）：

#include "cuda_runtime.h"

#include "opencv2/opencv.hpp"
#include <iostream>

#include <stdio.h>

using namespace cv;
using namespace std;

__global__ void myFilterKernel(int* d_face, int width, int height, int faceStep) {
    int x = blockIdx.x*blockDim.x + threadIdx.x;
    int y = blockIdx.y*blockDim.y + threadIdx.y;

    int face_c = faceStep / width; //Channel count, it should be 3

    if (y < height && x < width) {
        //Thread pos
        int face_tid = y * faceStep + (face_c * x);

        //Filter
        for (int i = 0; i < face_c; i++) {
            d_face[face_tid + i] *= 2;
        }
    }
}

void myFilter(Mat face, int w, int h, int THREADS){
    //It's confirmed that w, h = 194
    int img_pitch = (int)face.step[0]; //Source stride in bytes - applies input image row size in bytes (512*3*4 = 6144).
    int roi_pitch = w * (int)face.step[1]; //Destination stride in bytes - applies roi row size in bytes (194*3*4 = 2328).

    /*CUDA WORK*/
    //int faceBytes = face.step[0]*face.rows;  //Should be face.elemSize()*sizeof(int)*w*h = 3*4*194*194, but it gives 3*4*512*194
    int faceBytes = roi_pitch * h; //We may copy only the ROI, and not the entire image.

    int *d_face;
    cudaMalloc<int>(&d_face, faceBytes);

    //cudaMemcpy(d_face, face.ptr(), faceBytes, cudaMemcpyHostToDevice);

    //2D (ROI) copy from host to device.
    cudaMemcpy2D(d_face,                    //void *dst,
                 roi_pitch,                 //size_t dpitch,    //2384 bytes
                 face.ptr(),                //const void *src,  
                 img_pitch,                 //size_t spitch,    //6144 bytes
                 roi_pitch,                 //size_t width, //width in bytes equals roi_pitch //2384 bytes
                 h,                         //size_t height,    //144 rows
                 cudaMemcpyHostToDevice);   //enum cudaMemcpyKind kind);

    dim3 threadsPerBlock(THREADS, THREADS);
    dim3 numBlocks((unsigned int)ceil(w / threadsPerBlock.x), (unsigned int)ceil(h / threadsPerBlock.y));

    //myFilterKernel<<<numBlocks, threadsPerBlock>>>(d_face, w, h, face.step1());
    myFilterKernel<<<numBlocks, threadsPerBlock>>>(d_face, w, h, w*3); //Pass step in units of int32 elements.

    cudaDeviceSynchronize();

    //cudaMemcpy(face.ptr(), d_face, faceBytes, cudaMemcpyDeviceToHost);
    //2D copy from device to host
    cudaMemcpy2D(face.ptr(),                //void *dst,
                 img_pitch,                 //size_t dpitch,    //6144 bytes
                 d_face,                    //const void *src,
                 roi_pitch,                 //size_t spitch,    //2384 bytes
                 roi_pitch,                 //size_t width,     //width in bytes equals roi_pitch //2384 bytes
                 h,                         //size_t height,    //144
                 cudaMemcpyDeviceToHost);   //enum cudaMemcpyKind kind);

    cudaFree(d_face);
}

int main()
{
    int THREADS = 8;

    string path = "lenna.png"; //512x512 size
    Mat img = imread(path);

    //The int conversion is needed for my final filter (blur).
    img.convertTo(img, CV_32SC3);

    //Rect R = setROI(faces[i]); //Adjust black rectangle region.
    Rect R = cv::Rect(204, 188, 194, 194);

    Point top_left = R.tl();
    Point bot_right = R.br();

    int w = bot_right.x - top_left.x;
    int h = bot_right.y - top_left.y;

    Mat faceROI = img(R);
    //faceROI.convertTo(faceROI, CV_32SC3); //Image is already converted to CV_32SC3

    //Filter apply
    myFilter(faceROI, w, h, THREADS);

    //Draws rectangles. Green one is for detected face and black one is for previous adjustment.
    rectangle(img, R.tl(), R.br(), Scalar(0, 255, 0), 3);
    //rectangle(img, top_left, bot_right, Scalar(0, 0, 0), 3);

    //Recover original format.
    img.convertTo(img, CV_8UC3);

    imwrite("test.jpg", img);

    //Show output image for testing"):
    imshow("img", img);
    waitKey();
    destroyAllWindows();

    return 0;
}

输出量：

注意事项：

大多数滤镜要求像素周围的环境较小。

在这种情况下，我们可能必须将较大的ROI从主机复制到设备（并将准确的ROI从设备复制到主机）。
在这种情况下，测试像素是否超出图像边界可能变得更加复杂。

OpenCV如何使用CUDA内核对图像应用滤镜？

1条答案

相关问题

热门标签

最新问答