c++ OpenCL:自驱动程序更新以来多线程环境中出现内部死锁(Nvidia)

bq9c1y66  于 2022-12-01  发布在  其他
关注(0)|答案(1)|浏览(125)

我使用Khronos SDK在Windows上开发OpenCL 3.0应用程序,其中包括使用GPU处理存储在驱动器上的大量数据。为此,我使用了几个CPU线程,这些线程从驱动器读取、处理、发送到GPU,并将结果返回到驱动器上。一年多来,我一直使用这段代码,没有出现任何问题。但是在最近更新了我的Nvidia GPU驱动程序(从460版本的东西到最新的517. xx)之后,这个程序突然不能再工作了。我尝试了几个5XX范围内的老驱动程序,但都没有改变这种行为。
在研究了一下导致这种情况的原因之后,我发现OpenCL调用了lock(即使是那些应该是非阻塞的),并且永远不会返回。如果所有的调用都在一个线程上完成,一切都很好,但是任何后续的线程都不会从它的第一次调用中返回。
举一个简单的例子,只要创建几个线程,每个线程创建一个OpenCL队列,第一个执行的线程就可以正常工作,但是所有其他线程都不会从clCreateCommandQueue调用返回。
我在两台PC上测试了它,一台GTX 1650和一台RTX 3070 TI,经过一周的努力想出解决方案,并在网上搜索类似的问题,我什么也没找到。
谢谢你读我,如果有人知道什么可能是问题,或者可以证明我不是唯一一个面临它?
提前感谢!

**TLDR:**OpenCL与任何最新的Nvidia驱动程序一起使用时,如果从多个CPU线程调用,则会导致我的clCreateCommandQueue(和其他cl调用)永远不会返回。

j5fpnvbx

j5fpnvbx1#

我这里有一个代码示例。
这并不是最小的,因为只有在cl上下文中出现ocl错误时,我才会得到eba04348所描述的行为。
我已经向nvidia提交了一个bug。

#include <iostream>

#define CL_HPP_MINIMUM_OPENCL_VERSION 110
#define CL_HPP_TARGET_OPENCL_VERSION 110

// https://github.com/KhronosGroup/OpenCL-CLHPP
#include "opencl.hpp"

#include <vector>
#include <thread>

struct DeviceData
{
    cl::Context mContextCL;
    cl::Program mProgramCL;
    cl::Kernel mKernelCL;
    cl::CommandQueue mQueueCL;
    std::vector<cl::Buffer> mBufferListCL;
    int static constexpr n = 10000;
};

/*! Please do not use in production code!
 *
 * @param context produce error in this context
 * @param device related to context
 * @return this has to return false
 */
bool produceError(cl::Context& context, cl::Device& device){
    cl_int error = CL_SUCCESS;

    std::vector<float> data (512 * 1024 * 1024 / sizeof(float), 17.0f);
    auto const dataSizeInBytes = data.size() * sizeof(float);

    using Buffers = std::vector<cl::Buffer>;
    Buffers clBufferDstList;
    Buffers clBufferSrcList;

    cl::CommandQueue queue (context, device, 0, &error);
    if (CL_SUCCESS != error)
        return false;

    // Initialize main source buffer, will be cloned many times "inside the device"
    cl::Buffer clMainBufferSrc (context, 0, dataSizeInBytes, nullptr, &error);
    if (CL_SUCCESS != error)
        return false;
    error = queue.enqueueWriteBuffer (clMainBufferSrc, CL_TRUE, 0, dataSizeInBytes, data.data(), nullptr, nullptr);
    if (CL_SUCCESS != error)
        return false;

    // Loop until things break down
    while (true) {
        cl::Buffer clNewBufferSrc(context, 0, dataSizeInBytes, nullptr, &error);
        if (CL_SUCCESS != error)
            return false;
        cl::Buffer clNewBufferDst(context, 0, dataSizeInBytes, nullptr, &error);
        if (CL_SUCCESS != error)
            return false;
        clBufferSrcList.push_back(clNewBufferSrc);
        clBufferDstList.push_back(clNewBufferDst);

        // Copy data to new src and dst buffer - on the device / initialize buffers
        error = queue.enqueueCopyBuffer(clMainBufferSrc, clNewBufferSrc, 0, 0, dataSizeInBytes);
        if (CL_MEM_OBJECT_ALLOCATION_FAILURE == error)
            break;
        if (CL_SUCCESS != error)
            return false;
        error = queue.enqueueCopyBuffer(clMainBufferSrc, clNewBufferDst, 0, 0, dataSizeInBytes);
        if (CL_MEM_OBJECT_ALLOCATION_FAILURE == error)
            break;
        if (CL_SUCCESS != error)
            return false;
        error = queue.finish();
        if (CL_SUCCESS != error)
            return false;
    }

    return true;
}

int main() {
    // get all platforms (drivers), e.g. NVIDIA
    std::vector<cl::Platform> all_platforms;
    cl::Platform::get(&all_platforms);

    if (all_platforms.size()==0) {
        std::cout<<" No platforms found. Check OpenCL installation!\n";
        exit(1);
    }
    cl::Platform default_platform=all_platforms[0];
    std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n";

    // get default device (CPUs, GPUs) of the default platform
    std::vector<cl::Device> all_devices;
    default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
    if(all_devices.size()==0){
        std::cout<<" No devices found. Check OpenCL installation!\n";
        exit(1);
    }

    // use device[1] because that's a GPU; device[0] is the CPU
    // or [0] if CPU has no ocl drivers
    cl::Device default_device=all_devices[0];
    std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";

    DeviceData data;
    data.mContextCL = {default_device};

    auto f1 = [default_device, &data]() {
        // create a queue (a queue of commands that the GPU will execute)
        data.mQueueCL = {data.mContextCL, default_device};

        // create the program that we want to execute on the device
        cl::Program::Sources sources;

        // calculates for each element; C = A + B
        std::string kernel_code =
                "   void kernel simple_add(global const int* A, global const int* B, global int* C, "
                "                          global const int* N) {"
                "       int ID, Nthreads, n, ratio, start, stop;"
                ""
                "       ID = get_global_id(0);"
                "       Nthreads = get_global_size(0);"
                "       n = N[0];"
                ""
                "       ratio = (n / Nthreads);"  // number of elements for each thread
                "       start = ratio * ID;"
                "       stop  = ratio * (ID + 1);"
                ""
                "       for (int i=start; i<stop; i++)"
                "           C[i] = A[i] + B[i];"
                "   }";
        sources.push_back({kernel_code.c_str(), kernel_code.length()});

        data.mProgramCL = {data.mContextCL, sources};
        if (data.mProgramCL.build({default_device}) != CL_SUCCESS) {
            std::cout << "Error building: " << data.mProgramCL.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
            exit(1);
        }

        data.mKernelCL = {data.mProgramCL, "simple_add"};

        // create buffers on device (allocate space on GPU): A, B, C, N(1)
        data.mBufferListCL = {
                {data.mContextCL, CL_MEM_READ_WRITE, sizeof(int) * data.n},
                {data.mContextCL, CL_MEM_READ_WRITE, sizeof(int) * data.n},
                {data.mContextCL, CL_MEM_READ_WRITE, sizeof(int) * data.n},
                {data.mContextCL, CL_MEM_READ_ONLY,  sizeof(int)}
        };
    };

    auto f2 = [default_device, &data]() {
        // create things on here (CPU)
        int A[data.n], B[data.n];
        for (int i = 0; i < data.n; i++) {
            A[i] = i;
            B[i] = data.n - i - 1;
        }

        // apparently OpenCL only likes arrays ...
        // N holds the number of elements in the vectors we want to add
        int const N[1] = {data.n};

        auto& buffer_A = data.mBufferListCL[0];
        auto& buffer_B = data.mBufferListCL[1];
        auto& buffer_C = data.mBufferListCL[2];
        auto& buffer_N = data.mBufferListCL[3];

        // push write commands to queue
        data.mQueueCL.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * data.n, A);
        data.mQueueCL.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * data.n, B);
        data.mQueueCL.enqueueWriteBuffer(buffer_N, CL_TRUE, 0, sizeof(int), N);

        // RUN ZE KERNEL
        data.mKernelCL.setArg(0, buffer_A);
        data.mKernelCL.setArg(1, buffer_B);
        data.mKernelCL.setArg(2, buffer_C);
        data.mKernelCL.setArg(3, buffer_N);
        data.mQueueCL.enqueueNDRangeKernel(data.mKernelCL, cl::NullRange, cl::NDRange(10), cl::NullRange);
        data.mQueueCL.finish();
    };

    auto f3 = [&data]() {
        auto &buffer_C = data.mBufferListCL[2];
        int C[data.n];
        // read result from GPU to here
        data.mQueueCL.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * data.n, C);

        std::cout << "result: {";
        for (int i = 0; i < data.n; i++) {
            std::cout << C[i] << " ";
        }
        std::cout << "}" << std::endl;
    };

    // First run to show that all is fine if we stay on the main thread

    produceError(data.mContextCL, default_device);

    f1();
    f2();
    f3();

    // Second run where we get stuck in t2, at the first data.mQueueCL.enqueueWriteBuffer() call.
    // It works if we uncomment the call to produceError() below.
    // It also works if we recreate the cl::Context again after the produceError() call.

    data = {};
    data.mContextCL = {default_device};

    produceError(data.mContextCL, default_device);

    auto t1 = std::thread(f1);
    auto t1_id = t1.get_id();
    t1.join();
    auto t2 = std::thread(f2);
    auto t2_id = t2.get_id();
    t2.join();
    auto t3 = std::thread(f3);
    auto t3_id = t3.get_id();
    t3.join();

    std::cout << t1_id << std::endl;
    std::cout << t2_id << std::endl;
    std::cout << t3_id << std::endl;
    return 0;
}

相关问题