c++ 忽略OpenCL本地隔离栅

1cosmwyk  于 2023-02-26  发布在  其他
关注(0)|答案(1)|浏览(149)

我正在尝试使用OpenCL进行2D卷积。内核执行一个输出像素的计算,将FILTERSIZExFILTERSIZE值相加。我让前几个工作项将过滤器值加载到本地内存中。我希望所有工作项都等待,直到完成过滤,因此FILTERSIZExFILTERSIZE值,都在本地内存中。为此,我使用屏障barrier(CLK_LOCAL_MEM_FENCE);。为了检查它是否工作,我让一个工作项打印过滤器中的所有值。我希望所有过滤器值都是"1",因为这是我初始化它们的方式。那么,为什么所有值都是"0"呢?
下面是我的代码:

#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 300
#define CL_HPP_MINIMUM_OPENCL_VERSION 300
#include <CL/opencl.hpp>
#include <cmath>
#include <iostream>

float* workgroupsizes;
float* globalsizes;
cl::Context* context;
cl::CommandQueue* queue;
cl::Kernel* kernel;



void initialize_cl(int filtersize){ //infos: conv yes or no, height of one outputchannel, number of outputchannels, filter height, number of inputchannels

    try {
            // Get available platforms and devices
            std::vector<cl::Platform> platforms;
            cl::Platform::get(&platforms);
            if (platforms.empty()) {
                std::cerr << "No OpenCL platforms found." << std::endl;

            }
            std::vector<cl::Device> devices;
            platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
            if (devices.empty()) {
                std::cerr << "No OpenCL devices found." << std::endl;

            }

            // Create context and command queue
            context=new cl::Context(devices);
            queue=new cl::CommandQueue(*context, devices[0]);

        

            // Create kernel program
            cl::Program::Sources sources;
            std::string kernelCode;

           
                kernelCode = R"CLC(
             #define KERNSIZE %
            kernel void conv2d(constant float* a, constant float* b,
                               global float* c, const int inc, const int insize, const int outsize) {
                
                int outx = get_global_id(0);
                int outy = get_global_id(1);
                int outc=get_global_id(2);
               
                local float filter[KERNSIZE*KERNSIZE];
                if (outx<KERNSIZE && outy<KERNSIZE){
                    filter[outy*KERNSIZE+outx]=b[KERNSIZE*KERNSIZE*(inc+outc*get_global_size(2))+outy*KERNSIZE+outx];
                    
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                
                if (outx==10&&outy==10){
                for (int x=0;x<KERNSIZE;x++){
                        for (int y=0;y<KERNSIZE;y++){
                            printf("%f\n",filter[y*KERNSIZE+x]);
                    }
                }
                }

                
                

                

                float sum=0;

                for (int x=0;x<KERNSIZE;x++){
                        for (int y=0;y<KERNSIZE;y++){
                            sum+=a[inc*insize*insize+(outy+y)*insize+outx+x] * filter[y*KERNSIZE+x];
                            
                    }
                }
                

                c[outc*outsize*outsize+outy*outsize+outx]+=sum;
            }
        )CLC";
        std::size_t pos = kernelCode.find('%');
        
        
        kernelCode.replace(pos, 1, std::to_string(filtersize));
       

        sources.push_back({kernelCode.c_str(), kernelCode.length()});


        cl::Program program(*context, sources);

        try {
            program.build(devices);
        }
        catch (cl::Error& e) {
            if (e.err() == CL_BUILD_PROGRAM_FAILURE) {

                    // Get the build log for the first device
                std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);

                // Print the build log
                std::cout << "Build log:/n" << build_log << std::endl;
            

            }

            exit(0);
        }

          // Create kernel
        kernel=new cl::Kernel(program, "conv2d");



}catch (const cl::Error& error) {


        std::cerr << "OpenCL error: " << error.what() << " ("
                  << error.err() << ")" << std::endl;
                  exit(0);
    }
    

 
}


void conv2d(int inputsize, int outputsize, int filtersize, int inputchannels, int outputchannels, float* inputlayer, float* filters, float* outputlayer){

       // Create buffers
        cl::Buffer bufferA(*context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                           inputsize*inputsize*inputchannels* sizeof(float), inputlayer);
        cl::Buffer bufferB(*context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                           filtersize*filtersize*inputchannels*outputchannels*sizeof(float), filters);
        cl::Buffer bufferC(*context, CL_MEM_READ_WRITE,
                          outputsize*outputsize*outputchannels*sizeof(float));
   
        bool notfound=1;
        int wgsize1= int(256/outputsize);
        while (notfound){
            if (outputsize%wgsize1==0){
                notfound=0;
            }else{
                wgsize1--;
            }

        }
        
       

        // Set kernel arguments
        (*kernel).setArg(0, bufferA);
        (*kernel).setArg(1, bufferB);
        (*kernel).setArg(2, bufferC);

        (*kernel).setArg(4, inputsize);
        (*kernel).setArg(5, outputsize);

        std::cout<<outputsize<<" "<<outputchannels<<" "<<wgsize1;

        // Enqueue kernel for execution
        for (int i=0;i<inputchannels;i++){

        (*kernel).setArg(3, i);
        (*queue).enqueueNDRangeKernel(*kernel, cl::NullRange, cl::NDRange(outputsize, outputsize, outputchannels), cl::NDRange(outputsize, wgsize1, 1));
        (*queue).finish();
        }
        
    
        
        // Read back result from device
        (*queue).enqueueReadBuffer(bufferC, CL_TRUE, 0,
                                outputsize*outputsize*outputchannels*sizeof(float), outputlayer);

}

下面是主文件:

#include <iostream>
#include "GPUCONV2.cpp"

int main(){
 // Set up input data
    const int inchannels=1;
    const int outchannels=1;
    const int insize=22;
    const int outsize=20;
    const int filtersize=3;

   float** layers;
   layers=new float*[2];

   float** weights;
    weights=new float*[2];

    int totalsizein=inchannels*std::pow(insize,2);
    int totalsizeout=outchannels*std::pow(outsize,2);
    layers[0]=new float[totalsizein];
    layers[1]=new float[totalsizeout];

    std::fill_n(layers[0], totalsizein, 1);
    std::fill_n(layers[1], totalsizeout, 0);

    int totalsizew=inchannels*outchannels*std::pow(filtersize,2);
   weights[1]=new float[totalsizew];
   std::fill_n(weights[1], totalsizew, 1);

   initialize_cl(filtersize);

   std::chrono::high_resolution_clock::time_point start;

         std::chrono::duration<double> dur;
         start=std::chrono::high_resolution_clock::now();
   conv2d(insize,outsize,filtersize,inchannels,outchannels,layers[0],weights[1],layers[1]);
    dur =std::chrono::high_resolution_clock::now()-start;
    std::cout<<dur.count()<<std::endl;
   
   /*std::cout << "Result: ";
        for (int i = 0; i < totalsizeout; ++i) {
            std::cout << layers[1][i] << " ";
        }
        std::cout << std::endl;*/
}
jv4diomz

jv4diomz1#

问题是我使用get_global_id来确定是否应该加载过滤器值,因为我的输入图像被分成几个工作组,所以只有第一个工作组加载了过滤器。
我用if语句解决了这个问题:

if (get_local_id(0)+(get_local_id(1)*get_local_size(0) > KERNSIZE*KERNSIZE)

而不是旧的。

相关问题