我正在尝试使用OpenCL进行2D卷积。内核执行一个输出像素的计算,将FILTERSIZExFILTERSIZE值相加。我让前几个工作项将过滤器值加载到本地内存中。我希望所有工作项都等待,直到完成过滤,因此FILTERSIZExFILTERSIZE值,都在本地内存中。为此,我使用屏障barrier(CLK_LOCAL_MEM_FENCE);
。为了检查它是否工作,我让一个工作项打印过滤器中的所有值。我希望所有过滤器值都是"1",因为这是我初始化它们的方式。那么,为什么所有值都是"0"呢?
下面是我的代码:
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 300
#define CL_HPP_MINIMUM_OPENCL_VERSION 300
#include <CL/opencl.hpp>
#include <cmath>
#include <iostream>
float* workgroupsizes;
float* globalsizes;
cl::Context* context;
cl::CommandQueue* queue;
cl::Kernel* kernel;
void initialize_cl(int filtersize){ //infos: conv yes or no, height of one outputchannel, number of outputchannels, filter height, number of inputchannels
try {
// Get available platforms and devices
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.empty()) {
std::cerr << "No OpenCL platforms found." << std::endl;
}
std::vector<cl::Device> devices;
platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
if (devices.empty()) {
std::cerr << "No OpenCL devices found." << std::endl;
}
// Create context and command queue
context=new cl::Context(devices);
queue=new cl::CommandQueue(*context, devices[0]);
// Create kernel program
cl::Program::Sources sources;
std::string kernelCode;
kernelCode = R"CLC(
#define KERNSIZE %
kernel void conv2d(constant float* a, constant float* b,
global float* c, const int inc, const int insize, const int outsize) {
int outx = get_global_id(0);
int outy = get_global_id(1);
int outc=get_global_id(2);
local float filter[KERNSIZE*KERNSIZE];
if (outx<KERNSIZE && outy<KERNSIZE){
filter[outy*KERNSIZE+outx]=b[KERNSIZE*KERNSIZE*(inc+outc*get_global_size(2))+outy*KERNSIZE+outx];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (outx==10&&outy==10){
for (int x=0;x<KERNSIZE;x++){
for (int y=0;y<KERNSIZE;y++){
printf("%f\n",filter[y*KERNSIZE+x]);
}
}
}
float sum=0;
for (int x=0;x<KERNSIZE;x++){
for (int y=0;y<KERNSIZE;y++){
sum+=a[inc*insize*insize+(outy+y)*insize+outx+x] * filter[y*KERNSIZE+x];
}
}
c[outc*outsize*outsize+outy*outsize+outx]+=sum;
}
)CLC";
std::size_t pos = kernelCode.find('%');
kernelCode.replace(pos, 1, std::to_string(filtersize));
sources.push_back({kernelCode.c_str(), kernelCode.length()});
cl::Program program(*context, sources);
try {
program.build(devices);
}
catch (cl::Error& e) {
if (e.err() == CL_BUILD_PROGRAM_FAILURE) {
// Get the build log for the first device
std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
// Print the build log
std::cout << "Build log:/n" << build_log << std::endl;
}
exit(0);
}
// Create kernel
kernel=new cl::Kernel(program, "conv2d");
}catch (const cl::Error& error) {
std::cerr << "OpenCL error: " << error.what() << " ("
<< error.err() << ")" << std::endl;
exit(0);
}
}
void conv2d(int inputsize, int outputsize, int filtersize, int inputchannels, int outputchannels, float* inputlayer, float* filters, float* outputlayer){
// Create buffers
cl::Buffer bufferA(*context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
inputsize*inputsize*inputchannels* sizeof(float), inputlayer);
cl::Buffer bufferB(*context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
filtersize*filtersize*inputchannels*outputchannels*sizeof(float), filters);
cl::Buffer bufferC(*context, CL_MEM_READ_WRITE,
outputsize*outputsize*outputchannels*sizeof(float));
bool notfound=1;
int wgsize1= int(256/outputsize);
while (notfound){
if (outputsize%wgsize1==0){
notfound=0;
}else{
wgsize1--;
}
}
// Set kernel arguments
(*kernel).setArg(0, bufferA);
(*kernel).setArg(1, bufferB);
(*kernel).setArg(2, bufferC);
(*kernel).setArg(4, inputsize);
(*kernel).setArg(5, outputsize);
std::cout<<outputsize<<" "<<outputchannels<<" "<<wgsize1;
// Enqueue kernel for execution
for (int i=0;i<inputchannels;i++){
(*kernel).setArg(3, i);
(*queue).enqueueNDRangeKernel(*kernel, cl::NullRange, cl::NDRange(outputsize, outputsize, outputchannels), cl::NDRange(outputsize, wgsize1, 1));
(*queue).finish();
}
// Read back result from device
(*queue).enqueueReadBuffer(bufferC, CL_TRUE, 0,
outputsize*outputsize*outputchannels*sizeof(float), outputlayer);
}
下面是主文件:
#include <iostream>
#include "GPUCONV2.cpp"
int main(){
// Set up input data
const int inchannels=1;
const int outchannels=1;
const int insize=22;
const int outsize=20;
const int filtersize=3;
float** layers;
layers=new float*[2];
float** weights;
weights=new float*[2];
int totalsizein=inchannels*std::pow(insize,2);
int totalsizeout=outchannels*std::pow(outsize,2);
layers[0]=new float[totalsizein];
layers[1]=new float[totalsizeout];
std::fill_n(layers[0], totalsizein, 1);
std::fill_n(layers[1], totalsizeout, 0);
int totalsizew=inchannels*outchannels*std::pow(filtersize,2);
weights[1]=new float[totalsizew];
std::fill_n(weights[1], totalsizew, 1);
initialize_cl(filtersize);
std::chrono::high_resolution_clock::time_point start;
std::chrono::duration<double> dur;
start=std::chrono::high_resolution_clock::now();
conv2d(insize,outsize,filtersize,inchannels,outchannels,layers[0],weights[1],layers[1]);
dur =std::chrono::high_resolution_clock::now()-start;
std::cout<<dur.count()<<std::endl;
/*std::cout << "Result: ";
for (int i = 0; i < totalsizeout; ++i) {
std::cout << layers[1][i] << " ";
}
std::cout << std::endl;*/
}
1条答案
按热度按时间jv4diomz1#
问题是我使用
get_global_id
来确定是否应该加载过滤器值,因为我的输入图像被分成几个工作组,所以只有第一个工作组加载了过滤器。我用if语句解决了这个问题:
而不是旧的。