我很惊讶地发现我的着色器在寻址更高的索引时会开始从缓冲区阅读零。我猜这与驱动程序中寻址内部的精度有关。我从来没有遇到过任何内存不足的错误,着色器似乎只是默默地停止访问它们。如果我错了,请纠正我,但我相信CUDA支持64位指针和大量内存就好了。
我已经建立了一个MWE(见下图)我创建了一个2GB的缓冲区。如果我达到或超过2GB,着色器甚至不会向第一个元素写入任何内容。使用image_load_store写入着色器中的缓冲区最多只能工作512 MiB。我在无绑定图形方面运气更好,它可以正确地写入整个缓冲区。但我仍然坚持最大2GB,即使我可以创建一个更大的缓冲区,它似乎无绑定图形使用64位寻址,所以我不认为有任何理由这个限制应该存在。
如何使用OpenGL创建和使用大于2GB的缓冲区?
我使用的是GTX Titan(6 GB)。
//#include <windows.h>
# include <assert.h>
# include <stdio.h>
# include <memory.h>
# include <GL/glew.h>
# include <GL/glut.h>
const char* imageSource =
"#version 440\n"
"uniform layout(rgba32f) imageBuffer data;\n"
"uniform float val;\n"
"void main() {\n"
" imageStore(data, gl_VertexID, vec4(val));\n"
" gl_Position = vec4(0.0);\n"
"}\n";
const char* bindlessSource =
"#version 440\n"
"#extension GL_NV_gpu_shader5 : enable\n"
"#extension GL_NV_shader_buffer_load : enable\n"
"uniform vec4* data;\n"
"uniform float val;\n"
"void main() {\n"
" data[gl_VertexID] = vec4(val);\n"
" gl_Position = vec4(0.0);\n"
"}\n";
GLuint compile(GLenum type, const char* shaderSrc)
{
GLuint shader = glCreateShader(type);
glShaderSource(shader, 1, (const GLchar**)&shaderSrc, NULL);
glCompileShader(shader);
int success = 0;
int loglen = 0;
glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &loglen);
GLchar* log = new GLchar[loglen];
glGetShaderInfoLog(shader, loglen, &loglen, log);
if (!success)
{
printf("%s\n", log);
exit(0);
}
GLuint program = glCreateProgram();
glAttachShader(program, shader);
glLinkProgram(program);
return program;
}
int main(int argc, char**argv)
{
float* check;
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
glutCreateWindow("test");
glewInit();
GLsizeiptr bufferSize = 1024 * 1024 * 1024; //1GB
bufferSize *= 2;
bufferSize -= 16;
GLsizeiptr numFloats = bufferSize/sizeof(float);
GLsizeiptr numVec4s = bufferSize/(sizeof(float)*4);
float testVal = 123.123f;
glEnable(GL_RASTERIZER_DISCARD);
float* dat = new float[numFloats];
memset(dat, 0, bufferSize);
//create a buffer with data
GLuint buffer;
glGenBuffers(1, &buffer);
glBindBuffer(GL_TEXTURE_BUFFER, buffer);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, NULL, GL_STATIC_DRAW);
//get a bindless address
GLuint64 address;
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
//make a texture alias for it
GLuint bufferTexture;
glGenTextures(1, &bufferTexture);
glBindTexture(GL_TEXTURE_BUFFER, bufferTexture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, buffer); //should be GL_RGBA32F (see update)
glBindImageTextureEXT(0, bufferTexture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32F); //should be GL_RGBA32F (see update)
//compile the shaders
GLuint imageShader = compile(GL_VERTEX_SHADER, imageSource);
GLuint bindlessShader = compile(GL_VERTEX_SHADER, bindlessSource);
//initialize buffer
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
assert(glIsBufferResidentNV(GL_TEXTURE_BUFFER)); //sanity check
//run image_load_store
glUseProgram(imageShader);
glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
glUniform1f(glGetUniformLocation(imageShader, "val"), testVal);
glDrawArrays(GL_POINTS, 0, numVec4s);
glMemoryBarrier(GL_ALL_BARRIER_BITS);
check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
for (GLsizeiptr i = 0; i < numFloats; ++i)
{
if (check[i] != testVal)
{
printf("failed image_load_store: dat[%td] = %f (%fMiB)\n", i, check[i], (double)i*sizeof(float)/1024.0/1024.0);
break;
}
}
glUnmapBuffer(GL_TEXTURE_BUFFER);
//initialize buffer
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
assert(glIsBufferResidentNV(GL_TEXTURE_BUFFER)); //sanity check
//run bindless
glUseProgram(bindlessShader);
glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessShader, "val"), testVal);
glDrawArrays(GL_POINTS, 0, numVec4s);
glMemoryBarrier(GL_ALL_BARRIER_BITS);
check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
for (GLsizeiptr i = 0; i < numFloats; ++i)
{
if (check[i] != testVal)
{
printf("failed bindless: dat[%td] = %f (%fMiB)\n", i, check[i], (double)i*sizeof(float)/1024.0/1024.0);
break;
}
}
glUnmapBuffer(GL_TEXTURE_BUFFER);
return 0;
}
下面是我得到的输出:
> make && ./a.out
g++ -lGL -lGLEW -lglut main.c
failed image_load_store: dat[134217727] = 0.000000 (511.999996MiB)
更新日期:
发现错误。GL_R32F
内部格式应为GL_RGBA32F
,允许image_load_store达到~2GB标记。程序正确执行且无输出,直到大小达到2GB或更大,此时image_load_store和bindless仍然失败。GL_MAX_TEXTURE_BUFFER_SIZE
对我来说是134217728,这使得RGBA 32 F的最大大小正好是2GB。然而,我对超过2GB的大小仍然有疑问。当然,我可以分配多个缓冲区,但这是一堆内务管理和开销,我不想处理。
1条答案
按热度按时间nqwrtyyt1#
您可能需要针对特定的供应商;对于NVIDIA,您可以使用以下扩展,允许您在着色器中使用64位大小的地址(和缓冲区大小):https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_buffer_load.txthttps://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_buffer_store.txt显示器
基本上,您可以开始使用GLSL内部的指针,并将它们作为64位值从CPU主机向上传递。
返回最大缓冲区大小的方法是
glGetIntegerui64vNV(GL_MAX_SHADER_BUFFER_ADDRESS_NV, &max_shader_buffer_address);``glGetIntegerui64vNV(GL_MAX_SHADER_BUFFER_ADDRESS_NV, &max_shader_buffer_address);
printf("Maximum shader buffer address: %lu\n", max_shader_buffer_address);
在我装有RTX 3070计算机上,它是18446744073709551615。