在CUDA内核中将'threadIdx.x'转换为更高类型的正确方法(printf中的%lu格式在CUDA内核中出现故障?)

anauzrmj  于 2023-06-21  发布在  其他
关注(0)|答案(1)|浏览(99)

我必须访问CUDA内核中一个非常大的数组的元素。在某些应用中,数组的大小可以大于INT_MAX。
基本上,它们采取以下形式。

__global__ function(double *dArr) {
  size_t index = blockIdx.x * blockDim.x + threadIdx.x;

  dArr[index * WIDTH] = ...; // WIDTH is 256 or 512.
}

根据我的理解,像threadIdx.x这样的CUDA变量是unsigned int,其限制比通常的uint小。
我试图将这些CUDA变量转换为更高的类型,以便它们可以用作大型数组的索引。
我已经尝试了一些,但我的内核仍然不能处理大型数组。我甚至无法理解以下简单代码行的结果(它甚至没有采用大数字)。

#include <cstdio>

__global__ void printIndex() {
  printf("blockIdx.x %lu (%d), blockDim.x %lu (%d), threadIdx.x %lu (%d)\n",
          blockIdx.x, blockIdx.x, blockDim.x, blockDim.x, threadIdx.x, threadIdx.x);
  //printf("blockIdx.x %d, blockDim.x %d, threadIdx.x %d \n", blockIdx.x, blockDim.x, threadIdx.x); // this works fine.
}

int main() {
  printIndex<<<2,64>>>();
  cudaDeviceSynchronize();

  unsigned int ui = 1000;
  printf("ui %lu (%d) \n", ui, ui); // this is just for the comparison.

  return 0;
}

我选择%lu的原因是为了模仿某种类型的铸造到更高的类型,结果是相当奇怪的。(当我使用%u时,它工作得很好)

blockIdx.x 4294967297 (64), blockDim.x 0 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 4294967297 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 8589934594 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 12884901891 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 17179869188 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 21474836485 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 25769803782 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 30064771079 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 34359738376 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 38654705673 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 42949672970 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 47244640267 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 51539607564 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 55834574861 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 60129542158 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 64424509455 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 68719476752 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 73014444049 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 77309411346 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 81604378643 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 85899345940 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 90194313237 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 94489280534 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 98784247831 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 103079215128 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 107374182425 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 111669149722 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 115964117019 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 120259084316 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 124554051613 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 128849018910 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 133143986207 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 137438953504 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 141733920801 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 146028888098 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 150323855395 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 154618822692 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 158913789989 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 163208757286 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 167503724583 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 171798691880 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 176093659177 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 180388626474 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 184683593771 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 188978561068 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 193273528365 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 197568495662 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 201863462959 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 206158430256 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 210453397553 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 214748364850 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 219043332147 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 223338299444 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 227633266741 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 231928234038 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 236223201335 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 240518168632 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 244813135929 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 249108103226 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 253403070523 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 257698037820 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 261993005117 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 266287972414 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 270582939711 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 0 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 4294967297 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 8589934594 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 12884901891 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 17179869188 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 21474836485 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 25769803782 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 30064771079 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 34359738376 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 38654705673 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 42949672970 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 47244640267 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 51539607564 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 55834574861 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 60129542158 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 64424509455 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 68719476752 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 73014444049 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 77309411346 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 81604378643 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 85899345940 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 90194313237 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 94489280534 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 98784247831 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 103079215128 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 107374182425 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 111669149722 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 115964117019 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 120259084316 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 124554051613 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 128849018910 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 133143986207 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 137438953504 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 141733920801 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 146028888098 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 150323855395 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 154618822692 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 158913789989 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 163208757286 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 167503724583 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 171798691880 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 176093659177 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 180388626474 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 184683593771 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 188978561068 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 193273528365 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 197568495662 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 201863462959 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 206158430256 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 210453397553 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 214748364850 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 219043332147 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 223338299444 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 227633266741 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 231928234038 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 236223201335 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 240518168632 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 244813135929 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 249108103226 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 253403070523 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 257698037820 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 261993005117 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 266287972414 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 270582939711 (0), threadIdx.x 0 (6)
ui 1000 (1000)

在最后一行,我看到ui%lu打印得很好,而CUDA内核中的打印非常奇怪。首先,当我处理最多64的数字时,%lu%d的转换是不一样的。其次,即使%d表示也不正确。blockIdx.x 's应该是0或1。
我在哪里迷路了?要使用printf检查threadIdx.x内容,正确的方法是什么?如果我将这些类型转换为可能高于INT_MAX(或UINT_MAX)的更高类型,那么正确的转换方式是什么?
我添加了c标记,因为这是关于<cstdio>中的printf

yvfmudvl

yvfmudvl1#

您需要为每个变量使用正确的格式字符串。将格式字符串用于64位值并传递32位值是无效的。(谁知道这样执行的是哪种越界访问)
编译器应该对此发出警告。
参数与相应的格式字符串转换不兼容(应为类型“unsigned long”,但参数的类型为“unsigned int”)
如果你想使用%lu打印,将参数转换为size_t,即(size_t)threadIdx. x。
要将全局线程id计算为64位值,可以使用size_t id = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);

相关问题