#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}
};
__global__
void HelloWorld()
{
printf("Hello world\n");
}
volatile double dummy = 0;
__global__
void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = start; i < N; i += stride) {
output[i] = x[i] * y[i];
}
}
int main()
{
MyTimer timer;
srand(time(NULL));
HelloWorld<<<1,1>>>();
timer.startCounter();
int N = 2000 * 2000;
float* h_a = new float[N];
float* h_b = new float[N];
float* h_c = new float[N];
float* h_res = new float[N];
for (int i = 0; i < N; i++) {
h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
h_c[i] = h_a[i] * h_b[i];
}
dummy = timer.getCounterMsPrecise();
timer.startCounter();
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, N * sizeof(float));
cudaMalloc(&d_b, N * sizeof(float));
cudaMalloc(&d_c, N * sizeof(float));
dummy = timer.getCounterMsPrecise();
cout << "cudaMalloc cost = " << dummy << "\n";
timer.startCounter();
cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cout << "H2D copy cost = " << dummy << "\n";
timer.startCounter();
constexpr int GRID_DIM = 256;
constexpr int BLOCK_DIM = 256;
multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cout << "kernel cost = " << dummy << "\n";
timer.startCounter();
cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";
for (int i = 0; i < N; i++) if (h_res[i] != h_c[i]) {
cout << "error\n";
exit(1);
}
return 0;
}
如果我使用普通的cudaMalloc
,结果是
Hello world
cudaMalloc cost = 0.599463
H2D copy cost = 5.16785
kernel cost = 0.109068
D2H copy cost = 7.18768
但如果我用cudaMallocManaged
它就变成了
Hello world
cudaMalloc cost = 0.116722
H2D copy cost = 8.26673
kernel cost = 1.70356
D2H copy cost = 6.8841
为什么会有这么大的性能下降?代码已经手动将内存复制到设备端,那么它不应该与常规的cudaMalloc-ed
设备内存完全相同吗?
2条答案
按热度按时间ldioqlga1#
当使用托管内存时,“预取”并不意味着使用
cudaMemcpy
。我不推荐在托管内存中使用cudaMemcpy
。你不会找到任何培训材料建议这样做,而且它不一定会像你想的那样。要在按需分页管理内存(也称为统一内存或UM)机制中预取数据,实际上应该使用
cudaMemPrefetchAsync
。当我这样做时,我观察到两种情况之间的性能没有显着差异。为了进行合理的比较,我必须对代码进行一些重构:(V100,CUDA 11.4)
请注意,这假设您处于请求分页UM机制。(例如,在麦克斯韦或Kepler设备上,或在Windows上,或在Jetson上,当前),那么您将不会使用
cudaMemPrefetchAsync
,并且数据迁移与内核启动密不可分。还要注意CUDA_VISIBLE_DEVICES
的使用。在多GPU系统中,根据系统拓扑和系统中的GPU,UM可以有各种不同的行为。这可能会使苹果对苹果的比较变得困难。最后,我没有将数据预取回主机,如果你想比较那个活动,你已经得到了一些instruction。
xsuvu9jc2#
当使用托管内存时,cpu和gpu之间有一个底层的交换机制。特别是第一次运行内核时。如果你运行你的内核几次,执行时间就会恢复正常。