我正在自学CUDA。我的最终目标是将其应用到Fortran中,但由于许多课程/视频都基于C/C++,所以我经常会在这两种语言中执行相同的练习(这是一件好事)。目前,我正尝试在GPU上运行一个基本的练习,执行a(i)= b(i)+ c(i)。为了完整起见,我发布这两种代码以供比较:
- C代码如下
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_common.cuh"
#include "common.h"
//assume grid is 1D and block is 1D then nx = size
__global__ void sum_arrays_1Dgrid_1Dblock(float* a, float* b, float *c, int nx)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < nx)
c[gid] = a[gid] + b[gid];
}
void run_sum_array_1d(int argc, char** argv)
{
printf("Runing 1D grid \n");
int size = 1 << 22;
int block_size = 128;
int nx, ny = 0;
if (argc > 2)
size = 1 << atoi(argv[2]);
if (argc > 4)
block_size = 1 << atoi(argv[4]);
unsigned int byte_size = size * sizeof(float);
printf("Input size : %d \n",size);
float * h_a, *h_b, *h_out, *h_ref;
h_a = (float*)malloc(byte_size);
h_b = (float*)malloc(byte_size);
h_out = (float*)malloc(byte_size);
h_ref = (float*)malloc(byte_size);
if (!h_a)
printf("host memory allocation error \n");
for (size_t i = 0; i < size; i++)
{
h_a[i] = i % 10;
h_b[i] = i % 7;
}
clock_t cpu_start, cpu_end;
cpu_start = clock();
sum_array_cpu(h_a, h_b, h_out,size);
cpu_end = clock();
dim3 block( block_size);
dim3 grid((size+block.x -1)/block.x);
printf("Kernel is lauch with grid(%d,%d,%d) and block(%d,%d,%d) \n",
grid.x,grid.y,grid.z,block.x,block.y, block.z);
float *d_a, *d_b, *d_c;
gpuErrchk(cudaMalloc((void**)&d_a, byte_size));
gpuErrchk(cudaMalloc((void**)&d_b, byte_size));
gpuErrchk(cudaMalloc((void**)&d_c, byte_size));
gpuErrchk(cudaMemset(d_c,0,byte_size));
clock_t htod_start, htod_end;
htod_start = clock();
gpuErrchk(cudaMemcpy(d_a,h_a,byte_size,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, byte_size, cudaMemcpyHostToDevice));
htod_end = clock();
clock_t gpu_start, gpu_end;
gpu_start = clock();
sum_arrays_1Dgrid_1Dblock << <grid, block >> > (d_a, d_b, d_c, size);
gpuErrchk(cudaDeviceSynchronize());
gpu_end = clock();
clock_t dtoh_start, dtoh_end;
dtoh_start = clock();
gpuErrchk(cudaMemcpy(h_ref,d_c,byte_size,cudaMemcpyDeviceToHost));
dtoh_end = clock();
compare_arrays(h_out, h_ref,size);
// elapsed time comparison
printf("Sum array CPU execution time [ms] : %4.6f \n",
(double)((double)1000.0*(cpu_end - cpu_start)/CLOCKS_PER_SEC));
printf("Sum array GPU execution time [ms] : %4.6f \n",
(double)((double)1000.0*(gpu_end - gpu_start)/CLOCKS_PER_SEC));
printf("htod mem transfer time [ms] : %4.6f \n",
(double)((double)1000.0*(htod_end - htod_start)/CLOCKS_PER_SEC));
printf("dtoh mem transfer time [ms] : %4.6f \n",
(double)((double)1000.0*(dtoh_end - dtoh_start)/CLOCKS_PER_SEC));
printf("Total GPU execution time [ms] : %4.6f \n",
(double)((double)1000.0*(dtoh_end - htod_start)/CLOCKS_PER_SEC));
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
free(h_ref);
free(h_out);
free(h_b);
free(h_a);
}
////arguments :
////1 - kernel (0:1D or 1:2D),
////2 - input size (2 pow (x))
////3 - for 2D kernel nx,
////4 - block.x
////5 - block.y
int main(int argc, char** argv)
{
printf("\n----------------------- SUM ARRAY EXAMPLE FOR NVPROF ------------------------ \n\n");
printf("argc : %d \n",argc);
for (int i = 0; i < argc; i++)
{
printf("argv : %s \n",argv[i]);
};
run_sum_array_1d(argc, argv);
//query_device();
return 0;
}
- Fortran代码如下
#include 'Error.fpp'
MODULE CUDAOps
USE cudafor
USE CUDAUtils
USE CPUOps
IMPLICIT NONE
CONTAINS
ATTRIBUTES(GLOBAL) SUBROUTINE sumArraysGPU_1D(a,b,c,Nsize)
IMPLICIT NONE
!> intent variables
INTEGER, INTENT(IN), DIMENSION(:) :: a,b
INTEGER, INTENT(INOUT), DIMENSION(:) :: c
INTEGER, INTENT(IN), VALUE :: Nsize
!> local variables
INTEGER :: blockId, threadId
! get the blockId
blockId = (blockIdx%x-1)
! get the threadId
threadId = blockId * blockDim%x + threadIdx%x-1
! adjust to let the threadId to start from 1
threadId = threadId + 1
!WRITE(*,*) 'threadId = ',threadId
! set the maximum
IF (threadId <= Nsize) THEN
! perform the sum
c(threadId) = a(threadId) + b(threadId)
END IF
END SUBROUTINE sumArraysGPU_1D
SUBROUTINE runSumArrays1D(xpow,blockSizeX)
IMPLICIT NONE
! intent variables
INTEGER, INTENT(IN) :: xpow,blockSizeX
!> variables declaration
! size of the arrays
INTEGER:: Nsize
! size of the GPU block
INTEGER:: block_size
! other auxiliary variables
INTEGER :: i,j,istat
REAL(KIND=wp) :: t1,t2,time,timeGPU
TYPE(cudaEvent) :: startEvent, stopEvent
! host data allocation
INTEGER, DIMENSION(:), ALLOCATABLE :: h_a, h_b, h_c, gpu_results
! device data allocation
INTEGER, DIMENSION(:), ALLOCATABLE, DEVICE :: d_a, d_b, d_c
! define the GPU grid and block
TYPE(DIM3) :: grid, tBlock
! define data size and block size along X dimension
Nsize = 2**xpow
block_size = 2**blockSizeX
! allocate memory in host
ALLOCATE(h_a(Nsize))
ALLOCATE(h_b(Nsize))
ALLOCATE(h_c(Nsize))
! allocate memory in device
ALLOCATE(gpu_results(Nsize))
ALLOCATE(d_a(Nsize))
ALLOCATE(d_b(Nsize))
ALLOCATE(d_c(Nsize))
! define block and grid
tBlock = DIM3(block_size,1,1)
grid = DIM3((Nsize/tBlock%x),1,1)
! host data initialization
CALL generateNumberByIntegerDivision(h_a,10,Nsize)
CALL generateNumberByIntegerDivision(h_b,7,Nsize)
WRITE(*,*) 'Kernel is going to be launched with'
WRITE(*,*) 'Nsize = ',Nsize
WRITE(*,*) 'xpow = ',xpow
WRITE(*,*) 'blockSizeX = ',blockSizeX
WRITE(*,*) 'block_size = ',block_size
WRITE(*,*) 'grid.x = ',grid%x
WRITE(*,*) 'grid.y = ',grid%y
WRITE(*,*) 'grid.z = ',grid%z
WRITE(*,*) 'block.x = ',tblock%x
WRITE(*,*) 'block.y = ',tblock%y
WRITE(*,*) 'block.z = ',tblock%z
timeGPU = 0.0_wp
CALL CPU_TIME(t1)
! perform the sum in serial using the CPU
CALL sumArraysCPU(h_a,h_b,h_c)
CALL CPU_TIME(t2)
WRITE(*,*) 'time for the CPU implementation (ms) = ',(t2-t1)*1e3
! initialize CUDA events
!istat = cudaEventCreate(startEvent)
GPU_ERROR(cudaEventCreate(startEvent))
istat = cudaEventCreate(stopEvent)
! copy the source data h_a from CPU to GPU
istat = cudaEventRecord(startEvent,0)
istat = cudaMemCpy(d_a,h_a,Nsize,cudaMemcpyHostToDevice)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
WRITE(*,*) 'time to transfer h_a to GPU (ms) = ',time
timeGPU = timeGPU + time
! copy the source data h_b from CPU to GPU
istat = cudaEventRecord(startEvent,0)
istat = cudaMemCpy(d_b,h_b,Nsize,cudaMemcpyHostToDevice)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
WRITE(*,*) 'time to transfer h_b to GPU (ms) = ',time
timeGPU = timeGPU + time
! perform the sum on the GPU
istat = cudaEventRecord(startEvent,0)
CALL sumArraysGPU_1D<<<grid, tBlock>>>(d_a,d_b,d_c,Nsize)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
WRITE(*,*) 'time to perform the sum on GPU (ms) = ',time
timeGPU = timeGPU + time
! copy the data back from GPU to CPU
istat = cudaEventRecord(startEvent,0)
istat = cudaMemCpy(gpu_results,d_c,Nsize,cudaMemcpyDeviceToHost)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
WRITE(*,*) 'time to copy back data from GPU to CPU (ms) = ',time
timeGPU = timeGPU + time
WRITE(*,*) 'Total time to execute GPU (ms) :',timeGPU
!WRITE(*,*) 'h_c = ',h_c
!WRITE(*,*) 'gpu_results = ',gpu_results
! make a formal check of the result component by component
CALL checkArraysCPU(h_c,gpu_results,Nsize)
WRITE(*,*) 'SUM(h_c) = ',SUM(h_c)
WRITE(*,*) 'SUM(gpu_results) = ',SUM(gpu_results)
! deallocate memory in host
DEALLOCATE(h_a)
DEALLOCATE(h_b)
DEALLOCATE(h_c)
! deallocate memory in device
DEALLOCATE(gpu_results)
DEALLOCATE(d_a)
DEALLOCATE(d_b)
DEALLOCATE(d_c)
END SUBROUTINE runSumArrays1D
PROGRAM main
USE CPUOps
USE CUDAOps
IMPLICIT NONE
! declare local variables
INTEGER :: i,xpow,sizeBlockX
! set the default values
xpow = 22
sizeBlockX = 7
! lanuch the dedicated routines
CALL runSumArrays1D(xpow,sizeBlockX)
STOP
END PROGRAM main
当我使用nvprof以默认选项(数据大小和块大小)运行代码时,对两个代码都使用以下命令:
nvprof ./code.x
我得到了以下输出。
1.对于C代码:
----------------------- SUM ARRAY EXAMPLE FOR NVPROF ------------------------
Runing 1D grid
Input size : 4194304
Kernel is lauch with grid(32768,1,1) and block(128,1,1)
==33351== NVPROF is profiling process 33351, command: ./code_c.x
Arrays are same
Sum array CPU execution time [ms] : 4.850000
Sum array GPU execution time [ms] : 1.610000
htod mem transfer time [ms] : 10.640000
dtoh mem transfer time [ms] : 5.759000
Total GPU execution time [ms] : 18.011000
==33351== Profiling application: ./code_c.x
==33351== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 61.35% 10.715ms 2 5.3577ms 5.3566ms 5.3589ms [CUDA memcpy HtoD]
30.94% 5.4040ms 1 5.4040ms 5.4040ms 5.4040ms [CUDA memcpy DtoH]
5.81% 1.0147ms 1 1.0147ms 1.0147ms 1.0147ms sum_arrays_1Dgrid_1Dblock(float*, float*, float*, int)
1.90% 331.81us 1 331.81us 331.81us 331.81us [CUDA memset]
API calls: 75.67% 60.242ms 3 20.081ms 55.398us 60.116ms cudaMalloc
20.59% 16.393ms 3 5.4645ms 5.2016ms 5.7578ms cudaMemcpy
2.00% 1.5906ms 1 1.5906ms 1.5906ms 1.5906ms cudaDeviceSynchronize
1.47% 1.1673ms 3 389.10us 186.65us 497.81us cudaFree
0.14% 107.71us 101 1.0660us 88ns 57.578us cuDeviceGetAttribute
0.08% 65.483us 1 65.483us 65.483us 65.483us cuDeviceGetName
0.02% 17.946us 1 17.946us 17.946us 17.946us cudaMemset
0.02% 16.011us 1 16.011us 16.011us 16.011us cudaLaunchKernel
0.01% 8.6300us 1 8.6300us 8.6300us 8.6300us cuDeviceGetPCIBusId
0.00% 1.1600us 3 386ns 146ns 846ns cuDeviceGetCount
0.00% 369ns 2 184ns 94ns 275ns cuDeviceGet
0.00% 246ns 1 246ns 246ns 246ns cuDeviceTotalMem
0.00% 194ns 1 194ns 194ns 194ns cuModuleGetLoadingMode
0.00% 167ns 1 167ns 167ns 167ns cuDeviceGetUuid
1.对于Fortran代码:
==38266== NVPROF is profiling process 38266, command: ./code_f.x
Kernel is going to be launched with
Nsize = 4194304
xpow = 22
blockSizeX = 7
block_size = 128
grid.x = 32768
grid.y = 1
grid.z = 1
block.x = 128
block.y = 1
block.z = 1
time for the CPU implementation (ms) = 4.997969
time to transfer h_a to GPU (ms) = 5.680192
time to transfer h_b to GPU (ms) = 5.561248
time to perform the sum on GPU (ms) = 1.510400
time to copy back data from GPU to CPU (ms) = 7.039712
Total time to execute GPU (ms) : 19.79155
Arrays are the same!
SUM(h_c) = 1592097881
SUM(gpu_results) = 1592097881
==38266== Profiling application: ./code_f.x
==38266== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 58.75% 10.911ms 5 2.1822ms 1.2160us 5.4682ms [CUDA memcpy HtoD]
35.16% 6.5297ms 1 6.5297ms 6.5297ms 6.5297ms [CUDA memcpy DtoH]
6.10% 1.1321ms 1 1.1321ms 1.1321ms 1.1321ms cudaops_sumarraysgpu_1d_
API calls: 87.80% 150.69ms 6 25.115ms 2.5020us 150.30ms cudaMalloc
9.95% 17.072ms 6 2.8454ms 4.1870us 7.0309ms cudaMemcpy
1.39% 2.3788ms 6 396.47us 2.2640us 1.1368ms cudaFree
0.72% 1.2281ms 4 307.02us 6.6590us 629.72us cudaEventSynchronize
0.05% 93.254us 101 923ns 92ns 41.961us cuDeviceGetAttribute
0.04% 64.982us 1 64.982us 64.982us 64.982us cuDeviceGetName
0.02% 36.395us 8 4.5490us 1.1180us 13.299us cudaEventRecord
0.02% 31.801us 2 15.900us 873ns 30.928us cudaEventCreate
0.01% 18.638us 1 18.638us 18.638us 18.638us cudaLaunchKernel
0.00% 6.3520us 4 1.5880us 970ns 2.5790us cudaEventElapsedTime
0.00% 4.9980us 1 4.9980us 4.9980us 4.9980us cuDeviceGetPCIBusId
0.00% 1.5290us 3 509ns 165ns 1.1890us cuDeviceGetCount
0.00% 444ns 2 222ns 92ns 352ns cuDeviceGet
0.00% 279ns 1 279ns 279ns 279ns cuModuleGetLoadingMode
0.00% 248ns 1 248ns 248ns 248ns cuDeviceTotalMem
0.00% 164ns 1 164ns 164ns 164ns cuDeviceGetUuid
我在这里试图理解的是,为什么“cudaMalloc”、“cudaMemcpy”和“cudaFree”调用的次数与我在C代码中编写的一致,而在Fortran代码中却不一致。具体来说,当我执行3个数组的分配时,cudaMalloc显示我调用了6次?
我正在尝试了解我的Fortran代码中是否有错误/bug,或者这是正常的,如果是,原因是什么。谢谢。
我试着在Fortran中使用d_a,d_B和d_c数组的分配语句,看起来像是内核调用在已经显式完成的调用之上再次执行cudaMalloc和内存复制。
---------------- EDIT另外一个问题是,如果我打印一些nvprof特定的指标,例如:
nvprof --metrics gld_efficiency,sm_efficiency,achieved_occupancy ./code.x
1.这是C输出:
Invocations Metric Name Metric Description Min Max Avg
Device "NVIDIA GeForce MX330 (0)"
Kernel: sum_arrays_1Dgrid_1Dblock(float*, float*, float*, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
1 sm_efficiency Multiprocessor Activity 99.50% 99.50% 99.50%
1 achieved_occupancy Achieved Occupancy 0.922875 0.922875 0.922875
1.这是Fortran的输出
Invocations Metric Name Metric Description Min Max Avg
Device "NVIDIA GeForce MX330 (0)"
Kernel: cudaops_sumarraysgpu_1d_
1 gld_efficiency Global Memory Load Efficiency 67.86% 67.86% 67.86%
1 sm_efficiency Multiprocessor Activity 99.62% 99.62% 99.62%
1 achieved_occupancy Achieved Occupancy 0.877743 0.877743 0.877743
可以清楚地看到全局内存加载效率的差异,这两个问题是否相关?
1条答案
按热度按时间wljmcqd81#
至少在CUDA中,一个fortran数组至少具有(CUDA)fortran设备代码生成所需的元数据。
这个元数据导致每个fortran数组分配两个地址,一个用于实际数据,一个用于元数据。
元数据可能是什么的一个例子是数组的“宽度”。(我在这里松散地使用术语fortran数组。你不会总是看到任何类型的设备分配的元数据。)
当然,由于需要元数据(在本例中,由CUDA Fortran编译器决定),并且元数据是单独分配的,因此每个数组也会有2个复制操作,相应地,也会有2个自由操作。