最近我开始学习CUDA技术,得益于并行技术,它让我的计算速度提高了好几倍。我写的程序要用高斯方法计算方阵的行列式。不幸的是,由于某种原因,它总是告诉我矩阵的行列式等于一,尽管不是。另外,时间也显示不正确。最有可能的是,calculate_determinate函数中有错误,但我无法找到它。
我在Ubuntu 22.02中工作,这是我编译程序的方式:nvcc test2.cu -o test2
The output of the ./test2
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <malloc.h>
#include <stdint.h>
#include <time.h>
#include <clocale>
#include <CL/cl.h>
#include <cuda_runtime.h>
#include <cuda.h>
#define MAX_SIZE 10000
#define BLOCK_SIZE 512
#define THREAD_SIZE 1024
__global__ void calculate_determinant(double* device_matrix, int size, double* det) {
int j = threadIdx.x + blockDim.x * blockIdx.x;
double ratio;
int i, k;
double det_local;
// Initialize local determinant to 1 for all threads
if (threadIdx.x == 0) {
det_local = 1.0;
}
__syncthreads();
if (j < size) {
for (i = 0; i < size - 1; i++) {
// Synchronize threads to ensure previous row operations are complete
__syncthreads();
// Perform row operations on matrix elements below the diagonal
if (j > i) {
if (device_matrix[i * size + i]) {
ratio = device_matrix[j * size + i] / device_matrix[i * size + i];
} else {
ratio = 1;
}
for (k = i; k < size; k++) {
device_matrix[j * size + k] -= ratio * device_matrix[i * size + k];
}
} else {
break;
}
}
// Multiply local determinant by diagonal element in the last row processed by this thread
det_local *= device_matrix[(j)*size + j];
}
// Thread 0 writes the final determinant to global memory
if (threadIdx.x == 0) {
*det = det_local;
}
}
__host__ double* build_matrix(uint32_t size) {
uint32_t i, j;
double* matrix = (double*)malloc(size * size * sizeof(double));
if (matrix == NULL) {
printf("Memory allocation error in build_matrix");
exit(2);
}
printf("OG matrix:\n");
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
matrix[i * size + j] = rand() % 10;
printf("%.3f ", matrix[i * size + j]);
}
printf("\n");
}
return matrix;
}
__host__ int main() {
uint32_t size, i;
printf("Enter the size of the matrix: ");
scanf("%u", &size);
if (size > MAX_SIZE) {
printf("The matrix is too big");
return 1;
}
srand((unsigned)time(NULL));
double* matrix = build_matrix(size);
double* device_matrix;
double host_det = 1.0;
double* device_det;
// Allocate memory on device
cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
cudaMalloc((void**)&device_det, sizeof(double));
// Copy matrix and determinant from host to device
cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);
// Recording time
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//float start2 = clock();
calculate_determinant <<<BLOCK_SIZE+1, THREAD_SIZE >>> (device_matrix, size, device_det);
cudaThreadSynchronize();
// float end = clock();
// Recording time
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
// Copy determinant back from device to host
cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);
// Free memory on device
cudaFree(device_matrix);
cudaFree(device_det);
free(matrix);
printf("Determinant is : %.3f\n", host_det);
printf("Time elapsed: %.2f\n", elapsedTime);
// printf("Time of execution = %.2f\n", end - start2);
return 0;
}
我尝试了不同的方法,包括改变我形成矩阵的方式,但仍然没有结果。也许有一些经验丰富的CUDA用户可以分享他们的经验?
1条答案
按热度按时间yizd12fk1#
我不清楚你的代码中使用的是什么算法,所以我在google上搜索了一下,找到了这个,然后实现了那个。
示例:
compute-sanitizer
运行你的代码,看看你是否有任何系统/硬件/cuda-setup问题。