ubuntu CUDA矩阵行列式

r3i60tvu  于 2023-04-05  发布在  其他
关注(0)|答案(1)|浏览(224)

最近我开始学习CUDA技术,得益于并行技术,它让我的计算速度提高了好几倍。我写的程序要用高斯方法计算方阵的行列式。不幸的是,由于某种原因,它总是告诉我矩阵的行列式等于一,尽管不是。另外,时间也显示不正确。最有可能的是,calculate_determinate函数中有错误,但我无法找到它。
我在Ubuntu 22.02中工作,这是我编译程序的方式:nvcc test2.cu -o test2
The output of the ./test2

#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <malloc.h>
#include <stdint.h>
#include <time.h>
#include <clocale>
#include <CL/cl.h>
#include <cuda_runtime.h>
#include <cuda.h>

#define MAX_SIZE 10000
#define BLOCK_SIZE 512
#define THREAD_SIZE 1024

__global__ void calculate_determinant(double* device_matrix, int size, double* det) {
     int j = threadIdx.x + blockDim.x * blockIdx.x;
    double ratio;
    int i, k;
    double det_local;

    // Initialize local determinant to 1 for all threads
    if (threadIdx.x == 0) {
        det_local = 1.0;
    }
    __syncthreads();

    if (j < size) {
        for (i = 0; i < size - 1; i++) {
            // Synchronize threads to ensure previous row operations are complete
            __syncthreads();

            // Perform row operations on matrix elements below the diagonal
            if (j > i) {
                if (device_matrix[i * size + i]) {
                    ratio = device_matrix[j * size + i] / device_matrix[i * size + i];
                } else {
                    ratio = 1;
                }
                for (k = i; k < size; k++) {
                    device_matrix[j * size + k] -= ratio * device_matrix[i * size + k];
                }
            } else {
                break;
            }
        }
        // Multiply local determinant by diagonal element in the last row processed by this thread
        det_local *= device_matrix[(j)*size + j];
    }

    // Thread 0 writes the final determinant to global memory
    if (threadIdx.x == 0) {
        *det = det_local;
    }
}

__host__ double* build_matrix(uint32_t size) {
    uint32_t i, j;
    double* matrix = (double*)malloc(size * size * sizeof(double));
    if (matrix == NULL) {
        printf("Memory allocation error in build_matrix");
        exit(2);
    }

    printf("OG matrix:\n");
    for (i = 0; i < size; i++) {
        for (j = 0; j < size; j++) {
            matrix[i * size + j] = rand() % 10;
            printf("%.3f ", matrix[i * size + j]);
        }
        printf("\n");
    }

    return matrix;
}

__host__ int main() {

    uint32_t size, i;
    printf("Enter the size of the matrix: ");
    scanf("%u", &size);
    if (size > MAX_SIZE) {
        printf("The matrix is too big");
        return 1;
    }

    srand((unsigned)time(NULL));

    double* matrix = build_matrix(size);
    double* device_matrix;
    double host_det = 1.0;
    double* device_det;

    // Allocate memory on device
    cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
    cudaMalloc((void**)&device_det, sizeof(double));

    // Copy matrix and determinant from host to device
    cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);

    //  Recording time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    //float start2 = clock();

   
    calculate_determinant <<<BLOCK_SIZE+1, THREAD_SIZE >>> (device_matrix, size, device_det);
    cudaThreadSynchronize();

   // float end = clock();

    // Recording time
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);

    // Copy determinant back from device to host
    cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);

    // Free memory on device
    cudaFree(device_matrix);
    cudaFree(device_det);
    free(matrix);

    printf("Determinant is : %.3f\n", host_det);
    printf("Time elapsed:  %.2f\n", elapsedTime);
   // printf("Time of execution =  %.2f\n", end - start2);

    return 0;
}

我尝试了不同的方法,包括改变我形成矩阵的方式,但仍然没有结果。也许有一些经验丰富的CUDA用户可以分享他们的经验?

yizd12fk

yizd12fk1#

我不清楚你的代码中使用的是什么算法,所以我在google上搜索了一下,找到了这个,然后实现了那个。

  • 高斯方法有各种各样的问题,比如零值的处理,我不打算涵盖所有的问题,我想有某些输入模式会导致这个方法崩溃,这需要大量的特殊情况来解决;我没有这样做。我假设这是一个学习练习。* 你不应该使用我下面的代码进行严肃的数值工作。*
  • 除了矩阵大小为2和3之外,我没有测试过这段代码。
  • 您选择的同步方法仅适用于1个数据块,因此仅限于最大为1024的矩阵大小。
  • 您可以使用grid-stride loop将其扩展到高于1024,尽管仍然只使用单个块
  • 您可以使用协作组网格同步将其更广泛地扩展到多块。

示例:

$ cat t2234.cu
#include <stdio.h>
#include <stdint.h>
#include <time.h>

#define THREAD_SIZE 1024 // must be power-of-2, between 2 and 1024
#define MAX_SIZE THREAD_SIZE

__global__ void calculate_determinant(double* device_matrix, int size, double* det) {
    __shared__ double smem[THREAD_SIZE];
    int col = threadIdx.x; // requires changes to work with more than 1 block
    double ratio;
    int row, tcol;

    for (tcol = 0; tcol < size-1; tcol++)
      for (row = tcol+1; row < size; row++) {
        // Perform row operations on matrix elements to zero out elements below main diagonal
        ratio = device_matrix[row * size + tcol] / device_matrix[tcol * size + tcol];
        __syncthreads();
        device_matrix[row * size + col] -= ratio * device_matrix[tcol * size + col];
        // Synchronize threads to ensure previous row operations are complete
        __syncthreads();
        }
        // compute determinant - parallel sweep product reduction
    smem[threadIdx.x] = device_matrix[threadIdx.x*size+threadIdx.x];
    for (int st = THREAD_SIZE/2; st > 0; st>>=1){
      __syncthreads();
      if ((col<st)&&((col+st)<blockDim.x)) smem[col] *= smem[col+st];}
    if (!threadIdx.x) *det = smem[0];
}

__host__ double* build_matrix(uint32_t size) {
    uint32_t i, j;
    double* matrix = (double*)malloc(size * size * sizeof(double));
    if (matrix == NULL) {
        printf("Memory allocation error in build_matrix");
        exit(2);
    }

    printf("OG matrix:\n");
    for (i = 0; i < size; i++) {
        for (j = 0; j < size; j++) {
            matrix[i * size + j] = rand() % 10;
            if ((i ==0) && (matrix[i*size+j] == 0)) matrix[i*size+j] = 1;
            printf("%.3f ", matrix[i * size + j]);
        }
        printf("\n");
    }

    return matrix;
}

__host__ int main() {

    uint32_t size;
    printf("Enter the size of the matrix: ");
    scanf("%u", &size);
    if (size > MAX_SIZE) {
        printf("The matrix is too big");
        return 1;
    }

    srand((unsigned)time(NULL));

    double* matrix = build_matrix(size);
    double* device_matrix;
    double host_det = 1.0;
    double* device_det;

    // Allocate memory on device
    cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
    cudaMalloc((void**)&device_det, sizeof(double));

    // Copy matrix and determinant from host to device
    cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);

    //  Recording time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    //float start2 = clock();


    calculate_determinant <<<1, size >>> (device_matrix, size, device_det);
    cudaDeviceSynchronize();

   // float end = clock();

    // Recording time
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);

    // Copy determinant back from device to host
    cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);

    // Free memory on device
    cudaFree(device_matrix);
    cudaFree(device_det);
    free(matrix);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) printf("CUDA error: %s\n", cudaGetErrorString(err));
    printf("Determinant is : %.3f\n", host_det);
    printf("Time elapsed:  %.2f\n", elapsedTime);
   // printf("Time of execution =  %.2f\n", end - start2);

    return 0;
}
$ nvcc -o t2234 t2234.cu
$ ./t2234
Enter the size of the matrix: 3
OG matrix:
5.000 1.000 8.000
1.000 3.000 2.000
9.000 3.000 2.000
Determinant is : -176.000
Time elapsed:  0.05
$
  • 上面报告的结果似乎与在线计算器(如this one)中报告的结果相匹配。
  • 如果你不能得到类似的结果,用compute-sanitizer运行你的代码,看看你是否有任何系统/硬件/cuda-setup问题。
  • 对于开发工作,当您正在学习和尝试调试某些东西时,最好不要让您的代码每次运行时都生成一个新的随机矩阵,而是从一个固定的矩阵开始,直到您可以正常工作。

相关问题