CUDA–cublas–inverse of matrix (0)

There are many ways to use CUDA to solve the inverse of a matrix. You can also write your own kernel function to implement it. I checked the CSDN
Cublas’s method for solving matrix inverses, but the author’s writing is rather cumbersome, and other people who watch and learn will find it difficult to understand. So I
Decided to write one myself. I use the LU decomposition method, and cublas provides the corresponding function. code show as below:
#include <stdio.h>
#include <stdlib.h>
#include<malloc.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cublas_v2.h>

#define cudacall(call) \
    do \
    { \
        cudaError_t err = (call); \
        if(cudaSuccess != err) \
        { \
            fprintf(stderr,"CUDA Error:\\
File = %s\\
Line = %d\\
Reason = %s\\
", __FILE__, __LINE__, cudaGetErrorString(err)); \
            cudaDeviceReset(); \
            exit(EXIT_FAILURE); \
        } \
    } \
    while(0)


#define cublascall(call) \
    do \
    { \
        cublasStatus_t status = (call); \
        if(CUBLAS_STATUS_SUCCESS != status) \
        { \
            fprintf(stderr,"CUBLAS Error:\\
File = %s\\
Line = %d\\
Code = %d\\
", __FILE__, __LINE__, status); \
            cudaDeviceReset(); \
            exit(EXIT_FAILURE); \
        } \
                                                                                                                \
    } \
    while(0)

void invert(float** src, float** dst, int n, const int batchSize)
{
    cublasHandle_t handle;
    cublascall(cublasCreate_v2( & amp;handle));

    int *P, *INFO;

    cudacall(cudaMalloc( & amp;P, n * batchSize * sizeof(int)));
    cudacall(cudaMalloc( & amp;INFO, batchSize * sizeof(int)));

    int lda = n;

    float **A = (float **)malloc(batchSize * sizeof(float *));
    float **A_d, *A_dflat;
    cudacall(cudaMalloc( & amp;A_d, batchSize * sizeof(float *)));
    cudacall(cudaMalloc( & amp;A_dflat, n*n*batchSize * sizeof(float)));
    A[0] = A_dflat;

    for (int i = 1; i < batchSize; i ++ )
        A[i] = A[i - 1] + (n*n);
    cudacall(cudaMemcpy(A_d, A, batchSize * sizeof(float *), cudaMemcpyHostToDevice));
    for (int i = 0; i < batchSize; i ++ )
        cudacall(cudaMemcpy(A_dflat + (i*n*n), src[i], n*n * sizeof(float), cudaMemcpyHostToDevice));

    cublascall(cublasSgetrfBatched(handle, n, A_d, lda, P, INFO, batchSize));

    int *INFOh=new int[batchSize];
    //int INFOh[batchSize];
    cudacall(cudaMemcpy(INFOh, INFO, batchSize * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < batchSize; i ++ )
        if (INFOh[i] != 0)
        {
            fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\\
", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }

    float **C = (float **)malloc(batchSize * sizeof(float *));
    float **C_d, *C_dflat;
    cudacall(cudaMalloc( & amp;C_d, batchSize * sizeof(float *)));
    cudacall(cudaMalloc( & amp;C_dflat, n*n*batchSize * sizeof(float)));
    C[0] = C_dflat;
    for (int i = 1; i < batchSize; i ++ )
        C[i] = C[i - 1] + (n*n);
    cudacall(cudaMemcpy(C_d, C, batchSize * sizeof(float *), cudaMemcpyHostToDevice));
    cublascall(cublasSgetriBatched(handle, n, (const float **)A_d, lda, P, C_d, lda, INFO, batchSize));

    cudacall(cudaMemcpy(INFOh, INFO, batchSize * sizeof(int), cudaMemcpyDeviceToHost));


    for (int i = 0; i < batchSize; i ++ )
        if (INFOh[i] != 0)
        {
            fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\\
", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }
    for (int i = 0; i < batchSize; i ++ )
        cudacall(cudaMemcpy(dst[i], C_dflat + (i*n*n), n*n * sizeof(float), cudaMemcpyDeviceToHost));
    cudaFree(A_d); cudaFree(A_dflat); free(A);
    cudaFree(C_d); cudaFree(C_dflat); free(C);
    cudaFree(P); cudaFree(INFO); cublasDestroy_v2(handle); delete[]INFOh;
    
}

void test_invert()
{
    const int n = 3;
    const int mybatch = 4;

    //Random matrix with full pivots
    float full_pivot[n*n] = { 0.5, 3, 4,
                                1, 3, 10,
                                4, 9, 16};
    //Almost same as above matrix with first pivot zero
    float zero_pivot[n*n] = { 0, 3, 4,
                              1, 3, 10,
                              4, 9, 16};

    float another_zero_pivot[n*n] = { 0, 3, 4,
                                      1, 5, 6,
                                      9, 8, 2};

    float another_full_pivot[n * n] = { 22, 3, 4,
                                        1, 5, 6,
                                        9, 8, 2};
    float *result_flat = (float *)malloc(mybatch*n*n * sizeof(float));
    float **results = (float **)malloc(mybatch * sizeof(float *));
    for (int i = 0; i < mybatch; i + + )
        results[i] = result_flat + (i*n*n);
    float **inputs = (float **)malloc(mybatch * sizeof(float *));
    inputs[0] = zero_pivot;
    inputs[1] = full_pivot;
    inputs[2] = another_zero_pivot;
    inputs[3] = another_full_pivot;

    for (int qq = 0; qq < mybatch; qq + + ) {
        fprintf(stdout, "Input %d:\\
\\
", qq);
        for (int i = 0; i < n; i + + )
        {
            for (int j = 0; j < n; j + + )
                fprintf(stdout, "%f\t", inputs[qq][i*n + j]);
            fprintf(stdout, "\\
");
        }
    }
    fprintf(stdout, "\\
\\
");
    invert(inputs, results, n, mybatch);

    for (int qq = 0; qq < mybatch; qq + + ) {
        fprintf(stdout, "Inverse %d:\\
\\
", qq);
        for (int i = 0; i < n; i + + )
        {
            for (int j = 0; j < n; j + + )
                fprintf(stdout, "%f\t", results[qq][i*n + j]);
            fprintf(stdout, "\\
");
        }
    }
}


int main(void) {

    test_invert();
    return 0;
}
inverse matrix
operation result: