DLUT Operating System Supplementary Experiment Continuation – Example of Matrix Multiplication

In the last experiment blog, I wrote that my computer is not suitable for the local Dpcpp compiler, so I can only write it on DevCloud, and DevCloud does not have a GPU device. It happens that the matrix multiplication example experiment uses both CPU and GPU, so I only I can borrow my roommate’s computer to complete the experiment.

1. Test the CPU and GPU execution time of matrix multiplication

Attach the code:

#include <chrono>
#include <iostream>
#include <CL/sycl.hpp>
#define random_float() (rand() / double(RAND_MAX))
using namespace std;
using namespace sycl;
// return execution time
double gpu_kernel(float *A, float *B, float *C, int M, int N, int K, int block_size, sycl::queue & amp;q) {
  // define the workgroup size and mapping
  auto grid_rows = (M + block_size - 1) / block_size * block_size;
  auto grid_cols = (N + block_size - 1) / block_size * block_size;
  auto local_ndrange = range<2>(block_size, block_size);
  auto global_ndrange = range<2>(grid_rows, grid_cols);
  double duration = 0.0f;
  auto e = q.submit([ & amp;](sycl::handler & amp;h) {
      h.parallel_for<class k_name_t>(
          sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) {
              int row = index. get_global_id(0);
              int col = index. get_global_id(1);
              float sum = 0.0f;
              for (int i = 0; i < K; i ++ ) {
                sum + = A[row * K + i] * B[i * N + col];
              }
              C[row * N + col] = sum;
          });
    });
    e. wait();
    duration + = (e.get_profiling_info<info::event_profiling::command_end>() -
    e.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
    return(duration);
}
// return execution time
double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) {
    double duration = 0.0;
    std::chrono::high_resolution_clock::time_points, e;
    //Single Thread Computation in CPU
    s = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < M; i ++ ) {
        for(int j = 0; j < N; j ++ ) {
            float sum = 0.0f;
            for(int k = 0; k < K; k ++ ) {
                sum + = cA[i * K + k] * cB[k * N + j];
            }
            cC[i * N + j] = sum;
        }
    }
    e = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration<float, std::milli>(e - s).count();
    return(duration);
}
int verify(float *cpu_res, float *gpu_res, int length){
    int err = 0;
    for(int i = 0; i < length; i ++ ) {
       if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) {
          err++;
          printf("\\
%lf, %lf", cpu_res[i], gpu_res[i]);
       }
    }
    return(err);
}
int gemm(const int M,
         const int N,
         const int K,
         const int block_size,
         const int iterations,
         sycl::queue &q) {
  cout << "Problem size: c(" << M << "," << N << ") ="
       << " a(" << M << "," << K << ") *"
       << " b(" << K << "," << N << ")\\
";

  auto A = malloc_shared<float>(M * K, q);
  auto B = malloc_shared<float>(K * N, q);
  auto C = malloc_shared<float>(M * N, q);
  auto C_host = malloc_host<float>(M * N, q);
  // init the A/B/C
  for(int i=0; i < M * K; i ++ ) {
      A[i] = random_float();
  }
  for(int i=0; i < K * N; i ++ ) {
      B[i] = random_float();
  }
  for(int i=0; i < M * N; i ++ ) {
      C[i] = 0.0f;
      C_host[i] = 0.0f;
  }
  double flopsPerMatrixMul
      = 2.0 * static_cast<double>(M) * static_cast<double>(N) * static_cast<double>(K);
  double duration_gpu = 0.0f;
  double duration_cpu = 0.0f;
  // GPU computation and timer
  int warmup = 10;
  for (int run = 0; run < iterations + warmup; run ++ ) {
    float duration = gpu_kernel(A, B, C, M, N, K, block_size, q);
    if(run >= warmup) duration_gpu += duration;
  }
  duration_gpu = duration_gpu / iterations;
  // CPU computation and timer
  warmup = 2;
  for(int run = 0; run < iterations/2 + warmup; run ++ ) {
      float duration = cpu_kernel(A, B, C_host, M, N, K);
      if(run >= warmup) duration_cpu + = duration;
  }
  duration_cpu = duration_cpu / iterations/2;
  // Compare the results of CPU and GPU
  int errCode = 0;
  errCode = verify(C_host, C, M*N);
  if(errCode > 0) printf("\\
There are %d errors\\
", errCode);
  printf("\\
Performance Flops = %lf, \\
"
          "GPU Computation Time = %lf (ms); \\
"
          "CPU Computaiton Time = %lf (ms); \\
",
          flopsPerMatrixMul, duration_gpu, duration_cpu);

  free(A, q);
  free(B, q);
  free(C, q);
  free(C_host, q);
  return(errCode);
}

int main() {
  auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
  queue my_gpu_queue( cl::sycl::gpu_selector_v , propList);
  int errCode = gemm(2000, 2000, 2000, 4, 10, my_gpu_queue);
  return(errCode);
}

Set M=N=K=2000.

Local running results:

It can be seen that the GPU execution time is about 1409ms, the CPU execution time is about 6540ms, and the GPU execution time is shorter than the CPU.

2. Change the number of matrix rows and columns respectively, and test the execution time of CPU and GPU

Attach the code:

#include <chrono>
#include <iostream>
#include <CL/sycl.hpp>
#define random_float() (rand() / double(RAND_MAX))
using namespace std;
using namespace sycl;
#define tileY 2
#define tileX 2
// return execution time
double gpu_kernel(float *A, float *B, float *C,
                  int M, int N, int K,
                  int BLOCK, sycl::queue &q) {
  // define the workgroup size and mapping
  auto grid_rows = M / tileY;
  auto grid_cols = N / tileX;
  auto local_ndrange = range<2>(BLOCK, BLOCK);
  auto global_ndrange = range<2>(grid_rows, grid_cols);
  double duration = 0.0f;
  auto e = q.submit([ & amp;](sycl::handler & amp;h) {
      h.parallel_for<class k_name_t>(
          sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) {
              int row = tileY * index. get_global_id(0);
              int col = tileX * index. get_global_id(1);
              float sum[tileY][tileX] = {0.0f};
              float subA[tileY] = {0.0f};
              float subB[tileX] = {0.0f};
               // core computation
              for (int k = 0; k < N; k ++ ) {
                // read data to register
                for(int m = 0; m < tileY; m ++ ) {
                    subA[m] = A[(row + m) * N + k];
                }
                for(int p = 0; p < tileX; p ++ ) {
                    subB[p] = B[k * N + p + col];
                }
                for (int m = 0; m < tileY; m ++ ) {
                  for (int p = 0; p < tileX; p ++ ) {
                    sum[m][p] + = subA[m] * subB[p];
                  }
                }
              } //end of K
              // write results back
              for (int m = 0; m < tileY; m ++ ) {
                for (int p = 0; p < tileX; p ++ ) {
                  C[(row + m) * N + col + p] = sum[m][p];
                }
              }
          });
    });
    e. wait();
    duration + = (e.get_profiling_info<info::event_profiling::command_end>() -
    e.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
    return(duration);
}
// return execution time
double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) {
    double duration = 0.0;
    std::chrono::high_resolution_clock::time_points, e;
    //Single Thread Computation in CPU
    s = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < M; i ++ ) {
        for(int j = 0; j < N; j ++ ) {
            float sum = 0.0f;
            for(int k = 0; k < K; k ++ ) {
                sum + = cA[i * K + k] * cB[k * N + j];
            }
            cC[i * N + j] = sum;
        }
    }
    e = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration<float, std::milli>(e - s).count();

    return(duration);
}
int verify(float *cpu_res, float *gpu_res, int length){
    int err = 0;
    for(int i = 0; i < length; i ++ ) {
       if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) {
          err++;
          printf("\\
%lf, %lf", cpu_res[i], gpu_res[i]);
       }
    }
    return(err);
}
int gemm(const int M,
         const int N,
         const int K,
         const int block_size,
         const int iterations,
         sycl::queue &q) {
  cout << "Problem size: c(" << M << "," << N << ") ="
       << " a(" << M << "," << K << ") *"
       << " b(" << K << "," << N << ")\\
";
  auto A = malloc_shared<float>(M * K, q);
  auto B = malloc_shared<float>(K * N, q);
  auto C = malloc_shared<float>(M * N, q);
  auto C_host = malloc_host<float>(M * N, q);
  // init the A/B/C
  for(int i=0; i < M * K; i ++ ) {
      A[i] = random_float();
  }
  for(int i=0; i < K * N; i ++ ) {
      B[i] = random_float();
  }
  for(int i=0; i < M * N; i ++ ) {
      C[i] = 0.0f;
      C_host[i] = 0.0f;
  }
  double flopsPerMatrixMul
      = 2.0 * static_cast<double>(M) * static_cast<double>(N) * static_cast<double>(K);
  double duration_gpu = 0.0f;
  double duration_cpu = 0.0f;
  // GPU computation and timer
  int warmup = 10;
  for (int run = 0; run < iterations + warmup; run ++ ) {
    float duration = gpu_kernel(A, B, C, M, N, K, block_size, q);
    if(run >= warmup) duration_gpu += duration;
  }
  duration_gpu = duration_gpu / iterations;
  // CPU computation and timer
  warmup = 2;
  for(int run = 0; run < iterations/2 + warmup; run ++ ) {
      float duration = cpu_kernel(A, B, C_host, M, N, K);
      if(run >= warmup) duration_cpu += duration;
  }
  duration_cpu = duration_cpu / iterations/2;
  // Compare the results of CPU and GPU
  int errCode = 0;
  errCode = verify(C_host, C, M*N);
  if(errCode > 0) printf("\\
There are %d errors\\
", errCode);
  printf("\\
GEMM size M = %d, N = %d, K = %d", M, N, K);
  printf("\\
Work-Group size = %d * %d, tile_X = %d, tile_Y = %d", block_size, block_size, tileX, tileY);
  printf("\\
Performance Flops = %lf, \\
"
          "GPU Computation Time = %lf (ms); \\
"
          "CPU Computaiton Time = %lf (ms); \\
",
          flopsPerMatrixMul, duration_gpu, duration_cpu);

  free(A, q);
  free(B, q);
  free(C, q);
  free(C_host, q);
  return(errCode);
}
int main() {
  auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
  queue my_gpu_queue( cl::sycl::gpu_selector_v , propList);
  int errCode = gemm(512, 512, 512, /* GEMM size, M, N, K */
                     4, /* workgroup size */
                     10, /* repeat time */
                     my_gpu_queue);
  return(errCode);
}

The change of tileX and tileY will cause the number of rows and columns to change

Operation result:

tileX=2，tileY=2

tileX=2，tileY=4

tileX=2，tileY=8

tileX=4，tileY=2

tileX=8，tileY=2

Analysis:

1. In the changes of tileX and tileY, it can be seen that the CPU execution time basically does not change, indicating that the change does not have much impact on the CPU.

2. Keep tileX unchanged, when tileY increases, GPU execution time changes, but the change is small, indicating that the change has little impact on GPU execution time; keep tileY unchanged, when tileX increases, GPU execution time The change is large, indicating that the change of tileX has a great impact on the GPU execution time.

3. When tileX and tileY reach 8 or more, the execution time of GPU and CPU has a high probability of bursting, and an error occurs. Two sets of error data are listed below: