Benchmark Matrix Multiply on GPU Environment

The aim of this benchmark is learn how to analyse modules for C using the APIs CUDA, OpenMP 5 and OpenACC. A practical example to see how it can be used and to see a real example of the speed gains. The results are impressive for the effort and performance on the supercomputacional environment.

Command Line Arguments
How to Execute
Hierachy
Codes
Acknowledgements

Command Line Arguments

Example:

~$ bash START.sh [[[--comparison file] | [--help]]

(required) Specifies the name of supercomputer (word) will be execute

 <supercomputer> - ogbon | airis

 file - mm_blas  | mm_cublas

~$ bash START.sh ogbon --comparison mm_blas

How to Execute

~$ bash START.sh ogbon --comparison mm_blas

Hierachy

	    |--------------------------|             |------------|
            | TIME | SPEEDUP | MEMORY  |--has-a----->|   RESULTS  |
            |--------------------------|             |------------|
                       ^                                ^  ^
                      /                                 |  |
                   has-a                                |  |
                    /                                   |  |
                   /           |--------------|         |  |
                  /            |    PLOTS     |-is-a----|  |
                 /             |--------------|            |
                /                                        is-a 
               /                                           |
|----------------|                                  |------------|
|     OBJECT     |--has-a-------------------------->| PROFILING  |
|----------------|                                  |------------|

Codes

Sequential

void mm(double *A, double *B, double *C, int n){

for(int i = 0; i < n; i++) 
 for(int j = 0; j < n; j++)
   for(int k = 0; k < n; k++) 
      C[i*n+j]+=A[i*n+k]*B[k*n+j];
			
}

BLAS

void mm_blas(double *A, double *B, double *C, int size){

char transa ='N';
char transb ='N';
double alpha = 1.;
double beta =  0.;
int m = size;
int n = size; 
int k = size; 
int lda = size;
int ldb = size;
int ldc = size;

dgemm_(&transa, &transb, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);

}

CUBLAS

void mm_cublas(double *A_host, double *B_host, double *C_host, int size){

double alpha = 1.;
double beta =  0.;
int m = size;
int n = size; 
int k = size;
int lda = size;
int ldb = size;
int ldc = size;
            
double *A_device;
double *B_device;
double *C_device;
  
cudaMalloc((void**)&A_device, size * size * sizeof(double) ); 
cudaMalloc((void**)&B_device, size * size * sizeof(double) ); 
cudaMalloc((void**)&C_device, size * size * sizeof(double) ); 

cublasHandle_t handle;
cublasCreate(&handle);

cublasSetMatrix(size, size, sizeof(double), A_host, size, A_device, size);
cublasSetMatrix(size, size, sizeof(double), B_host, size, B_device, size);
cublasSetMatrix(size, size, sizeof(double), C_host, size, C_device, size);
  
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A_device, lda, B_device, ldb, &beta, C_device, ldc);
 
cublasGetMatrix(size, size, sizeof(double), C_device, size, C_host, size);

cudaFree(A_device);
cudaFree(B_device);
cudaFree(C_device);
  
cublasDestroy(handle);
   
}

OpenMP 5

void mm_omp5(double *A, double *B, double *C, int n){

int i, j, k;

#pragma omp target data map(to:A[:n*n], B[:n*n], n) map(from:C[:n*n])
 #pragma omp target teams distribute parallel for private(i,j,k)
   for(i = 0; i < n; i++) 
    for(j = 0; j < n; j++)
      for(k = 0; k < n; k++) 
        C[i*n+j] += A[i*n+k] * B[k*n+j];
	     
}

OpenACC

void mm_openacc(double *A, double *B, double *C, int n){

int i, j, k;

#pragma acc data present_or_copyin(A[:n*n], B[:n*n], n) copyout(C[:n*n])
 #pragma acc parallel 
   #pragma acc loop
     for(i = 0; i < n; i++)
      for(j = 0; j < n; j++)
        for(k = 0; k < n; k++)
          C[i*n+j] += A[i*n+k] * B[k*n+j];
			 
}

CUDA

__global__ void kernel(double *A, double *B, double *C, int n) {
  
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;

if(i < n && j < n)
    for(int k = 0; k < n; k++) 
       C[i*n+j] += A[i*n+k] * B[k*n+j];

}

Acknowledgements

This work has been partially supported by NVIDIA Hardware Grant Program, and I have also worked in cooperation with the researches Silvano Júnior and Raí Bizerra.

muriloboratto/benchmark-mode-optimization-GPU