To perform Matrix Multiplication using CUDA and check its performance with nvprof.
Hardware – PCs with NVIDIA GPU & CUDA NVCC Google Colab with NVCC Compiler
- Define Constants: Define the size of the matrices (SIZE) and the size of the CUDA blocks (BLOCK_SIZE).
- Kernel Function: Define a CUDA kernel function matrixMultiply that performs the matrix multiplication.
- In the main function, perform the following steps:
- Initialize Matrices: Initialize the input matrices ‘a’ and ‘b’ with some values.
- Allocate Device Memory: Allocate memory on the GPU for the input matrices ‘a’ and ‘b’, and the output matrix ‘c’.
- Copy Matrices to Device: Copy the input matrices from host (CPU) memory to device (GPU) memory.
- Set Grid and Block Sizes: Set the grid and block sizes for the CUDA kernel launch.
- Start Timer: Start a timer to measure the execution time of the kernel.
- Launch Kernel: Launch the matrixMultiply kernel with the appropriate grid and block sizes, and the input and output matrices as arguments.
- Copy Result to Host: After the kernel execution, copy the result matrix from device memory to host memory.
- Stop Timer: Stop the timer and calculate the elapsed time.
- Print Result: Print the result matrix and the elapsed time.
- Free Device Memory: Finally, free the device memory that was allocated for the matrices.
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter
%%writefile matmul.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <sys/time.h>
#ifndef _COMMON_H
#define _COMMON_H
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(1); \
} \
}
#define CHECK_CUBLAS(call) \
{ \
cublasStatus_t err; \
if ((err = (call)) != CUBLAS_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CURAND(call) \
{ \
curandStatus_t err; \
if ((err = (call)) != CURAND_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUFFT(call) \
{ \
cufftResult err; \
if ( (err = (call)) != CUFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUSPARSE(call) \
{ \
cusparseStatus_t err; \
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__); \
cudaError_t cuda_err = cudaGetLastError(); \
if (cuda_err != cudaSuccess) \
{ \
fprintf(stderr, " CUDA error \"%s\" also detected\n", \
cudaGetErrorString(cuda_err)); \
} \
exit(1); \
} \
}
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#endif // _COMMON_H
#define SIZE 4
#define BLOCK_SIZE 2
// Kernel function to perform matrix multiplication
__global__ void matrixMultiply(int *a, int *b, int *c, int size)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
for (int k = 0; k < size; ++k)
{
sum += a[row * size + k] * b[k * size + col];
}
c[row * size + col] = sum;
}
int main()
{
int a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE];
int *dev_a, *dev_b, *dev_c;
int size = SIZE * SIZE * sizeof(int);
// Initialize matrices 'a' and 'b'
for (int i = 0; i < SIZE; ++i)
{
for (int j = 0; j < SIZE; ++j)
{
a[i][j] = i + j;
b[i][j] = i - j;
}
}
// Allocate memory on the device
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
// Copy input matrices from host to device memory
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
// Set grid and block sizes
dim3 dimGrid(SIZE / BLOCK_SIZE, SIZE / BLOCK_SIZE);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// Start timer
struct timeval start, end;
gettimeofday(&start, NULL);
// Launch kernel
matrixMultiply<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c, SIZE);
// Copy result matrix from device to host memory
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
// Stop timer
gettimeofday(&end, NULL);
double elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1000000.0;
// Print the result matrix
printf("Result Matrix:\n");
for (int i = 0; i < SIZE; ++i)
{
for (int j = 0; j < SIZE; ++j)
{
printf("%d ", c[i][j]);
}
printf("\n");
}
// Print the elapsed time
printf("Elapsed Time: %.6f seconds\n", elapsed_time);
// Free device memory
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
!nvcc -o matmul matmul.cu
!./matmul
!nvprof ./matmul
!nvprof --print-gpu-trace ./matmul
Thus the program has been executed by using CUDA to mulptiply two matrices. It is observed that there are variations in host and device elapsed time. Device took 0.000211 sec and host took 0.000216 sec.