Refer to the program sumMatrixGPUManaged.cu. Would removing the memsets below affect performance? If you can, check performance with nvprof or nvvp.
To perform Matrix addition with unified memory and check its performance with nvprof.
- Include the required files and library.
- Introduce a function named "initialData","sumMatrixOnHost","checkResult" to return the initialize the data , perform matrix summation on the host and then check the result.
- Create a grid 2D block 2D global function to perform matrix on the GPU.
- Declare the main function and set up the device & data size of matrix , perform memory allocation on host memory & initialize the data at host side then add matrix at host side for result checks followed by invoking kernel at host side. Check the kernel error, and check device for results.Finally free the device global memory and reset device.
- Execute the program and run the terminal.
Developed by: Subramaniya Pillai B
Register number: 212221230109
%%cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <cuda.h>
#include <sys/time.h>
#include <cuda.h>
#include <sys/time.h>
void initialData(float *ip, const int size)
{
int i;
for (i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
return;
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
float *ia = A;
float *ib = B;
float *ic = C;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
return;
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
match = 0;
printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
break;
}
}
if (!match)
{
printf("Arrays do not match.\n\n");
}
}
// grid 2D block 2D
__global__ void sumMatrixGPU(float *MatA, float *MatB, float *MatC, int nx,
int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
{
MatC[idx] = MatA[idx] + MatB[idx];
}
}
#ifndef _COMMON_H
#define _COMMON_H
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(1); \
} \
}
#define CHECK_CUBLAS(call) \
{ \
cublasStatus_t err; \
if ((err = (call)) != CUBLAS_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CURAND(call) \
{ \
curandStatus_t err; \
if ((err = (call)) != CURAND_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUFFT(call) \
{ \
cufftResult err; \
if ( (err = (call)) != CUFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUSPARSE(call) \
{ \
cusparseStatus_t err; \
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__); \
cudaError_t cuda_err = cudaGetLastError(); \
if (cuda_err != cudaSuccess) \
{ \
fprintf(stderr, " CUDA error \"%s\" also detected\n", \
cudaGetErrorString(cuda_err)); \
} \
exit(1); \
} \
}
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#endif // _COMMON_H
int main(int argc, char **argv)
{
printf("%s Starting ", argv[0]);
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("using Device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
// set up data size of matrix
int nx, ny;
int ishift = 12;
if (argc > 1) ishift = atoi(argv[1]);
nx = ny = 1 << ishift;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
printf("Matrix size: nx %d ny %d\n", nx, ny);
// malloc host memory
float *A, *B, *hostRef, *gpuRef;
CHECK(cudaMallocManaged((void **)&A, nBytes));
CHECK(cudaMallocManaged((void **)&B, nBytes));
CHECK(cudaMallocManaged((void **)&gpuRef, nBytes); );
CHECK(cudaMallocManaged((void **)&hostRef, nBytes););
// initialize data at host side
double iStart = seconds();
initialData(A, nxy);
initialData(B, nxy);
double iElaps = seconds() - iStart;
printf("initialization: \t %f sec\n", iElaps);
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// add matrix at host side for result checks
iStart = seconds();
sumMatrixOnHost(A, B, hostRef, nx, ny);
iElaps = seconds() - iStart;
printf("sumMatrix on host:\t %f sec\n", iElaps);
// invoke kernel at host side
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
// warm-up kernel, with unified memory all pages will migrate from host to device
sumMatrixGPU<<<grid, block>>>(A, B, gpuRef, 1, 1);
// after warm-up, time with unified memory
iStart = seconds();
sumMatrixGPU<<<grid, block>>>(A, B, gpuRef, nx, ny);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>> \n", iElaps,grid.x, grid.y, block.x, block.y);
// check kernel error
CHECK(cudaGetLastError());
// check device results
checkResult(hostRef, gpuRef, nxy);
// free device global memory
CHECK(cudaFree(A));
CHECK(cudaFree(B));
CHECK(cudaFree(hostRef));
CHECK(cudaFree(gpuRef));
// reset device
CHECK(cudaDeviceReset());
return (0);
}
%%cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <cuda.h>
#include <sys/time.h>
#include <cuda.h>
#include <sys/time.h>
void initialData(float *ip, const int size)
{
int i;
for (i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
return;
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
float *ia = A;
float *ib = B;
float *ic = C;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
return;
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
match = 0;
printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
break;
}
}
if (!match)
{
printf("Arrays do not match.\n\n");
}
}
// grid 2D block 2D
__global__ void sumMatrixGPU(float *MatA, float *MatB, float *MatC, int nx,
int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
{
MatC[idx] = MatA[idx] + MatB[idx];
}
}
#ifndef _COMMON_H
#define _COMMON_H
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(1); \
} \
}
#define CHECK_CUBLAS(call) \
{ \
cublasStatus_t err; \
if ((err = (call)) != CUBLAS_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CURAND(call) \
{ \
curandStatus_t err; \
if ((err = (call)) != CURAND_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUFFT(call) \
{ \
cufftResult err; \
if ( (err = (call)) != CUFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUSPARSE(call) \
{ \
cusparseStatus_t err; \
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__); \
cudaError_t cuda_err = cudaGetLastError(); \
if (cuda_err != cudaSuccess) \
{ \
fprintf(stderr, " CUDA error \"%s\" also detected\n", \
cudaGetErrorString(cuda_err)); \
} \
exit(1); \
} \
}
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#endif // _COMMON_H
int main(int argc, char **argv)
{
printf("%s Starting ", argv[0]);
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("using Device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
// set up data size of matrix
int nx, ny;
int ishift = 12;
if (argc > 1) ishift = atoi(argv[1]);
nx = ny = 1 << ishift;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
printf("Matrix size: nx %d ny %d\n", nx, ny);
// malloc host memory
float *A, *B, *hostRef, *gpuRef;
CHECK(cudaMallocManaged((void **)&A, nBytes));
CHECK(cudaMallocManaged((void **)&B, nBytes));
CHECK(cudaMallocManaged((void **)&gpuRef, nBytes); );
CHECK(cudaMallocManaged((void **)&hostRef, nBytes););
// initialize data at host side
double iStart = seconds();
initialData(A, nxy);
initialData(B, nxy);
double iElaps = seconds() - iStart;
printf("initialization: \t %f sec\n", iElaps);
// add matrix at host side for result checks
iStart = seconds();
sumMatrixOnHost(A, B, hostRef, nx, ny);
iElaps = seconds() - iStart;
printf("sumMatrix on host:\t %f sec\n", iElaps);
// invoke kernel at host side
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
// warm-up kernel, with unified memory all pages will migrate from host to device
sumMatrixGPU<<<grid, block>>>(A, B, gpuRef, 1, 1);
// after warm-up, time with unified memory
iStart = seconds();
sumMatrixGPU<<<grid, block>>>(A, B, gpuRef, nx, ny);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>> \n", iElaps,
grid.x, grid.y, block.x, block.y);
// check kernel error
CHECK(cudaGetLastError());
// check device results
checkResult(hostRef, gpuRef, nxy);
// free device global memory
CHECK(cudaFree(A));
CHECK(cudaFree(B));
CHECK(cudaFree(hostRef));
CHECK(cudaFree(gpuRef));
// reset device
CHECK(cudaDeviceReset());
return (0);
}
Thus Matrix addition with unified memory is done successfully.