To implement the kernel reduceUnrolling16 and comapare the performance of kernal reduceUnrolling16 with kernal reduceUnrolling8 using nvprof.
Hardware – PCs with NVIDIA GPU & CUDA NVCC Google Colab with NVCC Compiler
- Initialization and Memory Allocation
- Define the input size n.
- Allocate host memory (h_idata and h_odata) for input and output data. Input Data Initialization
- Initialize the input data on the host (h_idata) by assigning a value of 1 to each element. Device Memory Allocation
- Allocate device memory (d_idata and d_odata) for input and output data on the GPU. Data Transfer: Host to Device
- Copy the input data from the host (h_idata) to the device (d_idata) using cudaMemcpy. Grid and Block Configuration
- Define the grid and block dimensions for the kernel launch:
- Each block consists of 256 threads.
- Calculate the grid size based on the input size n and block size.
- Start CPU Timer
- Initialize a CPU timer to measure the CPU execution time.
- Compute CPU Sum
- Calculate the sum of the input data on the CPU using a for loop and store the result in sum_cpu.
- Stop CPU Timer
- Record the elapsed CPU time.
- Start GPU Timer
- Initialize a GPU timer to measure the GPU execution time. Kernel Execution
- Launch the reduceUnrolling16 kernel on the GPU with the specified grid and block dimensions. Data Transfer: Device to Host
- Copy the result data from the device (d_odata) to the host (h_odata) using cudaMemcpy.
- Compute GPU Sum
- Calculate the final sum on the GPU by summing the elements in h_odata and store the result in sum_gpu.
- Stop GPU Timer
- Record the elapsed GPU time.
- Print Results
- Display the computed CPU sum, GPU sum, CPU elapsed time, and GPU elapsed time. Memory Deallocation
- Free the allocated host and device memory using free and cudaFree.
- Exit
- Return from the main function.
%%cuda
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
#ifndef _COMMON_H
#define _COMMON_H
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(1); \
} \
}
#define CHECK_CUBLAS(call) \
{ \
cublasStatus_t err; \
if ((err = (call)) != CUBLAS_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CURAND(call) \
{ \
curandStatus_t err; \
if ((err = (call)) != CURAND_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUFFT(call) \
{ \
cufftResult err; \
if ( (err = (call)) != CUFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUSPARSE(call) \
{ \
cusparseStatus_t err; \
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__); \
cudaError_t cuda_err = cudaGetLastError(); \
if (cuda_err != cudaSuccess) \
{ \
fprintf(stderr, " CUDA error \"%s\" also detected\n", \
cudaGetErrorString(cuda_err)); \
} \
exit(1); \
} \
}
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#endif // _COMMON_H
// Kernel function declaration
__global__ void reduceUnrolling16(int *g_idata, int *g_odata, unsigned int n);
// Function to calculate elapsed time in milliseconds
double getElapsedTime(struct timeval start, struct timeval end)
{
long seconds = end.tv_sec - start.tv_sec;
long microseconds = end.tv_usec - start.tv_usec;
double elapsed = seconds + microseconds / 1e6;
return elapsed * 1000; // Convert to milliseconds
}
int main()
{
// Input size and host memory allocation
unsigned int n = 1 << 20; // 1 million elements
size_t size = n * sizeof(int);
int *h_idata = (int *)malloc(size);
int *h_odata = (int *)malloc(size);
// Initialize input data on the host
for (unsigned int i = 0; i < n; i++)
{
h_idata[i] = 1;
}
// Device memory allocation
int *d_idata, *d_odata;
cudaMalloc((void **)&d_idata, size);
cudaMalloc((void **)&d_odata, size);
// Copy input data from host to device
cudaMemcpy(d_idata, h_idata, size, cudaMemcpyHostToDevice);
// Define grid and block dimensions
dim3 blockSize(256); // 256 threads per block
dim3 gridSize((n + blockSize.x * 16 - 1) / (blockSize.x * 16));
// Start CPU timer
struct timeval start_cpu, end_cpu;
gettimeofday(&start_cpu, NULL);
// Compute the sum on the CPU
int sum_cpu = 0;
for (unsigned int i = 0; i < n; i++)
{
sum_cpu += h_idata[i];
}
// Stop CPU timer
gettimeofday(&end_cpu, NULL);
double elapsedTime_cpu = getElapsedTime(start_cpu, end_cpu);
// Start GPU timer
struct timeval start_gpu, end_gpu;
gettimeofday(&start_gpu, NULL);
// Launch the reduction kernel
reduceUnrolling16<<<gridSize, blockSize>>>(d_idata, d_odata, n);
// Copy the result from device to host
cudaMemcpy(h_odata, d_odata, size, cudaMemcpyDeviceToHost);
// Compute the final sum on the GPU
int sum_gpu = 0;
for (unsigned int i = 0; i < gridSize.x; i++)
{
sum_gpu += h_odata[i];
}
// Stop GPU timer
gettimeofday(&end_gpu, NULL);
double elapsedTime_gpu = getElapsedTime(start_gpu, end_gpu);
// Print the results and elapsed times
printf("CPU Sum: %d\n", sum_cpu);
printf("GPU Sum: %d\n", sum_gpu);
printf("CPU Elapsed Time: %.2f ms\n", elapsedTime_cpu);
printf("GPU Elapsed Time: %.2f ms\n", elapsedTime_gpu);
// Free memory
free(h_idata);
free(h_odata);
cudaFree(d_idata);
cudaFree(d_odata);
return 0;
}
__global__ void reduceUnrolling16(int *g_idata, int *g_odata, unsigned int n)
{
// Set thread ID
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 16 + threadIdx.x;
// Convert global data pointer to the local pointer of this block
int *idata = g_idata + blockIdx.x * blockDim.x * 16;
// Unrolling 16
if (idx + 15 * blockDim.x < n)
{
int a1 = g_idata[idx];
int a2 = g_idata[idx + blockDim.x];
int a3 = g_idata[idx + 2 * blockDim.x];
int a4 = g_idata[idx + 3 * blockDim.x];
int a5 = g_idata[idx + 4 * blockDim.x];
int a6 = g_idata[idx + 5 * blockDim.x];
int a7 = g_idata[idx + 6 * blockDim.x];
int a8 = g_idata[idx + 7 * blockDim.x];
int b1 = g_idata[idx + 8 * blockDim.x];
int b2 = g_idata[idx + 9 * blockDim.x];
int b3 = g_idata[idx + 10 * blockDim.x];
int b4 = g_idata[idx + 11 * blockDim.x];
int b5 = g_idata[idx + 12 * blockDim.x];
int b6 = g_idata[idx + 13 * blockDim.x];
int b7 = g_idata[idx + 14 * blockDim.x];
int b8 = g_idata[idx + 15 * blockDim.x];
g_idata[idx] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + b1 + b2 + b3 + b4 + b5 + b6 + b7 + b8;
}
__syncthreads();
// In-place reduction in global memory
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)
{
if (tid < stride)
{
idata[tid] += idata[tid + stride];
}
// Synchronize within thread block
__syncthreads();
}
// Write result for this block to global memory
if (tid == 0)
{
g_odata[blockIdx.x] = idata[0];
}
}
%%cuda
#include <sys/time.h>
#ifndef _COMMON_H
#define _COMMON_H
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(1); \
} \
}
#define CHECK_CUBLAS(call) \
{ \
cublasStatus_t err; \
if ((err = (call)) != CUBLAS_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CURAND(call) \
{ \
curandStatus_t err; \
if ((err = (call)) != CURAND_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUFFT(call) \
{ \
cufftResult err; \
if ( (err = (call)) != CUFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUSPARSE(call) \
{ \
cusparseStatus_t err; \
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__); \
cudaError_t cuda_err = cudaGetLastError(); \
if (cuda_err != cudaSuccess) \
{ \
fprintf(stderr, " CUDA error \"%s\" also detected\n", \
cudaGetErrorString(cuda_err)); \
} \
exit(1); \
} \
}
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#endif // _COMMON_H
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
// Kernel function declaration
__global__ void reduceUnrolling16(int *g_idata, int *g_odata, unsigned int n);
// Function to calculate elapsed time in milliseconds
double getElapsedTime(struct timeval start, struct timeval end)
{
long seconds = end.tv_sec - start.tv_sec;
long microseconds = end.tv_usec - start.tv_usec;
double elapsed = seconds + microseconds / 1e6;
return elapsed * 1000; // Convert to milliseconds
}
int main()
{
// Input size and host memory allocation
unsigned int n = 1 << 20; // 1 million elements
size_t size = n * sizeof(int);
int *h_idata = (int *)malloc(size);
int *h_odata = (int *)malloc(size);
// Initialize input data on the host
for (unsigned int i = 0; i < n; i++)
{
h_idata[i] = 1;
}
// Device memory allocation
int *d_idata, *d_odata;
cudaMalloc((void **)&d_idata, size);
cudaMalloc((void **)&d_odata, size);
// Copy input data from host to device
cudaMemcpy(d_idata, h_idata, size, cudaMemcpyHostToDevice);
// Define grid and block dimensions
dim3 blockSize(256); // 256 threads per block
dim3 gridSize((n + blockSize.x * 16 - 1) / (blockSize.x * 16));
// Start CPU timer
struct timeval start_cpu, end_cpu;
gettimeofday(&start_cpu, NULL);
// Compute the sum on the CPU
int sum_cpu = 0;
for (unsigned int i = 0; i < n; i++)
{
sum_cpu += h_idata[i];
}
// Stop CPU timer
gettimeofday(&end_cpu, NULL);
double elapsedTime_cpu = getElapsedTime(start_cpu, end_cpu);
// Start GPU timer
struct timeval start_gpu, end_gpu;
gettimeofday(&start_gpu, NULL);
// Launch the reduction kernel
reduceUnrolling16<<<gridSize, blockSize>>>(d_idata, d_odata, n);
// Copy the result from device to host
cudaMemcpy(h_odata, d_odata, size, cudaMemcpyDeviceToHost);
// Compute the final sum on the GPU
int sum_gpu = 0;
for (unsigned int i = 0; i < gridSize.x; i++)
{
sum_gpu += h_odata[i];
}
// Stop GPU timer
gettimeofday(&end_gpu, NULL);
double elapsedTime_gpu = getElapsedTime(start_gpu, end_gpu);
// Print the results and elapsed times
printf("CPU Sum: %d\n", sum_cpu);
printf("GPU Sum: %d\n", sum_gpu);
printf("CPU Elapsed Time: %.2f ms\n", elapsedTime_cpu);
printf("GPU Elapsed Time: %.2f ms\n", elapsedTime_gpu);
// Free memory
free(h_idata);
free(h_odata);
cudaFree(d_idata);
cudaFree(d_odata);
return 0;
}
__global__ void reduceUnrolling16(int *g_idata, int *g_odata, unsigned int n)
{
// Set thread ID
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 16 + threadIdx.x;
// Convert global data pointer to the local pointer of this block
int *idata = g_idata + blockIdx.x * blockDim.x * 16;
// Unrolling 16
if (idx + 7 * blockDim.x < n)
{
int a1 = g_idata[idx];
int a2 = g_idata[idx + blockDim.x];
int a3 = g_idata[idx + 2 * blockDim.x];
int a4 = g_idata[idx + 3 * blockDim.x];
int b1 = g_idata[idx + 8 * blockDim.x];
int b2 = g_idata[idx + 9 * blockDim.x];
int b3 = g_idata[idx + 10 * blockDim.x];
int b4 = g_idata[idx + 11 * blockDim.x];
g_idata[idx] = a1 + a2 + a3 + a4 + b1 + b2 + b3 + b4;
}
__syncthreads();
// In-place reduction in global memory
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)
{
if (tid < stride)
{
idata[tid] += idata[tid + stride];
}
// Synchronize within thread block
__syncthreads();
}
// Write result for this block to global memory
if (tid == 0)
{
g_odata[blockIdx.x] = idata[0];
}
}
Thus the program has been executed by unrolling by 8 and unrolling by 16. It is observed that 8 has executed with less elapsed time than 16 with blocks 524488, 1048576.