请问小彭老师,这段GPU代码为什么加速比这么低?
Opened this issue · 4 comments
balleb6545anickk commented
测试环境:
笔记本R7-5800H,3060,Win11,MSVC最新版Release模式。
测试结果:
GPU time: 0.0018809
CPU time: 0.0048002
ratio: 2.55208
我用其它的CUDA程序加速比都能达到10倍左右,这个加速比为什么这么慢?
(另外,改成float加速就很快,为什么?如果一定要用double,该怎么改?)
#include <omp.h>
#include <chrono>
#include <iostream>
#include <vector>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define TYPE double
#define imgW 2448
#define imgH 2048
#define N imgW *imgH
__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
TYPE *para2) {
int pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= width * height)
return;
TYPE data = input[pos];
TYPE row = pos / width;
TYPE col = pos % width;
TYPE x = (col - para2[0]) * para2[2];
TYPE y = (row - para2[1]) * para2[3];
const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;
output[pos] = a / b;
}
void CPU_Cal(const TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1, TYPE *para2) {
#pragma omp parallel for
for (int row = 0; row < height; ++row) {
TYPE *_output = output + row * width;
const TYPE *_input = input + row * width;
for (int col = 0; col < width; ++col) {
const TYPE data = *_input;
const TYPE x = (col - para2[0]) * para2[2];
const TYPE y = (row - para2[1]) * para2[3];
const TYPE a =
para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
const TYPE b =
para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;
*_output = a / b;
++_output;
++_input;
}
}
}
int main() {
// 准备数据
std::vector<TYPE> input(N, 2);
std::vector<TYPE> output(N, 0);
std::vector<TYPE> para0(30, 1.5);
std::vector<TYPE> para1(30, 1.5);
std::vector<TYPE> para3{1246, 1037, 2448, 2048};
// 随机准备一段数据
for (int i = 0; i < N; ++i) {
input[i] = (double)i / N;
output[i] = (double)i / N + 2;
}
for (int i = 0; i < 30; ++i) {
para0[i] = (double)i / 30;
para1[i] = (double)i / 30 + 4.0;
}
TYPE *d_input;
TYPE *d_output;
TYPE *d_para0;
TYPE *d_para1;
TYPE *d_para2;
cudaMalloc((void **)&d_input, N * sizeof(TYPE));
cudaMalloc((void **)&d_output, N * sizeof(TYPE));
cudaMalloc((void **)&d_para0, 30 * sizeof(TYPE));
cudaMalloc((void **)&d_para1, 30 * sizeof(TYPE));
cudaMalloc((void **)&d_para2, 4 * sizeof(TYPE));
cudaMemcpy(d_input, input.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
cudaMemcpy(d_output, output.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
cudaMemcpy(d_para0, para0.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
cudaMemcpy(d_para1, para1.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
cudaMemcpy(d_para2, para3.data(), 4 * sizeof(TYPE), cudaMemcpyHostToDevice);
// GPU计算时间(取最短时间)
int thread_num = 256;
int block_num = (imgW * imgH + thread_num - 1) / thread_num;
double gpu_time = 10000000;
cudaDeviceSynchronize();
for (size_t i = 0; i < 50; i++) {
auto t0 = std::chrono::steady_clock::now();
GPU_Cal<<<block_num, thread_num>>>(d_input, d_output, imgW, imgH, d_para0, d_para1, d_para2);
cudaDeviceSynchronize();
double time =
std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
gpu_time = std::min(gpu_time, time);
}
std::cout << "GPU time: " << gpu_time << std::endl;
// CPU计算时间(取最短时间)
TYPE *h_output;
h_output = (TYPE *)malloc(N * sizeof(TYPE));
cudaMemcpy(h_output, d_output, N * sizeof(TYPE), cudaMemcpyDeviceToHost);
double cpu_time = 10000000;
for (size_t i = 0; i < 50; i++) {
auto t0 = std::chrono::steady_clock::now();
CPU_Cal(input.data(), output.data(), imgW, imgH, para0.data(), para1.data(), para3.data());
double time =
std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
cpu_time = std::min(cpu_time, time);
}
std::cout << "CPU time: " << cpu_time << std::endl;
std::cout << "ratio: " << cpu_time / gpu_time << std::endl;
// 检测计算结果是否一致
for (int i = 0; i < N; i++) {
if (h_output[i] != h_output[i] && output[i] != output[i]) {
continue;
}
if (fabs(h_output[i] - output[i]) > 1e-2) {
printf("Error! i: %d, cpu: %f, gpu:%f.\n", i, output[i], h_output[i]);
abort();
}
}
cudaFree(d_input);
cudaFree(d_output);
cudaFree(d_para0);
cudaFree(d_para1);
cudaFree(d_para2);
return 0;
}
balleb6545anickk commented
顺便问一下小彭老师,什么时候把CUDA nsight安排上!
balleb6545anickk commented
HJzhang-sjtu commented
README里面写了,小彭老师不回答CUDA优化相关的问题。我来回答下吧,你这个开的block数目太多了,总共就那几十个SM,你开了1万多个block,光调度这些block运行在SM上开销就很大了。可以让一个block计算更多的数据,例如每一个block计算256*256个input数据,每一个block内的thread计算256个数据。
archibate commented
- GPU(特别是消费级显卡)就是对double支持很差的,正常的。正常图形学应用都是float数据,aipig甚至巴不得用half。(这就是为什么他们搞科学计算的都不爱用GPU集群,因为科研仿真需要double精度)
- 你这里的主要瓶颈是这个双精度浮点除法,把
a / b
改成a + b
后加速比直接从 2.23 提升到 2.99 了。 i / width
这个整数除法的开销也很大,你试图用一维的blockdim和griddim在运行东西,然后用除法和模运算来模拟出row和col,这是不正确的。应该利用blockDim.x和blockDim.y,避免低效的除法。
1和2的问题是可能是你算法需要,改了你的结果就不对了。3这个问题我给你改下。
__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
TYPE *para2) {
for (int row = threadIdx.y + blockIdx.y * blockDim.y; row < height; row += gridDim.y * blockDim.y) {
for (int col = threadIdx.x + blockIdx.x * blockDim.x; col < width; col += gridDim.x * blockDim.x) {
int i = row * width + col;
TYPE data = input[i];
TYPE x = (row - para2[0]) * para2[2];
TYPE y = (col - para2[1]) * para2[3];
const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;
output[i] = a / b;
}
}
}