编写⼀个基于oneAPI的C++/SYCL程序来执行矩阵乘法操作。
本项目基于Intel提供的oneAPI工具集,编写C++/SYCL程序利用基于SYCL的编程模型在GPU上实现矩阵乘法的计算。
#include <sycl/sycl.hpp>
#include <random>
#include <chrono>
constexpr size_t MATRIX_SIZE = 1024;
constexpr size_t BLOCK_SIZE = 16;
// 随机生成输入矩阵
void generateRandomMatrix(float* matrix, size_t rows, size_t cols) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(0.0f, 1.0f);
for (size_t i = 0; i < rows * cols; ++i) {
matrix[i] = dis(gen);
}
}
// 常规串行计算作为baseline
void matrixMultiplyScalar(float* result, const float* matrixA, const float* matrixB) {
for (size_t i = 0; i < MATRIX_SIZE; ++i) {
for (size_t j = 0; j < MATRIX_SIZE; ++j) {
float sum = 0.0f;
for (size_t k = 0; k < MATRIX_SIZE; ++k) {
sum += matrixA[i * MATRIX_SIZE + k] * matrixB[k * MATRIX_SIZE + j];
}
result[i * MATRIX_SIZE + j] = sum;
}
}
}
class MatrixMultiplyKernel {
public:
MatrixMultiplyKernel(sycl::queue& queue, float* result, const float* matrixA, const float* matrixB)
: queue_(queue), result_(result), matrixA_(matrixA), matrixB_(matrixB) {}
void operator()() {
sycl::buffer<float, 2> resultBuffer(result_, sycl::range<2>(MATRIX_SIZE, MATRIX_SIZE));
sycl::buffer<const float, 2> matrixABuffer(matrixA_, sycl::range<2>(MATRIX_SIZE, MATRIX_SIZE));
sycl::buffer<const float, 2> matrixBBuffer(matrixB_, sycl::range<2>(MATRIX_SIZE, MATRIX_SIZE));
queue_.submit([&](sycl::handler& cgh) {
auto resultAcc = resultBuffer.get_access<sycl::access::mode::write>(cgh);
auto matrixAAcc = matrixABuffer.get_access<sycl::access::mode::read>(cgh);
auto matrixBAcc = matrixBBuffer.get_access<sycl::access::mode::read>(cgh);
// 分块
cgh.parallel_for<class BlockMatrixMultiplyKernel>(
sycl::nd_range<2>(sycl::range<2>(MATRIX_SIZE, MATRIX_SIZE), sycl::range<2>(BLOCK_SIZE, BLOCK_SIZE)),
[=](sycl::nd_item<2> item) {
size_t globalRow = item.get_global_id(0);
size_t globalCol = item.get_global_id(1);
float sum = 0.0f;
// 循环计算块乘法
for (size_t k = 0; k < MATRIX_SIZE; k++) {
sum += matrixAAcc[globalRow][k] * matrixBAcc[k][globalCol];
}
// 将局部结果写回全局内存
resultAcc[item.get_global_id()] = sum;
});
});
}
private:
sycl::queue& queue_;
float* result_;
const float* matrixA_;
const float* matrixB_;
};
bool compare(const float* matrixA, const float* matrixB, size_t size) {
for (size_t i = 0; i < size; ++i) {
if (fabs(matrixA[i] - matrixB[i]) >= 1e-3f) {
return false;
}
}
return true;
}
int main() {
sycl::queue queue(sycl::gpu_selector_v);
// 使用USM分配内存
float* matrixA = sycl::malloc_shared<float>(MATRIX_SIZE * MATRIX_SIZE, queue);
float* matrixB = sycl::malloc_shared<float>(MATRIX_SIZE * MATRIX_SIZE, queue);
float* matrixC = sycl::malloc_shared<float>(MATRIX_SIZE * MATRIX_SIZE, queue);
float* resultScalar = sycl::malloc_shared<float>(MATRIX_SIZE * MATRIX_SIZE, queue);
float* resultParallel = sycl::malloc_shared<float>(MATRIX_SIZE * MATRIX_SIZE, queue);
generateRandomMatrix(matrixA, MATRIX_SIZE, MATRIX_SIZE);
generateRandomMatrix(matrixB, MATRIX_SIZE, MATRIX_SIZE);
auto startSerial = std::chrono::high_resolution_clock::now();
matrixMultiplyScalar(resultScalar, matrixA, matrixB);
auto endSerial = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> durationSerial = endSerial - startSerial;
std::cout << "Time taken for scalar matrix multiplication:: " << durationSerial.count() << " seconds" << std::endl;
MatrixMultiplyKernel matrixMultiplyParallel(queue, resultParallel, matrixA, matrixB);
auto startParallel = std::chrono::high_resolution_clock::now();
matrixMultiplyParallel();
auto endParallel = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> durationParallel = endParallel - startParallel;
std::cout << "Time taken for parallel matrix multiplication:: " << durationParallel.count() << " seconds" << std::endl;
bool match = compare(resultParallel, resultScalar, MATRIX_SIZE * MATRIX_SIZE);
if (match) {
std::cout << "Matrix multiplication successfully run on the device"
<< "\n";
} else {
std::cout
<< "*********************************************Verification Failed. Results are "
"not matched**************************"
<< "\n";
}
free(matrixA, queue);
free(matrixB, queue);
free(matrixC, queue);
free(resultScalar, queue);
free(resultParallel, queue);
return 0;
}
随机生成1024*1024的两个矩阵,进行乘法运算,并将并行运算的结果与常规串行运算结果相比较,输出如下:
Job has been submitted to Intel(R) DevCloud and will execute soon.
Job ID Name User Time Use S Queue
------------------------- ---------------- --------------- -------- - -----
2432120.v-qsvr-1 ...ub-singleuser u207452 00:00:41 R jupyterhub
2432161.v-qsvr-1 ...x_multiply.sh u207452 0 Q batch
Waiting for Output ████████████████████ Done⬇
########################################################################
# Date: Sat 18 Nov 2023 10:13:11 PM PST
# Job ID: 2432161.v-qsvr-1.aidevcloud
# User: u207452
# Resources: cput=75:00:00,neednodes=1:gpu:ppn=2,nodes=1:gpu:ppn=2,walltime=06:00:00
########################################################################
## u207452 is compiling SYCL_Essentials Module2 -- SYCL Program Structure sample - 7 of 7 matrix_multiply.cpp
Time taken for scalar matrix multiplication:: 1.77183 seconds
Time taken for parallel matrix multiplication:: 0.148906 seconds
Matrix multiplication successfully run on the device
########################################################################
# End of output for job 2432161.v-qsvr-1.aidevcloud
# Date: Sat 18 Nov 2023 10:13:23 PM PST
########################################################################
Job Completed in 20 seconds.
可以看到,并行运算与串行运算结果一致,且具有很高的执行效率。
通过本次实验,我初次接触了高性能计算以及Intel提供的高性能计算平台,利用oneAPI这一开发工具套件,上手实操完成了对矩阵乘法的并行优化。此外,我切实感受了SYCL编程模型及相应库函数的强大之处,对我未来的工作大有裨益。