LibreCUDA

LibreCUDA is a project aimed at replacing the CUDA driver API to enable launching CUDA code on Nvidia GPUs without relying on the proprietary CUDA runtime. It achieves this by communicating directly with the hardware via ioctls, ( specifically what Nvidia's open-gpu-kernel-modules refer to as the rmapi), as well as QMD, Nvidia's MMIO command queue structure. LibreCUDA is capable of uploading CUDA ELF binaries onto the GPU and launching them via the command queue.

Current features

Allocate and free gpu memory & map the memory to be accessible by the CPU
Upload cuda kernels (CUDA ELF binaries)
Set dynamic shared memory for cuda functions
Launch CUDA kernels
Supports cheap async kernel launches on a single stream
host to device (DMA), device to device (Compute), device to host memcpy (DMA)
Supports cheap async memcpys on a single stream

Example

Below is an example demonstrating the usage of LibreCUDA:

int main() {
    libreCuInit(0);

    int device_count{};
    libreCuDeviceGetCount(&device_count);
    std::cout << "Device count: " + std::to_string(device_count) << std::endl;

    LibreCUdevice device{};
    libreCuDeviceGet(&device, 0);

    LibreCUcontext ctx{};
    libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device);

    LibreCUmodule module{};

    uint8_t *image;
    size_t n_bytes;
    {
        std::ifstream input("write_float.cubin", std::ios::binary);
        std::vector<uint8_t> bytes(
                (std::istreambuf_iterator<char>(input)),
                (std::istreambuf_iterator<char>()));
        input.close();
        image = new uint8_t[bytes.size()];
        doMemcpy(image, bytes.data(), bytes.size());
        n_bytes = bytes.size();
    }
    libreCuModuleLoadData(&module, image, n_bytes);

    uint32_t num_funcs{};
    libreCuModuleGetFunctionCount(&num_funcs, module);
    std::cout << "Num functions: " << num_funcs << std::endl;

    auto *functions = new LibreCUFunction[num_funcs];
    libreCuModuleEnumerateFunctions(functions, num_funcs, module);

    for (size_t i = 0; i < num_funcs; i++) {
        LibreCUFunction func = functions[i];
        const char *func_name{};
        libreCuFuncGetName(&func_name, func);
        std::cout << "  function \"" << func_name << "\"" << std::endl;
    }

    delete[] functions;

    LibreCUFunction func{};
    libreCuModuleGetFunction(&func, module, "write_float");

    LibreCUstream stream{};
    libreCuStreamCreate(&stream, 0);

    void *float_dst_va{};
    libreCuMemAlloc(&float_dst_va, sizeof(float), true);

    float float_value = 3.1415f;
    void *float_src_va{};
    libreCuMemAlloc(&float_src_va, sizeof(float), true);
    *(float *) (float_src_va) = float_value;

    std::cout << "Src value: " << float_value << std::endl;
    std::cout << "Dst value (pre exec): " << *(float *) (float_dst_va) << std::endl;

    void *params[] = {
            &float_dst_va, // dst
            &float_src_va // src
    };
    libreCuLaunchKernel(func,
                        1, 1, 1,
                        1, 1, 1,
                        0,
                        stream,
                        params, sizeof(params) / sizeof(void *),
                        nullptr
    );

    libreCuStreamCommence(stream);
    
    libreCuStreamAwait(stream);
    std::cout << "Dst value (post exec): " << *(float *) (float_dst_va) << std::endl;

    libreCuMemFree(float_dst_va);
    libreCuStreamDestroy(stream);
    libreCuModuleUnload(module);
    libreCuCtxDestroy(ctx);
    return 0;
}

Outputs

Device count: 1
Num functions: 1
  function "write_float"
Src value: 3.1415
Dst value (pre exec): 0
Dst value (post exec): 3.1415

How to Use

The recommended way to use librecuda is to clone the LibreCUDA repository and link against the driverapi library in CMake:

git clone --recurse https://github.com/mikex86/LibreCuda.git

Add the repository as a CMake directory

add_subdirectory(LibreCuda)

Link against the driver api library

target_link_libraries(YourTarget PRIVATE driverapi)

Include headers

#include <librecuda.h>

Project Status

The project is in its early stages and currently implements only rudimentary CUDA functions. It is not yet ready for production use.

Contributing

Contributions are welcome! Please submit issues and pull requests to help improve LibreCUDA.

License

This project is licensed under the MIT License.