openucx/ucx

cuda_copy_md.c:489 UCX WARN cuPointerSetAttribute error with CUDA VMM API

Opened this issue · 1 comments

Describe the bug

Server uses cuda virtual memory management API (cuMemAddressReserve, cuMemCreate..) to create memory region and issues error
cuda_copy_md.c:489 UCX WARN cuPointerSetAttribute(0x7f85c0000000, SYNC_MEMOPS) error: operation not supported.

Can RNDV protocl support the memory space allocated by CUDA VMM API?

Steps to Reproduce

  • UCX version
# Library version: 1.17.0
# Library path: /home/xxx/ucx-1.17.0/install/lib/libucs.so.0
# API headers version: 1.17.0
# Git branch '', revision 
# Configured with: --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-gtest --enable-examples --enable-optimizations --disable-logging --disable-debug --disable-assertions --disable-params-check --without-xpmem --without-java --with-cuda=/usr/local/cuda-11.7 --with-gdrcopy --prefix=/home/xxx/ucx-1.17.0/install

Setup and versions

  • OS version: Ubuntu 20.04.6 LTS x86_64
  • GPU type: NVIDIA Tesla V100-PCIE-32GB
  • Cuda version: CUDA 11.7
  • Cuda driver version: 535.129.03

Additional information

  • VMM API to create memory region
cudaError_t vmm_alloc(void **ptr, size_t size, int currentDevice, CUdeviceptr start_add) {
    CUmemAllocationProp prop = {};
    memset(&prop,0,sizeof(prop));
    prop.type          = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id   = currentDevice;
    prop.allocFlags.gpuDirectRDMACapable = 1;

    size_t granularity = 0;
    CUresult result;
     // Calculates either the minimal or recommended granularity.
    result = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    if (result != CUDA_SUCCESS) {
        printf("cudaErrorMemoryAllocation error %d\n",result);
        return cudaErrorMemoryAllocation;
    }

    size = ((size - 1) / granularity + 1) * granularity;

    CUdeviceptr dptr;
    // Allocate an address range reservation.
    result = cuMemAddressReserve(&dptr, size, 0, start_add + granularity, 0);
    if (result != CUDA_SUCCESS) {
        printf("cuMemAddressReserve error %d\n",result);
        return cudaErrorMemoryAllocation;
    }
    printf("dptr = %p\n",dptr);

    CUmemGenericAllocationHandle allocationHandle;
    // Create a CUDA memory handle representing a memory allocation of a given size described by the given properties.
    result = cuMemCreate(&allocationHandle, size, &prop, 0);
    if (result != CUDA_SUCCESS) {
        printf("cuMemCreate error %d\n",result);
        return cudaErrorMemoryAllocation;
    }

    // Maps an allocation handle to a reserved virtual address range.
    // cuMemMap can only create mappings on VA range reservations that are not currently mapped.
    result = cuMemMap(dptr, size, 0, allocationHandle, 0);
    if (result != CUDA_SUCCESS) {
        printf("cuMemMap error %d\n",result);
        return cudaErrorMemoryAllocation;
    }

    CUmemAccessDesc accessDescriptor;
    accessDescriptor.location.id   = prop.location.id;
    accessDescriptor.location.type = prop.location.type;
    accessDescriptor.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    // Set the access flags for each location specified in desc for the given virtual address range.
    // Any new mapping to this virtual address will need to have access granted through cuMemSetAccess, as all mappings start with no accessibility setup.
    result = cuMemSetAccess(dptr, size, &accessDescriptor, 1);
    if (result != CUDA_SUCCESS) {
        printf("cuMemSetAccess error %d\n",result);
        return cudaErrorMemoryAllocation;
    }
    *ptr = (void *)dptr;

    return cudaSuccess;
}

VMM is currently not supported. It should be fixed by #9867
cc @Akshay-Venkatesh