[QST] It seems high-level api don't support selecting which GPU to run

Question

[QST] It seems high-level api don't support selecting which GPU to run

TenWoods opened this issue a year ago · 6 comments

I created a compress manager and compressed my data by

CascadedManager nvcomp_manager{chunk_size , nvcompBatchedCascadedDefaultOpts, resource->stream, device_id};
CompressionConfig comp_config = nvcomp_manager.configure_compression(resource->data_length
   resource->resizeDeviceCompMemory(comp_config.max_compressed_buffer_size);
nvcomp_manager.compress(resource->data_pointer, resource->device_comp_result, comp_config);
CHECK_ERROR(cudaStreamSynchronize(resource->stream), __FILE__, __LINE__);

I have passed device id to select which GPU to run compress but it seems always run on GPU 0. Is there something wrong with the way I use it?

Answer 1 · 2023-08-24T20:13:16.000Z

Just to double-check, is device_id the one associated with resource->stream? nvCOMP uses the provided stream for execution, which is associated with a particular device, so device_id needs to be the same device.

Answer 2 · 2023-08-28T06:59:50.000Z

It's a little weird, I used the example code of high-level API and added the device selection in main function:

int main()
{
    int deviceCount;
    int deviceID = 1;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount > 0 && deviceID < deviceCount)
    {
        std::cout << '\t' << "Device ID: " << deviceID << std::endl;
        cudaSetDevice(deviceID);
    }
    else
    {
        std::cerr << '\t' << "Bad device ID " << deviceID << std::endl;
    }
    .
    .
    .
    .    
    for (int i = 0; i < 100000; i++)
    {
        comp_decomp_with_single_manager_with_checksums (device_input_ptrs, input_buffer_len);
    }
}

I also did the same device selection at the beginning of function comp_decomp_with_single_manager_with_checksums. And I modified the gpu_num to 1 in function comp_decomp_with_single_manager_with_checksums . But the program reported an error when running:

terminate called after throwing an instance of 'std::runtime_error'
  what():  Encountered Cuda Error: 2: 'out of memory'.

This error is same as I change my resource->stream and device_id to the same device.
Here is my GPU environment:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.172.01   Driver Version: 450.172.01   CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla P100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   41C    P0    33W / 300W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-SXM2...  On   | 00000000:08:00.0 Off |                    0 |
| N/A   41C    P0    34W / 300W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  Tesla P100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |
| N/A   41C    P0    35W / 300W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  Tesla P100-SXM2...  On   | 00000000:87:00.0 Off |                    0 |
| N/A   40C    P0    34W / 300W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

Answer 3 · 2023-08-28T15:52:04.000Z

Sorry, I don't think I have enough information to reproduce this error. Would you be able to provide the rest of the code you're using that's hitting this error?

Answer 4 · 2023-08-29T05:59:47.000Z

Sorry for insufficient information, here is my test code.

#include <iostream>
#include <random>
#include <assert.h>
#include <iostream>

#include "nvcomp/lz4.hpp"
#include "nvcomp.hpp"
#include "nvcomp/nvcompManagerFactory.hpp"

using namespace nvcomp;

#define CUDA_CHECK(cond)                                                       \
  do {                                                                         \
    cudaError_t err = cond;                                                    \
    if (err != cudaSuccess) {                                               \
      std::cerr << "Failure" << std::endl;                                \
      exit(1);                                                              \
    }                                                                         \
  } while (false)

/**
 * In this example, we:
 *  1) construct an nvcompManager with checksum support enabled
 *  2) compress the input data
 *  3) decompress the input data
 */
void comp_decomp_with_single_manager_with_checksums(uint8_t* device_input_ptrs, const size_t input_buffer_len)
{
    int deviceCount;
    int deviceID = 1;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount > 0 && deviceID < deviceCount)
    {
         std::cout << '\t' << "Device ID: " << deviceID << std::endl;
         cudaSetDevice(deviceID);
    }
    else
    {
         std::cerr << '\t' << "Bad device ID " << deviceID << std::endl;
    }
    int current_device;
    cudaGetDevice(&current_device);
    std::cout << current_device << std::endl;
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    const int chunk_size = 1 << 16;
    nvcompType_t data_type = NVCOMP_TYPE_CHAR;

    // manager constructed with checksum mode as final argument
    nvcompBatchedLZ4Opts_t format_opts{data_type};
    LZ4Manager nvcomp_manager{chunk_size, format_opts, stream, deviceID, ComputeAndVerify};
    CompressionConfig comp_config = nvcomp_manager.configure_compression(input_buffer_len);

    uint8_t* comp_buffer;
    CUDA_CHECK(cudaMalloc(&comp_buffer, comp_config.max_compressed_buffer_size));

    // Checksums are computed and stored for uncompressed and compressed buffers during compression
    nvcomp_manager.compress(device_input_ptrs, comp_buffer, comp_config);

    DecompressionConfig decomp_config = nvcomp_manager.configure_decompression(comp_buffer);
    uint8_t* res_decomp_buffer;
    CUDA_CHECK(cudaMalloc(&res_decomp_buffer, decomp_config.decomp_data_size));

    // Checksums are computed for compressed and decompressed buffers and verified against those
    // stored during compression
    nvcomp_manager.decompress(res_decomp_buffer, comp_buffer, decomp_config);

    CUDA_CHECK(cudaStreamSynchronize(stream));

    /*
     * After synchronizing the stream, the nvcomp status can be checked to see if
     * the checksums were successfully verified. Provided no unrelated nvcomp errors occurred,
     * if the checksums were successfully verified, the status will be nvcompSuccess. Otherwise,
     * it will be nvcompErrorBadChecksum.
     */
    nvcompStatus_t final_status = *decomp_config.get_status();
    if(final_status == nvcompErrorBadChecksum) {
        throw std::runtime_error("One or more checksums were incorrect.\n");
    }

    CUDA_CHECK(cudaFree(comp_buffer));
    CUDA_CHECK(cudaFree(res_decomp_buffer));

    CUDA_CHECK(cudaStreamDestroy(stream));
}

int main()
{
    int deviceCount;
    int deviceID = 1;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount > 0 && deviceID < deviceCount)
    {
        std::cout << '\t' << "Device ID: " << deviceID << std::endl;
        cudaSetDevice(deviceID);
    }
    else
    {
        std::cerr << '\t' << "Bad device ID " << deviceID << std::endl;
    }

    // Initialize a random array of chars
    const size_t input_buffer_len = 1000000;
    std::vector<uint8_t> uncompressed_data(input_buffer_len);

    std::mt19937 random_gen(42);

    // char specialization of std::uniform_int_distribution is
    // non-standard, and isn't available on MSVC, so use short instead,
    // but with the range limited, and then cast below.
    std::uniform_int_distribution<short> uniform_dist(0, 255);
    for (size_t ix = 0; ix < input_buffer_len; ++ix) {
        uncompressed_data[ix] = static_cast<uint8_t>(uniform_dist(random_gen));
    }

    int current_device;
    cudaGetDevice(&current_device);
    std::cout << "Data on GPU " << current_device << std::endl;

    uint8_t* device_input_ptrs;
    CUDA_CHECK(cudaMalloc(&device_input_ptrs, input_buffer_len));
    CUDA_CHECK(cudaMemcpy(device_input_ptrs, uncompressed_data.data(), input_buffer_len, cudaMemcpyDefault));

    for (int i = 0; i < 100000; i++)
    {
        comp_decomp_with_single_manager_with_checksums(device_input_ptrs, input_buffer_len);
    }
    CUDA_CHECK(cudaFree(device_input_ptrs));
    return 0;
}

Answer 5 · 2023-09-28T06:01:22.000Z

This issue has been labeled inactive-30d due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. This issue will be labeled inactive-90d if there is no activity in the next 60 days.

Answer 6 · 2023-12-27T06:01:23.000Z

This issue has been labeled inactive-90d due to no recent activity in the past 90 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.