NVIDIA/nvcomp

[QST] It seems high-level api don't support selecting which GPU to run

TenWoods opened this issue · 6 comments

I created a compress manager and compressed my data by

CascadedManager nvcomp_manager{chunk_size , nvcompBatchedCascadedDefaultOpts, resource->stream, device_id};
CompressionConfig comp_config = nvcomp_manager.configure_compression(resource->data_length
   resource->resizeDeviceCompMemory(comp_config.max_compressed_buffer_size);
nvcomp_manager.compress(resource->data_pointer, resource->device_comp_result, comp_config);
CHECK_ERROR(cudaStreamSynchronize(resource->stream), __FILE__, __LINE__);

I have passed device id to select which GPU to run compress but it seems always run on GPU 0. Is there something wrong with the way I use it?

Just to double-check, is device_id the one associated with resource->stream? nvCOMP uses the provided stream for execution, which is associated with a particular device, so device_id needs to be the same device.

It's a little weird, I used the example code of high-level API and added the device selection in main function:

int main()
{
    int deviceCount;
    int deviceID = 1;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount > 0 && deviceID < deviceCount)
    {
        std::cout << '\t' << "Device ID: " << deviceID << std::endl;
        cudaSetDevice(deviceID);
    }
    else
    {
        std::cerr << '\t' << "Bad device ID " << deviceID << std::endl;
    }
    .
    .
    .
    .    
    for (int i = 0; i < 100000; i++)
    {
        comp_decomp_with_single_manager_with_checksums (device_input_ptrs, input_buffer_len);
    }
}

I also did the same device selection at the beginning of function comp_decomp_with_single_manager_with_checksums. And I modified the gpu_num to 1 in function comp_decomp_with_single_manager_with_checksums . But the program reported an error when running:

terminate called after throwing an instance of 'std::runtime_error'
  what():  Encountered Cuda Error: 2: 'out of memory'.

This error is same as I change my resource->stream and device_id to the same device.
Here is my GPU environment:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.172.01   Driver Version: 450.172.01   CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla P100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   41C    P0    33W / 300W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-SXM2...  On   | 00000000:08:00.0 Off |                    0 |
| N/A   41C    P0    34W / 300W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  Tesla P100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |
| N/A   41C    P0    35W / 300W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  Tesla P100-SXM2...  On   | 00000000:87:00.0 Off |                    0 |
| N/A   40C    P0    34W / 300W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

Sorry, I don't think I have enough information to reproduce this error. Would you be able to provide the rest of the code you're using that's hitting this error?

Sorry for insufficient information, here is my test code.

#include <iostream>
#include <random>
#include <assert.h>
#include <iostream>

#include "nvcomp/lz4.hpp"
#include "nvcomp.hpp"
#include "nvcomp/nvcompManagerFactory.hpp"

using namespace nvcomp;

#define CUDA_CHECK(cond)                                                       \
  do {                                                                         \
    cudaError_t err = cond;                                                    \
    if (err != cudaSuccess) {                                               \
      std::cerr << "Failure" << std::endl;                                \
      exit(1);                                                              \
    }                                                                         \
  } while (false)

/**
 * In this example, we:
 *  1) construct an nvcompManager with checksum support enabled
 *  2) compress the input data
 *  3) decompress the input data
 */
void comp_decomp_with_single_manager_with_checksums(uint8_t* device_input_ptrs, const size_t input_buffer_len)
{
    int deviceCount;
    int deviceID = 1;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount > 0 && deviceID < deviceCount)
    {
         std::cout << '\t' << "Device ID: " << deviceID << std::endl;
         cudaSetDevice(deviceID);
    }
    else
    {
         std::cerr << '\t' << "Bad device ID " << deviceID << std::endl;
    }
    int current_device;
    cudaGetDevice(&current_device);
    std::cout << current_device << std::endl;
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    const int chunk_size = 1 << 16;
    nvcompType_t data_type = NVCOMP_TYPE_CHAR;

    // manager constructed with checksum mode as final argument
    nvcompBatchedLZ4Opts_t format_opts{data_type};
    LZ4Manager nvcomp_manager{chunk_size, format_opts, stream, deviceID, ComputeAndVerify};
    CompressionConfig comp_config = nvcomp_manager.configure_compression(input_buffer_len);

    uint8_t* comp_buffer;
    CUDA_CHECK(cudaMalloc(&comp_buffer, comp_config.max_compressed_buffer_size));

    // Checksums are computed and stored for uncompressed and compressed buffers during compression
    nvcomp_manager.compress(device_input_ptrs, comp_buffer, comp_config);

    DecompressionConfig decomp_config = nvcomp_manager.configure_decompression(comp_buffer);
    uint8_t* res_decomp_buffer;
    CUDA_CHECK(cudaMalloc(&res_decomp_buffer, decomp_config.decomp_data_size));

    // Checksums are computed for compressed and decompressed buffers and verified against those
    // stored during compression
    nvcomp_manager.decompress(res_decomp_buffer, comp_buffer, decomp_config);

    CUDA_CHECK(cudaStreamSynchronize(stream));

    /*
     * After synchronizing the stream, the nvcomp status can be checked to see if
     * the checksums were successfully verified. Provided no unrelated nvcomp errors occurred,
     * if the checksums were successfully verified, the status will be nvcompSuccess. Otherwise,
     * it will be nvcompErrorBadChecksum.
     */
    nvcompStatus_t final_status = *decomp_config.get_status();
    if(final_status == nvcompErrorBadChecksum) {
        throw std::runtime_error("One or more checksums were incorrect.\n");
    }

    CUDA_CHECK(cudaFree(comp_buffer));
    CUDA_CHECK(cudaFree(res_decomp_buffer));

    CUDA_CHECK(cudaStreamDestroy(stream));
}

int main()
{
    int deviceCount;
    int deviceID = 1;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount > 0 && deviceID < deviceCount)
    {
        std::cout << '\t' << "Device ID: " << deviceID << std::endl;
        cudaSetDevice(deviceID);
    }
    else
    {
        std::cerr << '\t' << "Bad device ID " << deviceID << std::endl;
    }

    // Initialize a random array of chars
    const size_t input_buffer_len = 1000000;
    std::vector<uint8_t> uncompressed_data(input_buffer_len);

    std::mt19937 random_gen(42);

    // char specialization of std::uniform_int_distribution is
    // non-standard, and isn't available on MSVC, so use short instead,
    // but with the range limited, and then cast below.
    std::uniform_int_distribution<short> uniform_dist(0, 255);
    for (size_t ix = 0; ix < input_buffer_len; ++ix) {
        uncompressed_data[ix] = static_cast<uint8_t>(uniform_dist(random_gen));
    }

    int current_device;
    cudaGetDevice(&current_device);
    std::cout << "Data on GPU " << current_device << std::endl;

    uint8_t* device_input_ptrs;
    CUDA_CHECK(cudaMalloc(&device_input_ptrs, input_buffer_len));
    CUDA_CHECK(cudaMemcpy(device_input_ptrs, uncompressed_data.data(), input_buffer_len, cudaMemcpyDefault));

    for (int i = 0; i < 100000; i++)
    {
        comp_decomp_with_single_manager_with_checksums(device_input_ptrs, input_buffer_len);
    }
    CUDA_CHECK(cudaFree(device_input_ptrs));
    return 0;
}

This issue has been labeled inactive-30d due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. This issue will be labeled inactive-90d if there is no activity in the next 60 days.