[QST] It seems high-level api don't support selecting which GPU to run
TenWoods opened this issue · 6 comments
I created a compress manager and compressed my data by
CascadedManager nvcomp_manager{chunk_size , nvcompBatchedCascadedDefaultOpts, resource->stream, device_id};
CompressionConfig comp_config = nvcomp_manager.configure_compression(resource->data_length
resource->resizeDeviceCompMemory(comp_config.max_compressed_buffer_size);
nvcomp_manager.compress(resource->data_pointer, resource->device_comp_result, comp_config);
CHECK_ERROR(cudaStreamSynchronize(resource->stream), __FILE__, __LINE__);
I have passed device id
to select which GPU to run compress but it seems always run on GPU 0. Is there something wrong with the way I use it?
Just to double-check, is device_id
the one associated with resource->stream
? nvCOMP uses the provided stream for execution, which is associated with a particular device, so device_id
needs to be the same device.
It's a little weird, I used the example code of high-level API and added the device selection in main function:
int main()
{
int deviceCount;
int deviceID = 1;
cudaGetDeviceCount(&deviceCount);
if (deviceCount > 0 && deviceID < deviceCount)
{
std::cout << '\t' << "Device ID: " << deviceID << std::endl;
cudaSetDevice(deviceID);
}
else
{
std::cerr << '\t' << "Bad device ID " << deviceID << std::endl;
}
.
.
.
.
for (int i = 0; i < 100000; i++)
{
comp_decomp_with_single_manager_with_checksums (device_input_ptrs, input_buffer_len);
}
}
I also did the same device selection at the beginning of function comp_decomp_with_single_manager_with_checksums
. And I modified the gpu_num
to 1 in function comp_decomp_with_single_manager_with_checksums
. But the program reported an error when running:
terminate called after throwing an instance of 'std::runtime_error'
what(): Encountered Cuda Error: 2: 'out of memory'.
This error is same as I change my resource->stream
and device_id
to the same device.
Here is my GPU environment:
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.172.01 Driver Version: 450.172.01 CUDA Version: 11.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla P100-SXM2... On | 00000000:06:00.0 Off | 0 |
| N/A 41C P0 33W / 300W | 2MiB / 16280MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 Tesla P100-SXM2... On | 00000000:08:00.0 Off | 0 |
| N/A 41C P0 34W / 300W | 2MiB / 16280MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 2 Tesla P100-SXM2... On | 00000000:85:00.0 Off | 0 |
| N/A 41C P0 35W / 300W | 2MiB / 16280MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 3 Tesla P100-SXM2... On | 00000000:87:00.0 Off | 0 |
| N/A 40C P0 34W / 300W | 2MiB / 16280MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
Sorry, I don't think I have enough information to reproduce this error. Would you be able to provide the rest of the code you're using that's hitting this error?
Sorry for insufficient information, here is my test code.
#include <iostream>
#include <random>
#include <assert.h>
#include <iostream>
#include "nvcomp/lz4.hpp"
#include "nvcomp.hpp"
#include "nvcomp/nvcompManagerFactory.hpp"
using namespace nvcomp;
#define CUDA_CHECK(cond) \
do { \
cudaError_t err = cond; \
if (err != cudaSuccess) { \
std::cerr << "Failure" << std::endl; \
exit(1); \
} \
} while (false)
/**
* In this example, we:
* 1) construct an nvcompManager with checksum support enabled
* 2) compress the input data
* 3) decompress the input data
*/
void comp_decomp_with_single_manager_with_checksums(uint8_t* device_input_ptrs, const size_t input_buffer_len)
{
int deviceCount;
int deviceID = 1;
cudaGetDeviceCount(&deviceCount);
if (deviceCount > 0 && deviceID < deviceCount)
{
std::cout << '\t' << "Device ID: " << deviceID << std::endl;
cudaSetDevice(deviceID);
}
else
{
std::cerr << '\t' << "Bad device ID " << deviceID << std::endl;
}
int current_device;
cudaGetDevice(¤t_device);
std::cout << current_device << std::endl;
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
const int chunk_size = 1 << 16;
nvcompType_t data_type = NVCOMP_TYPE_CHAR;
// manager constructed with checksum mode as final argument
nvcompBatchedLZ4Opts_t format_opts{data_type};
LZ4Manager nvcomp_manager{chunk_size, format_opts, stream, deviceID, ComputeAndVerify};
CompressionConfig comp_config = nvcomp_manager.configure_compression(input_buffer_len);
uint8_t* comp_buffer;
CUDA_CHECK(cudaMalloc(&comp_buffer, comp_config.max_compressed_buffer_size));
// Checksums are computed and stored for uncompressed and compressed buffers during compression
nvcomp_manager.compress(device_input_ptrs, comp_buffer, comp_config);
DecompressionConfig decomp_config = nvcomp_manager.configure_decompression(comp_buffer);
uint8_t* res_decomp_buffer;
CUDA_CHECK(cudaMalloc(&res_decomp_buffer, decomp_config.decomp_data_size));
// Checksums are computed for compressed and decompressed buffers and verified against those
// stored during compression
nvcomp_manager.decompress(res_decomp_buffer, comp_buffer, decomp_config);
CUDA_CHECK(cudaStreamSynchronize(stream));
/*
* After synchronizing the stream, the nvcomp status can be checked to see if
* the checksums were successfully verified. Provided no unrelated nvcomp errors occurred,
* if the checksums were successfully verified, the status will be nvcompSuccess. Otherwise,
* it will be nvcompErrorBadChecksum.
*/
nvcompStatus_t final_status = *decomp_config.get_status();
if(final_status == nvcompErrorBadChecksum) {
throw std::runtime_error("One or more checksums were incorrect.\n");
}
CUDA_CHECK(cudaFree(comp_buffer));
CUDA_CHECK(cudaFree(res_decomp_buffer));
CUDA_CHECK(cudaStreamDestroy(stream));
}
int main()
{
int deviceCount;
int deviceID = 1;
cudaGetDeviceCount(&deviceCount);
if (deviceCount > 0 && deviceID < deviceCount)
{
std::cout << '\t' << "Device ID: " << deviceID << std::endl;
cudaSetDevice(deviceID);
}
else
{
std::cerr << '\t' << "Bad device ID " << deviceID << std::endl;
}
// Initialize a random array of chars
const size_t input_buffer_len = 1000000;
std::vector<uint8_t> uncompressed_data(input_buffer_len);
std::mt19937 random_gen(42);
// char specialization of std::uniform_int_distribution is
// non-standard, and isn't available on MSVC, so use short instead,
// but with the range limited, and then cast below.
std::uniform_int_distribution<short> uniform_dist(0, 255);
for (size_t ix = 0; ix < input_buffer_len; ++ix) {
uncompressed_data[ix] = static_cast<uint8_t>(uniform_dist(random_gen));
}
int current_device;
cudaGetDevice(¤t_device);
std::cout << "Data on GPU " << current_device << std::endl;
uint8_t* device_input_ptrs;
CUDA_CHECK(cudaMalloc(&device_input_ptrs, input_buffer_len));
CUDA_CHECK(cudaMemcpy(device_input_ptrs, uncompressed_data.data(), input_buffer_len, cudaMemcpyDefault));
for (int i = 0; i < 100000; i++)
{
comp_decomp_with_single_manager_with_checksums(device_input_ptrs, input_buffer_len);
}
CUDA_CHECK(cudaFree(device_input_ptrs));
return 0;
}
This issue has been labeled inactive-30d
due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. This issue will be labeled inactive-90d
if there is no activity in the next 60 days.
This issue has been labeled inactive-90d
due to no recent activity in the past 90 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.