running with cuda 11.3 and torch 1.10

Question

running with cuda 11.3 and torch 1.10

Closed this issue 6 months ago · 0 comments

Hi @myownskyW7 @yhcao6 I tried runining detectron-v3det on torch 1.10.0 with cu113, with GPU 4090, it raise the following error during training, which seems to be imcompatible with gpu architecture:

[04/10 19:12:42 d2.data.build]: Using training sampler RepeatFactorTrainingSampler
[04/10 19:12:43 d2.data.common]: Serializing 118804 elements to byte tensors and concatenating them all ...
[04/10 19:12:43 d2.data.common]: Serialized dataset takes 71.87 MiB
[04/10 19:12:44 detectron2]: Starting training from iteration 0
/root/miniconda3/lib/python3.8/site-packages/torch/functional.py:445: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at  ../aten/src/ATen/native/TensorShape.cpp:2157.)
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
/root/miniconda3/lib/python3.8/site-packages/torch/optim/lr_scheduler.py:129: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
  warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
Traceback (most recent call last):
  File "tools/train_detic.py", line 292, in <module>
    launch(
  File "/root/autodl-tmp/detectron2-v3det/detectron2/engine/launch.py", line 82, in launch
    main_func(*args)
  File "tools/train_detic.py", line 271, in main
    do_train(cfg, model, resume=args.resume)
  File "tools/train_detic.py", line 178, in do_train
    loss_dict = model(data)
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/root/autodl-tmp/detectron2-v3det/projects/Detic/detic/modeling/meta_arch/custom_rcnn.py", line 157, in forward
    proposals, proposal_losses = self.proposal_generator(
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/root/autodl-tmp/detectron2-v3det/projects/Detic/centernet/modeling/dense_heads/centernet.py", line 210, in forward
    losses = self.losses(
  File "/root/autodl-tmp/detectron2-v3det/projects/Detic/centernet/modeling/dense_heads/centernet.py", line 302, in losses
    agn_pos_loss, agn_neg_loss = binary_heatmap_focal_loss_jit(
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch)

nvrtc compilation failed: 

#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)


template<typename T>
__device__ T maximum(T a, T b) {
  return isnan(a) ? a : (a > b ? a : b);
}

template<typename T>
__device__ T minimum(T a, T b) {
  return isnan(a) ? a : (a < b ? a : b);
}

extern "C" __global__
void fused_clamp_neg_add_14842003191666628678(double vgamma_2, double vbeta_2, float* ttargets_1, float* tv_, double vsigmoid_clamp_2, double vv__, float* aten_mul_1, float* aten_mul, float* aten_pow_1, float* aten_log, float* aten_add_1, float* aten_pow, float* aten_add, float* aten_clamp) {
{
if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<68200ll ? 1 : 0) {
    float tv__1 = __ldg(tv_ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
    aten_clamp[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1);
    float ttargets_1_1 = __ldg(ttargets_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
    aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = (0.f - ttargets_1_1) + 1.f;
    aten_pow[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = powf((0.f - ttargets_1_1) + 1.f, (float)(vbeta_2));
    aten_add_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = (0.f - ((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1))) + 1.f;
    aten_log[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = logf((0.f - ((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1))) + 1.f);
    aten_pow_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = powf((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1), (float)(vgamma_2));
    aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = (logf((0.f - ((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1))) + 1.f)) * (powf((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1), (float)(vgamma_2)));
    aten_mul_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((logf((0.f - ((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1))) + 1.f)) * (powf((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1), (float)(vgamma_2)))) * (powf((0.f - ttargets_1_1) + 1.f, (float)(vbeta_2)));
  }}
}

as we only have 4090 gpu, which need at least cuda 11.3, could you pls help?