running with cuda 11.3 and torch 1.10
Closed this issue · 0 comments
twangnh commented
Hi @myownskyW7 @yhcao6 I tried runining detectron-v3det on torch 1.10.0 with cu113, with GPU 4090, it raise the following error during training, which seems to be imcompatible with gpu architecture:
[04/10 19:12:42 d2.data.build]: Using training sampler RepeatFactorTrainingSampler
[04/10 19:12:43 d2.data.common]: Serializing 118804 elements to byte tensors and concatenating them all ...
[04/10 19:12:43 d2.data.common]: Serialized dataset takes 71.87 MiB
[04/10 19:12:44 detectron2]: Starting training from iteration 0
/root/miniconda3/lib/python3.8/site-packages/torch/functional.py:445: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:2157.)
return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]
/root/miniconda3/lib/python3.8/site-packages/torch/optim/lr_scheduler.py:129: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
Traceback (most recent call last):
File "tools/train_detic.py", line 292, in <module>
launch(
File "/root/autodl-tmp/detectron2-v3det/detectron2/engine/launch.py", line 82, in launch
main_func(*args)
File "tools/train_detic.py", line 271, in main
do_train(cfg, model, resume=args.resume)
File "tools/train_detic.py", line 178, in do_train
loss_dict = model(data)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/detectron2-v3det/projects/Detic/detic/modeling/meta_arch/custom_rcnn.py", line 157, in forward
proposals, proposal_losses = self.proposal_generator(
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/detectron2-v3det/projects/Detic/centernet/modeling/dense_heads/centernet.py", line 210, in forward
losses = self.losses(
File "/root/autodl-tmp/detectron2-v3det/projects/Detic/centernet/modeling/dense_heads/centernet.py", line 302, in losses
agn_pos_loss, agn_neg_loss = binary_heatmap_focal_loss_jit(
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch)
nvrtc compilation failed:
#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)
template<typename T>
__device__ T maximum(T a, T b) {
return isnan(a) ? a : (a > b ? a : b);
}
template<typename T>
__device__ T minimum(T a, T b) {
return isnan(a) ? a : (a < b ? a : b);
}
extern "C" __global__
void fused_clamp_neg_add_14842003191666628678(double vgamma_2, double vbeta_2, float* ttargets_1, float* tv_, double vsigmoid_clamp_2, double vv__, float* aten_mul_1, float* aten_mul, float* aten_pow_1, float* aten_log, float* aten_add_1, float* aten_pow, float* aten_add, float* aten_clamp) {
{
if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<68200ll ? 1 : 0) {
float tv__1 = __ldg(tv_ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
aten_clamp[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1);
float ttargets_1_1 = __ldg(ttargets_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = (0.f - ttargets_1_1) + 1.f;
aten_pow[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = powf((0.f - ttargets_1_1) + 1.f, (float)(vbeta_2));
aten_add_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = (0.f - ((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1))) + 1.f;
aten_log[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = logf((0.f - ((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1))) + 1.f);
aten_pow_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = powf((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1), (float)(vgamma_2));
aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = (logf((0.f - ((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1))) + 1.f)) * (powf((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1), (float)(vgamma_2)));
aten_mul_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((logf((0.f - ((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1))) + 1.f)) * (powf((tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1)>(float)(vv__) ? (float)(vv__) : (tv__1<(float)(vsigmoid_clamp_2) ? (float)(vsigmoid_clamp_2) : tv__1), (float)(vgamma_2)))) * (powf((0.f - ttargets_1_1) + 1.f, (float)(vbeta_2)));
}}
}
as we only have 4090 gpu, which need at least cuda 11.3, could you pls help?