[DCNv4 error] RuntimeError: falseINTERNAL ASSERT FAILED at src/cuda/dcnv4_col2im_cuda.cuh:470
billbliss3 opened this issue · 1 comments
DCNv4 version 1.0.0.post2 by pip.
pip install DCNv4==1.0.0.post2
config file as follows:
img_backbone=dict(
type='InternImage',
core_op='DCNv3',
## large
init_cfg=dict(type='Pretrained', checkpoint='ckpts/cascade_flash_internimage_l_fpn_3x_coco.pth'),
# init_cfg=dict(type='Pretrained', checkpoint='ckpts/mask2former_flash_internimage_l_640_160k_ade20k_ss.pth'),
channels=160,
depths=[5, 5, 22, 5],
groups=[10, 20, 40, 80],
offset_scale=2.0,
##
use_dcn_v4_op=True,
out_indices=(2, 3),
mlp_ratio=4.,
drop_path_rate=0.4,
norm_layer='LN',
layer_scale=1.0,
with_cp=use_checkpoint,
post_norm=True,
dw_kernel_size=3),
but get error log as follows:
File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 138, in run
iter_runner(iter_loaders[i], **kwargs)
File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 68, in train
self.call_hook('after_train_iter')
File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/base_runner.py", line 309, in call_hook
getattr(hook, fn_name)(self)
File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/hooks/optimizer.py", line 56, in after_train_iter
runner.outputs['loss'].backward()
File "/opt/conda/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/init.py", line 154, in backward
Variable._execution_engine.run_backward(
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 199, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.8/site-packages/torch/utils/checkpoint.py", line 138, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/init.py", line 154, in backward
Variable.execution_engine.run_backward(
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 199, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 340, in wrapper
outputs = fn(ctx, *args)
File "/opt/conda/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 111, in decorate_bwd
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/DCNv4/functions/dcnv4_func.py", line 125, in backward
ext.dcnv4_backward(*args)
RuntimeError: falseINTERNAL ASSERT FAILED at "/tmp/pip-install-3xkvdwi/dcnv4_443530a10fe9416eb8a6d1a2a10d577b/src/cuda/dcnv4_col2im_cuda.cuh":470, please report a bug to PyTorch. kernel launch error
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 847) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 710, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
If I change the config to 'internimage_t_1k_224'. It works fine.
type='InternImage',
core_op='DCNv3',
channels=64,
depths=[4, 4, 18, 4],
groups=[4, 8, 16, 32],
mlp_ratio=4.,
drop_path_rate=0.2,
norm_layer='LN',
layer_scale=1.0,
offset_scale=1.0,
post_norm=False,
with_cp=False,
out_indices=(2, 3),
use_dcn_v4_op=True,
init_cfg=dict(type='Pretrained', checkpoint='ckpts/internimage_t_1k_224.pth')