OpenGVLab/InternImage

[DCNv4 error] RuntimeError: falseINTERNAL ASSERT FAILED at src/cuda/dcnv4_col2im_cuda.cuh:470

billbliss3 opened this issue · 1 comments

DCNv4 version 1.0.0.post2 by pip.

pip install DCNv4==1.0.0.post2

config file as follows:

img_backbone=dict(
    type='InternImage',
    core_op='DCNv3',
    ## large
    init_cfg=dict(type='Pretrained', checkpoint='ckpts/cascade_flash_internimage_l_fpn_3x_coco.pth'),
    # init_cfg=dict(type='Pretrained', checkpoint='ckpts/mask2former_flash_internimage_l_640_160k_ade20k_ss.pth'),
    channels=160,
    depths=[5, 5, 22, 5],
    groups=[10, 20, 40, 80],
    offset_scale=2.0,
    ##
    use_dcn_v4_op=True,
    out_indices=(2, 3),
    mlp_ratio=4.,
    drop_path_rate=0.4,
    norm_layer='LN',
    layer_scale=1.0,
    with_cp=use_checkpoint,
    post_norm=True,
    dw_kernel_size=3),

but get error log as follows:

File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 138, in run
iter_runner(iter_loaders[i], **kwargs)
File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 68, in train
self.call_hook('after_train_iter')
File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/base_runner.py", line 309, in call_hook
getattr(hook, fn_name)(self)
File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/hooks/optimizer.py", line 56, in after_train_iter
runner.outputs['loss'].backward()
File "/opt/conda/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/init.py", line 154, in backward
Variable._execution_engine.run_backward(
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 199, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.8/site-packages/torch/utils/checkpoint.py", line 138, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/init.py", line 154, in backward
Variable.execution_engine.run_backward(
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 199, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 340, in wrapper
outputs = fn(ctx, *args)
File "/opt/conda/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 111, in decorate_bwd
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/DCNv4/functions/dcnv4_func.py", line 125, in backward
ext.dcnv4_backward(*args)
RuntimeError: falseINTERNAL ASSERT FAILED at "/tmp/pip-install-3xkvdwi
/dcnv4_443530a10fe9416eb8a6d1a2a10d577b/src/cuda/dcnv4_col2im_cuda.cuh":470, please report a bug to PyTorch. kernel launch error
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 847) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 710, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

If I change the config to 'internimage_t_1k_224'. It works fine.

  type='InternImage',
    core_op='DCNv3',
    channels=64,
    depths=[4, 4, 18, 4],
    groups=[4, 8, 16, 32],
    mlp_ratio=4.,
    drop_path_rate=0.2,
    norm_layer='LN',
    layer_scale=1.0,
    offset_scale=1.0,
    post_norm=False,
    with_cp=False,
    out_indices=(2, 3),
    use_dcn_v4_op=True,
    init_cfg=dict(type='Pretrained', checkpoint='ckpts/internimage_t_1k_224.pth')