Train D2Det on custom dataset
Opened this issue · 1 comments
phongnhhn92 commented
Hi, I have been trying to train D2Det with my custom dataset which is already in the COCO format. My dataset has 7 classes so I have edit num_classes = 7 in the bbox_head dict(). The other things I have changed it the data dict() for my dataset.
However, I always have this CUDA error:
/home/phong/miniconda3/envs/d2/bin/python /home/phong/data/Zalo2020/D2Det/tools/train.py ../configs/D2Det/zalo.py
2020-11-05 00:57:30,306 - mmdet - INFO - Environment info:
------------------------------------------------------------
sys.platform: linux
Python: 3.6.12 |Anaconda, Inc.| (default, Sep 8 2020, 23:10:56) [GCC 7.3.0]
CUDA available: True
CUDA_HOME: /usr/local/cuda
NVCC: Cuda compilation tools, release 10.1, V10.1.105
GPU 0: GeForce RTX 2080 Ti
GCC: gcc (Ubuntu 8.4.0-3ubuntu2) 8.4.0
PyTorch: 1.2.0
PyTorch compiling details: PyTorch built with:
- GCC 7.3
- Intel(R) Math Kernel Library Version 2020.0.2 Product Build 20200624 for Intel(R) 64 architecture applications
- Intel(R) MKL-DNN v0.18.1 (Git Hash 7de7e5d02bf687f971e7668963649728356e0c20)
- OpenMP 201511 (a.k.a. OpenMP 4.5)
- NNPACK is enabled
- CUDA Runtime 10.0
- NVCC architecture flags: -gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_50,code=compute_50
- CuDNN 7.6.2
- Magma 2.5.1
- Build settings: BLAS=MKL, BUILD_NAMEDTENSOR=OFF, BUILD_TYPE=Release, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -fopenmp -DUSE_FBGEMM -DUSE_QNNPACK -O2 -fPIC -Wno-narrowing -Wall -Wextra -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Wno-stringop-overflow, DISABLE_NUMA=1, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, USE_CUDA=True, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON,
TorchVision: 0.4.0
OpenCV: 4.4.0
MMCV: 0.4.3
MMDetection: 1.1.0+a76781a
MMDetection Compiler: GCC 8.4
MMDetection CUDA Compiler: 10.1
------------------------------------------------------------
2020-11-05 00:57:30,306 - mmdet - INFO - Distributed training: False
2020-11-05 00:57:30,306 - mmdet - INFO - Config:
/home/phong/data/Zalo2020/D2Det/configs/D2Det/zalo.py
# model settings
model = dict(
type='D2Det',
pretrained='torchvision://resnet50',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(
type='DeformRoIPoolingPack',
out_size=7,
sample_per_part=2,
out_channels=256,
no_trans=False,
group_size=1,
trans_std=0.1),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCBBoxHead',
with_reg=False,
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=7,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0)),
reg_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
D2Det_head=dict(
type='D2DetHead',
num_convs=8,
in_channels=256,
norm_cfg=dict(type='GN', num_groups=36),
MASK_ON=False))
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_radius=1,
pos_weight=-1,
max_num_reg=192,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.03, nms=dict(type='nms', iou_thr=0.5), max_per_img=125))
# dataset settings
dataset_type = 'CocoDataset'
data_root = '../data/zalo/za_traffic_2020/traffic_train/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
imgs_per_gpu=1,
workers_per_gpu=1,
train=dict(
type=dataset_type,
ann_file=data_root + 'train_traffic_sign_dataset.json',
img_prefix=data_root + 'images/',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'train_traffic_sign_dataset.json',
img_prefix=data_root + 'images_val/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'train_traffic_sign_dataset.json',
img_prefix=data_root + 'images_val/',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='bbox')
# optimizer
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=1.0 / 80,
step=[20, 23])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 24
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/D2Det_detection_r50_fpn_2x'
load_from = None
resume_from = None
workflow = [('train', 1)]
2020-11-05 00:57:30,641 - mmdet - INFO - load model from: torchvision://resnet50
2020-11-05 00:57:30,781 - mmdet - WARNING - The model and loaded state dict do not match exactly
unexpected key in source state_dict: fc.weight, fc.bias
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!
2020-11-05 00:57:33,651 - mmdet - INFO - Start running, host: phong@phong-Server, work_dir: /home/phong/data/Zalo2020/D2Det/tools/work_dirs/D2Det_detection_r50_fpn_2x
2020-11-05 00:57:33,651 - mmdet - INFO - workflow: [('train', 1)], max: 24 epochs
Traceback (most recent call last):
File "/home/phong/data/Zalo2020/D2Det/tools/train.py", line 144, in <module>
main()
File "/home/phong/data/Zalo2020/D2Det/tools/train.py", line 140, in main
meta=meta)
File "/home/phong/data/Zalo2020/D2Det/tools/../mmdet/apis/train.py", line 111, in train_detector
meta=meta)
File "/home/phong/data/Zalo2020/D2Det/tools/../mmdet/apis/train.py", line 233, in _non_dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/mmcv/runner/runner.py", line 359, in run
epoch_runner(data_loaders[i], **kwargs)
File "/home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/mmcv/runner/runner.py", line 263, in train
self.model, data_batch, train_mode=True, **kwargs)
File "/home/phong/data/Zalo2020/D2Det/tools/../mmdet/apis/train.py", line 75, in batch_processor
losses = model(**data)
File "/home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 150, in forward
return self.module(*inputs[0], **kwargs[0])
File "/home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/phong/data/Zalo2020/D2Det/tools/../mmdet/core/fp16/decorators.py", line 49, in new_func
return old_func(*args, **kwargs)
File "/home/phong/data/Zalo2020/D2Det/tools/../mmdet/models/detectors/base.py", line 137, in forward
return self.forward_train(img, img_meta, **kwargs)
File "/home/phong/data/Zalo2020/D2Det/tools/../mmdet/models/detectors/D2Det.py", line 157, in forward_train
x[:self.reg_roi_extractor.num_inputs], pos_rois)
File "/home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/phong/data/Zalo2020/D2Det/tools/../mmdet/core/fp16/decorators.py", line 127, in new_func
return old_func(*args, **kwargs)
File "/home/phong/data/Zalo2020/D2Det/tools/../mmdet/models/roi_extractors/single_level.py", line 96, in forward
target_lvls = self.map_roi_levels(rois, num_levels)
File "/home/phong/data/Zalo2020/D2Det/tools/../mmdet/models/roi_extractors/single_level.py", line 70, in map_roi_levels
(rois[:, 3] - rois[:, 1] + 1) * (rois[:, 4] - rois[:, 2] + 1))
RuntimeError: CUDA error: device-side assert triggered
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensor<Dtype, 2, int, DefaultPtrTraits>, THCDeviceTensor<long, 1, int, DefaultPtrTraits>, THCDeviceTensor<Dtype, 1, int, DefaultPtrTraits>, Dtype *, int, int) [with Dtype = float]: block: [0,0,0], thread: [0,0,0] Assertion `cur_target >= 0 && cur_target < n_classes` failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensor<Dtype, 2, int, DefaultPtrTraits>, THCDeviceTensor<long, 1, int, DefaultPtrTraits>, THCDeviceTensor<Dtype, 1, int, DefaultPtrTraits>, Dtype *, int, int) [with Dtype = float]: block: [0,0,0], thread: [1,0,0] Assertion `cur_target >= 0 && cur_target < n_classes` failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensor<Dtype, 2, int, DefaultPtrTraits>, THCDeviceTensor<long, 1, int, DefaultPtrTraits>, THCDeviceTensor<Dtype, 1, int, DefaultPtrTraits>, Dtype *, int, int) [with Dtype = float]: block: [0,0,0], thread: [2,0,0] Assertion `cur_target >= 0 && cur_target < n_classes` failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensor<Dtype, 2, int, DefaultPtrTraits>, THCDeviceTensor<long, 1, int, DefaultPtrTraits>, THCDeviceTensor<Dtype, 1, int, DefaultPtrTraits>, Dtype *, int, int) [with Dtype = float]: block: [0,0,0], thread: [3,0,0] Assertion `cur_target >= 0 && cur_target < n_classes` failed.
terminate called after throwing an instance of 'c10::Error'
what(): CUDA error: device-side assert triggered (insert_events at /pytorch/c10/cuda/CUDACachingAllocator.cpp:569)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x33 (0x7f920ac67273 in /home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x10f1e (0x7f920ae9ef1e in /home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x146b1 (0x7f920aea26b1 in /home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #3: c10::TensorImpl::release_resources() + 0x74 (0x7f920ac55354 in /home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #4: <unknown function> + 0x1c6b44 (0x7f9254ec5b44 in /home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0x40686b (0x7f925510586b in /home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #6: <unknown function> + 0x406896 (0x7f9255105896 in /home/phong/miniconda3/envs/d2/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #7: <unknown function> + 0x19b9fe (0x55b4d8ad99fe in /home/phong/miniconda3/envs/d2/bin/python)
frame #8: <unknown function> + 0xf2da8 (0x55b4d8a30da8 in /home/phong/miniconda3/envs/d2/bin/python)
frame #9: <unknown function> + 0xf2ec7 (0x55b4d8a30ec7 in /home/phong/miniconda3/envs/d2/bin/python)
frame #10: <unknown function> + 0xf2787 (0x55b4d8a30787 in /home/phong/miniconda3/envs/d2/bin/python)
frame #11: <unknown function> + 0xf2617 (0x55b4d8a30617 in /home/phong/miniconda3/envs/d2/bin/python)
frame #12: <unknown function> + 0xf262d (0x55b4d8a3062d in /home/phong/miniconda3/envs/d2/bin/python)
frame #13: <unknown function> + 0xf262d (0x55b4d8a3062d in /home/phong/miniconda3/envs/d2/bin/python)
frame #14: <unknown function> + 0xf262d (0x55b4d8a3062d in /home/phong/miniconda3/envs/d2/bin/python)
frame #15: <unknown function> + 0xf262d (0x55b4d8a3062d in /home/phong/miniconda3/envs/d2/bin/python)
frame #16: <unknown function> + 0xf262d (0x55b4d8a3062d in /home/phong/miniconda3/envs/d2/bin/python)
frame #17: <unknown function> + 0xf262d (0x55b4d8a3062d in /home/phong/miniconda3/envs/d2/bin/python)
frame #18: <unknown function> + 0xf262d (0x55b4d8a3062d in /home/phong/miniconda3/envs/d2/bin/python)
frame #19: <unknown function> + 0xf262d (0x55b4d8a3062d in /home/phong/miniconda3/envs/d2/bin/python)
frame #20: PyDict_SetItem + 0x3da (0x55b4d8a774ba in /home/phong/miniconda3/envs/d2/bin/python)
frame #21: PyDict_SetItemString + 0x4f (0x55b4d8a7e4df in /home/phong/miniconda3/envs/d2/bin/python)
frame #22: PyImport_Cleanup + 0x99 (0x55b4d8ae3d49 in /home/phong/miniconda3/envs/d2/bin/python)
frame #23: Py_FinalizeEx + 0x61 (0x55b4d8b4e061 in /home/phong/miniconda3/envs/d2/bin/python)
frame #24: Py_Main + 0x35e (0x55b4d8b583ae in /home/phong/miniconda3/envs/d2/bin/python)
frame #25: main + 0xee (0x55b4d8a2243e in /home/phong/miniconda3/envs/d2/bin/python)
frame #26: __libc_start_main + 0xf3 (0x7f9264de10b3 in /lib/x86_64-linux-gnu/libc.so.6)
frame #27: <unknown function> + 0x1c3d0b (0x55b4d8b01d0b in /home/phong/miniconda3/envs/d2/bin/python)
Process finished with exit code 134 (interrupted by signal 6: SIGABRT)
Is this something related to the annotations that I am using? One thing I have noticed is that in COCO dataset the bounding box information is float numbers but mine is int numbers. Would it be a possible error ?
JialeCao001 commented
@phongnhhn92 I am not sure about the problem. Can you print the size of RoI?