aim-uofa/Poseur

How to train on my own dataset?

trapqueenxx opened this issue · 3 comments

Hello, I have trained on my own dataset with 29 keypoints, and there is an error in poseur_head.py : the size of enc_outputs is [32, 17, 2](32 is the batchsize, 17 is the number of keypoints), it doesn't match with my 29 keypoints. Then I find in the class of PoseurTransformer_v3 in transformer.py, num_joints is initialized to 17, so I changed it to 29 and get started training. But the result epoch AP is 0, and the training log's loss and acc are unuausl as blow:

INFO - Epoch [1][50/1446] lr: 9.890e-05, eta: 4 days, 5:02:52, time: 1.198, data_time: 0.115, memory: 23176, enc_rle_loss: 319.5232, dec_rle_loss_0: 1515.8930, dec_rle_loss_1: 1094.4419, dec_rle_loss_2: 917.2394, dec_rle_loss_3: 1053.5126, dec_rle_loss_4: 977.9670, dec_rle_loss_5: 1089.1097, enc_coord_acc: 0.0002, dec_coord_acc: 0.0027, loss: 6967.6868

I don't kown how to train on my own dataset.

Hi, @trapqueenxx can you provide the whole config file here?

Hi, @trapqueenxx can you provide the whole config file here?

Thank you for your response.My config is below.

log_level = 'INFO'
load_from = '../models/poseur_256x192_w32_6dec_coco.pth'
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=10)
evaluation = dict(interval=10, metric='mAP', key_indicator='AP', rle_score=True)

optimizer = dict(
type='AdamW',
lr=1e-3,
weight_decay=1e-4,
paramwise_cfg = dict(
custom_keys={
# 'backbone': dict(lr_mult=0.1),
'sampling_offsets': dict(lr_mult=0.1),
'reference_points': dict(lr_mult=0.1),
# 'query_embed': dict(lr_mult=0.5, decay_mult=1.0),
},
)
)

optimizer_config = dict(grad_clip=None)
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210

log_config = dict(
interval=50, hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook'),
])

channel_cfg = dict(
num_output_channels=29,
dataset_joints=29,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
])

emb_dim = 256

norm_cfg = dict(type='BN', requires_grad=True)
model = dict(
type='Poseur',
pretrained=load_from,
backbone=dict(
type='HRNet',
norm_cfg = norm_cfg,
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(32, 64)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(32, 64, 128)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(32, 64, 128, 256),
multiscale_output=True,
)),
),
neck=dict(
type='ChannelMapper',
in_channels=[32, 64, 128, 256],
kernel_size=1,
out_channels=emb_dim,
act_cfg=None,
norm_cfg=dict(type='GN', num_groups=32),
),
keypoint_head=dict(
type='Poseur_noise_sample',
in_channels=512,
num_queries=29,
num_reg_fcs=2,
num_joints=channel_cfg['num_output_channels'],
with_box_refine=True,
loss_coord_enc=dict(type='RLELoss_poseur', use_target_weight=True),
loss_coord_dec=dict(type='RLELoss_poseur', use_target_weight=True),
loss_hp_keypoint=dict(type='JointsMSELoss', use_target_weight=True, loss_weight=10),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=emb_dim//2,
normalize=True,
offset=-0.5),
transformer=dict(
type='PoseurTransformer_v3',
query_pose_emb = True,
embed_dims = emb_dim,
encoder=dict(
type='DetrTransformerEncoder_zero_layer',
num_layers=0,
transformerlayers=dict(
type='BaseTransformerLayer',
ffn_cfgs = dict(
embed_dims=emb_dim,
),
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
num_levels=4,
num_points=4,
embed_dims=emb_dim),

                feedforward_channels=1024,
                ffn_dropout=0.1,
                operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
        decoder=dict(
            type='DeformableDetrTransformerDecoder',
            num_layers=6,
            return_intermediate=True,
            transformerlayers=dict(
                type='DetrTransformerDecoderLayer_grouped',
                ffn_cfgs = dict(
                    embed_dims=emb_dim,
                    ),
                attn_cfgs=[
                    dict(
                        type='MultiheadAttention',
                        embed_dims=emb_dim,
                        num_heads=8,
                        dropout=0.1),
                    dict(
                        type='MultiScaleDeformableAttention_post_value',
                        num_levels=4,
                        num_points=4,
                        embed_dims=emb_dim)],
                feedforward_channels=1024,
                ffn_dropout=0.1,
                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                 'ffn', 'norm')))),
    as_two_stage=True,
    use_heatmap_loss=False,
),
train_cfg=dict(image_size=[192, 256]),
test_cfg = dict(
    image_size=[192, 256],
    flip_test=True,
    post_process='default',
    shift_heatmap=True,
    modulate_kernel=11)

)
data_cfg = dict(
image_size=[288, 384],
heatmap_size=[72, 96],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
# use_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
det_bbox_thr=0.0,
use_gt_bbox=True,
bbox_file='',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
target_type='wo_mask',
type='TopDownGenerateCoordAndHeatMapTarget',
encoding='MSRA',
sigma=2),
dict(
type='Collect',
keys=['img', 'coord_target', 'coord_target_weight', 'hp_target', 'hp_target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=[
'img',
],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data = dict(
samples_per_gpu=32,
workers_per_gpu=4,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownHikposeDataset',
ann_file='../labels/del_train.json',
img_prefix='',
data_cfg=data_cfg,
pipeline=train_pipeline,
),
val=dict(
type='TopDownHikposeDataset',
ann_file='../labels/del_test_1.json',
img_prefix='',
data_cfg=data_cfg,
pipeline=val_pipeline,
),
test=dict(
type='TopDownHikposeDataset',
ann_file='../labels/del_test_1.json',
img_prefix='',
data_cfg=data_cfg,
pipeline=test_pipeline,
),
)
fp16 = dict(loss_scale='dynamic')

@trapqueenxx I update a config file for mpii dataset thus you can have a reference for your own dataset.