
About loss function

Opened this issue · 1 comments


I want to change multinomial cross entropy loss for affordance detection branch based on softmax into binary cross entropy loss based on sigmod, how can I do?

I try to change train.prototxt file as follow:
`layer {
name: "mask_score"
type: "Convolution"
bottom: "mask_deconv3" #
top: "mask_score"
param { lr_mult: 1.0 decay_mult: 1.0 }
param { lr_mult: 2.0 decay_mult: 0 }
convolution_param {
#num_output: 10 # 9 affordance classes + 1 background
#num_output: 1# output will be 1x1x14x14 --> for using SigmoidCrossEntropyLoss
num_output: 2# output will be 1x2x14x14 --> for using Softmax. Actually, binomial cross-entropy loss
#(sigmoid + cross entropy) = logistic regression = two classes softmax regression
kernel_size: 1 pad: 0
weight_filler {type: "gaussian" std: 0.01 } #weight_filler { type: "xavier" }
bias_filler { type: "constant" value: 0 }

layer {
name: "loss_mask"
type: "SoftmaxWithLoss"
#bottom: "mask_score_reshape"
bottom: "mask_score"
bottom: "mask_targets"
top: "loss_mask"
loss_weight: 3
loss_param {
ignore_label: -1
normalize: true
#normalize: false
propagate_down: true # backprop to prediction
propagate_down: false # don't backprop to labels

and set the base_lr = le-10 (large base_lr doesn't work). But the loss is very random, sometimes is very big to 100 and sometimes is small to 6. I can't see a downward trend of the loss.

Hi @lsj910128 , have you solved this? I'm trying to do the same but cannot figure out how to do it. This is my prototxt file, note that I added an extra python layer (BinaryMaskLayer) to convert values of classes in a 2 array into binary values:

name: "VGG_ILSVRC_16_layers"
layer {
name: 'input-data'
type: 'Python'
top: 'data'
top: 'im_info'
top: 'gt_boxes'
top: 'seg_mask_inds'
top: 'flipped'
python_param {
module: 'roi_data_layer.layer'
layer: 'RoIDataLayer'
param_str: "'num_classes': 3" # 2 obj categories + 1 background
layer {
name: "conv1_1"
type: "Convolution"
bottom: "data"
top: "conv1_1"
param {
lr_mult: 0
decay_mult: 0
param {
lr_mult: 0
decay_mult: 0
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
layer {
name: "relu1_1"
type: "ReLU"
bottom: "conv1_1"
top: "conv1_1"
layer {
name: "conv1_2"
type: "Convolution"
bottom: "conv1_1"
top: "conv1_2"
param {
lr_mult: 0
decay_mult: 0
param {
lr_mult: 0
decay_mult: 0
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
layer {
name: "relu1_2"
type: "ReLU"
bottom: "conv1_2"
top: "conv1_2"
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1_2"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
layer {
name: "conv2_1"
type: "Convolution"
bottom: "pool1"
top: "conv2_1"
param {
lr_mult: 0
decay_mult: 0
param {
lr_mult: 0
decay_mult: 0
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
layer {
name: "relu2_1"
type: "ReLU"
bottom: "conv2_1"
top: "conv2_1"
layer {
name: "conv2_2"
type: "Convolution"
bottom: "conv2_1"
top: "conv2_2"
param {
lr_mult: 0
decay_mult: 0
param {
lr_mult: 0
decay_mult: 0
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
layer {
name: "relu2_2"
type: "ReLU"
bottom: "conv2_2"
top: "conv2_2"
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2_2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
layer {
name: "conv3_1"
type: "Convolution"
bottom: "pool2"
top: "conv3_1"
param {
lr_mult: 1
param {
lr_mult: 2
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
layer {
name: "relu3_1"
type: "ReLU"
bottom: "conv3_1"
top: "conv3_1"
layer {
name: "conv3_2"
type: "Convolution"
bottom: "conv3_1"
top: "conv3_2"
param {
lr_mult: 1
param {
lr_mult: 2
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
layer {
name: "relu3_2"
type: "ReLU"
bottom: "conv3_2"
top: "conv3_2"
layer {
name: "conv3_3"
type: "Convolution"
bottom: "conv3_2"
top: "conv3_3"
param {
lr_mult: 1
param {
lr_mult: 2
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
layer {
name: "relu3_3"
type: "ReLU"
bottom: "conv3_3"
top: "conv3_3"
layer {
name: "pool3"
type: "Pooling"
bottom: "conv3_3"
top: "pool3"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
layer {
name: "conv4_1"
type: "Convolution"
bottom: "pool3"
top: "conv4_1"
param {
lr_mult: 1
param {
lr_mult: 2
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
layer {
name: "relu4_1"
type: "ReLU"
bottom: "conv4_1"
top: "conv4_1"
layer {
name: "conv4_2"
type: "Convolution"
bottom: "conv4_1"
top: "conv4_2"
param {
lr_mult: 1
param {
lr_mult: 2
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
layer {
name: "relu4_2"
type: "ReLU"
bottom: "conv4_2"
top: "conv4_2"
layer {
name: "conv4_3"
type: "Convolution"
bottom: "conv4_2"
top: "conv4_3"
param {
lr_mult: 1
param {
lr_mult: 2
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
layer {
name: "relu4_3"
type: "ReLU"
bottom: "conv4_3"
top: "conv4_3"
layer {
name: "pool4"
type: "Pooling"
bottom: "conv4_3"
top: "pool4"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
layer {
name: "conv5_1"
type: "Convolution"
bottom: "pool4"
top: "conv5_1"
param {
lr_mult: 1
param {
lr_mult: 2
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
layer {
name: "relu5_1"
type: "ReLU"
bottom: "conv5_1"
top: "conv5_1"
layer {
name: "conv5_2"
type: "Convolution"
bottom: "conv5_1"
top: "conv5_2"
param {
lr_mult: 1
param {
lr_mult: 2
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
layer {
name: "relu5_2"
type: "ReLU"
bottom: "conv5_2"
top: "conv5_2"
layer {
name: "conv5_3"
type: "Convolution"
bottom: "conv5_2"
top: "conv5_3"
param {
lr_mult: 1
param {
lr_mult: 2
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
layer {
name: "relu5_3"
type: "ReLU"
bottom: "conv5_3"
top: "conv5_3"

#========= RPN ============

layer {
name: "rpn_conv/3x3"
type: "Convolution"
bottom: "conv5_3"
top: "rpn/output"
param { lr_mult: 1.0 }
param { lr_mult: 2.0 }
convolution_param {
num_output: 512
kernel_size: 3 pad: 1 stride: 1
weight_filler { type: "gaussian" std: 0.01 }
bias_filler { type: "constant" value: 0 }
layer {
name: "rpn_relu/3x3"
type: "ReLU"
bottom: "rpn/output"
top: "rpn/output"

layer {
name: "rpn_cls_score"
type: "Convolution"
bottom: "rpn/output"
top: "rpn_cls_score"
param { lr_mult: 1.0 }
param { lr_mult: 2.0 }
convolution_param {
#num_output: 24
num_output: 30 # 2(bg/fg) * 15(n_anchors)
kernel_size: 1 pad: 0 stride: 1
weight_filler { type: "gaussian" std: 0.01 }
bias_filler { type: "constant" value: 0 }

layer {
name: "rpn_bbox_pred"
type: "Convolution"
bottom: "rpn/output"
top: "rpn_bbox_pred"
param { lr_mult: 1.0 }
param { lr_mult: 2.0 }
convolution_param {
#num_output: 48 # 4 * 12(anchors)
num_output: 60 # 4 * 15(anchors)
kernel_size: 1 pad: 0 stride: 1
weight_filler { type: "gaussian" std: 0.01 }
bias_filler { type: "constant" value: 0 }

layer {
bottom: "rpn_cls_score"
top: "rpn_cls_score_reshape"
name: "rpn_cls_score_reshape"
type: "Reshape"
reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }

layer {
name: 'rpn-data'
type: 'Python'
bottom: 'rpn_cls_score'
bottom: 'gt_boxes'
bottom: 'im_info'
bottom: 'data'
top: 'rpn_labels'
top: 'rpn_bbox_targets'
top: 'rpn_bbox_inside_weights'
top: 'rpn_bbox_outside_weights'
python_param {
module: 'rpn.anchor_target_layer'
layer: 'AnchorTargetLayer'
#param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]"
param_str: "'feat_stride': 16 \n'scales': !!python/tuple [2, 4, 8, 16, 32]"

layer {
name: "rpn_loss_cls"
type: "SoftmaxWithLoss"
bottom: "rpn_cls_score_reshape"
bottom: "rpn_labels"
propagate_down: 1
propagate_down: 0
top: "rpn_cls_loss"
loss_weight: 1
loss_param {
ignore_label: -1
normalize: true

layer {
name: "rpn_loss_bbox"
type: "SmoothL1Loss"
bottom: "rpn_bbox_pred"
bottom: "rpn_bbox_targets"
bottom: 'rpn_bbox_inside_weights'
bottom: 'rpn_bbox_outside_weights'
top: "rpn_loss_bbox"
loss_weight: 1
smooth_l1_loss_param { sigma: 3.0 }

#========= RoI Proposal ============

layer {
name: "rpn_cls_prob"
type: "Softmax"
bottom: "rpn_cls_score_reshape"
top: "rpn_cls_prob"

layer {
name: 'rpn_cls_prob_reshape'
type: 'Reshape'
bottom: 'rpn_cls_prob'
top: 'rpn_cls_prob_reshape'
#reshape_param { shape { dim: 0 dim: 24 dim: -1 dim: 0 } }
reshape_param { shape { dim: 0 dim: 30 dim: -1 dim: 0 } }

layer {
name: 'proposal'
type: 'Python'
bottom: 'rpn_cls_prob_reshape'
bottom: 'rpn_bbox_pred'
bottom: 'im_info'
top: 'rpn_rois'
python_param {
module: 'rpn.proposal_layer'
layer: 'ProposalLayer'
#param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]"
param_str: "'feat_stride': 16 \n'scales': !!python/tuple [2, 4, 8, 16, 32]"

layer {
name: 'roi-data'
type: 'Python'
bottom: 'rpn_rois'
bottom: 'gt_boxes'
bottom: 'im_info'
bottom: 'seg_mask_inds'
bottom: 'flipped'
top: 'rois'
top: 'labels'
top: 'bbox_targets'
top: 'bbox_inside_weights'
top: 'bbox_outside_weights'
top: 'mask_targets'
top: 'rois_pos'
top: 'label_for_mask'
python_param {
module: 'rpn.proposal_target_layer_ppsigmoid'
layer: 'ProposalTargetLayer'
param_str: "'num_classes': 3"

#========= RCNN ============

layer {
name: "roi_pool5"
#type: "ROIPooling"
#type: "ROIAlignment2"
type: "ROIAlignment"
bottom: "conv5_3" #bottom[0]
bottom: "rois" #bottom[1]
top: "pool5"
#roi_pooling_param {
#roi_alignment2_param {
roi_alignment_param {
pooled_w: 7
pooled_h: 7
spatial_scale: 0.0625
layer {
name: "fc6"
type: "InnerProduct"
bottom: "pool5"
top: "fc6"
param {
lr_mult: 1
param {
lr_mult: 2
inner_product_param {
num_output: 4096
layer {
name: "relu6"
type: "ReLU"
bottom: "fc6"
top: "fc6"
layer {
name: "fc7"
type: "InnerProduct"
bottom: "fc6"
top: "fc7"
param {
lr_mult: 1
param {
lr_mult: 2
inner_product_param {
num_output: 4096
layer {
name: "relu7"
type: "ReLU"
bottom: "fc7"
top: "fc7"
layer {
name: "cls_score"
type: "InnerProduct"
bottom: "fc7"
top: "cls_score"
param {
lr_mult: 1
param {
lr_mult: 2
inner_product_param {
weight_filler {
type: "gaussian"
std: 0.01
bias_filler {
type: "constant"
value: 0
layer {
name: "bbox_pred"
type: "InnerProduct"
bottom: "fc7"
top: "bbox_pred"
param {
lr_mult: 1
param {
lr_mult: 2
inner_product_param {
num_output: 12 # = 4 * 3, i.e., box coordinate for each class
weight_filler {
type: "gaussian"
std: 0.001
bias_filler {
type: "constant"
value: 0
layer {
name: "loss_cls"
type: "SoftmaxWithLoss"
bottom: "cls_score"
bottom: "labels"
propagate_down: 1
propagate_down: 0
top: "loss_cls"
loss_weight: 3
layer {
name: "loss_bbox"
type: "SmoothL1Loss"
bottom: "bbox_pred"
bottom: "bbox_targets"
bottom: "bbox_inside_weights"
bottom: "bbox_outside_weights"
top: "loss_bbox"
loss_weight: 2

##############Mask branch####################################
layer {
name: "roi_pool5_2"
#type: "ROIPooling"
#type: "ROIAlignment2"
type: "ROIAlignment"
bottom: "conv5_3"
bottom: "rois_pos"
top: "pool5_2"
#roi_pooling_param {
pooled_w: 7
pooled_h: 7
spatial_scale: 0.0625 # 1/16

Conv-Relu 1

layer {
name: "pool5_2_conv"
type: "Convolution"
bottom: "pool5_2"
top: "pool5_2_conv"
param { lr_mult: 1.0 decay_mult: 1.0}
param { lr_mult: 2.0 decay_mult: 0}
convolution_param {
num_output: 512
kernel_size: 1 pad: 0 #kernel_size: 3 pad: 1 stride: 1
weight_filler { type: "gaussian" std: 0.01 } #weight_filler { type: "xavier" }
bias_filler { type: "constant" value: 0 }

layer {
name: "pool5_2_conv_relu"
type: "ReLU"
bottom: "pool5_2_conv"
top: "pool5_2_conv_relu"

Conv-Relu 2

layer {
name: "pool5_2_conv2"
type: "Convolution"
bottom: "pool5_2_conv_relu"
top: "pool5_2_conv2"
param { lr_mult: 1.0 decay_mult: 1.0}
param { lr_mult: 2.0 decay_mult: 0}
convolution_param {
num_output: 512
kernel_size: 3 pad: 1 stride: 1#kernel_size: 1 pad: 0 #kernel_size: 3 pad: 1 stride: 1
weight_filler { type: "gaussian" std: 0.01 } #weight_filler { type: "xavier" }
bias_filler { type: "constant" value: 0 }

layer {
name: "pool5_2_conv2_relu"
type: "ReLU"
bottom: "pool5_2_conv2"
top: "pool5_2_conv2_relu"

Deconv 1

layer {
name: "mask_deconv1"
type: "Deconvolution"
#bottom: "pool5_2_conv_relu"
bottom: "pool5_2_conv2_relu"
top: "mask_deconv1"
param { lr_mult: 1 decay_mult: 1.0 }
param { lr_mult: 2 decay_mult: 0}
convolution_param {
num_output: 256
#pad: 1 stride: 2 kernel_size: 4 # 14x14
#pad: 1 stride: 3 kernel_size: 6 # 22x22
pad: 1 stride: 4 kernel_size: 8 # 30x30
group: 256 #apply independently
weight_filler { type: "bilinear" }
#bias_filler { type: "constant" value: 1 }

Conv-Relu 3

layer {
name: "pool5_2_conv3"
type: "Convolution"
bottom: "mask_deconv1"
top: "pool5_2_conv3"
param { lr_mult: 1.0 decay_mult: 1.0}
param { lr_mult: 2.0 decay_mult: 0}
convolution_param {
num_output: 512
kernel_size: 3 pad: 1 stride: 1#kernel_size: 1 pad: 0 #kernel_size: 3 pad: 1 stride: 1
weight_filler { type: "gaussian" std: 0.01 } #weight_filler { type: "xavier" }
bias_filler { type: "constant" value: 0 }

layer {
name: "pool5_2_conv3_relu"
type: "ReLU"
bottom: "pool5_2_conv3"
top: "pool5_2_conv3_relu"

Conv-Relu 4

layer {
name: "pool5_2_conv4"
type: "Convolution"
bottom: "pool5_2_conv3_relu"
top: "pool5_2_conv4"
param { lr_mult: 1.0 decay_mult: 1.0}
param { lr_mult: 2.0 decay_mult: 0}
convolution_param {
num_output: 512
kernel_size: 3 pad: 1 stride: 1#kernel_size: 1 pad: 0 #kernel_size: 3 pad: 1 stride: 1
weight_filler { type: "gaussian" std: 0.01 } #weight_filler { type: "xavier" }
bias_filler { type: "constant" value: 0 }

layer {
name: "pool5_2_conv4_relu"
type: "ReLU"
bottom: "pool5_2_conv4"
top: "pool5_2_conv4_relu"

Deconv 2

layer {
name: "mask_deconv2"
type: "Deconvolution"
bottom: "pool5_2_conv4_relu"
top: "mask_deconv2"
param { lr_mult: 1 decay_mult: 1.0 }
param { lr_mult: 2 decay_mult: 0}
convolution_param {
num_output: 256
#pad: 1 stride: 2 kernel_size: 4 # 28x28
#pad: 1 stride: 8 kernel_size: 16 # 490x490
pad: 1 stride: 4 kernel_size: 8
group: 256 #apply independently
weight_filler { type: "bilinear" }
#bias_filler { type: "constant" value: 1 }

Conv-Relu 5

layer {
name: "pool5_2_conv5"
type: "Convolution"
bottom: "mask_deconv2"
top: "pool5_2_conv5"
param { lr_mult: 1.0 decay_mult: 1.0}
param { lr_mult: 2.0 decay_mult: 0}
convolution_param {
num_output: 512
kernel_size: 3 pad: 1 stride: 1#kernel_size: 1 pad: 0 #kernel_size: 3 pad: 1 stride: 1
weight_filler { type: "gaussian" std: 0.01 } #weight_filler { type: "xavier" }
bias_filler { type: "constant" value: 0 }

layer {
name: "pool5_2_conv5_relu"
type: "ReLU"
bottom: "pool5_2_conv5"
top: "pool5_2_conv5_relu"

Conv-Relu 6

layer {
name: "pool5_2_conv6"
type: "Convolution"
bottom: "pool5_2_conv5_relu"
top: "pool5_2_conv6"
param { lr_mult: 1.0 decay_mult: 1.0}
param { lr_mult: 2.0 decay_mult: 0}
convolution_param {
num_output: 512
kernel_size: 3 pad: 1 stride: 1#kernel_size: 1 pad: 0 #kernel_size: 3 pad: 1 stride: 1
weight_filler { type: "gaussian" std: 0.01 } #weight_filler { type: "xavier" }
bias_filler { type: "constant" value: 0 }

layer {
name: "pool5_2_conv6_relu"
type: "ReLU"
bottom: "pool5_2_conv6"
top: "pool5_2_conv6_relu"

Deconv 3

layer {
name: "mask_deconv3"
type: "Deconvolution"
bottom: "pool5_2_conv6_relu"
top: "mask_deconv3"
param { lr_mult: 1 decay_mult: 1.0 }
param { lr_mult: 2 decay_mult: 0}
convolution_param {
num_output: 256
pad: 1 stride: 2 kernel_size: 4
#pad: 1 stride: 8 kernel_size: 16
#pad: 1 stride: 4 kernel_size: 8
group: 256 #apply independently
weight_filler { type: "bilinear" }
#bias_filler { type: "constant" value: 1 }

layer {
name: "mask_score"
type: "Convolution"
bottom: "mask_deconv3" #
top: "mask_score"
param { lr_mult: 1.0 decay_mult: 1.0 }
param { lr_mult: 2.0 decay_mult: 0 }
convolution_param {
num_output: 3 # 2 classes + 1 background
kernel_size: 1 pad: 0
weight_filler {type: "gaussian" std: 0.01 } #weight_filler { type: "xavier" }
bias_filler { type: "constant" value: 0 }

layer {
name: 'binary-mask'
type: 'Python'
bottom: 'mask_score'
bottom: 'mask_targets' #from lib/rpn/proposal_target_layer.py roi-data
bottom: 'label_for_mask' #from lib/rpn/proposal_target_layer.py roi-data
top: 'mask_score2'
top: 'binary_mask'
python_param {
module: 'rpn.binary_mask'
layer: 'BinaryMaskLayer'
param_str: "'num_classes': 3"

layer {
name: "loss_mask"
type: "SigmoidCrossEntropyLoss"
bottom: 'mask_score2'
bottom: "binary_mask"
top: "loss_mask"
loss_weight: 0.0003
propagate_down: true # backprop to prediction
propagate_down: false # don't backprop to labels