Question of training

Question

Question of training

sdp369 opened this issue 3 years ago · 5 comments

I try to train the model by only one GPU. But the process is ended by the following error :

Traceback (most recent call last):
File "train.py", line 127, in
mp.spawn(main, nprocs=config['n_gpu'], args=(config['n_gpu'], config, args.resume, args.test))
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/media/ders/sundingpeng/paper_code/Context-Aware-Consistency-master-2/train.py", line 98, in main
trainer.train()
File "/media/ders/sundingpeng/paper_code/Context-Aware-Consistency-master-2/base/base_trainer.py", line 115, in train
results = self._valid_epoch(epoch)
File "/media/ders/sundingpeng/paper_code/Context-Aware-Consistency-master-2/trainer.py", line 145, in _valid_epoch
for batch_idx, (data, target) in enumerate(tbar):
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/tqdm/std.py", line 1178, in iter
for obj in iterable:
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 517, in next
data = self._next_data()
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1199, in _next_data
return self._process_data(data)
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1225, in _process_data
data.reraise()
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/_utils.py", line 429, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
data = fetcher.fetch(index)
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 83, in default_collate
return [default_collate(samples) for samples in transposed]
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 83, in
return [default_collate(samples) for samples in transposed]
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
return torch.stack(batch, 0, out=out)

RuntimeError: stack expects each tensor to be equal size, but got [3, 366, 500] at entry 0 and [3, 335, 500] at entry 1

Answer 1 · 2021-12-04T14:57:57.000Z

Can you send out your execution command and json config file?

Answer 2 · 2021-12-05T01:34:33.000Z

Can you send out your execution command and json config file?

execution command：
nohup python3 train.py --config configs/voc_cac_deeplabv3+_resnet50_1over8_datalist0.json &

json config file：
{
"name": "CAC",
"experim_name": "voc_cac_deeplabv3+_resnet50_1over8_datalist0",
"dataset": "voc",
"data_dir": "../dataset/voc_seg_deeplab/data/VOCtrainval_11-May-2012",
"datalist": 0,
"n_gpu": 1,
"n_labeled_examples": 1323,
"diff_lrs": true,
"ramp_up": 0.1,
"unsupervised_w": 30,
"ignore_index": 255,
"lr_scheduler": "Poly",
"use_weak_lables":false,
"weakly_loss_w": 0.4,
"pretrained": true,
"random_seed": 42,

"model":{
    "supervised": false,
    "semi": true,
    "supervised_w": 1,

    "sup_loss": "CE",

    "layers": 50,
    "downsample": true,
    "proj_final_dim": 128,
    "out_dim": 256,
    "backbone": "deeplab_v3+",
    "pos_thresh_value": 0.75,
    "weight_unsup": 0.1,
    "epoch_start_unsup": 5,
    "selected_num": 6400,
    "temp": 0.1,
    "step_save": 1,
    "stride": 8
},


"optimizer": {
    "type": "SGD",
    "args":{
        "lr": 0.01,
        "weight_decay": 1e-4,
        "momentum": 0.9
    }
},

"train_supervised": {
    "batch_size": 8,
    "crop_size": 320,
    "shuffle": true,
    "base_size": 400,
    "scale": true,
    "augment": true,
    "flip": true,
    "rotate": false,
    "blur": false,
    "split": "train_supervised",
    "num_workers": 8
},

"train_unsupervised": {
    "batch_size": 8,
    "crop_size": 320,
    "shuffle": true,
    "base_size": 400,
    "scale": true,
    "augment": true,
    "flip": true,
    "rotate": false,
    "blur": false,
    "split": "train_unsupervised",
    "num_workers": 8,
    "iou_bound": [0.1, 1.0],
    "stride": 8
},

"val_loader": {
    "batch_size": 2,
    "val": true,
    "split": "val",
    "shuffle": false,
    "num_workers": 4
},

"trainer": {
    "epochs": 80,
    "save_dir": "saved/",
    "save_period": 1,

    "monitor": "max Mean_IoU",
    "early_stop": 100,
    
    "tensorboardX": true,
    "log_dir": "saved/",
    "log_per_iter": 20,

    "val": true,
    "val_per_epochs": 1
}

}

Answer 3 · 2021-12-05T07:40:58.000Z

I think you should change the entry config["val_loader"]["batch_size"] to 1, because the validation images of PASCAL VOC are not of the same size.

Answer 4 · 2021-12-05T09:32:50.000Z

I think you should change the entry config["val_loader"]["batch_size"] to 1, because the validation images of PASCAL VOC are not of the same size.

Thank you ! Your advice works. By the way, what does the 'n' mean, which is the comment in the code from 'model.py' , Line 160 - 165?

        output_feat1 = torch.cat(output_feature_list1, 0) #[n, c]
        output_feat2 = torch.cat(output_feature_list2, 0) #[n, c]
        pseudo_label1_overlap = torch.cat(pseudo_label_list1, 0) #[n,]
        pseudo_label2_overlap = torch.cat(pseudo_label_list2, 0) #[n,]
        pseudo_logits1_overlap = torch.cat(pseudo_logits_list1, 0) #[n,]
        pseudo_logits2_overlap = torch.cat(pseudo_logits_list2, 0) #[n,]

Answer 5 · 2021-12-05T12:11:15.000Z

n means the number of features within the overlapping region between the two feature maps (flattening the current batch of images).