Question of training
sdp369 opened this issue · 5 comments
I try to train the model by only one GPU. But the process is ended by the following error :
Traceback (most recent call last):
File "train.py", line 127, in
mp.spawn(main, nprocs=config['n_gpu'], args=(config['n_gpu'], config, args.resume, args.test))
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/media/ders/sundingpeng/paper_code/Context-Aware-Consistency-master-2/train.py", line 98, in main
trainer.train()
File "/media/ders/sundingpeng/paper_code/Context-Aware-Consistency-master-2/base/base_trainer.py", line 115, in train
results = self._valid_epoch(epoch)
File "/media/ders/sundingpeng/paper_code/Context-Aware-Consistency-master-2/trainer.py", line 145, in _valid_epoch
for batch_idx, (data, target) in enumerate(tbar):
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/tqdm/std.py", line 1178, in iter
for obj in iterable:
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 517, in next
data = self._next_data()
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1199, in _next_data
return self._process_data(data)
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1225, in _process_data
data.reraise()
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/_utils.py", line 429, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
data = fetcher.fetch(index)
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 83, in default_collate
return [default_collate(samples) for samples in transposed]
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 83, in
return [default_collate(samples) for samples in transposed]
File "/home/ders/anaconda3/envs/sdp/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [3, 366, 500] at entry 0 and [3, 335, 500] at entry 1
Can you send out your execution command and json config file?
Can you send out your execution command and json config file?
execution command:
nohup python3 train.py --config configs/voc_cac_deeplabv3+_resnet50_1over8_datalist0.json &
json config file:
{
"name": "CAC",
"experim_name": "voc_cac_deeplabv3+_resnet50_1over8_datalist0",
"dataset": "voc",
"data_dir": "../dataset/voc_seg_deeplab/data/VOCtrainval_11-May-2012",
"datalist": 0,
"n_gpu": 1,
"n_labeled_examples": 1323,
"diff_lrs": true,
"ramp_up": 0.1,
"unsupervised_w": 30,
"ignore_index": 255,
"lr_scheduler": "Poly",
"use_weak_lables":false,
"weakly_loss_w": 0.4,
"pretrained": true,
"random_seed": 42,
"model":{
"supervised": false,
"semi": true,
"supervised_w": 1,
"sup_loss": "CE",
"layers": 50,
"downsample": true,
"proj_final_dim": 128,
"out_dim": 256,
"backbone": "deeplab_v3+",
"pos_thresh_value": 0.75,
"weight_unsup": 0.1,
"epoch_start_unsup": 5,
"selected_num": 6400,
"temp": 0.1,
"step_save": 1,
"stride": 8
},
"optimizer": {
"type": "SGD",
"args":{
"lr": 0.01,
"weight_decay": 1e-4,
"momentum": 0.9
}
},
"train_supervised": {
"batch_size": 8,
"crop_size": 320,
"shuffle": true,
"base_size": 400,
"scale": true,
"augment": true,
"flip": true,
"rotate": false,
"blur": false,
"split": "train_supervised",
"num_workers": 8
},
"train_unsupervised": {
"batch_size": 8,
"crop_size": 320,
"shuffle": true,
"base_size": 400,
"scale": true,
"augment": true,
"flip": true,
"rotate": false,
"blur": false,
"split": "train_unsupervised",
"num_workers": 8,
"iou_bound": [0.1, 1.0],
"stride": 8
},
"val_loader": {
"batch_size": 2,
"val": true,
"split": "val",
"shuffle": false,
"num_workers": 4
},
"trainer": {
"epochs": 80,
"save_dir": "saved/",
"save_period": 1,
"monitor": "max Mean_IoU",
"early_stop": 100,
"tensorboardX": true,
"log_dir": "saved/",
"log_per_iter": 20,
"val": true,
"val_per_epochs": 1
}
}
I think you should change the entry config["val_loader"]["batch_size"]
to 1, because the validation images of PASCAL VOC are not of the same size.
I think you should change the entry
config["val_loader"]["batch_size"]
to 1, because the validation images of PASCAL VOC are not of the same size.
Thank you ! Your advice works. By the way, what does the 'n' mean, which is the comment in the code from 'model.py' , Line 160 - 165?
output_feat1 = torch.cat(output_feature_list1, 0) #[n, c]
output_feat2 = torch.cat(output_feature_list2, 0) #[n, c]
pseudo_label1_overlap = torch.cat(pseudo_label_list1, 0) #[n,]
pseudo_label2_overlap = torch.cat(pseudo_label_list2, 0) #[n,]
pseudo_logits1_overlap = torch.cat(pseudo_logits_list1, 0) #[n,]
pseudo_logits2_overlap = torch.cat(pseudo_logits_list2, 0) #[n,]
n
means the number of features within the overlapping region between the two feature maps (flattening the current batch of images).