donnyyou/torchcv

ValueError: Expected more than 1 value per channel when training, got input size 1

zhaoliu5254 opened this issue · 2 comments

My configuration:pytorch 1.4.0 and py3.7_cuda10.1.243_cudnn7.6.3_0,Two RTX2080Ti-11G.After modifying NGPUS and --gpu in run_fs_pspnet_cityscapes_seg.sh, the error is reported:

2020-04-11 12:23:19,529 INFO [runner_helper.py, 38] Converting syncbn model...
2020-04-11 12:23:22,108 INFO [controller.py, 28] Training start...
Traceback (most recent call last):
File "main.py", line 185, in
Controller.train(runner)
File "/home/zl/zhaoliu/fam/lib/runner/controller.py", line 46, in train
runner.train()
File "/home/zl/zhaoliu/fam/runner/seg/fcn_segmentor.py", line 86, in train
out = self.seg_net(data_dict)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
Traceback (most recent call last):
File "main.py", line 185, in
result = self.forward(*input, **kwargs)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 447, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
Controller.train(runner)
File "/home/zl/zhaoliu/fam/lib/runner/controller.py", line 46, in train
result = self.forward(*input, **kwargs)
File "/home/zl/zhaoliu/fam/model/seg/nets/pspnet.py", line 95, in forward
x = self.ppm(x)
runner.train() File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call

File "/home/zl/zhaoliu/fam/runner/seg/fcn_segmentor.py", line 86, in train
out = self.seg_net(data_dict)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/zhaoliu/fam/model/seg/nets/pspnet.py", line 52, in forward
ppm_out.append(F.interpolate(pool_scale(x), (input_size[2], input_size[3]),
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 447, in forward
result = self.forward(*input, **kwargs)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/container.py", line 100, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
input = module(input)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/zhaoliu/fam/model/seg/nets/pspnet.py", line 95, in forward
result = self.forward(*input, **kwargs)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/container.py", line 100, in forward
x = self.ppm(x)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
input = module(input)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/zhaoliu/fam/model/seg/nets/pspnet.py", line 52, in forward
result = self.forward(*input, **kwargs)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 473, in forward
ppm_out.append(F.interpolate(pool_scale(x), (input_size[2], input_size[3]),
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
self.eps, exponential_average_factor, process_group, world_size)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/_functions.py", line 13, in forward
result = self.forward(*input, **kwargs)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/container.py", line 100, in forward
raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
ValueError: Expected more than 1 value per channel when training, got input size 1
input = module(input)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/container.py", line 100, in forward
input = module(input)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 473, in forward
self.eps, exponential_average_factor, process_group, world_size)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/_functions.py", line 13, in forward
raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
ValueError: Expected more than 1 value per channel when training, got input size 1
Traceback (most recent call last):
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/distributed/launch.py", line 263, in
main()
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/distributed/launch.py", line 259, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/home/zl/.conda/envs/yolact-env/bin/python', '-u', 'main.py', '--local_rank=1', '--config_file', 'configs/seg/cityscapes/base_fcn_cityscapes_seg.conf', '--phase', 'train', '--gpu', '0', '1', '--train_batch_size', '1', '--val_batch_size', '1', '--backbone', 'deepbase_resnet101_d8', '--model_name', 'pspnet', '--drop_last', 'y', '--syncbn', 'y', '--dist', 'y', '--data_dir', '/home/zl/zhaoliu/fam/DataSet/CityScapes', '--loss_type', 'dsnce_loss', '--max_iters', '40000', '--checkpoints_name', 'fs_pspnet_cityscapes_segtag', '--pretrained', './pretrained_models/3x3resnet101-imagenet.pth']' returned non-zero exit status 1.

When you don't use sync BN, you should make sure the batch > 1 per gpu.

thanks! but, I used sync BN, and the same error is still reported when train_batch_size is set to 1:
ValueError: Expected more than 1 value per channel when training, got input size 1.

Error when train_batch_size is set to 2:
2020-05-04 23:03:39,946 INFO [runner_helper.py, 38] Converting syncbn model...
2020-05-04 23:03:42,564 INFO [controller.py, 28] Training start...
Traceback (most recent call last):
File "main.py", line 185, in
Controller.train(runner)
File "/home/zl/zhaoliu/fam/lib/runner/controller.py", line 46, in train
runner.train()
File "/home/zl/zhaoliu/fam/runner/seg/fcn_segmentor.py", line 86, in train
out = self.seg_net(data_dict)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 447, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/zhaoliu/fam/model/seg/nets/pspnet.py", line 100, in forward
mode="bilinear", align_corners=False)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/functional.py", line 2530, in interpolate
return torch._C._nn.upsample_bilinear2d(input, _output_size(2), align_corners)
RuntimeError: CUDA out of memory. Tried to allocate 86.00 MiB (GPU 0; 10.76 GiB total capacity; 9.54 GiB already allocated; 84.75 MiB free; 9.79 GiB reserved in total by PyTorch)
Traceback (most recent call last):
File "main.py", line 185, in
Controller.train(runner)
File "/home/zl/zhaoliu/fam/lib/runner/controller.py", line 46, in train
runner.train()
File "/home/zl/zhaoliu/fam/runner/seg/fcn_segmentor.py", line 88, in train
loss_dict = self.loss(out)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/zhaoliu/fam/model/seg/loss/loss.py", line 38, in forward
out_dict[key] = self.func_listint(item['type'].float().mean().item())
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/zl/zhaoliu/fam/model/seg/loss/ce_loss.py", line 25, in forward
ignore_index=self.ignore_index, reduction=self.reduction)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/functional.py", line 2021, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/nn/functional.py", line 1317, in log_softmax
ret = input.log_softmax(dim)
RuntimeError: CUDA out of memory. Tried to allocate 86.00 MiB (GPU 1; 10.76 GiB total capacity; 9.62 GiB already allocated; 60.19 MiB free; 9.91 GiB reserved in total by PyTorch)
Traceback (most recent call last):
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/distributed/launch.py", line 263, in
main()
File "/home/zl/.conda/envs/yolact-env/lib/python3.7/site-packages/torch/distributed/launch.py", line 259, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/home/zl/.conda/envs/yolact-env/bin/python', '-u', 'main.py', '--local_rank=1', '--config_file', 'configs/seg/cityscapes/base_fcn_cityscapes_seg.conf', '--phase', 'train', '--gpu', '0', '1', '--train_batch_size', '2', '--val_batch_size', '1', '--backbone', 'deepbase_resnet101_d8', '--model_name', 'pspnet', '--drop_last', 'y', '--syncbn', 'y', '--dist', 'y', '--data_dir', '/home/zl/zhaoliu/fam/DataSet/CityScapes', '--loss_type', 'dsnce_loss', '--max_iters', '40000', '--checkpoints_name', 'fs_pspnet_cityscapes_segtag', '--pretrained', './pretrained_models/3x3resnet101-imagenet.pth']' returned non-zero exit status 1.