Tensors Found on Different Devices
mwalczyk opened this issue · 4 comments
Hi,
I'm trying to run the train_wineholder.sh
script on my machine. It works fine for the first 500 iterations, but immediately after the 500th iteration, it pauses, eventually throwing the following error related to tensors existing on different devices:
Start of training:
2020-11-16 10:31:25 | INFO | fairseq.distributed_utils | distributed init (rank 1): tcp://localhost:14275
2020-11-16 10:31:25 | INFO | fairseq.distributed_utils | distributed init (rank 0): tcp://localhost:14275
2020-11-16 10:31:27 | INFO | fairseq.distributed_utils | initialized host lightbox-desktop as rank 0
2020-11-16 10:31:27 | INFO | fairseq.distributed_utils | initialized host lightbox-desktop as rank 1
2020-11-16 10:31:27 | INFO | fairnr_cli.train | Namespace(L1=False, adam_betas='(0.9, 0.999)', adam_eps=1e-08, all_gather_list_size=16384, alpha_weight=1.0, arch='nsvf_base', background_depth=5.0, background_stop_gradient=True, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_suffix='', chunk_size=64, clip_norm=0.0, color_weight=128.0, cpu=False, criterion='srn_loss', curriculum=0, data='/code/nsvf/datasets/Synthetic_NSVF/Wineholder', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', density_embed_dim=128, depth_weight=0.0, depth_weight_decay=None, deterministic_step=False, device_id=0, disable_validation=False, discrete_regularization=True, distributed_backend='nccl', distributed_init_method='tcp://localhost:14275', distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=2, distributed_wrapper='DDP', empty_cache_freq=0, end_learning_rate=0.0, eval_lpips=False, fast_stat_sync=False, feature_embed_dim=256, feature_layers=1, find_unused_parameters=False, fix_batches_to_gpus=False, fixed_validation_seed=None, force_anneal=None, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, half_voxel_size_at='5000,25000,75000', initial_boundingbox='/code/nsvf/datasets/Synthetic_NSVF/Wineholder/bbox.txt', inputs_to_density='emb:6:32', inputs_to_texture='feat:0:256, ray:4', keep_best_checkpoints=-1, keep_interval_updates=5, keep_last_epochs=-1, load_depth=False, load_mask=False, localsgd_frequency=3, log_format='simple', log_interval=1, lr=[0.001], lr_scheduler='polynomial_decay', max_epoch=0, max_hits=60, max_sentences=1, max_sentences_valid=1, max_tokens=None, max_tokens_valid=None, max_update=150000, maximize_best_checkpoint_metric=False, memory_efficient_bf16=False, memory_efficient_fp16=False, min_color=-1, min_loss_scale=0.0001, min_lr=-1, model_parallel_size=1, no_background_loss=False, no_epoch_checkpoints=False, no_last_checkpoints=False, no_load_binary=False, no_preload=True, no_progress_bar=False, no_sampling_at_reader=True, no_save=False, no_save_optimizer_state=False, no_seed_provided=False, nprocs_per_node=2, num_workers=0, object_id_path=None, optimizer='adam', optimizer_overrides='{}', output_valid=None, patience=-1, pixel_per_view=2048.0, power=1.0, profile=False, pruning_every_steps=2500, pruning_rerun_train_set=False, pruning_th=0.5, pruning_with_train_stats=False, quantization_config_path=None, raymarching_stepsize=0.01, raymarching_stepsize_ratio=0.125, raymarching_tolerance=0, reduce_step_size_at='5000,25000,75000', rendering_args=None, rendering_every_steps=None, required_batch_size_multiple=8, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', sampling_at_center=1.0, sampling_on_bbox=False, sampling_on_mask=1.0, sampling_patch_size=1, sampling_skipping_size=1, save_dir='/code/nsvf/checkpoints/Wineholder/nsvf_basev1', save_interval=1, save_interval_updates=500, scoring='bleu', seed=2, sentence_avg=False, skip_invalid_size_inputs_valid_test=False, slowmo_algorithm='LocalSGD', slowmo_momentum=None, stop_time_hours=0, subsample_valid=-1, task='single_object_rendering', tensorboard_logdir='/code/nsvf/checkpoints/Wineholder/tensorboard/nsvf_basev1', test_views='0', texture_embed_dim=256, texture_layers=3, threshold_loss_scale=None, tokenizer=None, total_num_update=150000, tpu=False, train_subset='train', train_views='0..100', transparent_background='1.0,1.0,1.0', update_freq=[1], use_bmuf=False, use_octree=True, use_old_adam=False, user_dir='fairnr', valid_chunk_size=64, valid_subset='valid', valid_view_per_batch=1, valid_view_resolution='800x800', valid_views='100..200', validate_after_updates=0, validate_interval=1, validate_interval_updates=0, vgg_level=2, vgg_weight=0.0, view_per_batch=2, view_resolution='800x800', virtual_epoch_steps=5000, voxel_embed_dim=32, voxel_path=None, voxel_size=0.25, warmup_updates=0, weight_decay=0.0)
2020-11-16 10:31:27 | INFO | fairnr_cli.train | NSVFModel(
(reader): ImageReader()
(encoder): SparseVoxelEncoder(
(values): Embedding(1170, 32)
)
(field): RaidanceField(
(bg_color): BackgroundField()
(den_filters): ModuleDict(
(emb): NeRFPosEmbLinear(Cat(32, Sinusoidal (in=32, out=384, angular=False)))
)
(tex_filters): ModuleDict(
(feat): Identity()
(ray): NeRFPosEmbLinear(Sinusoidal (in=3, out=24, angular=True))
)
(feature_field): ImplicitField(
(net): Sequential(
(0): FCLayer(
(net): Sequential(
(0): Linear(in_features=416, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
)
)
(1): FCLayer(
(net): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
)
)
(2): FCLayer(
(net): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
)
)
)
)
(predictor): SignedDistanceField(
(hidden_layer): FCLayer(
(net): Sequential(
(0): Linear(in_features=256, out_features=128, bias=True)
(1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
)
)
(output_layer): Linear(in_features=128, out_features=1, bias=True)
)
(renderer): TextureField(
(net): Sequential(
(0): FCLayer(
(net): Sequential(
(0): Linear(in_features=280, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
)
)
(1): FCLayer(
(net): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
)
)
(2): FCLayer(
(net): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
)
)
(3): FCLayer(
(net): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
)
)
(4): Linear(in_features=256, out_features=3, bias=True)
)
)
)
(raymarcher): VolumeRenderer()
)
2020-11-16 10:31:27 | INFO | fairnr_cli.train | model nsvf_base, criterion SRNLossCriterion
2020-11-16 10:31:27 | INFO | fairnr_cli.train | num. model params: 582737 (num. trained: 582724)
2020-11-16 10:31:27 | INFO | fairseq.utils | ***********************CUDA enviroments for all 2 workers***********************
2020-11-16 10:31:27 | INFO | fairseq.utils | rank 0: capabilities = 6.1 ; total memory = 7.929 GB ; name = GeForce GTX 1080
2020-11-16 10:31:27 | INFO | fairseq.utils | rank 1: capabilities = 6.1 ; total memory = 7.921 GB ; name = GeForce GTX 1080
2020-11-16 10:31:27 | INFO | fairseq.utils | ***********************CUDA enviroments for all 2 workers***********************
2020-11-16 10:31:27 | INFO | fairnr_cli.train | training on 2 GPUs
2020-11-16 10:31:27 | INFO | fairnr_cli.train | max tokens per GPU = None and max sentences per GPU = 1
2020-11-16 10:31:27 | INFO | fairseq.trainer | no existing checkpoint found /code/nsvf/checkpoints/Wineholder/nsvf_basev1/checkpoint_last.pt
2020-11-16 10:31:27 | INFO | fairseq.trainer | loading train data for epoch 1
/code/nsvf/env/lib/python3.8/site-packages/torch/nn/parallel/distributed.py:397: UserWarning: The `check_reduction` argument in `DistributedDataParallel` module is deprecated. Please avoid using it.
warnings.warn(
/code/nsvf/env/lib/python3.8/site-packages/torch/nn/parallel/distributed.py:397: UserWarning: The `check_reduction` argument in `DistributedDataParallel` module is deprecated. Please avoid using it.
warnings.warn(
Building EasyOctree done. total #nodes = 1881, terminal #nodes = 864 (time taken 0.254881 s)
Building EasyOctree done. total #nodes = 1881, terminal #nodes = 864 (time taken 0.260331 s)
/code/nsvf/env/lib/python3.8/site-packages/fairseq/utils.py:304: UserWarning: amp_C fused kernels unavailable, disabling multi_tensor_l2norm; you may get better performance by installing NVIDIA's apex library
warnings.warn(
/code/nsvf/env/lib/python3.8/site-packages/fairseq/utils.py:304: UserWarning: amp_C fused kernels unavailable, disabling multi_tensor_l2norm; you may get better performance by installing NVIDIA's apex library
Then at iter 500:
2020-11-16 10:41:12 | INFO | train_inner | epoch 001: 500 / 5000 loss=23.507, vox=0.125, stp=0.016, tvo=0.127, asf=68.201, ash=68.201, nvo=864, color=0.182, alpha=0.228, wps=2.1, ups=1.07, wpb=2, bsz=2, num_updates=500, lr=0.000996667, gnorm=171.876, train_wall=1, wall=464
Traceback (most recent call last):
File "train.py", line 20, in <module>
cli_main()
File "/code/nsvf/fairnr_cli/train.py", line 353, in cli_main
torch.multiprocessing.spawn(
File "/code/nsvf/env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/code/nsvf/env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/code/nsvf/env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/code/nsvf/env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/code/nsvf/fairnr_cli/train.py", line 338, in distributed_main
main(args, init_distributed=True)
File "/code/nsvf/fairnr_cli/train.py", line 104, in main
should_end_training = train(args, trainer, task, epoch_itr)
File "/media/lightbox/Extra/anaconda/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/code/nsvf/fairnr_cli/train.py", line 204, in train
valid_losses = validate_and_save(args, trainer, task, epoch_itr, valid_subsets)
File "/code/nsvf/fairnr_cli/train.py", line 245, in validate_and_save
valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets)
File "/code/nsvf/fairnr_cli/train.py", line 302, in validate
trainer.valid_step(sample)
File "/media/lightbox/Extra/anaconda/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/code/nsvf/env/lib/python3.8/site-packages/fairseq/trainer.py", line 631, in valid_step
raise e
File "/code/nsvf/env/lib/python3.8/site-packages/fairseq/trainer.py", line 615, in valid_step
_loss, sample_size, logging_output = self.task.valid_step(
File "/code/nsvf/fairnr/tasks/neural_rendering.py", line 306, in valid_step
images = model.visualize(sample, shape=0, view=0)
File "/code/nsvf/env/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 26, in decorate_context
return func(*args, **kwargs)
File "/code/nsvf/fairnr/models/fairnr_model.py", line 126, in visualize
images = {
File "/code/nsvf/fairnr/models/fairnr_model.py", line 127, in <dictcomp>
tag: recover_image(width=width, **images[tag])
File "/code/nsvf/fairnr/data/data_utils.py", line 264, in recover_image
img = ((img - min_val) / (max_val - min_val)).clamp(min=0, max=1)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
The only change I've made to the training script is, reducing --view-per-batch
to 1. Do you have any idea what the issue might be?
I'm running this on Ubuntu 20.04 with two GTX GeForce 1080 GPUs, CUDA version 10.1. Let me know if I can provide any further info at this time! Thanks so much!
Hmm, I never saw this error before. Did you try using one GPU first? For instance, by setting ``CUDA_VISIBLE_DEVICES=0" ?
Hey thanks for getting back to me so quickly. I can try that!
For what it's worth, I was able to get it to work by adding the following lines to fairnr/data/data_utils.py
in the function recover_image()
:
if torch.is_tensor(min_val):
min_val = min_val.float().to('cpu')
if torch.is_tensor(max_val):
max_val = min_val.float().to('cpu')
But I have no idea whether that is reasonable. I printed out the images
variable in fairnr_model.py
(function visualize()
) and noticed that in that dictionary, there were two entries that weren't torch tensors, which I think was causing issues:
'render_normal/0_0:HWC': {
'img': tensor([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.],
...,
[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]], device='cuda:0'),
'min_val': -1, <---- This entry
'max_val': 1 <---- This entry
},
The training proceeds and after reaching the requisite 500 iterations, enters another 50 iterations on the "valid" subset, and fails after about 12 of those iterations with a CUDA OOM error (pasted below for reference). Is there any guidelines on the minimum amount of VRAM required in order to run the training? Across two GPUs, I believe I have 16 GBs free. Alternatively, are there other ways to lower the amount of VRAM usage during training?
In the meantime, I will try your suggestion of setting the CUDA_VISIBLE_DEVICES env variable. Thanks!
Traceback (most recent call last):
File "train.py", line 20, in <module>
cli_main()
File "/code/nsvf/fairnr_cli/train.py", line 353, in cli_main
torch.multiprocessing.spawn(
File "/code/nsvf/env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/code/nsvf/env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/code/nsvf/env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/code/nsvf/env/lib/python3.8/site-packages/fairseq/trainer.py", line 615, in valid_step
_loss, sample_size, logging_output = self.task.valid_step(
File "/code/nsvf/fairnr/tasks/neural_rendering.py", line 303, in valid_step
loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
File "/code/nsvf/env/lib/python3.8/site-packages/fairseq/tasks/fairseq_task.py", line 361, in valid_step
loss, sample_size, logging_output = criterion(model, sample)
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/code/nsvf/fairnr/criterions/rendering_loss.py", line 42, in forward
net_output = model(**sample)
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/code/nsvf/fairnr/models/fairnr_model.py", line 77, in forward
results = self._forward(ray_start, ray_dir, **kwargs)
File "/code/nsvf/fairnr/models/nsvf.py", line 78, in _forward
samples = self.encoder.ray_sample(intersection_outputs)
File "/code/nsvf/fairnr/modules/encoder.py", line 354, in ray_sample
sampled_idx, sampled_depth, sampled_dists = uniform_ray_sampling(
File "/code/nsvf/fairnr/clib/__init__.py", line 213, in forward
max_len = sampled_idx.ne(-1).sum(-1).max()
RuntimeError: CUDA out of memory. Tried to allocate 1.02 GiB (GPU 1; 7.92 GiB total capacity; 3.34 GiB already allocated; 638.38 MiB free; 6.34 GiB reserved in total by PyTorch)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/code/nsvf/env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/code/nsvf/fairnr_cli/train.py", line 338, in distributed_main
main(args, init_distributed=True)
File "/code/nsvf/fairnr_cli/train.py", line 104, in main
should_end_training = train(args, trainer, task, epoch_itr)
File "/media/lightbox/Extra/anaconda/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/code/nsvf/fairnr_cli/train.py", line 204, in train
valid_losses = validate_and_save(args, trainer, task, epoch_itr, valid_subsets)
File "/code/nsvf/fairnr_cli/train.py", line 245, in validate_and_save
valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets)
File "/code/nsvf/fairnr_cli/train.py", line 302, in validate
trainer.valid_step(sample)
File "/media/lightbox/Extra/anaconda/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/code/nsvf/env/lib/python3.8/site-packages/fairseq/trainer.py", line 630, in valid_step
return self.valid_step(sample, raise_oom=True)
File "/media/lightbox/Extra/anaconda/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/code/nsvf/env/lib/python3.8/site-packages/fairseq/trainer.py", line 631, in valid_step
raise e
File "/code/nsvf/env/lib/python3.8/site-packages/fairseq/trainer.py", line 615, in valid_step
_loss, sample_size, logging_output = self.task.valid_step(
File "/code/nsvf/fairnr/tasks/neural_rendering.py", line 303, in valid_step
loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
File "/code/nsvf/env/lib/python3.8/site-packages/fairseq/tasks/fairseq_task.py", line 361, in valid_step
loss, sample_size, logging_output = criterion(model, sample)
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/code/nsvf/fairnr/criterions/rendering_loss.py", line 42, in forward
net_output = model(**sample)
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/code/nsvf/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/code/nsvf/fairnr/models/fairnr_model.py", line 77, in forward
results = self._forward(ray_start, ray_dir, **kwargs)
File "/code/nsvf/fairnr/models/nsvf.py", line 78, in _forward
samples = self.encoder.ray_sample(intersection_outputs)
File "/code/nsvf/fairnr/modules/encoder.py", line 354, in ray_sample
sampled_idx, sampled_depth, sampled_dists = uniform_ray_sampling(
File "/code/nsvf/fairnr/clib/__init__.py", line 200, in forward
sampled_idx, sampled_depth, sampled_dists = _ext.uniform_ray_sampling(
RuntimeError: CUDA out of memory. Tried to allocate 522.00 MiB (GPU 1; 7.92 GiB total capacity; 5.06 GiB already allocated; 120.38 MiB free; 6.84 GiB reserved in total by PyTorch)
Have the same RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
Ubuntu 20.04 with GeForce RTX 3090 (24GB memory) CUDA Version: 11.1. Thanks.
There are some tensors initialized and not put on GPUs.