RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED on Pong
norbertkeresztes opened this issue · 1 comments
norbertkeresztes commented
Describe the bug
Running the PPO on Pong example from the guides results in CUDNN_STATUS_EXECUTION_FAILED error.
To Reproduce
- OS and environment: Ubuntu 20.04.1
- SLM Lab git SHA (run
git rev-parse HEAD
to get it): faca82c spec
file used: slm_lab/spec/benchmark/ppo/ppo_pong.json- run
python run_lab.py slm_lab/spec/benchmark/ppo/ppo_pong.json ppo_pong train
Additional context
Only 4 CPU cores are used and they are running on 100%. It seems to use all four of the GPUs (RTX 3090).
Error logs
Replaced the start of the paths with ...
Process Process-5:
Traceback (most recent call last):
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File ".../SLM-Lab/slm_lab/experiment/control.py", line 26, in mp_run_session
metrics = session.run()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 118, in run
self.run_rl()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 104, in run_rl
action = self.agent.act(state)
File ".../slm_lab/agent/__init__.py", line 43, in act
action = self.algorithm.act(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 102, in act
action = self.action_policy(state, self, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 137, in default
pdparam = calc_pdparam(state, algorithm, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 85, in calc_pdparam
pdparam = algorithm.calc_pdparam(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/actor_critic.py", line 172, in calc_pdparam
out = super().calc_pdparam(x, net=net)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 96, in calc_pdparam
pdparam = net(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../SLM-Lab/slm_lab/agent/net/conv.py", line 176, in forward
x = self.conv_model(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Process Process-3:
Traceback (most recent call last):
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File ".../SLM-Lab/slm_lab/experiment/control.py", line 26, in mp_run_session
metrics = session.run()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 118, in run
self.run_rl()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 104, in run_rl
action = self.agent.act(state)
File ".../SLM-Lab/slm_lab/agent/__init__.py", line 43, in act
action = self.algorithm.act(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 102, in act
action = self.action_policy(state, self, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 137, in default
pdparam = calc_pdparam(state, algorithm, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 85, in calc_pdparam
pdparam = algorithm.calc_pdparam(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/actor_critic.py", line 172, in calc_pdparam
out = super().calc_pdparam(x, net=net)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 96, in calc_pdparam
pdparam = net(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../SLM-Lab/slm_lab/agent/net/conv.py", line 176, in forward
x = self.conv_model(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Process Process-4:
Traceback (most recent call last):
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File ".../envs/lab/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File ".../SLM-Lab/slm_lab/experiment/control.py", line 26, in mp_run_session
metrics = session.run()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 118, in run
self.run_rl()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 104, in run_rl
action = self.agent.act(state)
File ".../SLM-Lab/slm_lab/agent/__init__.py", line 43, in act
action = self.algorithm.act(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 102, in act
action = self.action_policy(state, self, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 137, in default
pdparam = calc_pdparam(state, algorithm, body)
File ".../SLM-Lab/slm_lab/agent/algorithm/policy_util.py", line 85, in calc_pdparam
pdparam = algorithm.calc_pdparam(state)
File ".../SLM-Lab/slm_lab/agent/algorithm/actor_critic.py", line 172, in calc_pdparam
out = super().calc_pdparam(x, net=net)
File ".../SLM-Lab/slm_lab/agent/algorithm/reinforce.py", line 96, in calc_pdparam
pdparam = net(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../SLM-Lab/slm_lab/agent/net/conv.py", line 176, in forward
x = self.conv_model(x)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File ".../envs/lab/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Traceback (most recent call last):
File "run_lab.py", line 99, in <module>
main()
File "run_lab.py", line 91, in main
get_spec_and_run(*args)
File "run_lab.py", line 75, in get_spec_and_run
run_spec(spec, lab_mode)
File "run_lab.py", line 58, in run_spec
Trial(spec).run()
File ".../SLM-Lab/slm_lab/experiment/control.py", line 182, in run
metrics = analysis.analyze_trial(self.spec, session_metrics_list)
File ".../SLM-Lab/slm_lab/experiment/analysis.py", line 266, in analyze_trial
trial_metrics = calc_trial_metrics(session_metrics_list, info_prepath)
File ".../SLM-Lab/slm_lab/experiment/analysis.py", line 186, in calc_trial_metrics
frames = session_metrics_list[0]['local']['frames']
IndexError: list index out of range
kengz commented
Hi, did some digging and found this issue pytorch/pytorch#17543 (comment)
are you using RTX 2080Ti with Cuda 9?