nan while training MAPPO on RWARE
hughjazzman opened this issue · 6 comments
Awesome work! Just having trouble running MAPPO on the RWARE environment. My run.json
is below. The main error is the nan
causing ValueError
appearing in the middle of training. I am wondering if maybe using the latest version of rware
instead of the one used in the paper might be the problem, and if so, why. Command run was
python3 src/main.py --config=mappo --env-config=gymma with env_args.time_limit=500 env_args.key="rware:rware-tiny-2ag-v1"
{
"artifacts": [],
"command": "my_main",
"experiment": {
"base_dir": "/home/x/epymarl/src",
"dependencies": [
"munch==2.5.0",
"numpy==1.21.6",
"PyYAML==5.3.1",
"sacred==0.8.0",
"torch==1.12.1",
"wqmix==0.1.0"
],
"mainfile": "main.py",
"name": "pymarl",
"repositories": [
{
"commit": "f047072aedc9a128d28b01ba42ef3381dcad2328",
"dirty": true,
"url": "https://github.com/uoe-agents/epymarl.git"
},
{
"commit": "f047072aedc9a128d28b01ba42ef3381dcad2328",
"dirty": true,
"url": "https://github.com/uoe-agents/epymarl.git"
}
],
"sources": [
[
"main.py",
"_sources/main_663aa94901b58b1db134be7c43e7a0df.py"
],
[
"run.py",
"_sources/run_4f45b371acea3e76064abb7e3b6103b1.py"
]
]
},
"fail_trace": [
"Traceback (most recent call last):\n",
" File \"/home/x/anaconda3/envs/x/lib/python3.7/site-packages/sacred/stdout_capturing.py\", line 163, in tee_output_fd\n yield out # let the caller do their printing\n",
" File \"/home/x/anaconda3/envs/x/lib/python3.7/site-packages/sacred/run.py\", line 238, in __call__\n self.result = self.main_function(*args)\n",
" File \"/home/x/anaconda3/envs/x/lib/python3.7/site-packages/sacred/config/captured_function.py\", line 42, in captured_function\n result = wrapped(*args, **kwargs)\n",
" File \"src/main.py\", line 36, in my_main\n run(_run, config, _log)\n",
" File \"/home/x/epymarl/src/run.py\", line 55, in run\n run_sequential(args=args, logger=logger)\n",
" File \"/home/x/epymarl/src/run.py\", line 185, in run_sequential\n episode_batch = runner.run(test_mode=False)\n",
" File \"/home/x/epymarl/src/runners/parallel_runner.py\", line 104, in run\n actions = self.mac.select_actions(self.batch, t_ep=self.t, t_env=self.t_env, bs=envs_not_terminated, test_mode=test_mode)\n",
" File \"/home/x/epymarl/src/controllers/basic_controller.py\", line 23, in select_actions\n chosen_actions = self.action_selector.select_action(agent_outputs[bs], avail_actions[bs], t_env, test_mode=test_mode)\n",
" File \"/home/x/epymarl/src/components/action_selectors.py\", line 73, in select_action\n m = Categorical(agent_inputs)\n",
" File \"/home/x/anaconda3/envs/x/lib/python3.7/site-packages/torch/distributions/categorical.py\", line 64, in __init__\n super(Categorical, self).__init__(batch_shape, validate_args=validate_args)\n",
" File \"/home/x/anaconda3/envs/x/lib/python3.7/site-packages/torch/distributions/distribution.py\", line 56, in __init__\n f\"Expected parameter {param} \"\n",
"ValueError: Expected parameter probs (Tensor of shape (10, 2, 5)) of distribution Categorical(probs: torch.Size([10, 2, 5])) to satisfy the constraint Simplex(), but found invalid values:\ntensor([[[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]],\n\n [[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]],\n\n [[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]],\n\n [[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]],\n\n [[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]],\n\n [[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]],\n\n [[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]],\n\n [[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]],\n\n [[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]],\n\n [[nan, nan, nan, nan, nan],\n [nan, nan, nan, nan, nan]]], device='cuda:0', grad_fn=<DivBackward0>)\n",
"\nDuring handling of the above exception, another exception occurred:\n\n",
"Traceback (most recent call last):\n",
" File \"/home/x/anaconda3/envs/x/lib/python3.7/contextlib.py\", line 130, in __exit__\n self.gen.throw(type, value, traceback)\n",
" File \"/home/x/anaconda3/envs/x/lib/python3.7/site-packages/sacred/stdout_capturing.py\", line 175, in tee_output_fd\n tee_stdout.wait(timeout=1)\n",
" File \"/home/x/anaconda3/envs/x/lib/python3.7/subprocess.py\", line 1019, in wait\n return self._wait(timeout=timeout)\n",
" File \"/home/x/anaconda3/envs/x/lib/python3.7/subprocess.py\", line 1645, in _wait\n raise TimeoutExpired(self.args, timeout)\n",
"subprocess.TimeoutExpired: Command '['tee', '-a', '/tmp/tmpz5nib2ap']' timed out after 1 seconds\n"
],
"heartbeat": "2022-09-21T15:11:48.887817",
"host": {
"ENV": {},
"cpu": "Intel(R) Core(TM) i5-9600K CPU @ 3.70GHz",
"gpus": {
"driver_version": "471.41",
"gpus": [
{
"model": "NVIDIA GeForce RTX 2070",
"persistence_mode": false,
"total_memory": 8192
}
]
},
"hostname": "x",
"os": [
"Linux",
"Linux-5.10.60.1-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid"
],
"python_version": "3.7.13"
},
"meta": {
"command": "my_main",
"options": {
"--beat-interval": null,
"--capture": null,
"--comment": null,
"--debug": false,
"--enforce_clean": false,
"--file_storage": null,
"--force": false,
"--help": false,
"--loglevel": null,
"--mongo_db": null,
"--name": null,
"--pdb": false,
"--print-config": false,
"--priority": null,
"--queue": false,
"--s3": null,
"--sql": null,
"--tiny_db": null,
"--unobserve": false,
"COMMAND": null,
"UPDATE": [
"env_args.time_limit=500",
"env_args.key=rware:rware-tiny-2ag-v1"
],
"help": false,
"with": true
}
},
"resources": [],
"result": null,
"start_time": "2022-09-21T11:53:23.830946",
"status": "FAILED",
"stop_time": "2022-09-21T15:11:48.976631"
Edit: Ran it again with the exact environment (https://github.com/uoe-agents/robotic-warehouse) but still got the same error.
[INFO 17:09:58] my_main t_env: 7055000 / 20050000
[INFO 17:09:58] my_main Estimated time left: 7 hours, 20 minutes, 15 seconds. Time passed: 4 hours, 0 seconds
[INFO 17:11:32] my_main Recent Stats | t_env: 7100000 | Episode: 14200
advantage_mean: -0.0393 agent_grad_norm: 0.1806 critic_grad_norm: 18.5048 critic_loss: 2.0751
ep_length_mean: 500.0000 pg_loss: 0.0272 pi_max: 0.6237 q_taken_mean: 21.0140
return_mean: 0.1333 return_std: 0.3700 target_mean: 20.9746 td_error_abs: 0.6127
test_ep_length_mean: 500.0000 test_return_mean: 0.0500 test_return_std: 0.2126
[DEBUG 17:11:33] pymarl Stopping Heartbeat
[ERROR 17:11:34] pymarl Failed after 4:01:40!
Traceback (most recent calls WITHOUT Sacred internals):
File "src/main.py", line 36, in my_main
run(_run, config, _log)
File "/home/x/epymarl/src/run.py", line 55, in run
run_sequential(args=args, logger=logger)
File "/home/x/epymarl/src/run.py", line 185, in run_sequential
episode_batch = runner.run(test_mode=False)
File "/home/x/epymarl/src/runners/parallel_runner.py", line 104, in run
actions = self.mac.select_actions(self.batch, t_ep=self.t, t_env=self.t_env, bs=envs_not_terminated, test_mode=test_mode)
File "/home/x/epymarl/src/controllers/basic_controller.py", line 23, in select_actions
chosen_actions = self.action_selector.select_action(agent_outputs[bs], avail_actions[bs], t_env, test_mode=test_mode)
File "/home/x/epymarl/src/components/action_selectors.py", line 73, in select_action
m = Categorical(agent_inputs)
File "/home/x/anaconda3/envs/qmix/lib/python3.7/site-packages/torch/distributions/categorical.py", line 64, in __init__
super(Categorical, self).__init__(batch_shape, validate_args=validate_args)
File "/home/x/anaconda3/envs/qmix/lib/python3.7/site-packages/torch/distributions/distribution.py", line 56, in __init__
f"Expected parameter {param} "
ValueError: Expected parameter probs (Tensor of shape (10, 2, 5)) of distribution Categorical(probs: torch.Size([10, 2, 5])) to satisfy the constraint Simplex(), but found invalid values:
tensor([[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]],
[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]],
[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]],
[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]],
[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]],
[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]],
[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]],
[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]],
[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]],
[[nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan]]], device='cuda:0', grad_fn=<DivBackward0>)
During handling of the above exception, another exception occurred:
Traceback (most recent calls WITHOUT Sacred internals):
File "/home/x/anaconda3/envs/qmix/lib/python3.7/contextlib.py", line 130, in __exit__
self.gen.throw(type, value, traceback)
File "/home/x/anaconda3/envs/qmix/lib/python3.7/subprocess.py", line 1019, in wait
return self._wait(timeout=timeout)
File "/home/x/anaconda3/envs/qmix/lib/python3.7/subprocess.py", line 1645, in _wait
raise TimeoutExpired(self.args, timeout)
subprocess.TimeoutExpired: Command '['tee', '-a', '/tmp/tmpr0gvavq4']' timed out after 1 seconds
me too
Hello,
did you use the hyperparameters from the paper? Or the defaults that exist in the .yaml files?
I met same bug, with same configs of rware in README
@papoudakis Could I have pip list
results from your environments?
I would like to know your package version such as torch
Hello,
I have updated the mappo.yaml config. We had accidentally set the standardise_returns hyperparameter to True, while it should be False. Could you please check again whether the issue persists? Please make sure that you use the hyperparameters that are mentioned in the paper.
Thanks, issue is fixed with given solution.
standardise_returns: False