Torch copo error seems occurred when an agent is done
weirme opened this issue · 2 comments
weirme commented
following error occurred when training about 80 epochs, seems because some agent is done and removed from dict.
my MetaDrive version is 0.2.5, could someone kindly give me some suggestion on this issue?
Failure # 1 (occurred at 2023-07-10_17-02-34)
�[36mray::CoPOTrainer.train()�[39m (pid=356776, ip=10.0.0.10, repr=CoPOTrainer)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/tune/trainable/trainable.py", line 367, in train
raise skipped from exception_cause(skipped)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/tune/trainable/trainable.py", line 364, in train
result = self.step()
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/algorithms/algorithm.py", line 749, in step
results, train_iter_ctx = self._run_one_training_iteration()
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/algorithms/algorithm.py", line 2623, in _run_one_training_iteration
results = self.training_step()
File "/home/weirme/Repo/CoPO/copo_code/copo/torch_copo/algo_copo.py", line 524, in training_step
worker_set=self.workers, max_env_steps=self.config.train_batch_size
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/execution/rollout_ops.py", line 86, in synchronous_parallel_sample
lambda w: w.sample(), local_worker=False, healthy_only=True
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py", line 696, in foreach_worker
handle_remote_call_result_errors(remote_results, self._ignore_worker_failures)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py", line 73, in handle_remote_call_result_errors
raise r.get()
ray.exceptions.RayTaskError(KeyError): �[36mray::RolloutWorker.apply()�[39m (pid=356826, ip=10.0.0.10, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f0627d6c710>)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/utils/actor_manager.py", line 183, in apply
raise e
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/utils/actor_manager.py", line 174, in apply
return func(self, *args, **kwargs)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/execution/rollout_ops.py", line 86, in <lambda>
lambda w: w.sample(), local_worker=False, healthy_only=True
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/rollout_worker.py", line 900, in sample
batches = [self.input_reader.next()]
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
batches = [self.get_data()]
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 285, in get_data
item = next(self._env_runner)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 721, in _env_runner
base_env.send_actions(actions_to_send)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/env/multi_agent_env.py", line 615, in send_actions
raise e
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/env/multi_agent_env.py", line 608, in send_actions
obs, rewards, dones, infos = env.step(agent_dict)
File "/home/weirme/Repo/CoPO/copo_code/copo/torch_copo/utils/env_wrappers.py", line 309, in step
o, r, d, i = super(LCFEnv, self).step(actions)
File "/home/weirme/Repo/CoPO/copo_code/copo/torch_copo/utils/env_wrappers.py", line 96, in step
self._update_distance_map(dones=d)
File "/home/weirme/Repo/CoPO/copo_code/copo/torch_copo/utils/env_wrappers.py", line 143, in _update_distance_map
if hasattr(self, "vehicles_including_just_terminated"):
File "/home/weirme/Repo/metadrive/metadrive/envs/base_env.py", line 490, in vehicles_including_just_terminated
ret.update(self.agent_manager.just_terminated_agents)
File "/home/weirme/Repo/metadrive/metadrive/manager/agent_manager.py", line 276, in just_terminated_agents
for agent_name, v_name in self._agents_finished_this_frame.items()
File "/home/weirme/Repo/metadrive/metadrive/manager/agent_manager.py", line 276, in <dictcomp>
for agent_name, v_name in self._agents_finished_this_frame.items()
File "/home/weirme/Repo/metadrive/metadrive/manager/agent_manager.py", line 289, in get_agent
object_name = self.agent_to_object(agent_name)
File "/home/weirme/Repo/metadrive/metadrive/manager/agent_manager.py", line 316, in agent_to_object
return self._agent_to_object[agent_id]
KeyError: 'agent24'
Failure # 2 (occurred at 2023-07-10_17-02-49)
�[36mray::CoPOTrainer.train()�[39m (pid=364049, ip=10.0.0.10, repr=CoPOTrainer)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/tune/trainable/trainable.py", line 367, in train
raise skipped from exception_cause(skipped)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/tune/trainable/trainable.py", line 364, in train
result = self.step()
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/algorithms/algorithm.py", line 749, in step
results, train_iter_ctx = self._run_one_training_iteration()
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/algorithms/algorithm.py", line 2623, in _run_one_training_iteration
results = self.training_step()
File "/home/weirme/Repo/CoPO/copo_code/copo/torch_copo/algo_copo.py", line 524, in training_step
worker_set=self.workers, max_env_steps=self.config.train_batch_size
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/execution/rollout_ops.py", line 86, in synchronous_parallel_sample
lambda w: w.sample(), local_worker=False, healthy_only=True
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py", line 696, in foreach_worker
handle_remote_call_result_errors(remote_results, self._ignore_worker_failures)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py", line 73, in handle_remote_call_result_errors
raise r.get()
ray.exceptions.RayTaskError(KeyError): �[36mray::RolloutWorker.apply()�[39m (pid=364112, ip=10.0.0.10, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f2d5fbd7d50>)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/utils/actor_manager.py", line 183, in apply
raise e
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/utils/actor_manager.py", line 174, in apply
return func(self, *args, **kwargs)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/execution/rollout_ops.py", line 86, in <lambda>
lambda w: w.sample(), local_worker=False, healthy_only=True
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/rollout_worker.py", line 900, in sample
batches = [self.input_reader.next()]
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
batches = [self.get_data()]
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 285, in get_data
item = next(self._env_runner)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 721, in _env_runner
base_env.send_actions(actions_to_send)
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/env/multi_agent_env.py", line 615, in send_actions
raise e
File "/home/weirme/miniconda3/envs/copo/lib/python3.7/site-packages/ray/rllib/env/multi_agent_env.py", line 608, in send_actions
obs, rewards, dones, infos = env.step(agent_dict)
File "/home/weirme/Repo/CoPO/copo_code/copo/torch_copo/utils/env_wrappers.py", line 309, in step
o, r, d, i = super(LCFEnv, self).step(actions)
File "/home/weirme/Repo/CoPO/copo_code/copo/torch_copo/utils/env_wrappers.py", line 96, in step
self._update_distance_map(dones=d)
File "/home/weirme/Repo/CoPO/copo_code/copo/torch_copo/utils/env_wrappers.py", line 143, in _update_distance_map
if hasattr(self, "vehicles_including_just_terminated"):
File "/home/weirme/Repo/metadrive/metadrive/envs/base_env.py", line 490, in vehicles_including_just_terminated
ret.update(self.agent_manager.just_terminated_agents)
File "/home/weirme/Repo/metadrive/metadrive/manager/agent_manager.py", line 276, in just_terminated_agents
for agent_name, v_name in self._agents_finished_this_frame.items()
File "/home/weirme/Repo/metadrive/metadrive/manager/agent_manager.py", line 276, in <dictcomp>
for agent_name, v_name in self._agents_finished_this_frame.items()
File "/home/weirme/Repo/metadrive/metadrive/manager/agent_manager.py", line 289, in get_agent
object_name = self.agent_to_object(agent_name)
File "/home/weirme/Repo/metadrive/metadrive/manager/agent_manager.py", line 316, in agent_to_object
return self._agent_to_object[agent_id]
KeyError: 'agent14'
weirme commented
this issue was resolved with metadrive 0.3.0.1
pengzhenghao commented
Oh I remember this bug. It is very tricky when one agent is terminated. Previously, we clear out the agent immediately so no observer
can help to get the data of it.
Happy to see it is solved.