[Bug Report] pettingzoo.atari pong_v3 pong reward need >0
xiezhipeng-git opened this issue · 9 comments
Describe the bug
pettingzoo.atari pong_v3 reward need >0
Name: pettingzoo
Version: 1.23.1
Code example
import random
import numpy as np
from pettingzoo.atari import (
basketball_pong_v3,
boxing_v2,
combat_plane_v2,
combat_tank_v2,
double_dunk_v3,
entombed_cooperative_v3,
flag_capture_v2,
foozpong_v3,
ice_hockey_v2,
joust_v3,
mario_bros_v3,
maze_craze_v3,
othello_v3,
pong_v3,
quadrapong_v4,
space_invaders_v2,
space_war_v2,
surround_v2,
tennis_v3,
video_checkers_v4,
volleyball_pong_v3,
warlords_v3,
wizard_of_wor_v3,
)
from pettingzoo.butterfly import (
cooperative_pong_v5,
knights_archers_zombies_v10,
pistonball_v6,
)
from pettingzoo.classic import (
chess_v5,
connect_four_v3,
gin_rummy_v4,
go_v5,
hanabi_v4,
leduc_holdem_v4,
rps_v2,
texas_holdem_no_limit_v6,
texas_holdem_v4,
tictactoe_v3,
)
from pettingzoo.mpe import (
simple_adversary_v3,
simple_crypto_v3,
simple_push_v3,
simple_reference_v3,
simple_speaker_listener_v4,
simple_spread_v3,
simple_tag_v3,
simple_v3,
simple_world_comm_v3,
)
from pettingzoo.sisl import multiwalker_v9, pursuit_v4, waterworld_v4
all_prefixes = ["atari", "classic", "butterfly", "mpe", "sisl"]
manual_environments = {
"butterfly/knights_archers_zombies",
"butterfly/pistonball",
"butterfly/cooperative_pong",
"sisl/pursuit",
}
all_environments = {
"atari/basketball_pong_v3": basketball_pong_v3,
"atari/boxing_v2": boxing_v2,
"atari/combat_tank_v2": combat_tank_v2,
"atari/combat_plane_v2": combat_plane_v2,
"atari/double_dunk_v3": double_dunk_v3,
"atari/entombed_cooperative_v3": entombed_cooperative_v3,
"atari/flag_capture_v2": flag_capture_v2,
"atari/foozpong_v3": foozpong_v3,
"atari/joust_v3": joust_v3,
"atari/ice_hockey_v2": ice_hockey_v2,
"atari/maze_craze_v3": maze_craze_v3,
"atari/mario_bros_v3": mario_bros_v3,
"atari/othello_v3": othello_v3,
"atari/pong_v3": pong_v3,
"atari/quadrapong_v4": quadrapong_v4,
"atari/space_invaders_v2": space_invaders_v2,
"atari/space_war_v2": space_war_v2,
"atari/surround_v2": surround_v2,
"atari/tennis_v3": tennis_v3,
"atari/video_checkers_v4": video_checkers_v4,
"atari/volleyball_pong_v3": volleyball_pong_v3,
"atari/wizard_of_wor_v3": wizard_of_wor_v3,
"atari/warlords_v3": warlords_v3,
"classic/chess_v5": chess_v5,
"classic/rps_v2": rps_v2,
"classic/connect_four_v3": connect_four_v3,
"classic/tictactoe_v3": tictactoe_v3,
"classic/leduc_holdem_v4": leduc_holdem_v4,
"classic/texas_holdem_v4": texas_holdem_v4,
"classic/texas_holdem_no_limit_v6": texas_holdem_no_limit_v6,
"classic/gin_rummy_v4": gin_rummy_v4,
"classic/go_v5": go_v5,
"classic/hanabi_v4": hanabi_v4,
"butterfly/knights_archers_zombies_v10": knights_archers_zombies_v10,
"butterfly/pistonball_v6": pistonball_v6,
"butterfly/cooperative_pong_v5": cooperative_pong_v5,
"mpe/simple_adversary_v3": simple_adversary_v3,
"mpe/simple_crypto_v3": simple_crypto_v3,
"mpe/simple_push_v3": simple_push_v3,
"mpe/simple_reference_v3": simple_reference_v3,
"mpe/simple_speaker_listener_v4": simple_speaker_listener_v4,
"mpe/simple_spread_v3": simple_spread_v3,
"mpe/simple_tag_v3": simple_tag_v3,
"mpe/simple_world_comm_v3": simple_world_comm_v3,
"mpe/simple_v3": simple_v3,
"sisl/multiwalker_v9": multiwalker_v9,
"sisl/waterworld_v4": waterworld_v4,
"sisl/pursuit_v4": pursuit_v4,
}
def getRandomeActions(env,observations,terminations,truncations):
# actions = {
# agent: env.action_space(agent).sample()
# for agent in env.agents
# if not (terminations[agent] or truncations[agent])
# }
actions = {}
observations = {}
for agent in env.agents:
if terminations[agent] or truncations[agent]:
# action = None
print("有结束的")
pass
obs = None
if observations!=None and agent in observations:
obs = observations[agent]
else:
if hasattr(env,"observe"):
obs = env.observe(agent)
#
if isinstance(obs, dict) and "action_mask" in obs:
if obs["action_mask"].max()!=0:
action = random.choice(np.flatnonzero(obs["action_mask"]).tolist())
actions.update({agent:action})
else:
action = None
actions.update({agent:action})
else:
action = env.action_space(agent).sample()
actions.update({agent:action})
observations.update({agent:obs})
return actions,observations
def getRandomAndStep(env,step=0):
rewards = {}
terminations = {}
truncations = {}
for agent in env.agent_iter():
obs, reward, termination, truncation, _ = env.last()
if termination or truncation:
action = None
elif isinstance(obs, dict) and "action_mask" in obs:
action = random.choice(np.flatnonzero(obs["action_mask"]).tolist())
else:
action = env.action_space(agent).sample()
# print(len(env.agents),termination,truncation)
# if step==0:
# action=3565
env.step(action)
# obs, reward, termination, truncation, _ = env.last()
# # print(len(env.agents),agent,termination,truncation,reward)
# if termination or truncation:
# action = None
# break
rewards.update({agent:reward})
terminations.update({agent:termination})
truncations.update({agent:truncation})
return rewards,terminations,truncations
def random_demo(env, render=True, episodes=1,parallel = True):
"""Runs an env object with random actions."""
total_reward = 0
completed_episodes = 0
lastAgentNum = 1
# 统一使用getRandomeActions parallel
# parallel = True
while completed_episodes < episodes:
observations = env.reset()
# nowNum = 0
# for agent in env.agent_iter():
# nowNum+=1
# if nowNum!=lastAgentNum:
# print("智能体个数变更",nowNum,lastAgentNum)
# lastAgentNum = nowNum
terminations = {agent: False for agent in env.possible_agents}
truncations = {agent: False for agent in env.possible_agents}
rewards = None
test_cycles = (
2000
) # allows environment to do more than max_cycles if it so wishes
if render:
env.render()
for step in range(test_cycles):
if parallel:
# 观察空间也在这里.可能会是轮流的观察空间。需要注意
actions,observations = getRandomeActions(env,observations,truncations,truncations)
#
if hasattr(env,"agent_selection"):
action = actions[env.agent_selection]
env.step(action)
rewards = env.env.rewards
terminations = env.env.terminations
truncations = env.env.truncations
infos = env.env.infos
else:
observations, rewards, terminations, truncations, infos = env.step(actions)
if rewards!= None:
for agent in rewards:
# print(agent, rewards[agent])
total_reward+=rewards[agent]/float(len(rewards))
# terminations 相当于done
if all([x or y for x, y in zip(terminations.values(), truncations.values())]):
print("提前结束")
break
else:
# 观察空间也在这里.可能会是轮流的观察空间。需要注意
rewards,termination,truncations = getRandomAndStep(env,step)
if rewards!= None:
for agent in rewards:
# print(agent, rewards[agent])
total_reward+=rewards[agent]/float(len(rewards))
if all([x or y for x, y in zip(terminations.values(), truncations.values())]):
break
if rewards!= None:
print("第",completed_episodes,"轮:",rewards, total_reward)
completed_episodes += 1
if render:
env.close()
print("Average total reward", total_reward / episodes)
return total_reward
if __name__ == "__main__":
# pip install autorom
# AutoROM
# Using environments in PettingZoo is very similar to Gym, i.e. you initialize an environment via:
# model = multiwalker_v9
model = pong_v3
# model = chess_v5
# model = all_environments["atari/foozpong_v3"]
# model = all_environments["sisl/multiwalker_v9"]
# model = all_environments["atari/mario_bros_v3"]
# from pettingzoo.butterfly import pistonball_v6
# env = pistonball_v6.env(render_mode="human")
# env = chess_v5.env(render_mode="human")
# env = multiwalker_v9.env(render_mode="human")
if hasattr(model,"parallel_env"):
env = model.parallel_env(render_mode="human")
parallel = True
else:
env = model.env(render_mode="human")
parallel = False
# 暂时都使用,parallel
random_demo(env, render=True, episodes=10,parallel = parallel)
# random_demo(env, render=True, episodes=10)
System info
Describe how PettingZoo was installed (pip, source, ...)
pip
Version of (by pettingzoopettingzoo.version)
1.23.1
What OS/version you're using. Note that while we will accept PRs to improve Window's support, we do not officially support it.
wsl:ubuntu-22.04 (windows11)
Python version
3.10.6
Additional context
No response
Checklist
- I have checked that there is no similar issue in the repo
Hi, thanks for the bug report. Your code is a bit complicated, so I tried to reproduce the problem in as simple an example as possible, and I found the rewards were working fine. The following code prints any rewards which are nonzero.
env = pong_v3.env()
env.reset(seed=42)
turn = 0
for agent in env.agent_iter():
observation, reward, termination, truncation, info = env.last()
if reward != 0:
print(f"TURN {turn} | {agent}: {reward}")
if termination or truncation:
break
else:
if "action_mask" in info:
mask = info["action_mask"]
elif isinstance(observation, dict) and "action_mask" in observation:
mask = observation["action_mask"]
else:
mask = None
action = env.action_space(agent).sample(mask) # this is where you would insert your policy
env.step(action)
turn +=1
env.close()
This should have an output similar to this:
TURN 266 | first_0: 1
TURN 267 | second_0: -1
TURN 548 | first_0: 1
TURN 549 | second_0: -1
TURN 830 | first_0: 1
TURN 831 | second_0: -1
However, I see that our average_total_reward utility doesn't actually work with zero sum games, which is most of the environments, so I think I will make a PR adding a utility to track the scores and return it after running games. As a basic way to track scores, try this code:
env = pong_v3.env()
env.reset(seed=42)
total_rewards = {agent: 0 for agent in env.agents}
for agent in env.agent_iter():
observation, reward, termination, truncation, info = env.last()
# When the enemy scores, your agent gets -1 return, but the total score for your agent stays the same
# The environment ends when one player reaches 20 points
if reward > 0:
total_rewards[agent] += reward
if termination or truncation:
print(f"Final rewards: {total_rewards}")
break
else:
if "action_mask" in info:
mask = info["action_mask"]
elif isinstance(observation, dict) and "action_mask" in observation:
mask = observation["action_mask"]
else:
mask = None
action = env.action_space(agent).sample(mask) # this is where you would insert your policy
env.step(action)
env.close()
This will tally up all scores received by agents over the course of the game:
Final rewards: {'first_0': 14, 'second_0': 20}
I found that the cause of the problem was test_cycles = 2000. 2000 cycles were not enough for pong to complete a round, but when I changed it to 5000, I saw rewards(1,-1). I assumed that the rewards should be the same as the displayed numbers. Thus, I thought the rewards were always zero. In that case, let's close this issue.
The cause of the problem has been found, but it is indeed a problem. Because if a pong game needs to complete 1 round to receive rewards of 1 and -1. So the training of this pong game would have to be twice as complex as n squared, so I reopened this issue
The cause of the problem has been found, but it is indeed a problem. Because if a pong game needs to complete 1 round to receive rewards of 1 and -1. So the training of this pong game would have to be twice as complex as n squared, so I reopened this issue
Not sure where you're getting N squared for training, but I think you're misunderstanding how the rewards work.
Running the code that I sent, you can put a breakpoint inside fhe if termination or truncation:
block, and view it with human rendering, and you will see that it plays a single round to 20 points, and each time an agent scores the rewards for that timestep will be 1 for the agent who scored and -1 for the other agent. We know this is true because my code adds these individual rewards in order to calculate the total reward, and the total reward is the same as you see on the screen. Here are some screenshots just to show what I mean:
The last reward value which you see will just be whoever scores the winning point (reward
in the above screenshot), so you are correct that when it's complete it will be rewards of (1, -1), but you are incorrect that there is only a single reward at the end of the round--as I said above, each point that is scored results in a reward of (-1, 1) or vice versa.
@elliottower You are right.During the process, there will be rewards for those who score. The mistake is that this is not a zero sum game. A score of (1,0) is reasonable. Okay, let's change the title
Talked with some other devs internally and the way it currently works is as intended. The reward allows the agent to tell between three possible scenarios: you score (+1), nobody scores (0) and the opponent scores (-1). Two players cannot score in the same timestep, so there are no other options to consider. If the rewards returned were (1, 0) rather than (1, -1), then agents would have no way of telling if the opponent scored. Giving a negative reward allows negative feedback so they can learn to not let them score again in the future.
As a side note, the way you calculate the final score is entirely up to you. You could tally up the points and only count the game as a 1 if your agent wins, and -1 if they lose. Or you could use the actual values, calculated by tallying like I showed above (e.g., 20 points to 12 points). I am considering copying this wrapper into PettingZoo so we have some better utilities to track statistics and total rewards and such, if you think this would be helpful please let me know.
This wrapper
is useful. However, the reason I was initially confused is that the scores displayed in Pong are inconsistent with the scores provided by you. For AI learning, it seems that (+1, -1) and (+1, 0) are considered the same because during training, only a single reward value is provided and the positive or negative aspect of the reward has little impact(This requires conducting dedicated experiments for validation.). Therefore, I still believe that aligning the rewards with the scores displayed in the game would be the best approach. However, the current solution is also acceptable.
This wrapper is useful. However, the reason I was initially confused is that the scores displayed in Pong are inconsistent with the scores provided by you. For AI learning, it seems that (+1, -1) and (+1, 0) are considered the same because during training, only a single reward value is provided and the positive or negative aspect of the reward has little impact(This requires conducting dedicated experiments for validation.). Therefore, I still believe that aligning the rewards with the scores displayed in the game would be the best approach. However, the current solution is also acceptable.
Are you suggesting that the scores be returned as the totals? So when the final point is scored, winning agent receives a reward of 20? That would likely mess with the learning, as the scale of rewards would change so much over the course of the game. Otherwise if you are suggesting that the rewards be (+1, 0) rather than (+1, -1), we could do that, but I see no strong reason to, at best it's the same and at worst it's losing some information. I'll also note that if you use the parallel wrappers, the rewards are returned for both agents each timestep, so having (+1, -1) makes the most sense for that scenario.
And yeah I was going to (or already did?) link that wrapper, I am planning to implement that as something in PettingZoo as well. I talked with the gymnasium devs a bit about it and how it should be implemented. If you're interested in helping out with that we're always looking for new contributors as well.
Appreciate you opening the issue, definitely good to take a look into things like this and ensure they all make sense and are done the right way.
Otherwise if you are suggesting that the rewards be (+1, 0) rather than (+1, -1), we could do that, but I see no strong reason to, at best it's the same and at worst it's losing some information. I'll also note that if you use the parallel wrappers, the rewards are returned for both agents each timestep, so having (+1, -1) makes the most sense for that scenario.
This sentence makes a lot of sense. I originally thought it would be better to add (1,0) and add it yourself to match the display. But what you said seems more reasonable