Cognitive-AI-Systems/pogema

About integrating sample factory

Closed this issue · 3 comments

Hello, I encountered some difficulties while integrating sample factory; Can you provide the test code used by Pogema to integrate with sample factory? Thank you very much.
Here is my partial code, I don't know if it was used correctly:

class EnvConfigs(BaseModel):
on_target: Literal['finish', 'nothing', 'restart'] = 'restart' # terminated
env_seed: Optional[int] = 3
size: int = 5
density: float = 0.3
num_agents: int = 1
obs_radius: int = 2
collision_system: Literal['block_both', 'priority', 'soft'] = 'soft'
observation_type: Literal['POMAPF', 'MAPF', 'default'] = 'POMAPF'
integration: Literal['SampleFactory', 'PyMARL', 'rllib', 'gym', 'PettingZoo'] = 'SampleFactory'
max_episode_steps: int = 128 # truncated animation_monitor: bool = False
auto_reset: bool = False
extra_reward: list = [0.01, 0.0]

class CustomEnv(Wrapper):

def __init__(self, full_env_name, cfg, render_mode: Optional[str] = None):
    self.env_name = full_env_name
    self.cfg = cfg
    self.env = self.create_random_env()
    self.re_plan = HeuristicPlanner()
    self.extra_reward = [0.0 for i in range(self.cfg.num_agents)]
    self.to_concat = ['obstacles', 'agents']
    super().__init__(self.env)
    self.observation_space = self.update_observation_space()

def update_observation_space(self):
    observation_space = Dict({
        'obs': Box(0.0, 1.0, (2, self.cfg.obs_radius * 2 + 1, self.cfg.obs_radius * 2 + 1), dtype=np.float32),
        'xy': Box(-1024, 1024, (2,), dtype=np.int64),
        'target_xy': Box(-1024, 1024, (2,), dtype=np.int64)
    })
    return observation_space

"""environment={'grid_config': {'MOVES': [[0, 0], [-1, 0], [1, 0], [0, -1], [0, 1]], 'FREE': 0, 'OBSTACLE': 1, 
'empty_outside': True, 'on_target': 'restart', 'seed': None, 'size': 20, 'density': 0.3, 'num_agents': 16, 
'obs_radius': 5, 'agents_xy': None, 'targets_xy': None, 'collision_system': 'soft', 'persistent': False, 
'observation_type': 'POMAPF', 'map': None, 'integration': 'SampleFactory', 
'max_episode_steps': 512, 'auto_reset': False}"""
def create_random_env(self):
    env_cfg = GridConfig(
        on_target=self.cfg.on_target,
        # seed=self.cfg.env_seed,
        size=self.cfg.size,
        density=self.cfg.density,
        num_agents=self.cfg.num_agents,
        obs_radius=self.cfg.obs_radius,
        collision_system=self.cfg.collision_system,
        observation_type=self.cfg.observation_type,
        integration=self.cfg.integration,
        max_episode_steps=self.cfg.max_episode_steps,
        auto_reset=self.cfg.auto_reset
    )
    env = pogema_v0(grid_config=env_cfg)
    env = EnvEnhancedWrapper(env)
    if self.cfg.animation_monitor:      # False
        env = AnimationMonitor(env, AnimationConfig(show_lines=True))
    return env

def add_extra_reward(self, reward):
    for k, r in enumerate(self.extra_reward):
        reward[k] += r
    return reward

def reshape_observation(self, observations):
    xy = list()
    target_xy = list()

    for k in range(self.cfg.num_agents):
        if np.all(self.re_plan.next_better_point[k] == np.array(observations[k]['xy'], dtype=int)):
            self.extra_reward[k] = 0.01
        elif np.all(self.re_plan.next_better_point[k] == np.array([-1, -1], dtype=int)):
            self.extra_reward[k] = 0.0
        else:
            self.extra_reward[k] = -0.01
        xy.append(observations[k]['xy'])
        target_xy.append(observations[k]['target_xy'])

    paths = self.re_plan.get_path(xy, target_xy)
    for i, path in enumerate(paths):
        observations[i]['obstacles'][(self.cfg.obs_radius, self.cfg.obs_radius)] = -1
        for point in path:
            abs_offset = (point[0] - xy[i][0], point[1] - xy[i][1])
            obstacles_point = (self.cfg.obs_radius + abs_offset[0], self.cfg.obs_radius + abs_offset[1])
            if obstacles_point[0] >= 0 and obstacles_point[1] >= 0:     # Avoiding the impact of negative indexes
                try:
                    observations[i]['obstacles'][obstacles_point] = -1
                except IndexError:
                    pass

    # shaping
    for agent_idx, obs in enumerate(observations):
        obs['obstacles'][obs['obstacles'] != 0] *= -1
        main_obs = np.concatenate([obs[key][None] for key in self.to_concat])
        for key in self.to_concat:
            del obs[key]
        for key in obs:
            obs[key] = np.array(obs[key], dtype=np.int64)
        observations[agent_idx]['obs'] = main_obs.astype(np.float32)

    return observations

def step(self, actions):
    observations, reward, done, tr, info = self.env.step(actions)
    return self.reshape_observation(observations), self.add_extra_reward(reward), done, tr, info
  
def reset(self, **kwargs):
    observations, info = self.env.reset(**kwargs)
    self.extra_reward = [0.0 for i in range(self.cfg.num_agents)]
    self.re_plan.reset(self.env.get_global_obstacles(), self.env.get_global_agents_xy(), self.cfg.obs_radius)
    return self.reshape_observation(observations), info

class EnvEnhancedWrapper(Wrapper):
def get_global_obstacles(self):
return self.grid.get_obstacles().astype(int)

def get_global_agents_xy(self):
    return self.grid.get_agents_xy()

From the training results, it seems that there is a problem

Hello, there could be many possible problems in your code.

As a working example of the latest SampleFactory integration (SF 2.0), I suggest looking at https://github.com/AIRI-Institute/learn-to-follow, which also demonstrates how to modify the environment and shape the reward function. For preliminary experiments, I recommend using FollowerLite.

I believe the problem has been resolved; therefore, I am closing the issue.