KeyError occurs when loading custom dataset
Opened this issue · 12 comments
I am trying load a dataset created from a Minigrid-type environment using the following code:
import os
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from gymnasium import spaces
from stable_baselines3 import PPO
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import minari
from minari import DataCollectorV0
def collate_fn(batch):
return {
"id": torch.Tensor([x.id for x in batch]),
"seed": torch.Tensor([x.seed for x in batch]),
"total_timesteps": torch.Tensor([x.total_timesteps for x in batch]),
"observations": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.observations) for x in batch],
batch_first=True
),
"actions": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.actions) for x in batch],
batch_first=True
),
"rewards": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.rewards) for x in batch],
batch_first=True
),
"terminations": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.terminations) for x in batch],
batch_first=True
),
"truncations": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.truncations) for x in batch],
batch_first=True
)
}
torch.manual_seed(42)
minari_testset = minari.load_dataset("MinigridRandomWall-6Spots-v0")
dataloader = DataLoader(minari_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
for batch in dataloader:
print("Observation shape: " + str(batch['observations'].shape))
print("Action shape: " + str(batch['actions'].shape))
print("Reward shape: " + str(batch['rewards'].shape))
print("Timestep shape " + str(batch["infos"]["timestep"].shape))
When I run the code, I get this error:
minari_testset = minari.load_dataset("MinigridRandomWall-6Spots-v0")
File "/home/justin/Minari/minari/storage/local.py", line 22, in load_dataset
return MinariDataset(data_path)
File "/home/justin/Minari/minari/dataset/minari_dataset.py", line 133, in __init__
self._data = MinariStorage(data)
File "/home/justin/Minari/minari/dataset/minari_storage.py", line 22, in __init__
flatten_observations = f.attrs["flatten_observation"].item()
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "/usr/local/lib/python3.8/dist-packages/h5py/_hl/attrs.py", line 56, in __getitem__
attr = h5a.open(self._id, self._e(name))
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py/h5a.pyx", line 80, in h5py.h5a.open
KeyError: "Can't open attribute (can't locate attribute in name index)"
When I created the dataset, I used an ImgObsWrapper
for the environment. Could that be the source of the problem?
Thanks for the report! We'll look into this.
@JustinS6626 I think we will also need the code used to create the dataset in order to debug this.
Thanks for getting back to me so soon! Posting the code would be difficult, since it includes some proprietary elements. However, if there is any chance that you would be able to post a worked example of creating a dataset using a Minigrid environment, I think that would solve the problem.
Just a quick update, I tried creating a dataset with the empty minigrid environment, just to see if the problem was with my custom environment. When I tried to create the dataset for the 8x8 empty minigrid, I got the following error:
Warning (from warnings module):
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 560
warnings.warn(
UserWarning: `code_permalink` is set to None. For reproducibility purposes it is highly recommended to link your dataset to versioned code.
Warning (from warnings module):
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 583
warnings.warn(
UserWarning: `minari_version` is set to None. The compatible dataset version specifier for Minari will be set to ~=0.4.
Traceback (most recent call last):
File "/usr/lib/python3.11/idlelib/run.py", line 578, in runcode
exec(code, self.locals)
File "/home/justin/darthmallocsarchive-svn/trunk/DissertationExperiment/CausalTensornet/RandomWallMinigridDatagen.py", line 143, in <module>
record_episodes(model, grid_name, max_steps, collector_env, data_name)
File "/home/justin/darthmallocsarchive-svn/trunk/DissertationExperiment/CausalTensornet/RandomWallMinigridDatagen.py", line 131, in record_episodes
dataset = minari.create_dataset_from_collector_env(collector_env=data_collect,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 630, in create_dataset_from_collector_env
collector_env.save_to_disk(
File "/usr/local/lib/python3.11/dist-packages/minari/data_collector/data_collector.py", line 495, in save_to_disk
observation_space_str = serialize_space(self.dataset_observation_space)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/functools.py", line 909, in wrapper
return dispatch(args[0].__class__)(*args, **kw)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/minari/serialization.py", line 50, in _serialize_dict
result["subspaces"][key] = serialize_space(space.spaces[key], to_string=False)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/functools.py", line 909, in wrapper
return dispatch(args[0].__class__)(*args, **kw)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/minari/serialization.py", line 14, in serialize_space
raise NotImplementedError(f"No serialization method available for {space}")
NotImplementedError: No serialization method available for MissionSpace(<function EmptyEnv._gen_mission at 0x7fce50fc3420>, None)
I tried to resolve the issue by using an ImgObsWrapper
for the environment, but I got the following error:
Warning (from warnings module):
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 560
warnings.warn(
UserWarning: `code_permalink` is set to None. For reproducibility purposes it is highly recommended to link your dataset to versioned code.
Warning (from warnings module):
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 583
warnings.warn(
UserWarning: `minari_version` is set to None. The compatible dataset version specifier for Minari will be set to ~=0.4.
Traceback (most recent call last):
File "/usr/lib/python3.11/idlelib/run.py", line 578, in runcode
exec(code, self.locals)
File "/home/justin/darthmallocsarchive-svn/trunk/DissertationExperiment/CausalTensornet/RandomWallMinigridDatagen.py", line 143, in <module>
record_episodes(model, grid_name, max_steps, collector_env, data_name)
File "/home/justin/darthmallocsarchive-svn/trunk/DissertationExperiment/CausalTensornet/RandomWallMinigridDatagen.py", line 131, in record_episodes
dataset = minari.create_dataset_from_collector_env(collector_env=data_collect,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 641, in create_dataset_from_collector_env
return MinariDataset(data_path)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/minari/dataset/minari_dataset.py", line 156, in __init__
self.spec = MinariDatasetSpec(
^^^^^^^^^^^^^^^^^^
File "<string>", line 12, in __init__
File "/usr/local/lib/python3.11/dist-packages/minari/dataset/minari_dataset.py", line 111, in __post_init__
self.env_name, self.dataset_name, self.version = parse_dataset_id(
^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/minari/dataset/minari_dataset.py", line 39, in parse_dataset_id
version = int(version)
^^^^^^^^^^^^
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
I am not sure if this is because the current Minari implementation is not setup to handle minigrid environments, or because I am doing something wrong.
Here is a toy example of what I am trying to do:
import os
import gymnasium as gym
import numpy as np
from minigrid.wrappers import *
import minari
from minari import DataCollectorV0, StepDataCallback
env_name = "MiniGrid-Empty-8x8-v0"
data_name = "MinigridEmpty-v0"
if data_name in minari.list_local_datasets():
dataset = minari.load_dataset(data_name)
else:
dataset = None
env = gym.make(env_name, max_episode_steps=100, render_mode="human")
env = ImgObsWrapper(env)
class MinigridDataCallback(StepDataCallback):
def __call__(self, env, obs, info, action=None, rew=None, terminated=None, truncated=None):
step_data = super().__call__(env, obs, info, action, rew, terminated, truncated)
step_data["infos"]["timestep"] = env.step_count
return step_data
dataset = None
collector_env = DataCollectorV0(env, step_data_callback=MinigridDataCallback, record_infos=True)
def record_episodes(step_total, data_collect, data_name):
n_step = 0
obs, _ = data_collect.reset(seed=123)
dataset = None
while n_step < step_total:
print("Current step: " + str(n_step))
act = np.random.randint(3)
new_obs, rew, terminated, truncated, info = data_collect.step(act)
obs = new_obs
if truncated or terminated:
obs, _ = data_collect.reset(seed=123)
if (n_step + 1) % step_total == 0:
if dataset is None:
dataset = minari.create_dataset_from_collector_env(collector_env=data_collect,
dataset_id=data_name,
algorithm_name="RandomGen")
else:
dataset.update_dataset_from_collector_env(data_collect)
n_step += 1
max_steps = 500
record_episodes(max_steps, collector_env, data_name)
I would really appreciate if from that, you would be able to provide some guidance on how to save and load a dataset successfully.
Just a quick update, I tried creating a dataset with the empty minigrid environment, just to see if the problem was with my custom environment. When I tried to create the dataset for the 8x8 empty minigrid, I got the following error:
Warning (from warnings module): File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 560 warnings.warn( UserWarning: `code_permalink` is set to None. For reproducibility purposes it is highly recommended to link your dataset to versioned code. Warning (from warnings module): File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 583 warnings.warn( UserWarning: `minari_version` is set to None. The compatible dataset version specifier for Minari will be set to ~=0.4. Traceback (most recent call last): File "/usr/lib/python3.11/idlelib/run.py", line 578, in runcode exec(code, self.locals) File "/home/justin/darthmallocsarchive-svn/trunk/DissertationExperiment/CausalTensornet/RandomWallMinigridDatagen.py", line 143, in <module> record_episodes(model, grid_name, max_steps, collector_env, data_name) File "/home/justin/darthmallocsarchive-svn/trunk/DissertationExperiment/CausalTensornet/RandomWallMinigridDatagen.py", line 131, in record_episodes dataset = minari.create_dataset_from_collector_env(collector_env=data_collect, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 630, in create_dataset_from_collector_env collector_env.save_to_disk( File "/usr/local/lib/python3.11/dist-packages/minari/data_collector/data_collector.py", line 495, in save_to_disk observation_space_str = serialize_space(self.dataset_observation_space) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.11/functools.py", line 909, in wrapper return dispatch(args[0].__class__)(*args, **kw) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/minari/serialization.py", line 50, in _serialize_dict result["subspaces"][key] = serialize_space(space.spaces[key], to_string=False) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.11/functools.py", line 909, in wrapper return dispatch(args[0].__class__)(*args, **kw) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/minari/serialization.py", line 14, in serialize_space raise NotImplementedError(f"No serialization method available for {space}") NotImplementedError: No serialization method available for MissionSpace(<function EmptyEnv._gen_mission at 0x7fce50fc3420>, None)
This error comes because there is no serialization method for MissionSpace. You can write a custom serialization method and register the functions to serialize_space
and deserialize_space
in minari.serialization
(check https://github.com/Farama-Foundation/Minari/blob/main/minari/serialization.py to understand more); we are planning to write a tutorial about it.
Or, a simpler fix is to overwrite the observation space when creating the DataCollectorV0 and replace the mission with a Text space.
@younik Thank you very much for getting back to me about that! Are there any examples of how to overwrite the observation space for the DataCollector and replace the mission space with a Text space?
Here an example:
import gymnasium as gym
import minigrid
import minari
from minari import DataCollectorV0
from gymnasium.spaces.text import alphanumeric
env = gym.make("MiniGrid-Fetch-5x5-N2-v0")
max_len = len("you must fetch a purple ball")
obs_space = gym.spaces.Dict({
"direction": env.observation_space["direction"],
"image": env.observation_space["image"],
"mission": gym.spaces.Text(
max_length=max_len,
charset=str(alphanumeric) + ' '
)
})
env = DataCollectorV0(env, record_infos=True, max_buffer_steps=1_000_000, observation_space=obs_space)
Thank you very much! I think that part of the code might be fixed. My implementation now looks like this
import os
import gymnasium as gym
import numpy as np
from minigrid.wrappers import *
import minari
from minari import DataCollectorV0, StepDataCallback
from gymnasium.spaces.text import alphanumeric
env_name = "MiniGrid-Empty-8x8-v0"
data_name = "MinigridEmpty-v0"
if data_name in minari.list_local_datasets():
dataset = minari.load_dataset(data_name)
else:
dataset = None
env = gym.make(env_name, max_episode_steps=100, render_mode="human")
max_len = len("you must reach the target area")
#env = ImgObsWrapper(env)
obs_space = gym.spaces.Dict({
"direction": env.observation_space["direction"],
"image": env.observation_space["image"],
"mission": gym.spaces.Text(
max_length=max_len,
charset=str(alphanumeric) + ' '
)
})
class MinigridDataCallback(StepDataCallback):
def __call__(self, env, obs, info, action=None, rew=None, terminated=None, truncated=None):
step_data = super().__call__(env, obs, info, action, rew, terminated, truncated)
step_data["infos"]["timestep"] = env.step_count
return step_data
dataset = None
collector_env = DataCollectorV0(env, step_data_callback=MinigridDataCallback, record_infos=True, observation_space=obs_space)
def record_episodes(step_total, data_collect, data_name):
n_step = 0
obs, _ = data_collect.reset(seed=123)
dataset = None
while n_step < step_total:
print("Current step: " + str(n_step))
act = np.random.randint(3)
new_obs, rew, terminated, truncated, info = data_collect.step(act)
obs = new_obs
if truncated or terminated:
obs, _ = data_collect.reset(seed=123)
if (n_step + 1) % step_total == 0:
if dataset is None:
dataset = minari.create_dataset_from_collector_env(collector_env=data_collect,
dataset_id=data_name,
algorithm_name="RandomGen")
else:
dataset.update_dataset_from_collector_env(data_collect)
n_step += 1
max_steps = 500
record_episodes(max_steps, collector_env, data_name)
Now, however, I am getting a different error:
Warning (from warnings module):
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 560
warnings.warn(
UserWarning: `code_permalink` is set to None. For reproducibility purposes it is highly recommended to link your dataset to versioned code.
Warning (from warnings module):
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 565
warnings.warn(
UserWarning: `author` is set to None. For longevity purposes it is highly recommended to provide an author name.
Warning (from warnings module):
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 570
warnings.warn(
UserWarning: `author_email` is set to None. For longevity purposes it is highly recommended to provide an author email, or some other obvious contact information.
Warning (from warnings module):
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 583
warnings.warn(
UserWarning: `minari_version` is set to None. The compatible dataset version specifier for Minari will be set to ~=0.4.
Traceback (most recent call last):
File "/usr/lib/python3.11/idlelib/run.py", line 578, in runcode
exec(code, self.locals)
File "/home/justin/minari_test.py", line 60, in <module>
record_episodes(max_steps, collector_env, data_name)
File "/home/justin/minari_test.py", line 51, in record_episodes
dataset = minari.create_dataset_from_collector_env(collector_env=data_collect,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/minari/utils.py", line 641, in create_dataset_from_collector_env
return MinariDataset(data_path)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/minari/dataset/minari_dataset.py", line 156, in __init__
self.spec = MinariDatasetSpec(
^^^^^^^^^^^^^^^^^^
File "<string>", line 12, in __init__
File "/usr/local/lib/python3.11/dist-packages/minari/dataset/minari_dataset.py", line 111, in __post_init__
self.env_name, self.dataset_name, self.version = parse_dataset_id(
^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/minari/dataset/minari_dataset.py", line 39, in parse_dataset_id
version = int(version)
^^^^^^^^^^^^
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
Based on that, I am wondering where the version should be specified. My Minari version is 0.4.1.
So our dataset ID parser was fussing because it expects the format: "<env name>-<dataset name>-<version number>". you can use "MinigridEmpty-random-v0"
Thank you very much! That fixes the problem, except for one small issue. I am also trying to save and load the timestep number of each step the agent takes, as you probably noticed in the MinigridDataCallback
class. I am trying to test out loading the data as shown in the code below:
import os
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from gymnasium import spaces
from stable_baselines3 import PPO
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import minari
from minari import DataCollectorV0
def collate_fn(batch):
print(type(batch[0]))
return {
"id": torch.Tensor([x.id for x in batch]),
"seed": torch.Tensor([x.seed for x in batch]),
"total_timesteps": torch.Tensor([x.total_timesteps for x in batch]),
"observations": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.observations["image"]) for x in batch],
batch_first=True
),
"actions": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.actions) for x in batch],
batch_first=True
),
"rewards": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.rewards) for x in batch],
batch_first=True
),
"terminations": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.terminations) for x in batch],
batch_first=True
),
"truncations": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.truncations) for x in batch],
batch_first=True
),
"timesteps": torch.nn.utils.rnn.pad_sequence(
[torch.as_tensor(x.infos["timestep"]) for x in batch],
batch_first=True
)
}
torch.manual_seed(42)
#dataname = "MinigridRandomWall-6Spots-v0"
dataname = "MinigridEmpty-8x8-v0"
minari_testset = minari.load_dataset(dataname)
dataloader = DataLoader(minari_testset, batch_size=64, shuffle=True, collate_fn=collate_fn)
for batch in dataloader:
print("Observation shape: " + str(batch['observations'].shape))
print("Action shape: " + str(batch['actions'].shape))
print("Reward shape: " + str(batch['rewards'].shape))
print("Timestep shape " + str(batch["timesteps"].shape))
When I use that approach, however, I get the following error:
Traceback (most recent call last):
File "/usr/lib/python3.11/idlelib/run.py", line 578, in runcode
exec(code, self.locals)
File "/home/justin/MinariLoadTest.py", line 55, in <module>
for batch in dataloader:
File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 634, in __next__
data = self._next_data()
^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 678, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
return self.collate_fn(data)
^^^^^^^^^^^^^^^^^^^^^
File "/home/justin/MinariLoadTest.py", line 43, in collate_fn
[torch.as_tensor(x.infos["timestep"]) for x in batch],
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/justin/MinariLoadTest.py", line 43, in <listcomp>
[torch.as_tensor(x.infos["timestep"]) for x in batch],
^^^^^^^
AttributeError: 'EpisodeData' object has no attribute 'infos'
What would be the correct way to add timestep numbering to my dataset?
unfortunately our EpisodeData
dataclass doesn't have an infos field. We will address this in the next release, and provide a correct way to retrieve infos.