lmdb_cache
is a Python library leveraging LMDB for efficient and fast data handling, ideal for machine learning workflows. It simplifies the process of storing and retrieving large datasets using LMDB.
- Efficient Serialization: Serialize anything: it utilizes
dill
for object serialization and deserialization. - Two-Stage Data Handling:
- Stage 1: Use
dump2lmdb
to dump large datasets into an LMDB database. This is the slowest part. - Stage 2: Once dataset is dumped, use
LMDBReadDict
to retreve data. It supports parallel retreving withmultiprocessing
for high-throughput applications.
- Stage 1: Use
- Supposed to be used whithin ML training pipelines: Can be integrated with PyTorch
Dataset
andDataLoader
, making it ideal for multi-process data loading in machine pipelines.
python3 -m pip install https://github.com/Red-Eyed/lmdb_cache.git
from lmdb_cache import dump2lmdb
from pathlib import Path
db_path = Path("/path/to/lmdb/database")
data_iterable = [(i, f"data_{i}") for i in range(1000)]
dump2lmdb(db_path, data_iterable)
from lmdb_cache import LMDBReadDict
from pathlib import Path
db_path = Path("/path/to/lmdb/database")
lmdb_dict = LMDBReadDict(db_path)
for i in range(1000):
data = lmdb_dict[i]
print(f"Key: {i}, Data: {data}")
import torch
from lmdb_cache import LMDBReadDict
from torch.utils.data import Dataset, DataLoader
class LMDBDataset(Dataset):
def __init__(self, lmdb_path):
self.lmdb_dict = LMDBReadDict(lmdb_path)
def __len__(self):
return len(self.lmdb_dict)
def __getitem__(self, idx):
return self.lmdb_dict[idx]
# Usage
# dump once
db_path = Path("/path/to/lmdb/database")
dump2lmdb(db_path, data_iterable)
# read multiple times
lmdb_dataset = LMDBDataset(db_path)
data_loader = DataLoader(lmdb_dataset, batch_size=32, shuffle=True)
for batch in data_loader:
# Process your batch