google-research/t5x

download pile dataset is not working

hyoo opened this issue · 1 comments

hyoo commented

seems like https://the-eye.eu/public/AI/pile/ is not reachable.

Is there any other reliable alternative location to fetch data?

hyoo commented

I learned that the dataset has been removed due to the DMCA claim. There is an alternative location, https://huggingface.co/datasets/EleutherAI/the_pile_deduplicated/tree/main

This has formatted in parquet, so you will need to read from parquet (instead of jsonl.zst) and convert to tfrecord.
Here is an example.

import pandas as pd
import tensorflow_datasets as tfds
import tensorflow as tf
import io
import zstandard
import jsonlines
import os
import time
from itertools import chain
import glob

_DESCRIPTION = """
The Pile is a large, diverse, open source language modelling data set
that consists of many smaller datasets combined together.
The objective is to obtain text from as many modalities as possible to
ensure that models trained using The Pile will have much broader generalization abilities.
We are currently developing Version 1, with an ultimate goal of 1 TiB of English text.
After the completion of Version 1, our next goal is a fully-multilingual, 10TiB text dataset.
"""
_CITATION = """
"""
_DATASET_MODES = ["lm"]
_VERSION = tfds.core.Version('1.0.0')

os.environ['TFDS_DATA_DIR']="/mnt/resource_nvme/the_pile"

class PileReader:
    def __init__(self, filenames, para_joiner='\n\n'):
        if not isinstance(filenames, list):
            filenames = [filenames]
        self.filenames = filenames
        self.para_joiner = para_joiner

    def _read_fn(self, filename):
        print(filename)
        df = pd.read_parquet(filename)
        for _, row in df.iterrows():
            yield row.to_dict()

    def __iter__(self):
        print(self.filenames)
        return chain.from_iterable([self._read_fn(filename) for filename in self.filenames])

class ThePileConfig(tfds.core.BuilderConfig):
    def __init__(self, *, mode=None, **kwargs):
        super(ThePileConfig, self).__init__(
            name=mode,
            description="The Pile dataset",
            **kwargs)

class ThePile(tfds.core.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        ThePileConfig(version=_VERSION, mode=mode) for mode in _DATASET_MODES
    ]
    def _info(self) -> tfds.core.DatasetInfo:
        return tfds.core.DatasetInfo(
            builder=self,
            description=_DESCRIPTION,
            features=tfds.features.FeaturesDict({
                'text': tfds.features.Text()
            }),
            supervised_keys=("text", "text"),
            homepage='https://github.com/EleutherAI/The-Pile',
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager: tfds.download.DownloadManager):
        dl_manager.verify_ssl = False
        dl_paths=glob.glob('./data/*.parquet')
        print(dl_paths)
        return {
            'train': self._generate_examples(dl_paths),
        }

    def _generate_examples(self, paths):
        pipeline = PileReader(paths)
        #print('pipeline', pipeline)
        for x, result in enumerate(pipeline):
            if result:
                idx = f'{x}_the_pile'
                yield idx, {'text': result['text']}

if __name__ == '__main__':
    ds = tfds.load('ThePile')