poloclub/diffusiondb

DatasetGenerationError: An error occurred while generating the dataset | ValueError: NaTType does not support utcoffset

ivanmkc opened this issue · 4 comments

dataset = load_dataset('poloclub/diffusiondb', '2m_random_50k', split="all")

Gives error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File ~/env/lib/python3.8/site-packages/datasets/builder.py:1626, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1625 example = self.info.features.encode_example(record) if self.info.features is not None else record
-> 1626 writer.write(example, key)
   1627 num_examples_progress_update += 1

File ~/env/lib/python3.8/site-packages/datasets/arrow_writer.py:488, in ArrowWriter.write(self, example, key, writer_batch_size)
    486     self.hkey_record = []
--> 488 self.write_examples_on_file()

File ~/env/lib/python3.8/site-packages/datasets/arrow_writer.py:446, in ArrowWriter.write_examples_on_file(self)
    442         batch_examples[col] = [
    443             row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
    444             for row in self.current_examples
    445         ]
--> 446 self.write_batch(batch_examples=batch_examples)
    447 self.current_examples = []

File ~/env/lib/python3.8/site-packages/datasets/arrow_writer.py:551, in ArrowWriter.write_batch(self, batch_examples, writer_batch_size)
    550 typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
--> 551 arrays.append(pa.array(typed_sequence))
    552 inferred_features[col] = typed_sequence.get_inferred_type()

File ~/env/lib/python3.8/site-packages/pyarrow/array.pxi:236, in pyarrow.lib.array()

File ~/env/lib/python3.8/site-packages/pyarrow/array.pxi:110, in pyarrow.lib._handle_arrow_array_protocol()

File ~/env/lib/python3.8/site-packages/datasets/arrow_writer.py:189, in TypedSequence.__arrow_array__(self, type)
    188     trying_cast_to_python_objects = True
--> 189     out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
    190 # use smaller integer precisions if possible

File ~/env/lib/python3.8/site-packages/pyarrow/array.pxi:320, in pyarrow.lib.array()

File ~/env/lib/python3.8/site-packages/pyarrow/array.pxi:39, in pyarrow.lib._sequence_to_array()

File ~/env/lib/python3.8/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~/env/lib/python3.8/site-packages/pandas/_libs/tslibs/nattype.pyx:67, in pandas._libs.tslibs.nattype._make_error_func.f()

ValueError: NaTType does not support utcoffset

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File ~/env/lib/python3.8/site-packages/datasets/builder.py:1635, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1634 num_shards = shard_id + 1
-> 1635 num_examples, num_bytes = writer.finalize()
   1636 writer.close()

File ~/env/lib/python3.8/site-packages/datasets/arrow_writer.py:582, in ArrowWriter.finalize(self, close_stream)
    581     self.hkey_record = []
--> 582 self.write_examples_on_file()
    583 # If schema is known, infer features even if no examples were written

File ~/env/lib/python3.8/site-packages/datasets/arrow_writer.py:446, in ArrowWriter.write_examples_on_file(self)
    442         batch_examples[col] = [
    443             row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
    444             for row in self.current_examples
    445         ]
--> 446 self.write_batch(batch_examples=batch_examples)
    447 self.current_examples = []

File ~/env/lib/python3.8/site-packages/datasets/arrow_writer.py:551, in ArrowWriter.write_batch(self, batch_examples, writer_batch_size)
    550 typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
--> 551 arrays.append(pa.array(typed_sequence))
    552 inferred_features[col] = typed_sequence.get_inferred_type()

File ~/env/lib/python3.8/site-packages/pyarrow/array.pxi:236, in pyarrow.lib.array()

File ~/env/lib/python3.8/site-packages/pyarrow/array.pxi:110, in pyarrow.lib._handle_arrow_array_protocol()

File ~/env/lib/python3.8/site-packages/datasets/arrow_writer.py:189, in TypedSequence.__arrow_array__(self, type)
    188     trying_cast_to_python_objects = True
--> 189     out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
    190 # use smaller integer precisions if possible

File ~/env/lib/python3.8/site-packages/pyarrow/array.pxi:320, in pyarrow.lib.array()

File ~/env/lib/python3.8/site-packages/pyarrow/array.pxi:39, in pyarrow.lib._sequence_to_array()

File ~/env/lib/python3.8/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~/env/lib/python3.8/site-packages/pandas/_libs/tslibs/nattype.pyx:67, in pandas._libs.tslibs.nattype._make_error_func.f()

ValueError: NaTType does not support utcoffset

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
Cell In[2], line 5
      2 from datasets import load_dataset
      4 # Load the dataset with the `large_random_1k` subset
----> 5 dataset = load_dataset('poloclub/diffusiondb', '2m_random_50k', split="all")

File ~/env/lib/python3.8/site-packages/datasets/load.py:1782, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
   1779 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1781 # Download and prepare data
-> 1782 builder_instance.download_and_prepare(
   1783     download_config=download_config,
   1784     download_mode=download_mode,
   1785     verification_mode=verification_mode,
   1786     try_from_hf_gcs=try_from_hf_gcs,
   1787     num_proc=num_proc,
   1788 )
   1790 # Build dataset for splits
   1791 keep_in_memory = (
   1792     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1793 )

File ~/env/lib/python3.8/site-packages/datasets/builder.py:872, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    870     if num_proc is not None:
    871         prepare_split_kwargs["num_proc"] = num_proc
--> 872     self._download_and_prepare(
    873         dl_manager=dl_manager,
    874         verification_mode=verification_mode,
    875         **prepare_split_kwargs,
    876         **download_and_prepare_kwargs,
    877     )
    878 # Sync info
    879 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~/env/lib/python3.8/site-packages/datasets/builder.py:1649, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs)
   1648 def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
-> 1649     super()._download_and_prepare(
   1650         dl_manager,
   1651         verification_mode,
   1652         check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
   1653         or verification_mode == VerificationMode.ALL_CHECKS,
   1654         **prepare_splits_kwargs,
   1655     )

File ~/env/lib/python3.8/site-packages/datasets/builder.py:967, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
    963 split_dict.add(split_generator.split_info)
    965 try:
    966     # Prepare split will record examples associated to the split
--> 967     self._prepare_split(split_generator, **prepare_split_kwargs)
    968 except OSError as e:
    969     raise OSError(
    970         "Cannot find data file. "
    971         + (self.manual_download_instructions or "")
    972         + "\nOriginal error:\n"
    973         + str(e)
    974     ) from None

File ~/env/lib/python3.8/site-packages/datasets/builder.py:1488, in GeneratorBasedBuilder._prepare_split(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size)
   1486 gen_kwargs = split_generator.gen_kwargs
   1487 job_id = 0
-> 1488 for job_id, done, content in self._prepare_split_single(
   1489     gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1490 ):
   1491     if done:
   1492         result = content

File ~/env/lib/python3.8/site-packages/datasets/builder.py:1644, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1642     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1643         e = e.__context__
-> 1644     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1646 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset
Generating train split: 13999 examples [00:45, 540.25 examples[/s](https://file+.vscode-resource.vscode-cdn.net/s)]

Hi @ivanmkc, do you have the same error when loading dataset = load_dataset('poloclub/diffusiondb', 'large_first_1k')?

Hi, this issue still exist. I encountered this error when loading the large_text_only dataset using datasets

fecet commented

Same here.

Trying to read "timestamp" feature in part221 will raise this.

xiaohk commented

Thanks for letting me know. I think 3ae2737 should fix this issue. Let me know if it doesn't work.