Inaccessible examples
nicodomschke opened this issue · 4 comments
nicodomschke commented
Hi,
I wanted to run some of the examples, but it seems like the dropbox links inside
only lead to dropbox page saying "That didn't work for some reason"
trying any get_xxx_data() function leads to something like this:
---------------------------------------------------------------------------
ParserError Traceback (most recent call last)
Cell In[4], line 1
----> 1 data = get_lipophilicity_data()
File ~/.conda/envs/zstruct-llm/lib/python3.11/site-packages/gptchem/data.py:225, in get_lipophilicity_data()
221 def get_lipophilicity_data() -> pd.DataFrame:
222 """Return the Lipophilicity data parsed from ChEMBL [chembl]_"""
223 return (
224 pystow.module("gptchem")
--> 225 .ensure_csv(
226 "lipophilicity",
227 url="https://www.dropbox.com/s/secesuqvqrdexz4/lipophilicity.csv?dl=1",
228 read_csv_kwargs=dict(sep=","),
229 )
230 .reset_index(drop=True)
231 )
File ~/.conda/envs/zstruct-llm/lib/python3.11/site-packages/pystow/impl.py:632, in Module.ensure_csv(self, url, name, force, download_kwargs, read_csv_kwargs, *subkeys)
627 import pandas as pd
629 path = self.ensure(
630 *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs
631 )
--> 632 return pd.read_csv(path, **_clean_csv_kwargs(read_csv_kwargs))
File ~/.conda/envs/zstruct-llm/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~/.conda/envs/zstruct-llm/lib/python3.11/site-packages/pandas/io/parsers/readers.py:626, in _read(filepath_or_buffer, kwds)
623 return parser
625 with parser:
--> 626 return parser.read(nrows)
File ~/.conda/envs/zstruct-llm/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1923, in TextFileReader.read(self, nrows)
1916 nrows = validate_integer("nrows", nrows)
1917 try:
1918 # error: "ParserBase" has no attribute "read"
1919 (
1920 index,
1921 columns,
1922 col_dict,
-> 1923 ) = self._engine.read( # type: ignore[attr-defined]
1924 nrows
1925 )
1926 except Exception:
1927 self.close()
File ~/.conda/envs/zstruct-llm/lib/python3.11/site-packages/pandas/io/parsers/c_parser_wrapper.py:234, in CParserWrapper.read(self, nrows)
232 try:
233 if self.low_memory:
--> 234 chunks = self._reader.read_low_memory(nrows)
235 # destructive to chunks
236 data = _concatenate_chunks(chunks)
File parsers.pyx:838, in pandas._libs.parsers.TextReader.read_low_memory()
File parsers.pyx:905, in pandas._libs.parsers.TextReader._read_rows()
File parsers.pyx:874, in pandas._libs.parsers.TextReader._tokenize_rows()
File parsers.pyx:891, in pandas._libs.parsers.TextReader._check_tokenize_status()
File parsers.pyx:2061, in pandas._libs.parsers.raise_parser_error()
ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 3
kjappelbaum commented
Indeed, that is a bug. Let me move the data somewhere else. I am busy with something else this week but planned to spend one day of maintaining this next week
kjappelbaum commented
Thanks for raising this issue!
kjappelbaum commented
I'll upload them onto Zenodo to keep them in a stable place
kjappelbaum commented
datasets are on Hugginface https://huggingface.co/datasets/kjappelbaum/gptchem/tree/main but I still need to adjust the code in the library. Sorry for the delay on this!