DagsHub/streaming-client

`InvalidPathFormatError` when calling `ds.all()`

Closed this issue · 2 comments

My code:

len(ds.all().dataframe)

My repo:

https://test.dagshub.com/yonomitt/LAION-Aesthetics-V2-6.5plus/src/main/data

My error:

---------------------------------------------------------------------------
InvalidPathFormatError                    Traceback (most recent call last)
Cell In[15], line 1
----> 1 len(ds.all().dataframe)

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/client/dataclasses.py:77, in QueryResult.dataframe(self)
     75 for e in self.entries:
     76     names.append(e.path)
---> 77     urls.append(e.download_url(self.datasource))
     78     metadata_keys.update(e.metadata.keys())
     80 res = pd.DataFrame({"name": names, "dagshub_download_url": urls})

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/client/dataclasses.py:21, in Datapoint.download_url(self, ds)
     20 def download_url(self, ds: "Datasource"):
---> 21     return ds.source.raw_path(self)

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/model/datasource_state.py:93, in DatasourceState.raw_path(self, path)
     89 """
     90 Returns the url for the download path of a specified path
     91 """
     92 path = self._extract_path(path).strip("/")
---> 93 return self.root_raw_path + "/" + path

File ~/.miniforge3/envs/dagstest/lib/python3.10/functools.py:981, in cached_property.__get__(self, instance, owner)
    979 val = cache.get(self.attrname, _NOT_FOUND)
    980 if val is _NOT_FOUND:
--> 981     val = self.func(instance)
    982     try:
    983         cache[self.attrname] = val

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/model/datasource_state.py:111, in DatasourceState.root_raw_path(self)
    104 @cached_property
    105 def root_raw_path(self):
    106     """
    107     Returns the root raw path of the dataset for downloading files
    108     This is just a "prefix" of the datasource relative to the repo.
    109     In order to build a path of an entity you need to concatenate the path to this root
    110     """
--> 111     return self._root_path("raw")

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/model/datasource_state.py:115, in DatasourceState._root_path(self, path_type)
    113 def _root_path(self, path_type):
    114     assert path_type in ["raw", "content"]
--> 115     parts = self.path_parts()
    116     if self.source_type == DatasourceType.BUCKET:
    117         path_elems = [parts["schema"], parts["bucket"]]

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/model/datasource_state.py:145, in DatasourceState.path_parts(self)
    143 match = regex.fullmatch(self.path)
    144 if match is None:
--> 145     raise InvalidPathFormatError(f"{self.path} is not valid path format for type {self.source_type}.\n"
    146                                  f"Expected format: {expected_formats[self.source_type]}")
    147 return match.groupdict()

InvalidPathFormatError: repo://yonomitt/LAION-Aesthetics-V2-6.5plus/data is not valid path format for type DatasourceType.REPOSITORY.
Expected format: repo://owner/reponame/prefix

Aha, another regex issue, thank you very much Yono, will fix it <3

Fixed in latest couple commits, but I haven't bumped a version yet