Error using `to_dataset_dict()` with opendap url as path
tlogan2000 opened this issue · 3 comments
tlogan2000 commented
I am having errors loading data via to_dataset_dict
when the path field contains an opendap url
installed intake_esm version : 2021.8.17
This works fine
import xarray as xr
import intake
cat = intake.open_esm_datastore("https://pavics.ouranos.ca/catalog/climex.json") # TEST_USE_PROD_DATA
print(cat.df.head())
url = cat.df.path[0]
ds = xr.open_dataset(url, chunks=dict(realization=1, time=365))
However try to use to_dataset_dict()
results in a error :
dsdict = cat.to_dataset_dict(cdf_kwargs=dict(chunks=dict(realization=1, time=365)))
traceback:
---------------------------------------------------------------------------
ClientResponseError Traceback (most recent call last)
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/implementations/http.py:383, in HTTPFileSystem._info(self, url, **kwargs)
381 try:
382 info.update(
--> 383 await _file_info(
384 url,
385 size_policy=policy,
386 session=session,
387 **self.kwargs,
388 **kwargs,
389 )
390 )
391 if info.get("size") is not None:
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/implementations/http.py:734, in _file_info(url, session, size_policy, **kwargs)
733 async with r:
--> 734 r.raise_for_status()
736 # TODO:
737 # recognise lack of 'Accept-Ranges',
738 # or 'Accept-Ranges': 'none' (not 'bytes')
739 # to mean streaming only, no random access => return None
File /opt/conda/envs/birdy/lib/python3.8/site-packages/aiohttp/client_reqrep.py:1004, in ClientResponse.raise_for_status(self)
1003 self.release()
-> 1004 raise ClientResponseError(
1005 self.request_info,
1006 self.history,
1007 status=self.status,
1008 message=self.reason,
1009 headers=self.headers,
1010 )
ClientResponseError: 400, message='Bad Request', url=URL('https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/datasets/simulations/climex/day_climex-crcm5_historical+rcp85.ncml')
The above exception was the direct cause of the following exception:
FileNotFoundError Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 dsdict = cat.to_dataset_dict(cdf_kwargs=dict(chunks=dict(realization=1, time=365)))
File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/core.py:922, in esm_datastore.to_dataset_dict(self, zarr_kwargs, cdf_kwargs, preprocess, storage_options, progressbar, aggregate)
918 future_tasks = [
919 executor.submit(_load_source, key, source) for key, source in sources.items()
920 ]
921 for i, task in enumerate(concurrent.futures.as_completed(future_tasks)):
--> 922 key, ds = task.result()
923 self._datasets[key] = ds
924 if self.progressbar:
File /opt/conda/envs/birdy/lib/python3.8/concurrent/futures/_base.py:437, in Future.result(self, timeout)
435 raise CancelledError()
436 elif self._state == FINISHED:
--> 437 return self.__get_result()
439 self._condition.wait(timeout)
441 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
File /opt/conda/envs/birdy/lib/python3.8/concurrent/futures/_base.py:389, in Future.__get_result(self)
387 if self._exception:
388 try:
--> 389 raise self._exception
390 finally:
391 # Break a reference cycle with the exception in self._exception
392 self = None
File /opt/conda/envs/birdy/lib/python3.8/concurrent/futures/thread.py:57, in _WorkItem.run(self)
54 return
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/core.py:908, in esm_datastore.to_dataset_dict.<locals>._load_source(key, source)
907 def _load_source(key, source):
--> 908 return key, source.to_dask()
File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/source.py:89, in ESMDataSource.to_dask(self)
87 def to_dask(self):
88 """Return xarray object (which will have chunks)"""
---> 89 self._load_metadata()
90 return self._ds
File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake/source/base.py:236, in DataSourceBase._load_metadata(self)
234 """load metadata only if needed"""
235 if self._schema is None:
--> 236 self._schema = self._get_schema()
237 self.dtype = self._schema.dtype
238 self.shape = self._schema.shape
File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/source.py:57, in ESMDataSource._get_schema(self)
54 def _get_schema(self):
56 if self._ds is None:
---> 57 self._open_dataset()
59 metadata = {
60 'dims': dict(self._ds.dims),
61 'data_vars': {k: list(self._ds[k].coords) for k in self._ds.data_vars.keys()},
62 'coords': tuple(self._ds.coords.keys()),
63 }
64 self._schema = Schema(
65 datashape=None,
66 dtype=None,
(...)
69 extra_metadata=metadata,
70 )
File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/source.py:75, in ESMDataSource._open_dataset(self)
73 def _open_dataset(self):
74 mapper = _path_to_mapper(self.row[self.path_column], self.storage_options, self.data_format)
---> 75 ds = _open_asset(
76 mapper,
77 data_format=self.data_format,
78 zarr_kwargs=self.zarr_kwargs,
79 cdf_kwargs=self.cdf_kwargs,
80 preprocess=self.preprocess,
81 requested_variables=self.requested_variables,
82 )
83 ds.attrs['intake_esm_dataset_key'] = self.key
84 self._ds = ds
File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/merge_util.py:266, in _open_asset(path, data_format, zarr_kwargs, cdf_kwargs, preprocess, varname, requested_variables)
264 protocol = normalize_protocol(path.fs.protocol)
265 root = path.path
--> 266 path = path.open()
268 if data_format == 'zarr':
269 try:
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/core.py:140, in OpenFile.open(self)
132 def open(self):
133 """Materialise this as a real open file without context
134
135 The file should be explicitly closed to avoid enclosed file
(...)
138 been deleted; but a with-context is better style.
139 """
--> 140 out = self.__enter__()
141 closer = out.close
142 fobjects = self.fobjects.copy()[:-1]
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/core.py:103, in OpenFile.__enter__(self)
100 def __enter__(self):
101 mode = self.mode.replace("t", "").replace("b", "") + "b"
--> 103 f = self.fs.open(self.path, mode=mode)
105 self.fobjects = [f]
107 if self.compression is not None:
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/spec.py:1009, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
1007 else:
1008 ac = kwargs.pop("autocommit", not self._intrans)
-> 1009 f = self._open(
1010 path,
1011 mode=mode,
1012 block_size=block_size,
1013 autocommit=ac,
1014 cache_options=cache_options,
1015 **kwargs,
1016 )
1017 if compression is not None:
1018 from fsspec.compression import compr
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/implementations/http.py:343, in HTTPFileSystem._open(self, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)
341 kw["asynchronous"] = self.asynchronous
342 kw.update(kwargs)
--> 343 size = size or self.info(path, **kwargs)["size"]
344 session = sync(self.loop, self.set_session)
345 if block_size and size:
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/asyn.py:85, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
82 @functools.wraps(func)
83 def wrapper(*args, **kwargs):
84 self = obj or args[0]
---> 85 return sync(self.loop, func, *args, **kwargs)
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/asyn.py:65, in sync(loop, func, timeout, *args, **kwargs)
63 raise FSTimeoutError from return_result
64 elif isinstance(return_result, BaseException):
---> 65 raise return_result
66 else:
67 return return_result
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/asyn.py:25, in _runner(event, coro, result, timeout)
23 coro = asyncio.wait_for(coro, timeout=timeout)
24 try:
---> 25 result[0] = await coro
26 except Exception as ex:
27 result[0] = ex
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/implementations/http.py:396, in HTTPFileSystem._info(self, url, **kwargs)
393 except Exception as exc:
394 if policy == "get":
395 # If get failed, then raise a FileNotFoundError
--> 396 raise FileNotFoundError(url) from exc
397 logger.debug(str(exc))
399 return {"name": url, "size": None, **info, "type": "file"}
FileNotFoundError: https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/datasets/simulations/climex/day_climex-crcm5_historical+rcp85.ncml
tlogan2000 commented
Note I have also tried forcing engine='pydap'
via dsdict = cat.to_dataset_dict(cdf_kwargs=dict(chunks=dict(realization=1, time=365), engine='pydap'))
but end up with the same error
mgrover1 commented
@tlogan2000 , I tried reading in the catalog and received the following error:
ValidationError Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 catalog = intake.open_esm_datastore("https://pavics.ouranos.ca/catalog/climex.json")
File ~/miniforge3/envs/pyart-docs/lib/python3.10/site-packages/intake_esm/core.py:94, in esm_datastore.__init__(self, obj, progressbar, sep, registry, read_csv_kwargs, storage_options, intake_kwargs)
92 self.esmcat = ESMCatalogModel.from_dict(obj)
93 else:
---> 94 self.esmcat = ESMCatalogModel.load(
95 obj, storage_options=self.storage_options, read_csv_kwargs=read_csv_kwargs
96 )
98 self.derivedcat = registry or default_registry
99 self._entries = {}
File ~/miniforge3/envs/pyart-docs/lib/python3.10/site-packages/intake_esm/cat.py:226, in ESMCatalogModel.load(cls, json_file, storage_options, read_csv_kwargs)
224 if 'last_updated' not in data:
225 data['last_updated'] = None
--> 226 cat = cls.parse_obj(data)
227 if cat.catalog_file:
228 if _mapper.fs.exists(cat.catalog_file):
File ~/miniforge3/envs/pyart-docs/lib/python3.10/site-packages/pydantic/main.py:521, in pydantic.main.BaseModel.parse_obj()
File ~/miniforge3/envs/pyart-docs/lib/python3.10/site-packages/pydantic/main.py:341, in pydantic.main.BaseModel.__init__()
ValidationError: 1 validation error for ESMCatalogModel
aggregation_control
field required (type=value_error.missing)
Did you set the aggregation control in the catalog?
huard commented
No, the specs mention it is optional.