microsoft/Qcodes

Dataset I/O performance

thangleiter opened this issue · 0 comments

I frequently deal with fairly large datasets (some high-dimensional loops and a buffered get-parameter). Loading these datasets from the database into a usable format (i.e., xarray) takes an extremely long time.

A representative example:

>>> xds 
<xarray.Dataset> Size: 792MB
Dimensions: (... : 19,
             ... : 51,
             ... : 51,
             ... : 2000)
	...

For reference, the HDF5 I/O performance for this dataset:

>>> %timeit xds.to_netcdf(file := tempfile.mktemp(), engine='h5netcdf')
6.72 s ± 832 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
>>> %timeit xr.load_dataset(file)
500 ms ± 43.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

By contrast, loading this dataset from the database (~4GB on disk) takes 60 seconds! During the course of this minute, memory usage temporarily spikes by 8GB (Windows 10). The runtime breaks down as follows (some functions cherry-picked for detail, %lprun -u 1e-3 -f ... xds = qcds.to_xarray_dataset(), timings in ms):

to_xarray_dataset()

Total time: 57.4706 s
File: ...\qcodes\dataset\data_set.py
Function: to_xarray_dataset at line 986

Line # Hits Time Per Hit % Time Line Contents
1042 2 28081.6 14040.8 48.9 data = self.get_parameter_data(*params,
1043 1 0.0 0.0 0.0 start=start,
1044 1 0.0 0.0 0.0 end=end)
1045
1046 1 29389.0 29389.0 51.1 return load_to_xarray_dataset(self, data, use_multi_index=use_multi_index)

export_to_xarray()

Total time: 29.3664 s
File: ...\qcodes\dataset\exporters\export_to_xarray.py
Function: _load_to_xarray_dataarray_dict_no_metadata at line 62

Line # Hits Time Per Hit % Time Line Contents
68 1 0.0 0.0 0.0 import pandas as pd
69 1 0.0 0.0 0.0 import xarray as xr
70
71 1 0.0 0.0 0.0 if use_multi_index not in ("auto", "always", "never"):
72 raise ValueError(
73 f"Invalid value for use_multi_index. Expected one of 'auto', 'always', 'never' but got {use_multi_index}"
74 )
75
76 1 0.0 0.0 0.0 data_xrdarray_dict: dict[str, xr.DataArray] = {}
77
78 6 0.0 0.0 0.0 for name, subdict in datadict.items():
79 5 8157.3 1631.5 27.8 index = _generate_pandas_index(subdict)
80
81 5 0.0 0.0 0.0 if index is None:
82 xrdarray: xr.DataArray = (
83 _data_to_dataframe(subdict, index=index)
84 .to_xarray()
85 .get(name, xr.DataArray())
86 )
87 data_xrdarray_dict[name] = xrdarray
88 else:
89 5 15452.9 3090.6 52.6 index_unique = len(index.unique()) == len(index)
90
91 5 248.4 49.7 0.8 df = _data_to_dataframe(subdict, index)
92
93 5 0.0 0.0 0.0 if not index_unique:
94 # index is not unique so we fallback to using a counter as index
95 # and store the index as a variable
96 xrdata_temp = df.reset_index().to_xarray()
97 for _name in subdict:
98 data_xrdarray_dict[_name] = xrdata_temp[_name]
99 else:
100 5 1982.9 396.6 6.8 calc_index = _calculate_index_shape(index)
101 5 0.0 0.0 0.0 index_prod = prod(calc_index.values())
102 # if the product of the len of individual index dims == len(total_index)
103 # we are on a grid
104
105 5 0.0 0.0 0.0 on_grid = index_prod == len(index)
106
107 10 0.0 0.0 0.0 export_with_multi_index = (
108 10 0.0 0.0 0.0 not on_grid
109 5 0.0 0.0 0.0 and dataset.description.shapes is None
110 and use_multi_index == "auto"
111 5 0.0 0.0 0.0 ) or use_multi_index == "always"
112
113 5 0.0 0.0 0.0 if export_with_multi_index:
114 assert isinstance(df.index, pd.MultiIndex)
115
116 if hasattr(xr, "Coordinates"):
117 coords = xr.Coordinates.from_pandas_multiindex(
118 df.index, "multi_index"
119 )
120 xrdarray = xr.DataArray(df[name], coords=coords)
121 else:
122 # support xarray < 2023.8.0, can be removed when we drop support for that
123 xrdarray = xr.DataArray(df[name], [("multi_index", df.index)])
124 else:
125 5 3524.9 705.0 12.0 xrdarray = df.to_xarray().get(name, xr.DataArray())
126
127 5 0.0 0.0 0.0 data_xrdarray_dict[name] = xrdarray
128
129 1 0.0 0.0 0.0 return data_xrdarray_dict

_expand_data_to_arrays()

Total time: 4.06941 s
File: ...\qcodes\dataset\sqlite\queries.py
Function: _expand_data_to_arrays at line 266

Line # Hits Time Per Hit % Time Line Contents
266 def _expand_data_to_arrays(
267 data: list[tuple[Any, ...]], paramspecs: Sequence[ParamSpecBase]
268 ) -> None:
269 26 0.0 0.0 0.0 types = [param.type for param in paramspecs]
270 # if we have array type parameters expand all other parameters
271 # to arrays
272 5 0.0 0.0 0.0 if 'array' in types:
273
274 1 0.0 0.0 0.0 if ('numeric' in types or 'text' in types
275 or 'complex' in types):
276 1 0.0 0.0 0.0 first_array_element = types.index('array')
277 1 0.0 0.0 0.0 types_mapping: dict[int, Callable[[str], np.dtype[Any]]] = {}
278 6 0.0 0.0 0.0 for i, x in enumerate(types):
279 5 0.0 0.0 0.0 if x == "numeric":
280 3 0.0 0.0 0.0 types_mapping[i] = lambda _: np.dtype(np.float64)
281 2 0.0 0.0 0.0 elif x == "complex":
282 types_mapping[i] = lambda _: np.dtype(np.complex128)
283 2 0.0 0.0 0.0 elif x == "text":
284 types_mapping[i] = lambda array: np.dtype(f"U{len(array)}")
285
286 46253 30.0 0.0 0.7 for i_row, row in enumerate(data):
287 # todo should we handle int/float types here
288 # we would in practice have to perform another
289 # loop to check that all elements of a given can be cast to
290 # int without loosing precision before choosing an integer
291 # representation of the array
292 92504 3070.6 0.0 75.5 data[i_row] = tuple(
293 np.full_like(
294 row[first_array_element], array, dtype=types_mappingi
295 )
296 if i in types_mapping
297 else array
298 46252 25.8 0.0 0.6 for i, array in enumerate(row)
299 )
300
301 1 0.0 0.0 0.0 row_shape = None
302 46253 25.8 0.0 0.6 for i_row, row in enumerate(data):
303 # now expand all one element arrays to match the expected size
304 # one element arrays are introduced if scalar values are stored
305 # with an explicit array storage type
306 46252 16.1 0.0 0.4 max_size = 0
307 277512 146.8 0.0 3.6 for i, array in enumerate(row):
308 231260 110.2 0.0 2.7 if array.size > max_size:
309 46252 18.8 0.0 0.5 if max_size > 1:
310 log.warning(
311 f"Cannot expand array of size {max_size} "
312 f"to size {array.size}"
313 )
314 46252 23.4 0.0 0.6 max_size, row_shape = array.size, array.shape
315
316 46252 19.4 0.0 0.5 if max_size > 1:
317 46252 18.2 0.0 0.4 assert row_shape is not None
318 92504 549.2 0.0 13.5 data[i_row] = tuple(
319 np.full(row_shape, array, dtype=array.dtype)
320 if array.size == 1
321 else array
322 46252 15.1 0.0 0.4 for array in row
323 )

many_many()

Total time: 21.7225 s
File: ...\qcodes\dataset\sqlite\query_helpers.py
Function: many_many at line 117

Line # Hits Time Per Hit % Time Line Contents
117 def many_many(curr: sqlite3.Cursor, *columns: str) -> list[tuple[Any, ...]]:
118 """Get all values of many columns
119 Args:
120 curr: cursor to operate on
121 columns: names of the columns
122
123 Returns:
124 list of lists of values
125 """
126 5 21722.4 4344.5 100.0 res = curr.fetchall()
127
128 5 0.1 0.0 0.0 if _need_to_select(curr, *columns):
129 raise RuntimeError(
130 "Expected consistent selection: cursor has columns "
131 f"{tuple(c[0] for c in curr.description)} but expected {columns}"
132 )
133
134 5 0.0 0.0 0.0 return res

Unfortunately, I am not familiar with SQLite so I cannot say if there is much to be gained in the actual I/O from the database, but the discrepancy in speed to HDF5 makes me think that at least in the pure Python parts a lot could be gained. Loading a factor of 120 slower than xarray seems absurd.