Error in creating meta for isin with PyArrow strings
zmbc opened this issue · 1 comments
zmbc commented
Describe the issue: Fails with error message ArrowTypeError: Array type doesn't match type of values set: string vs int64
when I try to call .isin() on a Dask Series with a PyArrow string type.
Full traceback:
---------------------------------------------------------------------------
ArrowTypeError Traceback (most recent call last)
Cell In[18], line 11
7 import pyarrow as pa
9 sample_df = dd.from_pandas(pd.DataFrame({'foo': ["1", "2", "3"]}), npartitions=1).astype(pd.ArrowDtype(pa.string()))
---> 11 sample_df.foo.isin(["1", "2"])
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/dask_expr/_collection.py:771, in FrameBase.isin(self, values)
768 pass
769 from dask_expr.io._delayed import _DelayedExpr
--> 771 return new_collection(
772 expr.Isin(
773 self,
774 values=_DelayedExpr(
775 delayed(values, name="delayed-" + _tokenize_deterministic(values))
776 ),
777 )
778 )
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/dask_expr/_collection.py:4730, in new_collection(expr)
4728 def new_collection(expr):
4729 """Create new collection from an expr"""
-> 4730 meta = expr._meta
4731 expr._name # Ensure backend is imported
4732 return get_collection_type(meta)(expr)
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/functools.py:981, in cached_property.__get__(self, instance, owner)
979 val = cache.get(self.attrname, _NOT_FOUND)
980 if val is _NOT_FOUND:
--> 981 val = self.func(instance)
982 try:
983 cache[self.attrname] = val
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/dask_expr/_expr.py:1363, in Isin._meta(self)
1361 @functools.cached_property
1362 def _meta(self):
-> 1363 return make_meta(meta_nonempty(self.frame._meta).isin([1]))
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/pandas/core/series.py:5559, in Series.isin(self, values)
5486 def isin(self, values) -> Series:
5487 """
5488 Whether elements in Series are contained in `values`.
5489
(...)
5557 dtype: bool
5558 """
-> 5559 result = algorithms.isin(self._values, values)
5560 return self._constructor(result, index=self.index, copy=False).__finalize__(
5561 self, method="isin"
5562 )
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/pandas/core/algorithms.py:505, in isin(comps, values)
502 comps_array = extract_array(comps_array, extract_numpy=True)
503 if not isinstance(comps_array, np.ndarray):
504 # i.e. Extension Array
--> 505 return comps_array.isin(values)
507 elif needs_i8_conversion(comps_array.dtype):
508 # Dispatch to DatetimeLikeArrayMixin.isin
509 return pd_array(comps_array).isin(values)
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/pandas/core/arrays/arrow/array.py:1113, in ArrowExtensionArray.isin(self, values)
1110 if not len(values):
1111 return np.zeros(len(self), dtype=bool)
-> 1113 result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True))
1114 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
1115 # to False
1116 return np.array(result, dtype=np.bool_)
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/pyarrow/compute.py:263, in _make_generic_wrapper.<locals>.wrapper(memory_pool, options, *args, **kwargs)
261 if args and isinstance(args[0], Expression):
262 return Expression._call(func_name, list(args), options)
--> 263 return func.call(args, options, memory_pool)
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/pyarrow/_compute.pyx:385, in pyarrow._compute.Function.call()
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/pyarrow/error.pxi:154, in pyarrow.lib.pyarrow_internal_check_status()
File ~/mambaforge/envs/person_linkage_case_study_20240423/lib/python3.10/site-packages/pyarrow/error.pxi:91, in pyarrow.lib.check_status()
ArrowTypeError: Array type doesn't match type of values set: string vs int64
Minimal Complete Verifiable Example:
import dask
# If you uncomment this, it works
# dask.config.set({"dataframe.query-planning": False})
import dask.dataframe as dd
import pandas as pd
import pyarrow as pa
sample_df = dd.from_pandas(pd.DataFrame({'foo': ["1", "2", "3"]}), npartitions=1).astype(pd.ArrowDtype(pa.string()))
sample_df.foo.isin(["1", "2"])
Anything else we need to know?:
Environment:
- Dask version: 2024.5.0
- Python version: 3.10.14
- Operating System: Linux
- Install method (conda, pip, source): pip
phofl commented
Thanks for the report, sorry about the bug. Put up a fix and it will get released tomorrow with the new dask release