Null categorical codes written incorrectly
ivirshup opened this issue · 0 comments
ivirshup commented
Writing
suppressWarnings(SeuratData::InstallData("pbmc3k", force.reinstall = F))
suppressWarnings(data("pbmc3k"))
seuratObj <- suppressWarnings(pbmc3k)
WriteH5AD(seuratObj, "mudata_seurat.h5ad")
Reading
import anndata as ad
ad.read_h5ad("./mudata_seurat.h5ad")
File ~/miniconda3/envs/seurat-conversion/lib/python3.10/site-packages/pandas/core/arrays/categorical.py:709, in Categorical.from_codes(cls, codes, categories, ordered, dtype)
706 raise ValueError("codes need to be array-like integers")
708 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
--> 709 raise ValueError("codes need to be between -1 and len(categories)-1")
711 return cls(codes, dtype=dtype, fastpath=True)
ValueError: codes need to be between -1 and len(categories)-1
Full Traceback
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [17], in <cell line: 1>()
----> 1 ad.read_h5ad("./mudata_seurat.h5ad")
File ~/miniconda3/envs/seurat-conversion/lib/python3.10/site-packages/anndata/_io/h5ad.py:236, in read_h5ad(filename, backed, as_sparse, as_sparse_fmt, chunk_size)
233 assert False, "unexpected raw format"
234 elif k in {"obs", "var"}:
235 # Backwards compat
--> 236 d[k] = read_dataframe(f[k])
237 else: # Base case
238 d[k] = read_elem(f[k])
File ~/miniconda3/envs/seurat-conversion/lib/python3.10/site-packages/anndata/_io/h5ad.py:301, in read_dataframe(group)
299 return read_dataframe_legacy(group)
300 else:
--> 301 return read_elem(group)
File ~/miniconda3/envs/seurat-conversion/lib/python3.10/site-packages/anndata/_io/specs/registry.py:183, in read_elem(elem, modifiers)
178 def read_elem(
179 elem: Union[H5Array, H5Group, ZarrGroup, ZarrArray],
180 modifiers: frozenset(str) = frozenset(),
181 ) -> Any:
182 """Read an element from an on disk store."""
--> 183 return _REGISTRY.get_reader(type(elem), get_spec(elem), frozenset(modifiers))(elem)
File ~/miniconda3/envs/seurat-conversion/lib/python3.10/site-packages/anndata/_io/specs/methods.py:564, in read_dataframe_0_1_0(elem)
561 columns = _read_attr(elem.attrs, "column-order")
562 idx_key = _read_attr(elem.attrs, "_index")
563 df = pd.DataFrame(
--> 564 {k: read_series(elem[k]) for k in columns},
565 index=read_series(elem[idx_key]),
566 columns=list(columns),
567 )
568 if idx_key != "_index":
569 df.index.name = idx_key
File ~/miniconda3/envs/seurat-conversion/lib/python3.10/site-packages/anndata/_io/specs/methods.py:564, in <dictcomp>(.0)
561 columns = _read_attr(elem.attrs, "column-order")
562 idx_key = _read_attr(elem.attrs, "_index")
563 df = pd.DataFrame(
--> 564 {k: read_series(elem[k]) for k in columns},
565 index=read_series(elem[idx_key]),
566 columns=list(columns),
567 )
568 if idx_key != "_index":
569 df.index.name = idx_key
File ~/miniconda3/envs/seurat-conversion/lib/python3.10/site-packages/anndata/_io/specs/methods.py:586, in read_series(dataset)
584 categories = read_elem(categories_dset)
585 ordered = bool(_read_attr(categories_dset.attrs, "ordered", False))
--> 586 return pd.Categorical.from_codes(
587 read_elem(dataset), categories, ordered=ordered
588 )
589 else:
590 return read_elem(dataset)
File ~/miniconda3/envs/seurat-conversion/lib/python3.10/site-packages/pandas/core/arrays/categorical.py:709, in Categorical.from_codes(cls, codes, categories, ordered, dtype)
706 raise ValueError("codes need to be array-like integers")
708 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
--> 709 raise ValueError("codes need to be between -1 and len(categories)-1")
711 return cls(codes, dtype=dtype, fastpath=True)
ValueError: codes need to be between -1 and len(categories)-1
Checking out the file:
import h5py
import pandas as pd
f = h5py.File("./mudata_seurat.h5ad")
pd.value_counts(f["obs"]["seurat_annotations"][:])
0 697
1 483
2 480
3 344
4 271
5 162
6 155
-2147483648 62
7 32
8 14
dtype: int64
It looks like what's happening is that R and pandas encode categorical missing values quite differently. pandas
(and anndata) are expecting null values to have codes of -1
.