Merge pull request #369 from scverse/fix/assert-xenium-shapes-id

LucaMarconato · web-flow · commit 7eb2e61a783f · 2026-02-11T16:52:53.000+01:00
Xenium avoid calling `get_element_instances()`; fix deprecation warning
diff --git a/docs/conf.py b/docs/conf.py
@@ -127,6 +127,7 @@
     # If building the documentation fails because of a missing link that is outside your control,
     # you can add an exception to this list.
     ("py:class", "Path"),
+    ("py:class", "pathlib._local.Path"),
     ("py:class", "AnnData"),
     ("py:class", "SpatialData"),
     ("py:func", "imageio.imread"),  # maybe this can be fixed
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,18 +5,18 @@ requires = ["hatchling", "hatch-vcs"]
 
 [project]
 name = "spatialdata-io"
-dynamic= [
-  "version" # allow version to be set by git tags
+dynamic = [
+    "version" # allow version to be set by git tags
 ]
 description = "SpatialData IO for common techs"
 readme = "README.md"
 requires-python = ">=3.11"
-license = {file = "LICENSE"}
+license = { file = "LICENSE" }
 authors = [
-    {name = "scverse"},
+    { name = "scverse" },
 ]
 maintainers = [
-    {name = "scverse", email = "scverse@scverse.scverse"},
+    { name = "scverse", email = "scverse@scverse.scverse" },
 ]
 urls.Documentation = "https://spatialdata-io.readthedocs.io/"
 urls.Source = "https://github.com/scverse/spatialdata-io"
@@ -26,7 +26,7 @@ dependencies = [
     "click",
     "numpy",
     "scanpy",
-    "spatialdata>=0.2.6",
+    "spatialdata>=0.7.3a0",
     "scikit-image",
     "h5py",
     "joblib",
@@ -46,7 +46,7 @@ dev = [
     "pre-commit"
 ]
 doc = [
-    "sphinx>=4.5",
+    "sphinx>=4.5,<9",
     "sphinx-book-theme>=1.0.0",
     "myst-nb",
     "sphinxcontrib-bibtex>=1.0.0",
@@ -67,7 +67,7 @@ test = [
 # update: readthedocs doens't seem to try to install pre-releases even if when trying to install the pre optional-dependency. For
 # the moment, if needed, let's add the latest pre-release explicitly here.
 pre = [
-    "spatialdata>=0.4.0rc0"
+    "spatialdata>=0.7.3a0"
 ]
 
 [tool.coverage.run]
@@ -80,7 +80,7 @@ omit = [
 testpaths = ["tests"]
 xfail_strict = true
 addopts = [
-    "--import-mode=importlib",  # allow using test files with same name
+    "--import-mode=importlib", # allow using test files with same name
 ]
 
 [tool.ruff]
@@ -95,19 +95,19 @@ exclude = [
     "setup.py",
 ]
 lint.select = [
-    "F",  # Errors detected by Pyflakes
-    "E",  # Error detected by Pycodestyle
-    "W",  # Warning detected by Pycodestyle
-    "I",  # isort
-    "D",  # pydocstyle
-    "B",  # flake8-bugbear
-    "TID",  # flake8-tidy-imports
-    "C4",  # flake8-comprehensions
-    "BLE",  # flake8-blind-except
-    "UP",  # pyupgrade
-    "RUF100",  # Report unused noqa directives
-    "TCH",  # Typing imports
-    "NPY",  # Numpy specific rules
+    "F", # Errors detected by Pyflakes
+    "E", # Error detected by Pycodestyle
+    "W", # Warning detected by Pycodestyle
+    "I", # isort
+    "D", # pydocstyle
+    "B", # flake8-bugbear
+    "TID", # flake8-tidy-imports
+    "C4", # flake8-comprehensions
+    "BLE", # flake8-blind-except
+    "UP", # pyupgrade
+    "RUF100", # Report unused noqa directives
+    "TCH", # Typing imports
+    "NPY", # Numpy specific rules
     # "PTH",  # Use pathlib
     # "S"  # Security
 ]
diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py
@@ -7,7 +7,7 @@
 from numpy.typing import NDArray
 from spatialdata.models.models import Chunks_t
 
-__all__ = ["Chunks_t", "_compute_chunks", "_read_chunks", "normalize_chunks"]
+__all__ = ["Chunks_t", "_compute_chunks", "_read_chunks"]
 
 _Y_IDX = 0
 """Index of y coordinate in in chunk coordinate array format: (y, x, height, width)"""
@@ -143,60 +143,3 @@ def _read_chunks(
         for chunk_y in range(coords.shape[0])
     ]
     return chunks
-
-
-def normalize_chunks(
-    chunks: Chunks_t | None,
-    axes: Sequence[str],
-) -> dict[str, int]:
-    """Normalize chunk specification to dict format.
-
-    This function converts various chunk formats to a dict mapping dimension names
-    to chunk sizes. The dict format is preferred because it's explicit about which
-    dimension gets which chunk size and is compatible with spatialdata.
-
-    Parameters
-    ----------
-    chunks
-        Chunk specification. Can be:
-        - None: Uses DEFAULT_CHUNK_SIZE for all axes
-        - int: Applied to all axes
-        - tuple[int, ...]: Chunk sizes in order corresponding to axes
-        - dict: Mapping of axis names to chunk sizes (validated against axes)
-    axes
-        Tuple of axis names that defines the expected dimensions (e.g., ('c', 'y', 'x')).
-
-    Returns
-    -------
-    dict[str, int]
-        Dict mapping axis names to chunk sizes.
-
-    Raises
-    ------
-    ValueError
-        If chunks format is not supported or incompatible with axes.
-    """
-    if chunks is None:
-        return dict.fromkeys(axes, DEFAULT_CHUNK_SIZE)
-
-    if isinstance(chunks, int):
-        return dict.fromkeys(axes, chunks)
-
-    if isinstance(chunks, Mapping):
-        chunks_dict = dict(chunks)
-        missing = set(axes) - set(chunks_dict.keys())
-        if missing:
-            raise ValueError(f"chunks dict missing keys for axes {missing}, got: {list(chunks_dict.keys())}")
-        return {ax: chunks_dict[ax] for ax in axes}
-
-    if isinstance(chunks, tuple):
-        if len(chunks) != len(axes):
-            raise ValueError(f"chunks tuple length {len(chunks)} doesn't match axes {axes} (length {len(axes)})")
-        if not all(isinstance(c, int) for c in chunks):
-            raise ValueError(f"All elements in chunks tuple must be int, got: {chunks}")
-        return dict(zip(axes, chunks, strict=True))
-
-    raise ValueError(f"Unsupported chunks type: {type(chunks)}. Expected int, tuple, dict, or None.")
-
-
-##
diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py
@@ -7,7 +7,6 @@
 import numpy as np
 import tifffile
 from dask_image.imread import imread
-from geopandas import GeoDataFrame
 from spatialdata._docs import docstring_parameter
 from spatialdata._logging import logger
 from spatialdata.models import Image2DModel, ShapesModel
@@ -23,10 +22,12 @@
     from xarray import DataArray
 
 
+from spatialdata.models.chunks_utils import normalize_chunks
+
 from spatialdata_io.readers._utils._image import (
+    DEFAULT_CHUNK_SIZE,
     _compute_chunks,
     _read_chunks,
-    normalize_chunks,
 )
 
 VALID_IMAGE_TYPES = [".tif", ".tiff", ".png", ".jpg", ".jpeg"]
@@ -179,7 +180,7 @@ def image(
     chunks: Chunks_t | None = None,
     scale_factors: Sequence[int] | None = None,
 ) -> DataArray:
-    """Reads an image file and returns a parsed Image2D spatial element.
+    """Read an image file and returns a parsed Image2D spatial element.
 
     Parameters
     ----------
@@ -207,6 +208,8 @@ def image(
     # Map passed data axes to position of dimension
     axes_dim_mapping = {axes: ndim for ndim, axes in enumerate(data_axes)}
 
+    if chunks is None:
+        chunks = DEFAULT_CHUNK_SIZE
     chunks_dict = normalize_chunks(chunks, axes=data_axes)
 
     im = None
diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
@@ -20,11 +20,11 @@
 from dask.dataframe import read_parquet
 from dask_image.imread import imread
 from geopandas import GeoDataFrame
-from joblib import Parallel, delayed
 from pyarrow import Table
 from shapely import GeometryType, Polygon, from_ragged_array
 from spatialdata import SpatialData
 from spatialdata._core.query.relational_query import get_element_instances
+from spatialdata._logging import logger
 from spatialdata.models import (
     Image2DModel,
     Labels2DModel,
@@ -61,7 +61,7 @@ def xenium(
     *,
     cells_boundaries: bool = True,
     nucleus_boundaries: bool = True,
-    cells_as_circles: bool | None = None,
+    cells_as_circles: bool = False,
     cells_labels: bool = True,
     nucleus_labels: bool = True,
     transcripts: bool = True,
@@ -136,7 +136,7 @@ def xenium(
 
     Notes
     -----
-    Old versions. Until spatialdata-io v0.1.3post0: previously, `cells_as_circles` was `True` by default; the table was associated to the
+    Old versions. Until spatialdata-io v0.6.0: `cells_as_circles` was `True` by default; the table was associated to the
     circles when `cells_as_circles` was `True`, and the table was associated to the polygons when `cells_as_circles`
     was `False`; the radii of the circles were computed form the nuclei instead of the cells.
 
@@ -153,14 +153,6 @@ def xenium(
     ... )
     >>> sdata.write("path/to/data.zarr")
     """
-    if cells_as_circles is None:
-        cells_as_circles = True
-        warnings.warn(
-            "The default value of `cells_as_circles` will change to `False` in the next release. "
-            "Please pass `True` explicitly to maintain the current behavior.",
-            DeprecationWarning,
-            stacklevel=3,
-        )
     image_models_kwargs, labels_models_kwargs = _initialize_raster_models_kwargs(
         image_models_kwargs, labels_models_kwargs
     )
@@ -223,18 +215,16 @@ def xenium(
     # labels.
     if nucleus_labels:
         labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
-            path,
-            XeniumKeys.CELLS_ZARR,
-            specs,
+            path=path,
+            specs=specs,
             mask_index=0,
             labels_name="nucleus_labels",
             labels_models_kwargs=labels_models_kwargs,
         )
     if cells_labels:
         labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
-            path,
-            XeniumKeys.CELLS_ZARR,
-            specs,
+            path=path,
+            specs=specs,
             mask_index=1,
             labels_name="cell_labels",
             labels_models_kwargs=labels_models_kwargs,
@@ -360,8 +350,8 @@ def filter(self, record: logging.LogRecord) -> bool:
                         return False
                     return True
 
-            logger = tifffile.logger()
-            logger.addFilter(IgnoreSpecificMessage())
+            tf_logger = tifffile.logger()
+            tf_logger.addFilter(IgnoreSpecificMessage())
             image_models_kwargs = dict(image_models_kwargs)
             assert "c_coords" not in image_models_kwargs, (
                 "The channel names for the morphology focus images are handled internally"
@@ -374,7 +364,7 @@ def filter(self, record: logging.LogRecord) -> bool:
                 image_models_kwargs,
             )
             del image_models_kwargs["c_coords"]
-            logger.removeFilter(IgnoreSpecificMessage())
+            tf_logger.removeFilter(IgnoreSpecificMessage())
 
     if table is not None:
         tables["table"] = table
@@ -402,14 +392,16 @@ def filter(self, record: logging.LogRecord) -> bool:
 def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
     if isinstance(cell_id_column.iloc[0], bytes):
         return cell_id_column.str.decode("utf-8")
+    if not isinstance(cell_id_column.iloc[0], str):
+        cell_id_column.index = cell_id_column.index.astype(str)
     return cell_id_column
 
 
 def _get_polygons(
     path: Path,
     file: str,
     specs: dict[str, Any],
-    idx: ArrayLike | None = None,
+    idx: pd.Series | None = None,
 ) -> GeoDataFrame:
     # seems to be faster than pd.read_parquet
     df = pq.read_table(path / file).to_pandas()
@@ -448,7 +440,7 @@ def _get_polygons(
     if version is not None and version < packaging.version.parse("2.0.0"):
         assert idx is not None
         assert len(idx) == len(geo_df)
-        assert index.equals(idx)
+        assert np.array_equal(index.values, idx.values)
     else:
         if np.unique(geo_df.index).size != len(geo_df):
             warnings.warn(
@@ -464,7 +456,6 @@ def _get_polygons(
 
 def _get_labels_and_indices_mapping(
     path: Path,
-    file: str,
     specs: dict[str, Any],
     mask_index: int,
     labels_name: str,
@@ -493,36 +484,35 @@ def _get_labels_and_indices_mapping(
     cell_id, dataset_suffix = z["cell_id"][...].T
     cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
 
-    # this information will probably be available in the `label_id` column for version > 2.0.0 (see public
-    # release notes mentioned above)
-    real_label_index = get_element_instances(labels).values
-
-    # background removal
-    if real_label_index[0] == 0:
-        real_label_index = real_label_index[1:]
-
     if version < packaging.version.parse("2.0.0"):
-        expected_label_index = z["seg_mask_value"][...]
-
-        if not np.array_equal(expected_label_index, real_label_index):
-            raise ValueError(
-                "The label indices from the labels differ from the ones from the input data. Please report "
-                f"this issue. Real label indices: {real_label_index}, expected label indices: "
-                f"{expected_label_index}."
-            )
+        label_index = z["seg_mask_value"][...]
     else:
-        labels_positional_indices = z["polygon_sets"][f"{mask_index}"]["cell_index"][...]
-        if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))):
-            raise ValueError(
-                "The positional indices of the labels do not match the expected range. Please report this issue."
+        # For v >= 2.0.0, seg_mask_value is no longer available in the zarr;
+        # read label_id from the corresponding parquet boundary file instead
+        boundaries_file = XeniumKeys.NUCLEUS_BOUNDARIES_FILE if mask_index == 0 else XeniumKeys.CELL_BOUNDARIES_FILE
+        boundary_columns = pq.read_schema(path / boundaries_file).names
+        if "label_id" in boundary_columns:
+            boundary_df = pq.read_table(path / boundaries_file, columns=[XeniumKeys.CELL_ID, "label_id"]).to_pandas()
+            unique_pairs = boundary_df.drop_duplicates(subset=[XeniumKeys.CELL_ID, "label_id"]).copy()
+            unique_pairs[XeniumKeys.CELL_ID] = _decode_cell_id_column(unique_pairs[XeniumKeys.CELL_ID])
+            cell_id_to_label_id = unique_pairs.set_index(XeniumKeys.CELL_ID)["label_id"]
+            label_index = cell_id_to_label_id.loc[cell_id_str].values
+        else:
+            # fallback for dev versions around 2.0.0 that lack both seg_mask_value and label_id
+            logger.warn(
+                f"Could not find the labels ids from the metadata for version {version}. Using a fallback (slower) implementation."
             )
+            label_index = get_element_instances(labels).values
+
+            if label_index[0] == 0:
+                label_index = label_index[1:]
 
     # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
     indices_mapping = pd.DataFrame(
         {
             "region": labels_name,
             "cell_id": cell_id_str,
-            "label_index": real_label_index.astype(np.int64),
+            "label_index": label_index.astype(np.int64),
         }
     )
     # because AnnData converts the indices to str
diff --git a/tests/readers/test_utils_image.py b/tests/readers/test_utils_image.py