2020from dask .dataframe import read_parquet
2121from dask_image .imread import imread
2222from geopandas import GeoDataFrame
23- from joblib import Parallel , delayed
2423from pyarrow import Table
2524from shapely import GeometryType , Polygon , from_ragged_array
2625from spatialdata import SpatialData
2726from spatialdata ._core .query .relational_query import get_element_instances
27+ from spatialdata ._logging import logger
2828from spatialdata .models import (
2929 Image2DModel ,
3030 Labels2DModel ,
@@ -61,7 +61,7 @@ def xenium(
6161 * ,
6262 cells_boundaries : bool = True ,
6363 nucleus_boundaries : bool = True ,
64- cells_as_circles : bool | None = None ,
64+ cells_as_circles : bool = False ,
6565 cells_labels : bool = True ,
6666 nucleus_labels : bool = True ,
6767 transcripts : bool = True ,
@@ -136,7 +136,7 @@ def xenium(
136136
137137 Notes
138138 -----
139- Old versions. Until spatialdata-io v0.1.3post0: previously, `cells_as_circles` was `True` by default; the table was associated to the
139+ Old versions. Until spatialdata-io v0.6.0: `cells_as_circles` was `True` by default; the table was associated to the
140140 circles when `cells_as_circles` was `True`, and the table was associated to the polygons when `cells_as_circles`
141141 was `False`; the radii of the circles were computed form the nuclei instead of the cells.
142142
@@ -153,14 +153,6 @@ def xenium(
153153 ... )
154154 >>> sdata.write("path/to/data.zarr")
155155 """
156- if cells_as_circles is None :
157- cells_as_circles = True
158- warnings .warn (
159- "The default value of `cells_as_circles` will change to `False` in the next release. "
160- "Please pass `True` explicitly to maintain the current behavior." ,
161- DeprecationWarning ,
162- stacklevel = 3 ,
163- )
164156 image_models_kwargs , labels_models_kwargs = _initialize_raster_models_kwargs (
165157 image_models_kwargs , labels_models_kwargs
166158 )
@@ -223,18 +215,16 @@ def xenium(
223215 # labels.
224216 if nucleus_labels :
225217 labels ["nucleus_labels" ], _ = _get_labels_and_indices_mapping (
226- path ,
227- XeniumKeys .CELLS_ZARR ,
228- specs ,
218+ path = path ,
219+ specs = specs ,
229220 mask_index = 0 ,
230221 labels_name = "nucleus_labels" ,
231222 labels_models_kwargs = labels_models_kwargs ,
232223 )
233224 if cells_labels :
234225 labels ["cell_labels" ], cell_labels_indices_mapping = _get_labels_and_indices_mapping (
235- path ,
236- XeniumKeys .CELLS_ZARR ,
237- specs ,
226+ path = path ,
227+ specs = specs ,
238228 mask_index = 1 ,
239229 labels_name = "cell_labels" ,
240230 labels_models_kwargs = labels_models_kwargs ,
@@ -360,8 +350,8 @@ def filter(self, record: logging.LogRecord) -> bool:
360350 return False
361351 return True
362352
363- logger = tifffile .logger ()
364- logger .addFilter (IgnoreSpecificMessage ())
353+ tf_logger = tifffile .logger ()
354+ tf_logger .addFilter (IgnoreSpecificMessage ())
365355 image_models_kwargs = dict (image_models_kwargs )
366356 assert "c_coords" not in image_models_kwargs , (
367357 "The channel names for the morphology focus images are handled internally"
@@ -374,7 +364,7 @@ def filter(self, record: logging.LogRecord) -> bool:
374364 image_models_kwargs ,
375365 )
376366 del image_models_kwargs ["c_coords" ]
377- logger .removeFilter (IgnoreSpecificMessage ())
367+ tf_logger .removeFilter (IgnoreSpecificMessage ())
378368
379369 if table is not None :
380370 tables ["table" ] = table
@@ -402,14 +392,16 @@ def filter(self, record: logging.LogRecord) -> bool:
402392def _decode_cell_id_column (cell_id_column : pd .Series ) -> pd .Series :
403393 if isinstance (cell_id_column .iloc [0 ], bytes ):
404394 return cell_id_column .str .decode ("utf-8" )
395+ if not isinstance (cell_id_column .iloc [0 ], str ):
396+ cell_id_column .index = cell_id_column .index .astype (str )
405397 return cell_id_column
406398
407399
408400def _get_polygons (
409401 path : Path ,
410402 file : str ,
411403 specs : dict [str , Any ],
412- idx : ArrayLike | None = None ,
404+ idx : pd . Series | None = None ,
413405) -> GeoDataFrame :
414406 # seems to be faster than pd.read_parquet
415407 df = pq .read_table (path / file ).to_pandas ()
@@ -448,7 +440,7 @@ def _get_polygons(
448440 if version is not None and version < packaging .version .parse ("2.0.0" ):
449441 assert idx is not None
450442 assert len (idx ) == len (geo_df )
451- assert index . equals ( idx )
443+ assert np . array_equal ( index . values , idx . values )
452444 else :
453445 if np .unique (geo_df .index ).size != len (geo_df ):
454446 warnings .warn (
@@ -464,7 +456,6 @@ def _get_polygons(
464456
465457def _get_labels_and_indices_mapping (
466458 path : Path ,
467- file : str ,
468459 specs : dict [str , Any ],
469460 mask_index : int ,
470461 labels_name : str ,
@@ -493,36 +484,35 @@ def _get_labels_and_indices_mapping(
493484 cell_id , dataset_suffix = z ["cell_id" ][...].T
494485 cell_id_str = cell_id_str_from_prefix_suffix_uint32 (cell_id , dataset_suffix )
495486
496- # this information will probably be available in the `label_id` column for version > 2.0.0 (see public
497- # release notes mentioned above)
498- real_label_index = get_element_instances (labels ).values
499-
500- # background removal
501- if real_label_index [0 ] == 0 :
502- real_label_index = real_label_index [1 :]
503-
504487 if version < packaging .version .parse ("2.0.0" ):
505- expected_label_index = z ["seg_mask_value" ][...]
506-
507- if not np .array_equal (expected_label_index , real_label_index ):
508- raise ValueError (
509- "The label indices from the labels differ from the ones from the input data. Please report "
510- f"this issue. Real label indices: { real_label_index } , expected label indices: "
511- f"{ expected_label_index } ."
512- )
488+ label_index = z ["seg_mask_value" ][...]
513489 else :
514- labels_positional_indices = z ["polygon_sets" ][f"{ mask_index } " ]["cell_index" ][...]
515- if not np .array_equal (labels_positional_indices , np .arange (len (labels_positional_indices ))):
516- raise ValueError (
517- "The positional indices of the labels do not match the expected range. Please report this issue."
490+ # For v >= 2.0.0, seg_mask_value is no longer available in the zarr;
491+ # read label_id from the corresponding parquet boundary file instead
492+ boundaries_file = XeniumKeys .NUCLEUS_BOUNDARIES_FILE if mask_index == 0 else XeniumKeys .CELL_BOUNDARIES_FILE
493+ boundary_columns = pq .read_schema (path / boundaries_file ).names
494+ if "label_id" in boundary_columns :
495+ boundary_df = pq .read_table (path / boundaries_file , columns = [XeniumKeys .CELL_ID , "label_id" ]).to_pandas ()
496+ unique_pairs = boundary_df .drop_duplicates (subset = [XeniumKeys .CELL_ID , "label_id" ]).copy ()
497+ unique_pairs [XeniumKeys .CELL_ID ] = _decode_cell_id_column (unique_pairs [XeniumKeys .CELL_ID ])
498+ cell_id_to_label_id = unique_pairs .set_index (XeniumKeys .CELL_ID )["label_id" ]
499+ label_index = cell_id_to_label_id .loc [cell_id_str ].values
500+ else :
501+ # fallback for dev versions around 2.0.0 that lack both seg_mask_value and label_id
502+ logger .warn (
503+ f"Could not find the labels ids from the metadata for version { version } . Using a fallback (slower) implementation."
518504 )
505+ label_index = get_element_instances (labels ).values
506+
507+ if label_index [0 ] == 0 :
508+ label_index = label_index [1 :]
519509
520510 # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
521511 indices_mapping = pd .DataFrame (
522512 {
523513 "region" : labels_name ,
524514 "cell_id" : cell_id_str ,
525- "label_index" : real_label_index .astype (np .int64 ),
515+ "label_index" : label_index .astype (np .int64 ),
526516 }
527517 )
528518 # because AnnData converts the indices to str
0 commit comments