From 0c8021822479438ed855ebe3ebf769a7cb2f5e22 Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Tue, 28 Oct 2025 12:16:07 +0100 Subject: [PATCH 1/5] Load Xenium mask labels using Dask --- src/spatialdata_io/readers/xenium.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 87100c08..1244ecd0 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -458,7 +458,7 @@ def _get_labels_and_indices_mapping( with zarr_open(str(tmpdir), mode="r") as z: # get the labels - masks = z["masks"][f"{mask_index}"][...] + masks = da.from_array(z["masks"][f"{mask_index}"]) labels = Labels2DModel.parse( masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs ) From d3a247c161f3f0b19b814c84ffd4d52e6da0fd67 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 6 Jan 2026 20:49:37 +0100 Subject: [PATCH 2/5] persistent extraction of .zarr.zip --- src/spatialdata_io/readers/xenium.py | 330 +++++++++++++++------------ 1 file changed, 190 insertions(+), 140 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 1244ecd0..d9e32bd5 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -52,7 +52,11 @@ __all__ = ["xenium", "xenium_aligned_image", "xenium_explorer_selection"] -@deprecation_alias(cells_as_shapes="cells_as_circles", cell_boundaries="cells_boundaries", cell_labels="cells_labels") +@deprecation_alias( + cells_as_shapes="cells_as_circles", + cell_boundaries="cells_boundaries", + cell_labels="cells_labels", +) @inject_docs(xx=XeniumKeys) def xenium( path: str | Path, @@ -69,6 +73,7 @@ def xenium( cells_table: bool = True, n_jobs: int = 1, gex_only: bool = True, + cleanup_labels_zarr_tmpdir: bool = True, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), image_models_kwargs: Mapping[str, Any] = MappingProxyType({}), labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -197,62 +202,86 @@ def xenium( else: table = None - if version is not None and version >= packaging.version.parse("2.0.0") and table is not None: - cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs) - if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]): - warnings.warn( - 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' - " table. This could be due to trying to read a new version that is not supported yet. Please " - "report this issue.", - UserWarning, - stacklevel=2, - ) - table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL] - table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] - - polygons = {} - labels = {} - tables = {} - points = {} - images = {} - - # From the public release notes here: - # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa - # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used. - # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match - # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus - # labels. - if nucleus_labels: - labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( - path, - XeniumKeys.CELLS_ZARR, - specs, - mask_index=0, - labels_name="nucleus_labels", - labels_models_kwargs=labels_models_kwargs, - ) - if cells_labels: - labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( - path, - XeniumKeys.CELLS_ZARR, - specs, - mask_index=1, - labels_name="cell_labels", - labels_models_kwargs=labels_models_kwargs, + tmpdir = tempfile.TemporaryDirectory() + if not cleanup_labels_zarr_tmpdir: + logging.info( + f"Extracting cells zarr in the temporary directory {tmpdir.name}. Since `cleanup_labels_zarr_tmpdir` " + f"is set to `False`, this directory cleanup will be deferred (up to the end of the process). " + f"If the process is interrupted aburptly cleanup may not occurr. Use with care to avoid uncleaned up " + f"temporary directories." ) - if cell_labels_indices_mapping is not None and table is not None: - if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]): + zip_file = path / XeniumKeys.CELLS_ZARR + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(tmpdir.name) + try: + cells_zarr = zarr.open(str(tmpdir.name), mode="r") + if version is not None and version >= packaging.version.parse("2.0.0") and table is not None: + cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr=cells_zarr) + if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]): warnings.warn( - "The cell_id column in the cell_labels_table does not match the cell_id column derived from the " - "cell labels data. This could be due to trying to read a new version that is not supported yet. " - "Please report this issue.", + 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' + " table. This could be due to trying to read a new version that is not supported yet. Please " + "report this issue.", UserWarning, stacklevel=2, ) - else: - table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"] - if not cells_as_circles: - table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" + table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL] + table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] + + polygons = {} + labels = {} + tables = {} + points = {} + images = {} + + # From the public release notes here: + # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa + # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used. + # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match + # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus + # labels. + + if nucleus_labels: + labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( + cells_zarr, + cleanup_labels_zarr_tmpdir, + specs, + mask_index=0, + labels_name="nucleus_labels", + labels_models_kwargs=labels_models_kwargs, + ) + if cells_labels: + labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( + cells_zarr, + cleanup_labels_zarr_tmpdir, + specs, + mask_index=1, + labels_name="cell_labels", + labels_models_kwargs=labels_models_kwargs, + ) + if cell_labels_indices_mapping is not None and table is not None: + if not pd.DataFrame.equals( + cell_labels_indices_mapping["cell_id"], + table.obs[str(XeniumKeys.CELL_ID)], + ): + warnings.warn( + "The cell_id column in the cell_labels_table does not match the cell_id column derived from the " + "cell labels data. This could be due to trying to read a new version that is not supported yet. " + "Please report this issue.", + UserWarning, + stacklevel=2, + ) + else: + table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"] + if not cells_as_circles: + table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" + except Exception as e: + tmpdir.cleanup() + raise e + + # we cleanup now if we don't have lazy data + if not cells_labels and not nucleus_labels or cleanup_labels_zarr_tmpdir: + tmpdir.cleanup() if nucleus_boundaries: polygons["nucleus_boundaries"] = _get_polygons( @@ -381,7 +410,13 @@ def filter(self, record: logging.LogRecord) -> bool: if table is not None: tables["table"] = table - elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons} + elements_dict = { + "images": images, + "labels": labels, + "points": points, + "tables": tables, + "shapes": polygons, + } if cells_as_circles: elements_dict["shapes"][specs["region"]] = circles sdata = SpatialData(**elements_dict) @@ -402,7 +437,11 @@ def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series: def _get_polygons( - path: Path, file: str, specs: dict[str, Any], n_jobs: int, idx: ArrayLike | None = None + path: Path, + file: str, + specs: dict[str, Any], + n_jobs: int, + idx: ArrayLike | None = None, ) -> GeoDataFrame: def _poly(arr: ArrayLike) -> Polygon: return Polygon(arr[:-1]) @@ -441,8 +480,8 @@ def _poly(arr: ArrayLike) -> Polygon: def _get_labels_and_indices_mapping( - path: Path, - file: str, + cells_zarr: zarr.Group, + cleanup_labels_zarr_tmpdir: bool, specs: dict[str, Any], mask_index: int, labels_name: str, @@ -451,74 +490,71 @@ def _get_labels_and_indices_mapping( if mask_index not in [0, 1]: raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.") - with tempfile.TemporaryDirectory() as tmpdir: - zip_file = path / XeniumKeys.CELLS_ZARR - with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(tmpdir) + # get the labels + if cleanup_labels_zarr_tmpdir: + masks = cells_zarr["masks"][f"{mask_index}"][...] + else: + masks = da.from_array(cells_zarr["masks"][f"{mask_index}"]) + labels = Labels2DModel.parse( + masks, + dims=("y", "x"), + transformations={"global": Identity()}, + **labels_models_kwargs, + ) - with zarr_open(str(tmpdir), mode="r") as z: - # get the labels - masks = da.from_array(z["masks"][f"{mask_index}"]) - labels = Labels2DModel.parse( - masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs + # build the matching table + version = _parse_version_of_xenium_analyzer(specs) + if mask_index == 0: + # nuclei currently not supported + return labels, None + if version is None or version is not None and version < packaging.version.parse("1.3.0"): + # supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not + # supported in versions < 1.3.0 + return labels, None + + cell_id, dataset_suffix = cells_zarr["cell_id"][...].T + cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix) + + # this information will probably be available in the `label_id` column for version > 2.0.0 (see public + # release notes mentioned above) + real_label_index = get_element_instances(labels).values + + # background removal + if real_label_index[0] == 0: + real_label_index = real_label_index[1:] + + if version < packaging.version.parse("2.0.0"): + expected_label_index = cells_zarr["seg_mask_value"][...] + + if not np.array_equal(expected_label_index, real_label_index): + raise ValueError( + "The label indices from the labels differ from the ones from the input data. Please report " + f"this issue. Real label indices: {real_label_index}, expected label indices: " + f"{expected_label_index}." ) - - # build the matching table - version = _parse_version_of_xenium_analyzer(specs) - if mask_index == 0: - # nuclei currently not supported - return labels, None - if version is None or version is not None and version < packaging.version.parse("1.3.0"): - # supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not - # supported in versions < 1.3.0 - return labels, None - - cell_id, dataset_suffix = z["cell_id"][...].T - cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix) - - # this information will probably be available in the `label_id` column for version > 2.0.0 (see public - # release notes mentioned above) - real_label_index = get_element_instances(labels).values - - # background removal - if real_label_index[0] == 0: - real_label_index = real_label_index[1:] - - if version < packaging.version.parse("2.0.0"): - expected_label_index = z["seg_mask_value"][...] - - if not np.array_equal(expected_label_index, real_label_index): - raise ValueError( - "The label indices from the labels differ from the ones from the input data. Please report " - f"this issue. Real label indices: {real_label_index}, expected label indices: " - f"{expected_label_index}." - ) - else: - labels_positional_indices = z["polygon_sets"][f"{mask_index}"]["cell_index"][...] - if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))): - raise ValueError( - "The positional indices of the labels do not match the expected range. Please report this " - "issue." - ) - - # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems - indices_mapping = pd.DataFrame( - { - "region": labels_name, - "cell_id": cell_id_str, - "label_index": real_label_index.astype(np.int64), - } + else: + labels_positional_indices = cells_zarr["polygon_sets"][f"{mask_index}"]["cell_index"][...] + if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))): + raise ValueError( + "The positional indices of the labels do not match the expected range. Please report this issue." ) - # because AnnData converts the indices to str - indices_mapping.index = indices_mapping.index.astype(str) - return labels, indices_mapping + + # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems + indices_mapping = pd.DataFrame( + { + "region": labels_name, + "cell_id": cell_id_str, + "label_index": real_label_index.astype(np.int64), + } + ) + # because AnnData converts the indices to str + indices_mapping.index = indices_mapping.index.astype(str) + return labels, indices_mapping @inject_docs(xx=XeniumKeys) def _get_cells_metadata_table_from_zarr( - path: Path, - file: str, - specs: dict[str, Any], + cells_zarr: zarr.Group, ) -> AnnData: """Read cells metadata from ``{xx.CELLS_ZARR}``. @@ -526,35 +562,34 @@ def _get_cells_metadata_table_from_zarr( nucleus_count for versions >= 2.0.0. """ # for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks - with tempfile.TemporaryDirectory() as tmpdir: - zip_file = path / XeniumKeys.CELLS_ZARR - with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(tmpdir) - - with zarr_open(str(tmpdir), mode="r") as z: - x = z["cell_summary"][...] - column_names = z["cell_summary"].attrs["column_names"] - df = pd.DataFrame(x, columns=column_names) - cell_id_prefix = z["cell_id"][:, 0] - dataset_suffix = z["cell_id"][:, 1] + x = cells_zarr["cell_summary"][...] + column_names = cells_zarr["cell_summary"].attrs["column_names"] + df = pd.DataFrame(x, columns=column_names) + cell_id_prefix = cells_zarr["cell_id"][:, 0] + dataset_suffix = cells_zarr["cell_id"][:, 1] - cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) - df[XeniumKeys.CELL_ID] = cell_id_str - # because AnnData converts the indices to str - df.index = df.index.astype(str) - return df + cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) + df[XeniumKeys.CELL_ID] = cell_id_str + # because AnnData converts the indices to str + df.index = df.index.astype(str) + return df def _get_points(path: Path, specs: dict[str, Any]) -> Table: table = read_parquet(path / XeniumKeys.TRANSCRIPTS_FILE) table["feature_name"] = table["feature_name"].apply( - lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), meta=("feature_name", "object") + lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), + meta=("feature_name", "object"), ) transform = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y")) points = PointsModel.parse( table, - coordinates={"x": XeniumKeys.TRANSCRIPTS_X, "y": XeniumKeys.TRANSCRIPTS_Y, "z": XeniumKeys.TRANSCRIPTS_Z}, + coordinates={ + "x": XeniumKeys.TRANSCRIPTS_X, + "y": XeniumKeys.TRANSCRIPTS_Y, + "z": XeniumKeys.TRANSCRIPTS_Z, + }, feature_key=XeniumKeys.FEATURE_NAME, instance_key=XeniumKeys.CELL_ID, transformations={"global": transform}, @@ -576,7 +611,12 @@ def _get_tables_and_circles( adata.obs["region"] = specs["region"] adata.obs["region"] = adata.obs["region"].astype("category") adata.obs[XeniumKeys.CELL_ID] = _decode_cell_id_column(adata.obs[XeniumKeys.CELL_ID]) - table = TableModel.parse(adata, region=specs["region"], region_key="region", instance_key=str(XeniumKeys.CELL_ID)) + table = TableModel.parse( + adata, + region=specs["region"], + region_key="region", + instance_key=str(XeniumKeys.CELL_ID), + ) if cells_as_circles: transform = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y")) radii = np.sqrt(adata.obs[XeniumKeys.CELL_AREA].to_numpy() / np.pi) @@ -605,7 +645,11 @@ def _get_images( # let's add a dummy channel as a temporary workaround. image = da.concatenate([image, da.zeros_like(image[0:1])], axis=0) return Image2DModel.parse( - image, transformations={"global": Identity()}, dims=("c", "y", "x"), rgb=None, **image_models_kwargs + image, + transformations={"global": Identity()}, + dims=("c", "y", "x"), + rgb=None, + **image_models_kwargs, ) @@ -620,14 +664,18 @@ def _add_aligned_images( csv_files = list(path.glob("*.csv")) for file in ome_tif_files: element_name = None - for suffix in [XeniumKeys.ALIGNED_HE_IMAGE_SUFFIX, XeniumKeys.ALIGNED_IF_IMAGE_SUFFIX]: + for suffix in [ + XeniumKeys.ALIGNED_HE_IMAGE_SUFFIX, + XeniumKeys.ALIGNED_IF_IMAGE_SUFFIX, + ]: if file.name.endswith(suffix): element_name = suffix.replace(XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_REMOVE, "") break if element_name is not None: # check if an alignment file exists expected_filename = file.name.replace( - XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_REMOVE, XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_ADD + XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_REMOVE, + XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_ADD, ) alignment_files = [f for f in csv_files if f.name == expected_filename] assert len(alignment_files) <= 1, f"Found more than one alignment file for {file.name}." @@ -833,7 +881,9 @@ def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suf return np.array(cell_id_str) -def prefix_suffix_uint32_from_cell_id_str(cell_id_str: ArrayLike) -> tuple[ArrayLike, ArrayLike]: +def prefix_suffix_uint32_from_cell_id_str( + cell_id_str: ArrayLike, +) -> tuple[ArrayLike, ArrayLike]: # parse the string into the prefix and suffix cell_id_prefix_str, dataset_suffix = zip(*[x.split("-") for x in cell_id_str], strict=False) dataset_suffix_int = [int(x) for x in dataset_suffix] From 92a09ae4eb76681af3252e202ef8885f44710c74 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Wed, 7 Jan 2026 22:03:36 +0100 Subject: [PATCH 3/5] add asv benchmarks --- asv.conf.json | 23 ++++++++ benchmarks/__init__.py | 1 + benchmarks/bench_xenium.py | 109 +++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+) create mode 100644 asv.conf.json create mode 100644 benchmarks/__init__.py create mode 100644 benchmarks/bench_xenium.py diff --git a/asv.conf.json b/asv.conf.json new file mode 100644 index 00000000..d50ba6a5 --- /dev/null +++ b/asv.conf.json @@ -0,0 +1,23 @@ +{ + "version": 1, + "project": "spatialdata-io", + "project_url": "https://github.com/scverse/spatialdata-io", + "repo": ".", + "branches": ["main", "xenium-labels-dask"], + "dvcs": "git", + "environment_type": "virtualenv", + "pythons": ["3.12"], + "build_command": [], + "install_command": ["python -m pip install {build_dir}[test]"], + "uninstall_command": ["python -m pip uninstall -y {project}"], + "env_dir": ".asv/env", + "results_dir": ".asv/results", + "html_dir": ".asv/html", + "benchmark_dir": "benchmarks", + "hash_length": 8, + "build_cache_size": 2, + "install_timeout": 600, + "repeat": 3, + "processes": 1, + "attribute_selection": ["time_*", "peakmem_*"] +} diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 00000000..dd4ef30e --- /dev/null +++ b/benchmarks/__init__.py @@ -0,0 +1 @@ +# ASV benchmarks for spatialdata-io diff --git a/benchmarks/bench_xenium.py b/benchmarks/bench_xenium.py new file mode 100644 index 00000000..d8319b04 --- /dev/null +++ b/benchmarks/bench_xenium.py @@ -0,0 +1,109 @@ +"""Benchmarks for SpatialData IO operations. + +Configuration: + Edit SANDBOX_DIR and DATASET below to point to your data. + +Setup: + cd / + python download.py # use the same env where spatialdata is installed + +Running: + cd /path/to/spatialdata-io + + # Quick benchmark (single run, for testing): + asv run --python=same -b IOBenchmark --quick --show-stderr -v + + # Full benchmark (multiple runs, for accurate results): + asv run --python=same -b IOBenchmark --show-stderr -v + +Comparing branches: + # Run on specific commits: + asv run main^! -b IOBenchmark --show-stderr -v + asv run xenium-labels-dask^! -b IOBenchmark --show-stderr -v + + # Or compare two branches directly: + asv continuous main xenium-labels-dask -b IOBenchmark --show-stderr -v + + # View comparison: + asv compare main xenium-labels-dask + +Results: + - Console output shows timing and memory after each run + - JSON results saved to: .asv/results/ + - Generate HTML report: asv publish && asv preview +""" + +import inspect +import shutil +from pathlib import Path +from typing import TYPE_CHECKING + +from spatialdata import SpatialData + +from spatialdata_io import xenium + +# ============================================================================= +# CONFIGURATION - Edit these paths to match your setup +# ============================================================================= +SANDBOX_DIR = Path(__file__).parent.parent.parent / "spatialdata-sandbox" +DATASET = "xenium_2.0.0_io" +# ============================================================================= + + +def get_paths() -> tuple[Path, Path]: + """Get paths for benchmark data.""" + path = SANDBOX_DIR / DATASET + path_read = path / "data" + path_write = path / "data_benchmark.zarr" + + if not path_read.exists(): + raise ValueError(f"Data directory not found: {path_read}") + + return path_read, path_write + + +class IOBenchmark: + """Benchmark IO read operations.""" + + timeout = 3600 + repeat = 3 + number = 1 + warmup_time = 0 + processes = 1 + + def setup(self) -> None: + """Set up paths for benchmarking.""" + self.path_read, self.path_write = get_paths() + if self.path_write.exists(): + shutil.rmtree(self.path_write) + + def _read_xenium(self) -> SpatialData: + """Read xenium data with version-compatible kwargs.""" + signature = inspect.signature(xenium) + kwargs = {} + if "cleanup_labels_zarr_tmpdir" in signature.parameters: + kwargs["cleanup_labels_zarr_tmpdir"] = False + + return xenium( + path=str(self.path_read), + n_jobs=8, + cell_boundaries=True, + nucleus_boundaries=True, + morphology_focus=True, + cells_as_circles=True, + **kwargs, + ) + + def time_io(self) -> None: + """Walltime for data parsing.""" + sdata = self._read_xenium() + sdata.write(self.path_write) + + def peakmem_io(self) -> None: + """Peak memory for data parsing.""" + sdata = self._read_xenium() + sdata.write(self.path_write) + + +if __name__ == "__main__": + IOBenchmark().time_io() From 54a24a07dae46ffb844c4b8415ab65b8aee158ab Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 8 Jan 2026 10:20:55 +0100 Subject: [PATCH 4/5] use zarr zipstores in xenium --- src/spatialdata_io/readers/xenium.py | 169 ++++++++++++--------------- 1 file changed, 73 insertions(+), 96 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index d9e32bd5..32455175 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -73,7 +73,6 @@ def xenium( cells_table: bool = True, n_jobs: int = 1, gex_only: bool = True, - cleanup_labels_zarr_tmpdir: bool = True, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), image_models_kwargs: Mapping[str, Any] = MappingProxyType({}), labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -202,86 +201,62 @@ def xenium( else: table = None - tmpdir = tempfile.TemporaryDirectory() - if not cleanup_labels_zarr_tmpdir: - logging.info( - f"Extracting cells zarr in the temporary directory {tmpdir.name}. Since `cleanup_labels_zarr_tmpdir` " - f"is set to `False`, this directory cleanup will be deferred (up to the end of the process). " - f"If the process is interrupted aburptly cleanup may not occurr. Use with care to avoid uncleaned up " - f"temporary directories." + if version is not None and version >= packaging.version.parse("2.0.0") and table is not None: + cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs) + if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]): + warnings.warn( + 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' + " table. This could be due to trying to read a new version that is not supported yet. Please " + "report this issue.", + UserWarning, + stacklevel=2, + ) + table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL] + table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] + + polygons = {} + labels = {} + tables = {} + points = {} + images = {} + + # From the public release notes here: + # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa + # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used. + # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match + # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus + # labels. + if nucleus_labels: + labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( + path, + XeniumKeys.CELLS_ZARR, + specs, + mask_index=0, + labels_name="nucleus_labels", + labels_models_kwargs=labels_models_kwargs, ) - zip_file = path / XeniumKeys.CELLS_ZARR - with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(tmpdir.name) - try: - cells_zarr = zarr.open(str(tmpdir.name), mode="r") - if version is not None and version >= packaging.version.parse("2.0.0") and table is not None: - cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr=cells_zarr) - if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]): + if cells_labels: + labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( + path, + XeniumKeys.CELLS_ZARR, + specs, + mask_index=1, + labels_name="cell_labels", + labels_models_kwargs=labels_models_kwargs, + ) + if cell_labels_indices_mapping is not None and table is not None: + if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]): warnings.warn( - 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' - " table. This could be due to trying to read a new version that is not supported yet. Please " - "report this issue.", + "The cell_id column in the cell_labels_table does not match the cell_id column derived from the " + "cell labels data. This could be due to trying to read a new version that is not supported yet. " + "Please report this issue.", UserWarning, stacklevel=2, ) - table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL] - table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] - - polygons = {} - labels = {} - tables = {} - points = {} - images = {} - - # From the public release notes here: - # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa - # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used. - # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match - # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus - # labels. - - if nucleus_labels: - labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( - cells_zarr, - cleanup_labels_zarr_tmpdir, - specs, - mask_index=0, - labels_name="nucleus_labels", - labels_models_kwargs=labels_models_kwargs, - ) - if cells_labels: - labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( - cells_zarr, - cleanup_labels_zarr_tmpdir, - specs, - mask_index=1, - labels_name="cell_labels", - labels_models_kwargs=labels_models_kwargs, - ) - if cell_labels_indices_mapping is not None and table is not None: - if not pd.DataFrame.equals( - cell_labels_indices_mapping["cell_id"], - table.obs[str(XeniumKeys.CELL_ID)], - ): - warnings.warn( - "The cell_id column in the cell_labels_table does not match the cell_id column derived from the " - "cell labels data. This could be due to trying to read a new version that is not supported yet. " - "Please report this issue.", - UserWarning, - stacklevel=2, - ) - else: - table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"] - if not cells_as_circles: - table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" - except Exception as e: - tmpdir.cleanup() - raise e - - # we cleanup now if we don't have lazy data - if not cells_labels and not nucleus_labels or cleanup_labels_zarr_tmpdir: - tmpdir.cleanup() + else: + table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"] + if not cells_as_circles: + table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" if nucleus_boundaries: polygons["nucleus_boundaries"] = _get_polygons( @@ -480,8 +455,8 @@ def _poly(arr: ArrayLike) -> Polygon: def _get_labels_and_indices_mapping( - cells_zarr: zarr.Group, - cleanup_labels_zarr_tmpdir: bool, + path: Path, + file: str, specs: dict[str, Any], mask_index: int, labels_name: str, @@ -490,17 +465,12 @@ def _get_labels_and_indices_mapping( if mask_index not in [0, 1]: raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.") + zip_file = path / XeniumKeys.CELLS_ZARR + store = zarr.storage.ZipStore(zip_file, read_only=True) + z = zarr.open(store, mode="r") # get the labels - if cleanup_labels_zarr_tmpdir: - masks = cells_zarr["masks"][f"{mask_index}"][...] - else: - masks = da.from_array(cells_zarr["masks"][f"{mask_index}"]) - labels = Labels2DModel.parse( - masks, - dims=("y", "x"), - transformations={"global": Identity()}, - **labels_models_kwargs, - ) + masks = da.from_array(z["masks"][f"{mask_index}"]) + labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs) # build the matching table version = _parse_version_of_xenium_analyzer(specs) @@ -512,7 +482,7 @@ def _get_labels_and_indices_mapping( # supported in versions < 1.3.0 return labels, None - cell_id, dataset_suffix = cells_zarr["cell_id"][...].T + cell_id, dataset_suffix = z["cell_id"][...].T cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix) # this information will probably be available in the `label_id` column for version > 2.0.0 (see public @@ -524,7 +494,7 @@ def _get_labels_and_indices_mapping( real_label_index = real_label_index[1:] if version < packaging.version.parse("2.0.0"): - expected_label_index = cells_zarr["seg_mask_value"][...] + expected_label_index = z["seg_mask_value"][...] if not np.array_equal(expected_label_index, real_label_index): raise ValueError( @@ -533,7 +503,7 @@ def _get_labels_and_indices_mapping( f"{expected_label_index}." ) else: - labels_positional_indices = cells_zarr["polygon_sets"][f"{mask_index}"]["cell_index"][...] + labels_positional_indices = z["polygon_sets"][f"{mask_index}"]["cell_index"][...] if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))): raise ValueError( "The positional indices of the labels do not match the expected range. Please report this issue." @@ -554,7 +524,9 @@ def _get_labels_and_indices_mapping( @inject_docs(xx=XeniumKeys) def _get_cells_metadata_table_from_zarr( - cells_zarr: zarr.Group, + path: Path, + file: str, + specs: dict[str, Any], ) -> AnnData: """Read cells metadata from ``{xx.CELLS_ZARR}``. @@ -562,11 +534,16 @@ def _get_cells_metadata_table_from_zarr( nucleus_count for versions >= 2.0.0. """ # for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks - x = cells_zarr["cell_summary"][...] - column_names = cells_zarr["cell_summary"].attrs["column_names"] + zip_file = path / XeniumKeys.CELLS_ZARR + store = zarr.storage.ZipStore(zip_file, read_only=True) + + z = zarr.open(store, mode="r") + x = z["cell_summary"][...] + column_names = z["cell_summary"].attrs["column_names"] df = pd.DataFrame(x, columns=column_names) - cell_id_prefix = cells_zarr["cell_id"][:, 0] - dataset_suffix = cells_zarr["cell_id"][:, 1] + cell_id_prefix = z["cell_id"][:, 0] + dataset_suffix = z["cell_id"][:, 1] + store.close() cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) df[XeniumKeys.CELL_ID] = cell_id_str From ada3cd368327bf5d0d71dc9ad7e931010ff2de12 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 8 Jan 2026 14:59:22 +0100 Subject: [PATCH 5/5] fix pre-commit --- asv.conf.json | 2 +- benchmarks/bench_xenium.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/asv.conf.json b/asv.conf.json index d50ba6a5..516f0f1f 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -3,7 +3,7 @@ "project": "spatialdata-io", "project_url": "https://github.com/scverse/spatialdata-io", "repo": ".", - "branches": ["main", "xenium-labels-dask"], + "branches": ["main", "xenium-labels-dask", "xenium-labels-dask-zipstore"], "dvcs": "git", "environment_type": "virtualenv", "pythons": ["3.12"], diff --git a/benchmarks/bench_xenium.py b/benchmarks/bench_xenium.py index d8319b04..685e5094 100644 --- a/benchmarks/bench_xenium.py +++ b/benchmarks/bench_xenium.py @@ -40,7 +40,7 @@ from spatialdata import SpatialData -from spatialdata_io import xenium +from spatialdata_io import xenium # type: ignore[attr-defined] # ============================================================================= # CONFIGURATION - Edit these paths to match your setup