From 0c8021822479438ed855ebe3ebf769a7cb2f5e22 Mon Sep 17 00:00:00 2001
From: Marco Varrone <marco.varrone@unil.ch>
Date: Tue, 28 Oct 2025 12:16:07 +0100
Subject: [PATCH 1/5] Load Xenium mask labels using Dask

---
 src/spatialdata_io/readers/xenium.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index 87100c08..1244ecd0 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -458,7 +458,7 @@ def _get_labels_and_indices_mapping(
 
         with zarr_open(str(tmpdir), mode="r") as z:
             # get the labels
-            masks = z["masks"][f"{mask_index}"][...]
+            masks = da.from_array(z["masks"][f"{mask_index}"])
             labels = Labels2DModel.parse(
                 masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs
             )

From d3a247c161f3f0b19b814c84ffd4d52e6da0fd67 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Tue, 6 Jan 2026 20:49:37 +0100
Subject: [PATCH 2/5] persistent extraction of .zarr.zip

---
 src/spatialdata_io/readers/xenium.py | 330 +++++++++++++++------------
 1 file changed, 190 insertions(+), 140 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index 1244ecd0..d9e32bd5 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -52,7 +52,11 @@
 __all__ = ["xenium", "xenium_aligned_image", "xenium_explorer_selection"]
 
 
-@deprecation_alias(cells_as_shapes="cells_as_circles", cell_boundaries="cells_boundaries", cell_labels="cells_labels")
+@deprecation_alias(
+    cells_as_shapes="cells_as_circles",
+    cell_boundaries="cells_boundaries",
+    cell_labels="cells_labels",
+)
 @inject_docs(xx=XeniumKeys)
 def xenium(
     path: str | Path,
@@ -69,6 +73,7 @@ def xenium(
     cells_table: bool = True,
     n_jobs: int = 1,
     gex_only: bool = True,
+    cleanup_labels_zarr_tmpdir: bool = True,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
     image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
     labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
@@ -197,62 +202,86 @@ def xenium(
     else:
         table = None
 
-    if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
-        cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs)
-        if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
-            warnings.warn(
-                'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
-                " table. This could be due to trying to read a new version that is not supported yet. Please "
-                "report this issue.",
-                UserWarning,
-                stacklevel=2,
-            )
-        table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
-        table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
-
-    polygons = {}
-    labels = {}
-    tables = {}
-    points = {}
-    images = {}
-
-    # From the public release notes here:
-    # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
-    # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
-    # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
-    # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
-    # labels.
-    if nucleus_labels:
-        labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
-            path,
-            XeniumKeys.CELLS_ZARR,
-            specs,
-            mask_index=0,
-            labels_name="nucleus_labels",
-            labels_models_kwargs=labels_models_kwargs,
-        )
-    if cells_labels:
-        labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
-            path,
-            XeniumKeys.CELLS_ZARR,
-            specs,
-            mask_index=1,
-            labels_name="cell_labels",
-            labels_models_kwargs=labels_models_kwargs,
+    tmpdir = tempfile.TemporaryDirectory()
+    if not cleanup_labels_zarr_tmpdir:
+        logging.info(
+            f"Extracting cells zarr in the temporary directory {tmpdir.name}. Since `cleanup_labels_zarr_tmpdir` "
+            f"is set to `False`, this directory cleanup will be deferred (up to the end of the process). "
+            f"If the process is interrupted aburptly cleanup may not occurr. Use with care to avoid uncleaned up "
+            f"temporary directories."
         )
-        if cell_labels_indices_mapping is not None and table is not None:
-            if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
+    zip_file = path / XeniumKeys.CELLS_ZARR
+    with zipfile.ZipFile(zip_file, "r") as zip_ref:
+        zip_ref.extractall(tmpdir.name)
+    try:
+        cells_zarr = zarr.open(str(tmpdir.name), mode="r")
+        if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
+            cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr=cells_zarr)
+            if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
                 warnings.warn(
-                    "The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
-                    "cell labels data. This could be due to trying to read a new version that is not supported yet. "
-                    "Please report this issue.",
+                    'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
+                    " table. This could be due to trying to read a new version that is not supported yet. Please "
+                    "report this issue.",
                     UserWarning,
                     stacklevel=2,
                 )
-            else:
-                table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
-                if not cells_as_circles:
-                    table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
+            table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
+            table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
+
+        polygons = {}
+        labels = {}
+        tables = {}
+        points = {}
+        images = {}
+
+        # From the public release notes here:
+        # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
+        # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
+        # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
+        # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
+        # labels.
+
+        if nucleus_labels:
+            labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
+                cells_zarr,
+                cleanup_labels_zarr_tmpdir,
+                specs,
+                mask_index=0,
+                labels_name="nucleus_labels",
+                labels_models_kwargs=labels_models_kwargs,
+            )
+        if cells_labels:
+            labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
+                cells_zarr,
+                cleanup_labels_zarr_tmpdir,
+                specs,
+                mask_index=1,
+                labels_name="cell_labels",
+                labels_models_kwargs=labels_models_kwargs,
+            )
+            if cell_labels_indices_mapping is not None and table is not None:
+                if not pd.DataFrame.equals(
+                    cell_labels_indices_mapping["cell_id"],
+                    table.obs[str(XeniumKeys.CELL_ID)],
+                ):
+                    warnings.warn(
+                        "The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
+                        "cell labels data. This could be due to trying to read a new version that is not supported yet. "
+                        "Please report this issue.",
+                        UserWarning,
+                        stacklevel=2,
+                    )
+                else:
+                    table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
+                    if not cells_as_circles:
+                        table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
+    except Exception as e:
+        tmpdir.cleanup()
+        raise e
+
+    # we cleanup now if we don't have lazy data
+    if not cells_labels and not nucleus_labels or cleanup_labels_zarr_tmpdir:
+        tmpdir.cleanup()
 
     if nucleus_boundaries:
         polygons["nucleus_boundaries"] = _get_polygons(
@@ -381,7 +410,13 @@ def filter(self, record: logging.LogRecord) -> bool:
     if table is not None:
         tables["table"] = table
 
-    elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons}
+    elements_dict = {
+        "images": images,
+        "labels": labels,
+        "points": points,
+        "tables": tables,
+        "shapes": polygons,
+    }
     if cells_as_circles:
         elements_dict["shapes"][specs["region"]] = circles
     sdata = SpatialData(**elements_dict)
@@ -402,7 +437,11 @@ def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
 
 
 def _get_polygons(
-    path: Path, file: str, specs: dict[str, Any], n_jobs: int, idx: ArrayLike | None = None
+    path: Path,
+    file: str,
+    specs: dict[str, Any],
+    n_jobs: int,
+    idx: ArrayLike | None = None,
 ) -> GeoDataFrame:
     def _poly(arr: ArrayLike) -> Polygon:
         return Polygon(arr[:-1])
@@ -441,8 +480,8 @@ def _poly(arr: ArrayLike) -> Polygon:
 
 
 def _get_labels_and_indices_mapping(
-    path: Path,
-    file: str,
+    cells_zarr: zarr.Group,
+    cleanup_labels_zarr_tmpdir: bool,
     specs: dict[str, Any],
     mask_index: int,
     labels_name: str,
@@ -451,74 +490,71 @@ def _get_labels_and_indices_mapping(
     if mask_index not in [0, 1]:
         raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.")
 
-    with tempfile.TemporaryDirectory() as tmpdir:
-        zip_file = path / XeniumKeys.CELLS_ZARR
-        with zipfile.ZipFile(zip_file, "r") as zip_ref:
-            zip_ref.extractall(tmpdir)
+    # get the labels
+    if cleanup_labels_zarr_tmpdir:
+        masks = cells_zarr["masks"][f"{mask_index}"][...]
+    else:
+        masks = da.from_array(cells_zarr["masks"][f"{mask_index}"])
+    labels = Labels2DModel.parse(
+        masks,
+        dims=("y", "x"),
+        transformations={"global": Identity()},
+        **labels_models_kwargs,
+    )
 
-        with zarr_open(str(tmpdir), mode="r") as z:
-            # get the labels
-            masks = da.from_array(z["masks"][f"{mask_index}"])
-            labels = Labels2DModel.parse(
-                masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs
+    # build the matching table
+    version = _parse_version_of_xenium_analyzer(specs)
+    if mask_index == 0:
+        # nuclei currently not supported
+        return labels, None
+    if version is None or version is not None and version < packaging.version.parse("1.3.0"):
+        # supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not
+        # supported in versions < 1.3.0
+        return labels, None
+
+    cell_id, dataset_suffix = cells_zarr["cell_id"][...].T
+    cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
+
+    # this information will probably be available in the `label_id` column for version > 2.0.0 (see public
+    # release notes mentioned above)
+    real_label_index = get_element_instances(labels).values
+
+    # background removal
+    if real_label_index[0] == 0:
+        real_label_index = real_label_index[1:]
+
+    if version < packaging.version.parse("2.0.0"):
+        expected_label_index = cells_zarr["seg_mask_value"][...]
+
+        if not np.array_equal(expected_label_index, real_label_index):
+            raise ValueError(
+                "The label indices from the labels differ from the ones from the input data. Please report "
+                f"this issue. Real label indices: {real_label_index}, expected label indices: "
+                f"{expected_label_index}."
             )
-
-            # build the matching table
-            version = _parse_version_of_xenium_analyzer(specs)
-            if mask_index == 0:
-                # nuclei currently not supported
-                return labels, None
-            if version is None or version is not None and version < packaging.version.parse("1.3.0"):
-                # supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not
-                # supported in versions < 1.3.0
-                return labels, None
-
-            cell_id, dataset_suffix = z["cell_id"][...].T
-            cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
-
-            # this information will probably be available in the `label_id` column for version > 2.0.0 (see public
-            # release notes mentioned above)
-            real_label_index = get_element_instances(labels).values
-
-            # background removal
-            if real_label_index[0] == 0:
-                real_label_index = real_label_index[1:]
-
-            if version < packaging.version.parse("2.0.0"):
-                expected_label_index = z["seg_mask_value"][...]
-
-                if not np.array_equal(expected_label_index, real_label_index):
-                    raise ValueError(
-                        "The label indices from the labels differ from the ones from the input data. Please report "
-                        f"this issue. Real label indices: {real_label_index}, expected label indices: "
-                        f"{expected_label_index}."
-                    )
-            else:
-                labels_positional_indices = z["polygon_sets"][f"{mask_index}"]["cell_index"][...]
-                if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))):
-                    raise ValueError(
-                        "The positional indices of the labels do not match the expected range. Please report this "
-                        "issue."
-                    )
-
-            # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
-            indices_mapping = pd.DataFrame(
-                {
-                    "region": labels_name,
-                    "cell_id": cell_id_str,
-                    "label_index": real_label_index.astype(np.int64),
-                }
+    else:
+        labels_positional_indices = cells_zarr["polygon_sets"][f"{mask_index}"]["cell_index"][...]
+        if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))):
+            raise ValueError(
+                "The positional indices of the labels do not match the expected range. Please report this issue."
             )
-            # because AnnData converts the indices to str
-            indices_mapping.index = indices_mapping.index.astype(str)
-            return labels, indices_mapping
+
+    # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
+    indices_mapping = pd.DataFrame(
+        {
+            "region": labels_name,
+            "cell_id": cell_id_str,
+            "label_index": real_label_index.astype(np.int64),
+        }
+    )
+    # because AnnData converts the indices to str
+    indices_mapping.index = indices_mapping.index.astype(str)
+    return labels, indices_mapping
 
 
 @inject_docs(xx=XeniumKeys)
 def _get_cells_metadata_table_from_zarr(
-    path: Path,
-    file: str,
-    specs: dict[str, Any],
+    cells_zarr: zarr.Group,
 ) -> AnnData:
     """Read cells metadata from ``{xx.CELLS_ZARR}``.
 
@@ -526,35 +562,34 @@ def _get_cells_metadata_table_from_zarr(
     nucleus_count for versions >= 2.0.0.
     """
     # for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks
-    with tempfile.TemporaryDirectory() as tmpdir:
-        zip_file = path / XeniumKeys.CELLS_ZARR
-        with zipfile.ZipFile(zip_file, "r") as zip_ref:
-            zip_ref.extractall(tmpdir)
-
-        with zarr_open(str(tmpdir), mode="r") as z:
-            x = z["cell_summary"][...]
-            column_names = z["cell_summary"].attrs["column_names"]
-            df = pd.DataFrame(x, columns=column_names)
-            cell_id_prefix = z["cell_id"][:, 0]
-            dataset_suffix = z["cell_id"][:, 1]
+    x = cells_zarr["cell_summary"][...]
+    column_names = cells_zarr["cell_summary"].attrs["column_names"]
+    df = pd.DataFrame(x, columns=column_names)
+    cell_id_prefix = cells_zarr["cell_id"][:, 0]
+    dataset_suffix = cells_zarr["cell_id"][:, 1]
 
-            cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
-            df[XeniumKeys.CELL_ID] = cell_id_str
-            # because AnnData converts the indices to str
-            df.index = df.index.astype(str)
-            return df
+    cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
+    df[XeniumKeys.CELL_ID] = cell_id_str
+    # because AnnData converts the indices to str
+    df.index = df.index.astype(str)
+    return df
 
 
 def _get_points(path: Path, specs: dict[str, Any]) -> Table:
     table = read_parquet(path / XeniumKeys.TRANSCRIPTS_FILE)
     table["feature_name"] = table["feature_name"].apply(
-        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), meta=("feature_name", "object")
+        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+        meta=("feature_name", "object"),
     )
 
     transform = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y"))
     points = PointsModel.parse(
         table,
-        coordinates={"x": XeniumKeys.TRANSCRIPTS_X, "y": XeniumKeys.TRANSCRIPTS_Y, "z": XeniumKeys.TRANSCRIPTS_Z},
+        coordinates={
+            "x": XeniumKeys.TRANSCRIPTS_X,
+            "y": XeniumKeys.TRANSCRIPTS_Y,
+            "z": XeniumKeys.TRANSCRIPTS_Z,
+        },
         feature_key=XeniumKeys.FEATURE_NAME,
         instance_key=XeniumKeys.CELL_ID,
         transformations={"global": transform},
@@ -576,7 +611,12 @@ def _get_tables_and_circles(
     adata.obs["region"] = specs["region"]
     adata.obs["region"] = adata.obs["region"].astype("category")
     adata.obs[XeniumKeys.CELL_ID] = _decode_cell_id_column(adata.obs[XeniumKeys.CELL_ID])
-    table = TableModel.parse(adata, region=specs["region"], region_key="region", instance_key=str(XeniumKeys.CELL_ID))
+    table = TableModel.parse(
+        adata,
+        region=specs["region"],
+        region_key="region",
+        instance_key=str(XeniumKeys.CELL_ID),
+    )
     if cells_as_circles:
         transform = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y"))
         radii = np.sqrt(adata.obs[XeniumKeys.CELL_AREA].to_numpy() / np.pi)
@@ -605,7 +645,11 @@ def _get_images(
         # let's add a dummy channel as a temporary workaround.
         image = da.concatenate([image, da.zeros_like(image[0:1])], axis=0)
     return Image2DModel.parse(
-        image, transformations={"global": Identity()}, dims=("c", "y", "x"), rgb=None, **image_models_kwargs
+        image,
+        transformations={"global": Identity()},
+        dims=("c", "y", "x"),
+        rgb=None,
+        **image_models_kwargs,
     )
 
 
@@ -620,14 +664,18 @@ def _add_aligned_images(
     csv_files = list(path.glob("*.csv"))
     for file in ome_tif_files:
         element_name = None
-        for suffix in [XeniumKeys.ALIGNED_HE_IMAGE_SUFFIX, XeniumKeys.ALIGNED_IF_IMAGE_SUFFIX]:
+        for suffix in [
+            XeniumKeys.ALIGNED_HE_IMAGE_SUFFIX,
+            XeniumKeys.ALIGNED_IF_IMAGE_SUFFIX,
+        ]:
             if file.name.endswith(suffix):
                 element_name = suffix.replace(XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_REMOVE, "")
                 break
         if element_name is not None:
             # check if an alignment file exists
             expected_filename = file.name.replace(
-                XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_REMOVE, XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_ADD
+                XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_REMOVE,
+                XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_ADD,
             )
             alignment_files = [f for f in csv_files if f.name == expected_filename]
             assert len(alignment_files) <= 1, f"Found more than one alignment file for {file.name}."
@@ -833,7 +881,9 @@ def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suf
     return np.array(cell_id_str)
 
 
-def prefix_suffix_uint32_from_cell_id_str(cell_id_str: ArrayLike) -> tuple[ArrayLike, ArrayLike]:
+def prefix_suffix_uint32_from_cell_id_str(
+    cell_id_str: ArrayLike,
+) -> tuple[ArrayLike, ArrayLike]:
     # parse the string into the prefix and suffix
     cell_id_prefix_str, dataset_suffix = zip(*[x.split("-") for x in cell_id_str], strict=False)
     dataset_suffix_int = [int(x) for x in dataset_suffix]

From 92a09ae4eb76681af3252e202ef8885f44710c74 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Wed, 7 Jan 2026 22:03:36 +0100
Subject: [PATCH 3/5] add asv benchmarks

---
 asv.conf.json              |  23 ++++++++
 benchmarks/__init__.py     |   1 +
 benchmarks/bench_xenium.py | 109 +++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+)
 create mode 100644 asv.conf.json
 create mode 100644 benchmarks/__init__.py
 create mode 100644 benchmarks/bench_xenium.py

diff --git a/asv.conf.json b/asv.conf.json
new file mode 100644
index 00000000..d50ba6a5
--- /dev/null
+++ b/asv.conf.json
@@ -0,0 +1,23 @@
+{
+    "version": 1,
+    "project": "spatialdata-io",
+    "project_url": "https://github.com/scverse/spatialdata-io",
+    "repo": ".",
+    "branches": ["main", "xenium-labels-dask"],
+    "dvcs": "git",
+    "environment_type": "virtualenv",
+    "pythons": ["3.12"],
+    "build_command": [],
+    "install_command": ["python -m pip install {build_dir}[test]"],
+    "uninstall_command": ["python -m pip uninstall -y {project}"],
+    "env_dir": ".asv/env",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html",
+    "benchmark_dir": "benchmarks",
+    "hash_length": 8,
+    "build_cache_size": 2,
+    "install_timeout": 600,
+    "repeat": 3,
+    "processes": 1,
+    "attribute_selection": ["time_*", "peakmem_*"]
+}
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 00000000..dd4ef30e
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1 @@
+# ASV benchmarks for spatialdata-io
diff --git a/benchmarks/bench_xenium.py b/benchmarks/bench_xenium.py
new file mode 100644
index 00000000..d8319b04
--- /dev/null
+++ b/benchmarks/bench_xenium.py
@@ -0,0 +1,109 @@
+"""Benchmarks for SpatialData IO operations.
+
+Configuration:
+    Edit SANDBOX_DIR and DATASET below to point to your data.
+
+Setup:
+    cd <SANDBOX_DIR>/<DATASET>
+    python download.py  # use the same env where spatialdata is installed
+
+Running:
+    cd /path/to/spatialdata-io
+
+    # Quick benchmark (single run, for testing):
+    asv run --python=same -b IOBenchmark --quick --show-stderr -v
+
+    # Full benchmark (multiple runs, for accurate results):
+    asv run --python=same -b IOBenchmark --show-stderr -v
+
+Comparing branches:
+    # Run on specific commits:
+    asv run main^! -b IOBenchmark --show-stderr -v
+    asv run xenium-labels-dask^! -b IOBenchmark --show-stderr -v
+
+    # Or compare two branches directly:
+    asv continuous main xenium-labels-dask -b IOBenchmark --show-stderr -v
+
+    # View comparison:
+    asv compare main xenium-labels-dask
+
+Results:
+    - Console output shows timing and memory after each run
+    - JSON results saved to: .asv/results/
+    - Generate HTML report: asv publish && asv preview
+"""
+
+import inspect
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from spatialdata import SpatialData
+
+from spatialdata_io import xenium
+
+# =============================================================================
+# CONFIGURATION - Edit these paths to match your setup
+# =============================================================================
+SANDBOX_DIR = Path(__file__).parent.parent.parent / "spatialdata-sandbox"
+DATASET = "xenium_2.0.0_io"
+# =============================================================================
+
+
+def get_paths() -> tuple[Path, Path]:
+    """Get paths for benchmark data."""
+    path = SANDBOX_DIR / DATASET
+    path_read = path / "data"
+    path_write = path / "data_benchmark.zarr"
+
+    if not path_read.exists():
+        raise ValueError(f"Data directory not found: {path_read}")
+
+    return path_read, path_write
+
+
+class IOBenchmark:
+    """Benchmark IO read operations."""
+
+    timeout = 3600
+    repeat = 3
+    number = 1
+    warmup_time = 0
+    processes = 1
+
+    def setup(self) -> None:
+        """Set up paths for benchmarking."""
+        self.path_read, self.path_write = get_paths()
+        if self.path_write.exists():
+            shutil.rmtree(self.path_write)
+
+    def _read_xenium(self) -> SpatialData:
+        """Read xenium data with version-compatible kwargs."""
+        signature = inspect.signature(xenium)
+        kwargs = {}
+        if "cleanup_labels_zarr_tmpdir" in signature.parameters:
+            kwargs["cleanup_labels_zarr_tmpdir"] = False
+
+        return xenium(
+            path=str(self.path_read),
+            n_jobs=8,
+            cell_boundaries=True,
+            nucleus_boundaries=True,
+            morphology_focus=True,
+            cells_as_circles=True,
+            **kwargs,
+        )
+
+    def time_io(self) -> None:
+        """Walltime for data parsing."""
+        sdata = self._read_xenium()
+        sdata.write(self.path_write)
+
+    def peakmem_io(self) -> None:
+        """Peak memory for data parsing."""
+        sdata = self._read_xenium()
+        sdata.write(self.path_write)
+
+
+if __name__ == "__main__":
+    IOBenchmark().time_io()

From 54a24a07dae46ffb844c4b8415ab65b8aee158ab Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Thu, 8 Jan 2026 10:20:55 +0100
Subject: [PATCH 4/5] use zarr zipstores in xenium

---
 src/spatialdata_io/readers/xenium.py | 169 ++++++++++++---------------
 1 file changed, 73 insertions(+), 96 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index d9e32bd5..32455175 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -73,7 +73,6 @@ def xenium(
     cells_table: bool = True,
     n_jobs: int = 1,
     gex_only: bool = True,
-    cleanup_labels_zarr_tmpdir: bool = True,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
     image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
     labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
@@ -202,86 +201,62 @@ def xenium(
     else:
         table = None
 
-    tmpdir = tempfile.TemporaryDirectory()
-    if not cleanup_labels_zarr_tmpdir:
-        logging.info(
-            f"Extracting cells zarr in the temporary directory {tmpdir.name}. Since `cleanup_labels_zarr_tmpdir` "
-            f"is set to `False`, this directory cleanup will be deferred (up to the end of the process). "
-            f"If the process is interrupted aburptly cleanup may not occurr. Use with care to avoid uncleaned up "
-            f"temporary directories."
+    if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
+        cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs)
+        if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
+            warnings.warn(
+                'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
+                " table. This could be due to trying to read a new version that is not supported yet. Please "
+                "report this issue.",
+                UserWarning,
+                stacklevel=2,
+            )
+        table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
+        table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
+
+    polygons = {}
+    labels = {}
+    tables = {}
+    points = {}
+    images = {}
+
+    # From the public release notes here:
+    # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
+    # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
+    # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
+    # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
+    # labels.
+    if nucleus_labels:
+        labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
+            path,
+            XeniumKeys.CELLS_ZARR,
+            specs,
+            mask_index=0,
+            labels_name="nucleus_labels",
+            labels_models_kwargs=labels_models_kwargs,
         )
-    zip_file = path / XeniumKeys.CELLS_ZARR
-    with zipfile.ZipFile(zip_file, "r") as zip_ref:
-        zip_ref.extractall(tmpdir.name)
-    try:
-        cells_zarr = zarr.open(str(tmpdir.name), mode="r")
-        if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
-            cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr=cells_zarr)
-            if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
+    if cells_labels:
+        labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
+            path,
+            XeniumKeys.CELLS_ZARR,
+            specs,
+            mask_index=1,
+            labels_name="cell_labels",
+            labels_models_kwargs=labels_models_kwargs,
+        )
+        if cell_labels_indices_mapping is not None and table is not None:
+            if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
                 warnings.warn(
-                    'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
-                    " table. This could be due to trying to read a new version that is not supported yet. Please "
-                    "report this issue.",
+                    "The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
+                    "cell labels data. This could be due to trying to read a new version that is not supported yet. "
+                    "Please report this issue.",
                     UserWarning,
                     stacklevel=2,
                 )
-            table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
-            table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
-
-        polygons = {}
-        labels = {}
-        tables = {}
-        points = {}
-        images = {}
-
-        # From the public release notes here:
-        # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
-        # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
-        # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
-        # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
-        # labels.
-
-        if nucleus_labels:
-            labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
-                cells_zarr,
-                cleanup_labels_zarr_tmpdir,
-                specs,
-                mask_index=0,
-                labels_name="nucleus_labels",
-                labels_models_kwargs=labels_models_kwargs,
-            )
-        if cells_labels:
-            labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
-                cells_zarr,
-                cleanup_labels_zarr_tmpdir,
-                specs,
-                mask_index=1,
-                labels_name="cell_labels",
-                labels_models_kwargs=labels_models_kwargs,
-            )
-            if cell_labels_indices_mapping is not None and table is not None:
-                if not pd.DataFrame.equals(
-                    cell_labels_indices_mapping["cell_id"],
-                    table.obs[str(XeniumKeys.CELL_ID)],
-                ):
-                    warnings.warn(
-                        "The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
-                        "cell labels data. This could be due to trying to read a new version that is not supported yet. "
-                        "Please report this issue.",
-                        UserWarning,
-                        stacklevel=2,
-                    )
-                else:
-                    table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
-                    if not cells_as_circles:
-                        table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
-    except Exception as e:
-        tmpdir.cleanup()
-        raise e
-
-    # we cleanup now if we don't have lazy data
-    if not cells_labels and not nucleus_labels or cleanup_labels_zarr_tmpdir:
-        tmpdir.cleanup()
+            else:
+                table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
+                if not cells_as_circles:
+                    table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
 
     if nucleus_boundaries:
         polygons["nucleus_boundaries"] = _get_polygons(
@@ -480,8 +455,8 @@ def _poly(arr: ArrayLike) -> Polygon:
 
 
 def _get_labels_and_indices_mapping(
-    cells_zarr: zarr.Group,
-    cleanup_labels_zarr_tmpdir: bool,
+    path: Path,
+    file: str,
     specs: dict[str, Any],
     mask_index: int,
     labels_name: str,
@@ -490,17 +465,12 @@ def _get_labels_and_indices_mapping(
     if mask_index not in [0, 1]:
         raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.")
 
+    zip_file = path / XeniumKeys.CELLS_ZARR
+    store = zarr.storage.ZipStore(zip_file, read_only=True)
+    z = zarr.open(store, mode="r")
     # get the labels
-    if cleanup_labels_zarr_tmpdir:
-        masks = cells_zarr["masks"][f"{mask_index}"][...]
-    else:
-        masks = da.from_array(cells_zarr["masks"][f"{mask_index}"])
-    labels = Labels2DModel.parse(
-        masks,
-        dims=("y", "x"),
-        transformations={"global": Identity()},
-        **labels_models_kwargs,
-    )
+    masks = da.from_array(z["masks"][f"{mask_index}"])
+    labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs)
 
     # build the matching table
     version = _parse_version_of_xenium_analyzer(specs)
@@ -512,7 +482,7 @@ def _get_labels_and_indices_mapping(
         # supported in versions < 1.3.0
         return labels, None
 
-    cell_id, dataset_suffix = cells_zarr["cell_id"][...].T
+    cell_id, dataset_suffix = z["cell_id"][...].T
     cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
 
     # this information will probably be available in the `label_id` column for version > 2.0.0 (see public
@@ -524,7 +494,7 @@ def _get_labels_and_indices_mapping(
         real_label_index = real_label_index[1:]
 
     if version < packaging.version.parse("2.0.0"):
-        expected_label_index = cells_zarr["seg_mask_value"][...]
+        expected_label_index = z["seg_mask_value"][...]
 
         if not np.array_equal(expected_label_index, real_label_index):
             raise ValueError(
@@ -533,7 +503,7 @@ def _get_labels_and_indices_mapping(
                 f"{expected_label_index}."
             )
     else:
-        labels_positional_indices = cells_zarr["polygon_sets"][f"{mask_index}"]["cell_index"][...]
+        labels_positional_indices = z["polygon_sets"][f"{mask_index}"]["cell_index"][...]
         if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))):
             raise ValueError(
                 "The positional indices of the labels do not match the expected range. Please report this issue."
@@ -554,7 +524,9 @@ def _get_labels_and_indices_mapping(
 
 @inject_docs(xx=XeniumKeys)
 def _get_cells_metadata_table_from_zarr(
-    cells_zarr: zarr.Group,
+    path: Path,
+    file: str,
+    specs: dict[str, Any],
 ) -> AnnData:
     """Read cells metadata from ``{xx.CELLS_ZARR}``.
 
@@ -562,11 +534,16 @@ def _get_cells_metadata_table_from_zarr(
     nucleus_count for versions >= 2.0.0.
     """
     # for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks
-    x = cells_zarr["cell_summary"][...]
-    column_names = cells_zarr["cell_summary"].attrs["column_names"]
+    zip_file = path / XeniumKeys.CELLS_ZARR
+    store = zarr.storage.ZipStore(zip_file, read_only=True)
+
+    z = zarr.open(store, mode="r")
+    x = z["cell_summary"][...]
+    column_names = z["cell_summary"].attrs["column_names"]
     df = pd.DataFrame(x, columns=column_names)
-    cell_id_prefix = cells_zarr["cell_id"][:, 0]
-    dataset_suffix = cells_zarr["cell_id"][:, 1]
+    cell_id_prefix = z["cell_id"][:, 0]
+    dataset_suffix = z["cell_id"][:, 1]
+    store.close()
 
     cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
     df[XeniumKeys.CELL_ID] = cell_id_str

From ada3cd368327bf5d0d71dc9ad7e931010ff2de12 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Thu, 8 Jan 2026 14:59:22 +0100
Subject: [PATCH 5/5] fix pre-commit

---
 asv.conf.json              | 2 +-
 benchmarks/bench_xenium.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/asv.conf.json b/asv.conf.json
index d50ba6a5..516f0f1f 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -3,7 +3,7 @@
     "project": "spatialdata-io",
     "project_url": "https://github.com/scverse/spatialdata-io",
     "repo": ".",
-    "branches": ["main", "xenium-labels-dask"],
+    "branches": ["main", "xenium-labels-dask", "xenium-labels-dask-zipstore"],
     "dvcs": "git",
     "environment_type": "virtualenv",
     "pythons": ["3.12"],
diff --git a/benchmarks/bench_xenium.py b/benchmarks/bench_xenium.py
index d8319b04..685e5094 100644
--- a/benchmarks/bench_xenium.py
+++ b/benchmarks/bench_xenium.py
@@ -40,7 +40,7 @@
 
 from spatialdata import SpatialData
 
-from spatialdata_io import xenium
+from spatialdata_io import xenium  # type: ignore[attr-defined]
 
 # =============================================================================
 # CONFIGURATION - Edit these paths to match your setup