From 55ef92cc553816628a15ee9a46754fe81ace7fa8 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Mon, 21 Oct 2024 17:54:27 +0200
Subject: [PATCH 01/33] add cleaned macsima reader

---
 src/spatialdata_io/readers/_utils/_utils.py |  53 ++-
 src/spatialdata_io/readers/macsima.py       | 384 ++++++++++++++++++++
 tests/test_macsima.py                       | 111 ++++++
 3 files changed, 546 insertions(+), 2 deletions(-)
 create mode 100644 src/spatialdata_io/readers/macsima.py
 create mode 100644 tests/test_macsima.py

diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py
index 63f5bdf2..84cce922 100644
--- a/src/spatialdata_io/readers/_utils/_utils.py
+++ b/src/spatialdata_io/readers/_utils/_utils.py
@@ -8,6 +8,9 @@
 import numpy as np
 from anndata import AnnData, read_text
 from h5py import File
+from ome_types import from_tiff
+from ome_types.model import Pixels, UnitsLength
+from spatialdata._logging import logger
 
 from spatialdata_io.readers._utils._read_10x_h5 import _read_10x_h5
 
@@ -18,8 +21,8 @@
 
     NDArrayA = NDArray[Any]
 except (ImportError, TypeError):
-    NDArray = np.ndarray  # type: ignore[misc]
-    NDArrayA = np.ndarray  # type: ignore[misc]
+    NDArray = np.ndarray  # type: ignore[unused-ignore]
+    NDArrayA = np.ndarray  # type: ignore[unused-ignore]
 
 
 def _read_counts(
@@ -84,3 +87,49 @@ def _initialize_raster_models_kwargs(
     if "scale_factors" not in labels_models_kwargs:
         labels_models_kwargs["scale_factors"] = [2, 2, 2, 2]
     return image_models_kwargs, labels_models_kwargs
+
+
+def calc_scale_factors(stack: Any, min_size: int = 1000, default_scale_factor: int = 2) -> list[int]:
+    """Calculate scale factors based on image size to get lowest resolution under min_size pixels."""
+    # get lowest dimension, ignoring channels
+    lower_scale_limit = min(stack.shape[1:])
+    scale_factor = default_scale_factor
+    scale_factors = [scale_factor]
+    lower_scale_limit /= scale_factor
+    while lower_scale_limit >= min_size:
+        # scale_factors are cumulative, so we don't need to do e.g. scale_factor *= 2
+        scale_factors.append(scale_factor)
+        lower_scale_limit /= scale_factor
+    return scale_factors
+
+
+def parse_channels(path: Path) -> list[str]:
+    """Parse channel names from an OME-TIFF file."""
+    images = from_tiff(path).images
+    if len(images) > 1:
+        logger.warning("Found multiple images in OME-TIFF file. Only the first one will be used.")
+    channels = images[0].pixels.channels
+    logger.debug(channels)
+    names = [c.name for c in channels if c.name is not None]
+    return names
+
+
+def parse_physical_size(path: Path | None = None, ome_pixels: Pixels | None = None) -> float:
+    """Parse physical size from OME-TIFF to micrometer."""
+    pixels = ome_pixels or from_tiff(path).images[0].pixels
+    logger.debug(pixels)
+    if pixels.physical_size_x_unit != pixels.physical_size_y_unit:
+        logger.error("Physical units for x and y dimensions are not the same.")
+        raise NotImplementedError
+    if pixels.physical_size_x != pixels.physical_size_y:
+        logger.error("Physical sizes for x and y dimensions are the same.")
+        raise NotImplementedError
+    # convert to micrometer if needed
+    if pixels.physical_size_x_unit == UnitsLength.NANOMETER:
+        physical_size = pixels.physical_size_x / 1000
+    elif pixels.physical_size_x_unit == UnitsLength.MICROMETER:
+        physical_size = pixels.physical_size_x
+    else:
+        logger.error(f"Physical unit not recognized: '{pixels.physical_size_x_unit}'.")
+        raise NotImplementedError
+    return float(physical_size)
diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py
new file mode 100644
index 00000000..fd497f19
--- /dev/null
+++ b/src/spatialdata_io/readers/macsima.py
@@ -0,0 +1,384 @@
+from __future__ import annotations
+
+import warnings
+from collections.abc import Mapping
+from pathlib import Path
+from types import MappingProxyType
+from typing import Any
+
+import anndata as ad
+import dask.array as da
+import pandas as pd
+import spatialdata as sd
+from dask_image.imread import imread
+from spatialdata import SpatialData
+from spatialdata._logging import logger
+
+from spatialdata_io._constants._enum import ModeEnum
+from spatialdata_io.readers._utils._utils import (
+    calc_scale_factors,
+    parse_channels,
+    parse_physical_size,
+)
+
+__all__ = ["macsima"]
+
+
+class MACSimaParsingStyle(ModeEnum):
+    """Different parsing styles for MACSima data."""
+
+    PROCESSED_SINGLE_FOLDER = "processed_single_folder"
+    PROCESSED_MULTIPLE_FOLDERS = "processed_multiple_folders"
+    RAW = "raw"
+    AUTO = "auto"
+
+
+def macsima(
+    path: str | Path,
+    parsing_style: MACSimaParsingStyle | str = MACSimaParsingStyle.AUTO,
+    filter_folder_names: list[str] | None = None,
+    imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
+    subset: int | None = None,
+    c_subset: int | None = None,
+    max_chunk_size: int = 1024,
+    c_chunks_size: int = 1,
+    multiscale: bool = True,
+    transformations: bool = True,
+    scale_factors: list[int] | None = None,
+    default_scale_factor: int = 2,
+    nuclei_channel_name: str = "DAPI",
+    skip_rounds: list[int] | None = None,
+) -> SpatialData:
+    """
+    Read *MACSima* formatted dataset.
+
+    This function reads images from a MACSima cyclic imaging experiment. Metadata of the cycle rounds is parsed from the image names. The channel names are parsed from the OME metadata.
+
+    .. seealso::
+
+        - `MACSima output <https://application.qitissue.com/getting-started/naming-your-datasets>`_.
+
+    Parameters
+    ----------
+    path
+        Path to the directory containing the data.
+    parsing_style
+        Parsing style to use. If ``auto``, the parsing style is determined based on the contents of the path.
+    filter_folder_names
+        List of folder names to filter out when parsing multiple folders.
+    imread_kwargs
+        Keyword arguments passed to :func:`dask_image.imread.imread`.
+    subset
+        Subset the image to the first ``subset`` pixels in x and y dimensions.
+    c_subset
+        Subset the image to the first ``c_subset`` channels.
+    max_chunk_size
+        Maximum chunk size for x and y dimensions.
+    c_chunks_size
+        Chunk size for c dimension.
+    multiscale
+        Whether to create a multiscale image.
+    transformations
+        Whether to add a transformation from pixels to microns to the image.
+    scale_factors
+        Scale factors to use for downsampling. If None, scale factors are calculated based on image size.
+    default_scale_factor
+        Default scale factor to use for downsampling.
+    nuclei_channel_name
+        Common string of the nuclei channel to separate nuclei from other channels.
+    skip_rounds
+        List of round numbers to skip when parsing the data.
+
+    Returns
+    -------
+    :class:`spatialdata.SpatialData`
+    """
+    path = Path(path)
+    if not isinstance(parsing_style, MACSimaParsingStyle):
+        parsing_style = MACSimaParsingStyle(parsing_style)
+
+    if parsing_style == MACSimaParsingStyle.AUTO:
+        assert path.is_dir(), f"Path {path} is not a directory."
+
+        if any(p.suffix in [".tif", ".tiff"] for p in path.iterdir()):
+            # if path contains tifs, do parse_processed_folder on path
+            parsing_style = MACSimaParsingStyle.PROCESSED_SINGLE_FOLDER
+        elif all(p.is_dir() for p in path.iterdir() if not p.name.startswith(".")):
+            # if path contains only folders or hidden files, do parse_processed_folder on each folder
+            parsing_style = MACSimaParsingStyle.PROCESSED_MULTIPLE_FOLDERS
+        else:
+            raise ValueError(f"Cannot determine parsing style for path {path}. Please specify the parsing style.")
+
+    if parsing_style == MACSimaParsingStyle.PROCESSED_SINGLE_FOLDER:
+        return parse_processed_folder(
+            path,
+            imread_kwargs,
+            subset,
+            c_subset,
+            max_chunk_size,
+            c_chunks_size,
+            multiscale,
+            transformations,
+            scale_factors,
+            default_scale_factor,
+            nuclei_channel_name,
+            skip_rounds,
+        )
+    if parsing_style == MACSimaParsingStyle.PROCESSED_MULTIPLE_FOLDERS:
+        sdatas = {}
+        # iterate over all non-filtered folders in path and parse each folder
+        for p in [
+            p
+            for p in path.iterdir()
+            if p.is_dir() and (not filter_folder_names or not any(f in p.name for f in filter_folder_names))
+        ]:
+            sdatas[p.stem] = parse_processed_folder(
+                p,
+                imread_kwargs,
+                subset,
+                c_subset,
+                max_chunk_size,
+                c_chunks_size,
+                multiscale,
+                transformations,
+                scale_factors,
+                default_scale_factor,
+                nuclei_channel_name,
+                skip_rounds,
+            )
+        return sd.concatenate(list(sdatas.values()))
+    if parsing_style == MACSimaParsingStyle.RAW:
+        # TODO: see https://github.com/scverse/spatialdata-io/issues/155
+        raise NotImplementedError("Parsing raw MACSima data is not yet implemented.")
+
+
+def parse_name_to_cycle(name: str) -> int:
+    """Parse the cycle number from the name of the image."""
+    cycle = name.split("_")[0]
+    if "-" in cycle:
+        cycle = cycle.split("-")[1]
+    return int(cycle)
+
+
+def parse_processed_folder(
+    path: Path,
+    imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
+    subset: int | None = None,
+    c_subset: int | None = None,
+    max_chunk_size: int = 1024,
+    c_chunks_size: int = 1,
+    multiscale: bool = True,
+    transformations: bool = True,
+    scale_factors: list[int] | None = None,
+    default_scale_factor: int = 2,
+    nuclei_channel_name: str = "DAPI",
+    skip_rounds: list[int] | None = None,
+    file_pattern: str = "*.tif*",
+) -> SpatialData:
+    """Parse a single folder containing images from a cyclical imaging platform."""
+    # get list of image paths, get channel name from OME data and cycle round number from filename
+    # look for OME-TIFF files
+    path_files = list(path.glob(file_pattern))
+    logger.debug(path_files[0])
+    # make sure not to remove round 0 when parsing!
+    cycles = []
+    channels = []
+    for p in path_files:
+        cycle = parse_name_to_cycle(p.stem)
+        cycles.append(cycle)
+        try:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                channel_names = parse_channels(p)
+            if len(channel_names) > 1:
+                logger.warning(f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.")
+            channels.append(channel_names[0])
+        except ValueError as e:
+            logger.warning(f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.")
+    stack, sorted_cycles, sorted_channels = get_stack(
+        path_files,
+        cycles,
+        channels,
+        imread_kwargs,
+        skip_rounds=skip_rounds,
+    )
+
+    sorted_cycle_channels: list[str] = []
+    for c, ch in zip(sorted_cycles, sorted_channels):
+        sorted_cycle_channels.append(f"R{str(c)} {ch}")
+
+    # do subsetting if needed
+    if subset:
+        stack = stack[:, :subset, :subset]
+    if c_subset:
+        stack = stack[:c_subset, :, :]
+        sorted_cycle_channels = sorted_cycle_channels[:c_subset]
+    if multiscale and not scale_factors:
+        scale_factors = calc_scale_factors(stack, default_scale_factor=default_scale_factor)
+    if not multiscale:
+        scale_factors = None
+    logger.debug(f"Scale factors: {scale_factors}")
+
+    filtered_name = path.stem.replace(" ", "_")
+
+    return create_sdata(
+        stack,
+        sorted_cycle_channels,
+        path_files,
+        max_chunk_size,
+        c_chunks_size,
+        transformations,
+        scale_factors,
+        nuclei_channel_name,
+        filtered_name,
+    )
+
+
+def create_sdata(
+    stack: da.Array,
+    sorted_cycle_channels: list[str],
+    path_files: list[Path],
+    max_chunk_size: int,
+    c_chunks_size: int,
+    transformations: bool,
+    scale_factors: list[int] | None,
+    nuclei_channel_name: str,
+    filtered_name: str,
+) -> SpatialData:
+    # for stack and sorted_cycle_channels, if channel name is nuclei_channel_name, add to seperate nuclei stack
+    # keep the first nuclei channel in both the stack and the nuclei stack
+    nuclei_sorted_cycle_channels = []
+    nuclei_idx = []
+    for i, c in enumerate(sorted_cycle_channels):
+        if nuclei_channel_name in c:
+            nuclei_sorted_cycle_channels.append(c)
+            nuclei_idx.append(i)
+    if len(nuclei_idx) > 2:
+        has_nuclei_stack = True
+        # More than two nuclei channels found, keep only the first and last one in the stack
+        nuclei_stack = stack[nuclei_idx]
+        nuclei_idx_without_first_and_last = nuclei_idx[1:-1]
+        stack = stack[[i for i in range(len(sorted_cycle_channels)) if i not in nuclei_idx_without_first_and_last]]
+        sorted_cycle_channels = [
+            c for i, c in enumerate(sorted_cycle_channels) if i not in nuclei_idx_without_first_and_last
+        ]
+    else:
+        has_nuclei_stack = False
+    # Only one or two nuclei channels found, keep all in the stack
+    pixels_to_microns = parse_physical_size(path_files[0])
+    image_element = create_image_element(
+        stack,
+        sorted_cycle_channels,
+        max_chunk_size,
+        c_chunks_size,
+        transformations,
+        pixels_to_microns,
+        scale_factors,
+        name=filtered_name,
+    )
+    if has_nuclei_stack:
+        nuclei_image_element = create_image_element(
+            nuclei_stack,
+            nuclei_sorted_cycle_channels,
+            max_chunk_size,
+            c_chunks_size,
+            transformations,
+            pixels_to_microns,
+            scale_factors,
+            name=filtered_name,
+        )
+        table_nuclei = create_table(nuclei_sorted_cycle_channels)
+    table_channels = create_table(sorted_cycle_channels)
+
+    sdata = sd.SpatialData(
+        images={
+            f"{filtered_name}_image": image_element,
+        },
+        tables={
+            f"{filtered_name}_table": table_channels,
+        },
+    )
+    if has_nuclei_stack:
+        sdata.images[f"{filtered_name}_nuclei_image"] = nuclei_image_element
+        sdata.tables[f"{filtered_name}_nuclei_table"] = table_nuclei
+
+    return sdata
+
+
+def create_table(sorted_cycle_channels: list[str]) -> ad.AnnData:
+    df = pd.DataFrame(
+        {
+            "name": sorted_cycle_channels,
+            "cycle": [int(c.split(" ")[0][1:]) for c in sorted_cycle_channels],
+        }
+    )
+    table = ad.AnnData(var=df)
+    table.var_names = sorted_cycle_channels
+    return sd.models.TableModel.parse(table)
+
+
+def create_image_element(
+    stack: da.Array,
+    sorted_channels: list[str],
+    max_chunk_size: int,
+    c_chunks_size: int,
+    transformations: bool,
+    pixels_to_microns: float,
+    scale_factors: list[int] | None,
+    name: str | None = None,
+) -> sd.models.Image2DModel:
+    t_dict = None
+    if transformations:
+        t_pixels_to_microns = sd.transformations.Scale([pixels_to_microns, pixels_to_microns], axes=("x", "y"))
+        # 'microns' is also used in merscope example
+        # no inverse needed as the transformation is already from pixels to microns
+        t_dict = {name: t_pixels_to_microns}
+    # # chunk_size can be 1 for channels
+    chunks = {
+        "x": max_chunk_size,
+        "y": max_chunk_size,
+        "c": c_chunks_size,
+    }
+    if t_dict:
+        logger.debug("Adding transformation: %s", t_dict)
+    el = sd.models.Image2DModel.parse(
+        stack,
+        # TODO: make sure y and x locations are correct
+        dims=["c", "y", "x"],
+        scale_factors=scale_factors,
+        chunks=chunks,
+        c_coords=sorted_channels,
+        transformations=t_dict,
+    )
+    return el
+
+
+def get_stack(
+    path_files: list[Path],
+    cycles: list[int],
+    channels: list[str],
+    imread_kwargs: Mapping[str, Any],
+    skip_rounds: list[int] | None = None,
+) -> tuple[da.Array, list[int], list[str]]:
+    if len(path_files) != len(cycles) or len(path_files) != len(channels):
+        raise ValueError("Length of path_files, cycles and channels must be the same.")
+    # if any of round_channels is in skip_rounds, remove that round from the list and from path_files
+    if skip_rounds:
+        logger.info("Skipping cycles: %d", skip_rounds)
+        path_files, cycles, channels = map(
+            list,
+            zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]),
+        )
+    imgs = [imread(img, **imread_kwargs) for img in path_files]
+    for img, path in zip(imgs, path_files):
+        if img.shape[1:] != imgs[0].shape[1:]:
+            raise ValueError(
+                f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}"
+            )
+    # sort imgs, cycles and channels based on cycles
+    imgs, cycles, channels = map(
+        list,
+        zip(*sorted(zip(imgs, cycles, channels), key=lambda x: (x[1]))),
+    )
+    stack = da.stack(imgs).squeeze()
+    return stack, cycles, channels
diff --git a/tests/test_macsima.py b/tests/test_macsima.py
new file mode 100644
index 00000000..759867b0
--- /dev/null
+++ b/tests/test_macsima.py
@@ -0,0 +1,111 @@
+import math
+import sys
+from pathlib import Path
+from typing import Any
+
+import pytest
+from spatialdata.models import get_channels
+
+from spatialdata_io.readers.macsima import macsima
+
+if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()):
+    # The datasets should be downloaded and placed unzipped in the "data" directory;
+    # MACSimaData_HCA/ with e.g. unzipped HumanLiverH35/ inside: https://livercellatlas.org/download.php
+    # Lung_adc_demo/: Ask Miltenyi Biotec for the demo dataset
+    pytest.skip(
+        "requires the Lung_adc_demo or MACSimaData_HCA datasets",
+        allow_module_level=True,
+    )
+
+
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@pytest.mark.parametrize(
+    "dataset,expected",
+    [
+        ("Lung_adc_demo", {"y": (0, 15460), "x": (0, 13864)}),
+        ("MACSimaData_HCA/HumanLiverH35", {"y": (0, 1154), "x": (0, 1396)}),
+    ],
+)
+def test_image_size(dataset: str, expected: dict[str, Any]) -> None:
+    from spatialdata import get_extent
+
+    f = Path("./data") / dataset
+    assert f.is_dir()
+    sdata = macsima(f)
+    el = sdata[list(sdata.images.keys())[0]]
+    cs = sdata.coordinate_systems[0]
+
+    extent: dict[str, tuple[float, float]] = get_extent(el, coordinate_system=cs)
+    extent = {ax: (math.floor(extent[ax][0]), math.ceil(extent[ax][1])) for ax in extent}
+    assert extent == expected
+
+
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@pytest.mark.parametrize(
+    "dataset,expected",
+    [("Lung_adc_demo", 116), ("MACSimaData_HCA/HumanLiverH35", 102)],
+)
+def test_total_channels(dataset: str, expected: int) -> None:
+    f = Path("./data") / dataset
+    assert f.is_dir()
+    sdata = macsima(f)
+    el = sdata[list(sdata.images.keys())[0]]
+
+    # get the number of channels
+    channels: int = len(get_channels(el))
+    assert channels == expected
+
+
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@pytest.mark.parametrize(
+    "dataset,expected",
+    [
+        ("Lung_adc_demo", ["R0 DAPI", "R1 CD68", "R1 CD163"]),
+        ("MACSimaData_HCA/HumanLiverH35", ["R0 DAPI", "R1 PE", "R1 DAPI"]),
+    ],
+)
+def test_channel_names(dataset: str, expected: list[str]) -> None:
+    f = Path("./data") / dataset
+    assert f.is_dir()
+    sdata = macsima(f, c_subset=3)
+    el = sdata[list(sdata.images.keys())[0]]
+
+    # get the channel names
+    channels = get_channels(el)
+    assert list(channels) == expected
+
+
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@pytest.mark.parametrize(
+    "dataset,expected",
+    [
+        ("Lung_adc_demo", [0, 1, 1]),
+        ("MACSimaData_HCA/HumanLiverH35", [0, 1, 1]),
+    ],
+)
+def test_cycle_metadata(dataset: str, expected: list[str]) -> None:
+    f = Path("./data") / dataset
+    assert f.is_dir()
+    sdata = macsima(f, c_subset=3)
+    table = sdata[list(sdata.tables.keys())[0]]
+
+    # get the channel names
+    cycles = table.var["cycle"]
+    assert list(cycles) == expected
+
+
+def test_parsing_style() -> None:
+    with pytest.raises(ValueError):
+        macsima(Path("."), parsing_style="not_a_parsing_style")
+
+
+# @pytest.mark.parametrize(
+#     "name,expected",
+#     [
+#         ("C-002_S-000_S_FITC_R-01_W-C-1_ROI-01_A-CD147_C-REA282.tif", 2),
+#         ("001_S_R-01_W-B-1_ROI-01_A-CD14REA599ROI1_C-REA599.ome.tif", 1),
+#     ],
+# )
+# def test_parsing_of_name_to_cycle(name: str, expected: int) -> None:
+#     result = parse_name_to_cycle(name)
+#     assert result == expected

From e001548699c38435dd9f52b61f0ce982bc1dc935 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Tue, 22 Oct 2024 09:38:18 +0200
Subject: [PATCH 02/33] add ome-types

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 6b12efe6..e522bd8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
     "pyarrow",
     "readfcs",
     "tifffile>=2023.8.12",
+    "ome-types",
 ]
 
 [project.optional-dependencies]

From 47d651df62c65c6540e16d85c05c60068fcec81a Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Tue, 22 Oct 2024 11:41:05 +0200
Subject: [PATCH 03/33] refactor to MultiChannelImage, add parameter
 include_cycle_in_channel_name

---
 src/spatialdata_io/readers/_utils/_utils.py |   5 +-
 src/spatialdata_io/readers/macsima.py       | 317 +++++++++++++-------
 tests/test_macsima.py                       |  24 +-
 3 files changed, 220 insertions(+), 126 deletions(-)

diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py
index 752d68c4..5aedf5d4 100644
--- a/src/spatialdata_io/readers/_utils/_utils.py
+++ b/src/spatialdata_io/readers/_utils/_utils.py
@@ -89,11 +89,10 @@ def _initialize_raster_models_kwargs(
     return image_models_kwargs, labels_models_kwargs
 
 
-def calc_scale_factors(stack: Any, min_size: int = 1000, default_scale_factor: int = 2) -> list[int]:
+def calc_scale_factors(lower_scale_limit: float, min_size: int = 1000, default_scale_factor: int = 2) -> list[int]:
     """Calculate scale factors based on image size to get lowest resolution under min_size pixels."""
     # get lowest dimension, ignoring channels
-    lower_scale_limit = min(stack.shape[1:])
-    scale_factor = default_scale_factor
+    scale_factor: int = default_scale_factor
     scale_factors = [scale_factor]
     lower_scale_limit /= scale_factor
     while lower_scale_limit >= min_size:
diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py
index fd497f19..2916d79b 100644
--- a/src/spatialdata_io/readers/macsima.py
+++ b/src/spatialdata_io/readers/macsima.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
 import warnings
+from collections import defaultdict
 from collections.abc import Mapping
+from dataclasses import dataclass
 from pathlib import Path
 from types import MappingProxyType
 from typing import Any
@@ -33,6 +35,139 @@ class MACSimaParsingStyle(ModeEnum):
     AUTO = "auto"
 
 
+@dataclass
+class ChannelMetadata:
+    """Metadata for a channel in a multichannel dataset."""
+
+    name: str
+    cycle: int
+
+
+@dataclass
+class MultiChannelImage:
+    """Multichannel image with metadata."""
+
+    data: list[da.Array]
+    metadata: list[ChannelMetadata]
+    include_cycle_in_channel_name: bool = False
+
+    @classmethod
+    def from_paths(
+        cls,
+        path_files: list[Path],
+        imread_kwargs: Mapping[str, Any],
+        skip_rounds: list[int] | None = None,
+    ) -> MultiChannelImage:
+        cycles = []
+        channels = []
+        for p in path_files:
+            cycle = parse_name_to_cycle(p.stem)
+            cycles.append(cycle)
+            try:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    channel_names = parse_channels(p)
+                if len(channel_names) > 1:
+                    logger.warning(f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.")
+                channels.append(channel_names[0])
+            except ValueError as e:
+                logger.warning(f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.")
+
+        if len(path_files) != len(cycles) or len(path_files) != len(channels):
+            raise ValueError("Length of path_files, cycles and channels must be the same.")
+        # if any of round_channels is in skip_rounds, remove that round from the list and from path_files
+        if skip_rounds:
+            logger.info("Skipping cycles: %d", skip_rounds)
+            path_files, cycles, channels = map(
+                list,
+                zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]),
+            )
+        imgs = [imread(img, **imread_kwargs) for img in path_files]
+        for img, path in zip(imgs, path_files):
+            if img.shape[1:] != imgs[0].shape[1:]:
+                raise ValueError(
+                    f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}"
+                )
+        # sort imgs, cycles and channels based on cycles
+        output = cls(
+            data=imgs,
+            metadata=[ChannelMetadata(name=ch, cycle=c) for c, ch in zip(cycles, channels)],
+        )
+        return output
+
+    @classmethod
+    def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage:
+        """Create new MultiChannelImage with only the channels that contain c_name."""
+        indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name]
+        return MultiChannelImage.filter_by_index(mci, indices)
+
+    @classmethod
+    def filter_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage:
+        """Filter the image by index."""
+        metadata = [c for i, c in enumerate(mci.metadata) if i in indices]
+        data = [d for i, d in enumerate(mci.data) if i in indices]
+        return cls(
+            data=data,
+            metadata=metadata,
+            include_cycle_in_channel_name=mci.include_cycle_in_channel_name,
+        )
+
+    def get_channel_names(self) -> list[str]:
+        """Get the channel names."""
+        if self.include_cycle_in_channel_name:
+            return [f"R{c.cycle} {c.name}" for c in self.metadata]
+        else:
+            # if name is duplicated, add (i) to the name
+            names = [c.name for c in self.metadata]
+            name_dict: dict[str, int] = defaultdict(int)
+            name_counter: dict[str, int] = defaultdict(int)
+            for name in names:
+                name_dict[name] += 1
+            output = []
+            for name in names:
+                name_counter[name] += 1
+                output.append(f"{name} ({name_counter[name]})" if name_dict[name] > 1 else name)
+            return output
+
+    def get_cycles(self) -> list[int]:
+        """Get the cycle numbers."""
+        return [c.cycle for c in self.metadata]
+
+    def sort_by_channel(self) -> None:
+        """Sort the channels by cycle number."""
+        self.metadata = sorted(self.metadata, key=lambda x: x.cycle)
+        self.data = [d for _, d in sorted(zip(self.metadata, self.data), key=lambda x: x[0].cycle)]
+
+    def subset(self, subset: int | None = None) -> MultiChannelImage:
+        """Subset the image."""
+        if subset:
+            self.data = [d[:, :subset, :subset] for d in self.data]
+        return self
+
+    def subset_channels(self, c_subset: int) -> MultiChannelImage:
+        """Subset the channels."""
+        self.data = self.data[:c_subset]
+        self.metadata = self.metadata[:c_subset]
+        return self
+
+    def subset_by_idx(self, indices: list[int]) -> MultiChannelImage:
+        """Subset the image by index."""
+        data = [d for i, d in enumerate(self.data) if i in indices]
+        metadata = [c for i, c in enumerate(self.metadata) if i in indices]
+        return MultiChannelImage(
+            data=data,
+            metadata=metadata,
+            include_cycle_in_channel_name=self.include_cycle_in_channel_name,
+        )
+
+    def calc_scale_factors(self, default_scale_factor: int = 2) -> list[int]:
+        lower_scale_limit = min(self.data[0].shape[1:])
+        return calc_scale_factors(lower_scale_limit, default_scale_factor=default_scale_factor)
+
+    def get_stack(self) -> da.Array:
+        return da.stack(self.data).squeeze()
+
+
 def macsima(
     path: str | Path,
     parsing_style: MACSimaParsingStyle | str = MACSimaParsingStyle.AUTO,
@@ -47,7 +182,9 @@ def macsima(
     scale_factors: list[int] | None = None,
     default_scale_factor: int = 2,
     nuclei_channel_name: str = "DAPI",
+    split_threshold_nuclei_channel: int | None = 2,
     skip_rounds: list[int] | None = None,
+    include_cycle_in_channel_name: bool = False,
 ) -> SpatialData:
     """
     Read *MACSima* formatted dataset.
@@ -86,8 +223,12 @@ def macsima(
         Default scale factor to use for downsampling.
     nuclei_channel_name
         Common string of the nuclei channel to separate nuclei from other channels.
+    split_threshold_nuclei_channel
+        Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is greater than this threshold, the nuclei channels are split into a separate stack.
     skip_rounds
         List of round numbers to skip when parsing the data.
+    include_cycle_in_channel_name
+        Whether to include the cycle number in the channel name.
 
     Returns
     -------
@@ -121,8 +262,10 @@ def macsima(
             transformations,
             scale_factors,
             default_scale_factor,
-            nuclei_channel_name,
-            skip_rounds,
+            nuclei_channel_name=nuclei_channel_name,
+            split_threshold_nuclei_channel=split_threshold_nuclei_channel,
+            skip_rounds=skip_rounds,
+            include_cycle_in_channel_name=include_cycle_in_channel_name,
         )
     if parsing_style == MACSimaParsingStyle.PROCESSED_MULTIPLE_FOLDERS:
         sdatas = {}
@@ -143,8 +286,10 @@ def macsima(
                 transformations,
                 scale_factors,
                 default_scale_factor,
-                nuclei_channel_name,
-                skip_rounds,
+                nuclei_channel_name=nuclei_channel_name,
+                split_threshold_nuclei_channel=split_threshold_nuclei_channel,
+                skip_rounds=skip_rounds,
+                include_cycle_in_channel_name=include_cycle_in_channel_name,
             )
         return sd.concatenate(list(sdatas.values()))
     if parsing_style == MACSimaParsingStyle.RAW:
@@ -172,49 +317,33 @@ def parse_processed_folder(
     scale_factors: list[int] | None = None,
     default_scale_factor: int = 2,
     nuclei_channel_name: str = "DAPI",
+    split_threshold_nuclei_channel: int | None = 2,
     skip_rounds: list[int] | None = None,
     file_pattern: str = "*.tif*",
+    include_cycle_in_channel_name: bool = False,
 ) -> SpatialData:
     """Parse a single folder containing images from a cyclical imaging platform."""
     # get list of image paths, get channel name from OME data and cycle round number from filename
     # look for OME-TIFF files
     path_files = list(path.glob(file_pattern))
     logger.debug(path_files[0])
-    # make sure not to remove round 0 when parsing!
-    cycles = []
-    channels = []
-    for p in path_files:
-        cycle = parse_name_to_cycle(p.stem)
-        cycles.append(cycle)
-        try:
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                channel_names = parse_channels(p)
-            if len(channel_names) > 1:
-                logger.warning(f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.")
-            channels.append(channel_names[0])
-        except ValueError as e:
-            logger.warning(f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.")
-    stack, sorted_cycles, sorted_channels = get_stack(
+
+    mci = MultiChannelImage.from_paths(
         path_files,
-        cycles,
-        channels,
         imread_kwargs,
-        skip_rounds=skip_rounds,
+        skip_rounds,
     )
+    mci.include_cycle_in_channel_name = include_cycle_in_channel_name
 
-    sorted_cycle_channels: list[str] = []
-    for c, ch in zip(sorted_cycles, sorted_channels):
-        sorted_cycle_channels.append(f"R{str(c)} {ch}")
+    mci.sort_by_channel()
 
     # do subsetting if needed
     if subset:
-        stack = stack[:, :subset, :subset]
+        mci = mci.subset(subset)
     if c_subset:
-        stack = stack[:c_subset, :, :]
-        sorted_cycle_channels = sorted_cycle_channels[:c_subset]
+        mci = mci.subset_channels(c_subset)
     if multiscale and not scale_factors:
-        scale_factors = calc_scale_factors(stack, default_scale_factor=default_scale_factor)
+        scale_factors = mci.calc_scale_factors(default_scale_factor=default_scale_factor)
     if not multiscale:
         scale_factors = None
     logger.debug(f"Scale factors: {scale_factors}")
@@ -222,73 +351,68 @@ def parse_processed_folder(
     filtered_name = path.stem.replace(" ", "_")
 
     return create_sdata(
-        stack,
-        sorted_cycle_channels,
-        path_files,
-        max_chunk_size,
-        c_chunks_size,
-        transformations,
-        scale_factors,
-        nuclei_channel_name,
-        filtered_name,
+        mci=mci,
+        path_files=path_files,
+        max_chunk_size=max_chunk_size,
+        c_chunks_size=c_chunks_size,
+        transformations=transformations,
+        scale_factors=scale_factors,
+        nuclei_channel_name=nuclei_channel_name,
+        split_threshold_nuclei_channel=split_threshold_nuclei_channel,
+        filtered_name=filtered_name,
     )
 
 
 def create_sdata(
-    stack: da.Array,
-    sorted_cycle_channels: list[str],
+    mci: MultiChannelImage,
     path_files: list[Path],
     max_chunk_size: int,
     c_chunks_size: int,
     transformations: bool,
     scale_factors: list[int] | None,
     nuclei_channel_name: str,
+    split_threshold_nuclei_channel: int | None,
     filtered_name: str,
 ) -> SpatialData:
-    # for stack and sorted_cycle_channels, if channel name is nuclei_channel_name, add to seperate nuclei stack
-    # keep the first nuclei channel in both the stack and the nuclei stack
-    nuclei_sorted_cycle_channels = []
-    nuclei_idx = []
-    for i, c in enumerate(sorted_cycle_channels):
-        if nuclei_channel_name in c:
-            nuclei_sorted_cycle_channels.append(c)
-            nuclei_idx.append(i)
-    if len(nuclei_idx) > 2:
-        has_nuclei_stack = True
-        # More than two nuclei channels found, keep only the first and last one in the stack
-        nuclei_stack = stack[nuclei_idx]
-        nuclei_idx_without_first_and_last = nuclei_idx[1:-1]
-        stack = stack[[i for i in range(len(sorted_cycle_channels)) if i not in nuclei_idx_without_first_and_last]]
-        sorted_cycle_channels = [
-            c for i, c in enumerate(sorted_cycle_channels) if i not in nuclei_idx_without_first_and_last
-        ]
+    nuclei_idx = [i for i, c in enumerate(mci.get_channel_names()) if nuclei_channel_name in c]
+    n_nuclei_channels = len(nuclei_idx)
+    if not split_threshold_nuclei_channel:
+        # if split_threshold_nuclei_channel is None, do not split nuclei channels
+        split_nuclei = False
     else:
-        has_nuclei_stack = False
-    # Only one or two nuclei channels found, keep all in the stack
+        split_nuclei = n_nuclei_channels > split_threshold_nuclei_channel
+    if split_nuclei:
+        # if channel name is nuclei_channel_name, add to seperate nuclei stack
+        nuclei_mci = mci.subset_by_idx(nuclei_idx)
+        # keep the first nuclei channel in both the stack and the nuclei stack
+        nuclei_idx_without_first_and_last = nuclei_idx[1:-1]
+        mci = MultiChannelImage.filter_by_index(
+            mci,
+            [i for i in range(len(mci.metadata)) if i not in nuclei_idx_without_first_and_last],
+        )
+
     pixels_to_microns = parse_physical_size(path_files[0])
     image_element = create_image_element(
-        stack,
-        sorted_cycle_channels,
+        mci,
         max_chunk_size,
         c_chunks_size,
         transformations,
         pixels_to_microns,
         scale_factors,
-        name=filtered_name,
+        coordinate_system=filtered_name,
     )
-    if has_nuclei_stack:
+    if split_nuclei:
         nuclei_image_element = create_image_element(
-            nuclei_stack,
-            nuclei_sorted_cycle_channels,
+            nuclei_mci,
             max_chunk_size,
             c_chunks_size,
             transformations,
             pixels_to_microns,
             scale_factors,
-            name=filtered_name,
+            coordinate_system=filtered_name,
         )
-        table_nuclei = create_table(nuclei_sorted_cycle_channels)
-    table_channels = create_table(sorted_cycle_channels)
+        table_nuclei = create_table(nuclei_mci)
+    table_channels = create_table(mci)
 
     sdata = sd.SpatialData(
         images={
@@ -298,41 +422,43 @@ def create_sdata(
             f"{filtered_name}_table": table_channels,
         },
     )
-    if has_nuclei_stack:
+
+    if split_nuclei:
         sdata.images[f"{filtered_name}_nuclei_image"] = nuclei_image_element
         sdata.tables[f"{filtered_name}_nuclei_table"] = table_nuclei
 
     return sdata
 
 
-def create_table(sorted_cycle_channels: list[str]) -> ad.AnnData:
+def create_table(mci: MultiChannelImage) -> ad.AnnData:
+    cycles = mci.get_cycles()
+    names = mci.get_channel_names()
     df = pd.DataFrame(
         {
-            "name": sorted_cycle_channels,
-            "cycle": [int(c.split(" ")[0][1:]) for c in sorted_cycle_channels],
+            "name": names,
+            "cycle": cycles,
         }
     )
     table = ad.AnnData(var=df)
-    table.var_names = sorted_cycle_channels
+    table.var_names = names
     return sd.models.TableModel.parse(table)
 
 
 def create_image_element(
-    stack: da.Array,
-    sorted_channels: list[str],
+    mci: MultiChannelImage,
     max_chunk_size: int,
     c_chunks_size: int,
     transformations: bool,
     pixels_to_microns: float,
     scale_factors: list[int] | None,
-    name: str | None = None,
+    coordinate_system: str | None = None,
 ) -> sd.models.Image2DModel:
     t_dict = None
     if transformations:
         t_pixels_to_microns = sd.transformations.Scale([pixels_to_microns, pixels_to_microns], axes=("x", "y"))
         # 'microns' is also used in merscope example
         # no inverse needed as the transformation is already from pixels to microns
-        t_dict = {name: t_pixels_to_microns}
+        t_dict = {coordinate_system: t_pixels_to_microns}
     # # chunk_size can be 1 for channels
     chunks = {
         "x": max_chunk_size,
@@ -342,43 +468,12 @@ def create_image_element(
     if t_dict:
         logger.debug("Adding transformation: %s", t_dict)
     el = sd.models.Image2DModel.parse(
-        stack,
+        mci.get_stack(),
         # TODO: make sure y and x locations are correct
         dims=["c", "y", "x"],
         scale_factors=scale_factors,
         chunks=chunks,
-        c_coords=sorted_channels,
+        c_coords=mci.get_channel_names(),
         transformations=t_dict,
     )
     return el
-
-
-def get_stack(
-    path_files: list[Path],
-    cycles: list[int],
-    channels: list[str],
-    imread_kwargs: Mapping[str, Any],
-    skip_rounds: list[int] | None = None,
-) -> tuple[da.Array, list[int], list[str]]:
-    if len(path_files) != len(cycles) or len(path_files) != len(channels):
-        raise ValueError("Length of path_files, cycles and channels must be the same.")
-    # if any of round_channels is in skip_rounds, remove that round from the list and from path_files
-    if skip_rounds:
-        logger.info("Skipping cycles: %d", skip_rounds)
-        path_files, cycles, channels = map(
-            list,
-            zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]),
-        )
-    imgs = [imread(img, **imread_kwargs) for img in path_files]
-    for img, path in zip(imgs, path_files):
-        if img.shape[1:] != imgs[0].shape[1:]:
-            raise ValueError(
-                f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}"
-            )
-    # sort imgs, cycles and channels based on cycles
-    imgs, cycles, channels = map(
-        list,
-        zip(*sorted(zip(imgs, cycles, channels), key=lambda x: (x[1]))),
-    )
-    stack = da.stack(imgs).squeeze()
-    return stack, cycles, channels
diff --git a/tests/test_macsima.py b/tests/test_macsima.py
index 759867b0..886003b5 100644
--- a/tests/test_macsima.py
+++ b/tests/test_macsima.py
@@ -6,7 +6,7 @@
 import pytest
 from spatialdata.models import get_channels
 
-from spatialdata_io.readers.macsima import macsima
+from spatialdata_io.readers.macsima import macsima, parse_name_to_cycle
 
 if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()):
     # The datasets should be downloaded and placed unzipped in the "data" directory;
@@ -67,7 +67,7 @@ def test_total_channels(dataset: str, expected: int) -> None:
 def test_channel_names(dataset: str, expected: list[str]) -> None:
     f = Path("./data") / dataset
     assert f.is_dir()
-    sdata = macsima(f, c_subset=3)
+    sdata = macsima(f, c_subset=3, include_cycle_in_channel_name=True)
     el = sdata[list(sdata.images.keys())[0]]
 
     # get the channel names
@@ -99,13 +99,13 @@ def test_parsing_style() -> None:
         macsima(Path("."), parsing_style="not_a_parsing_style")
 
 
-# @pytest.mark.parametrize(
-#     "name,expected",
-#     [
-#         ("C-002_S-000_S_FITC_R-01_W-C-1_ROI-01_A-CD147_C-REA282.tif", 2),
-#         ("001_S_R-01_W-B-1_ROI-01_A-CD14REA599ROI1_C-REA599.ome.tif", 1),
-#     ],
-# )
-# def test_parsing_of_name_to_cycle(name: str, expected: int) -> None:
-#     result = parse_name_to_cycle(name)
-#     assert result == expected
+@pytest.mark.parametrize(
+    "name,expected",
+    [
+        ("C-002_S-000_S_FITC_R-01_W-C-1_ROI-01_A-CD147_C-REA282.tif", 2),
+        ("001_S_R-01_W-B-1_ROI-01_A-CD14REA599ROI1_C-REA599.ome.tif", 1),
+    ],
+)
+def test_parsing_of_name_to_cycle(name: str, expected: int) -> None:
+    result = parse_name_to_cycle(name)
+    assert result == expected

From 601a1223d7255e754b1f90046b08d2d1d8d0df75 Mon Sep 17 00:00:00 2001
From: LLehner <laurenslehner@googlemail.com>
Date: Thu, 24 Oct 2024 16:10:13 +0200
Subject: [PATCH 04/33] update seqfish reader

---
 src/spatialdata_io/_constants/_constants.py |  11 +-
 src/spatialdata_io/readers/_utils/_utils.py |   9 +-
 src/spatialdata_io/readers/seqfish.py       | 136 +++++++++++++-------
 3 files changed, 99 insertions(+), 57 deletions(-)

diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
index 34848137..57f50db0 100644
--- a/src/spatialdata_io/_constants/_constants.py
+++ b/src/spatialdata_io/_constants/_constants.py
@@ -66,14 +66,15 @@ class SeqfishKeys(ModeEnum):
     # file extensions
     CSV_FILE = ".csv"
     TIFF_FILE = ".tiff"
-    OME_TIFF_FILE = ".ome.tiff"
+    GEOJSON_FILE = ".geojson"
     # file identifiers
-    SECTION = "section"
-    TRANSCRIPT_COORDINATES = "TranscriptCoordinates"
+    SECTION = "_section"
+    TRANSCRIPT_COORDINATES = "TranscriptList"
     DAPI = "DAPI"
-    COUNTS_FILE = "CxG"
-    CELL_MASK_FILE = "CellMask"
+    COUNTS_FILE = "CellxGene"
+    SEGMENTATION = "Segmentation"
     CELL_COORDINATES = "CellCoordinates"
+    BOUNDARIES = "Boundaries"
     # transcripts
     TRANSCRIPTS_X = "x"
     TRANSCRIPTS_Y = "y"
diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py
index 4623fced..8c9a412a 100644
--- a/src/spatialdata_io/readers/_utils/_utils.py
+++ b/src/spatialdata_io/readers/_utils/_utils.py
@@ -3,7 +3,7 @@
 import os
 from collections.abc import Mapping
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import numpy as np
 from anndata import AnnData, read_text
@@ -13,15 +13,14 @@
 
 PathLike = Union[os.PathLike, str]  # type:ignore[type-arg]
 
-try:
+if TYPE_CHECKING:
     from numpy.typing import NDArray
-
     NDArrayA = NDArray[Any]
-except (ImportError, TypeError):
-    NDArray = np.ndarray
+else:
     NDArrayA = np.ndarray
 
 
+
 def _read_counts(
     path: str | Path,
     counts_file: str,
diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py
index b82b2eef..e1f97f42 100644
--- a/src/spatialdata_io/readers/seqfish.py
+++ b/src/spatialdata_io/readers/seqfish.py
@@ -5,7 +5,7 @@
 from collections.abc import Mapping
 from pathlib import Path
 from types import MappingProxyType
-from typing import Any
+from typing import Any, Literal, Optional, Union
 
 import anndata as ad
 import numpy as np
@@ -19,7 +19,7 @@
     ShapesModel,
     TableModel,
 )
-from spatialdata.transformations import Identity
+from spatialdata.transformations.transformations import Identity
 
 from spatialdata_io._constants._constants import SeqfishKeys as SK
 from spatialdata_io._docs import inject_docs
@@ -33,7 +33,9 @@ def seqfish(
     load_images: bool = True,
     load_labels: bool = True,
     load_points: bool = True,
-    sections: list[int] | None = None,
+    load_shapes: bool = True,
+    load_additional_shapes: Union[Literal["segmentation", "boundaries", "all"], str, None] = None,
+    sections: Optional[list[int]] = None,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
 ) -> SpatialData:
     """
@@ -43,8 +45,8 @@ def seqfish(
 
         - ```{vx.COUNTS_FILE!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Counts and metadata file.
         - ```{vx.CELL_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Cell coordinates file.
-        - ```{vx.DAPI!r}{vx.SECTION!r}{vx.OME_TIFF_FILE!r}```: High resolution tiff image.
-        - ```{vx.CELL_MASK_FILE!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: Cell mask file.
+        - ```{vx.DAPI!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: High resolution tiff image.
+        - ```{vx.SEGMENTATION!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: Cell mask file.
         - ```{vx.TRANSCRIPT_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Transcript coordinates file.
 
     .. seealso::
@@ -58,11 +60,16 @@ def seqfish(
     load_images
         Whether to load the images.
     load_labels
-        Whether to load the labels.
+        Whether to load cell segmentation.
     load_points
-        Whether to load the points.
+        Whether to load the transcript locations.
+    load_shapes
+        Whether to load cells as shape.
+    load_additional_shapes
+        Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei.
     sections
-        Which sections (specified as integers) to load. By default, all sections are loaded.
+        Which sections (specified as integers) to load. Only necessary if multiple sections present.
+        If "all" is specified, reads all remaining .tiff and .geojson files in the directory.
     imread_kwargs
         Keyword arguments to pass to :func:`dask_image.imread.imread`.
 
@@ -71,7 +78,7 @@ def seqfish(
     :class:`spatialdata.SpatialData`
     """
     path = Path(path)
-    count_file_pattern = re.compile(rf"(.*?)_{SK.CELL_COORDINATES}_{SK.SECTION}[0-9]+" + re.escape(SK.CSV_FILE))
+    count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$")
     count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)]
     if not count_files:
         # no file matching tbe pattern found
@@ -85,52 +92,56 @@ def seqfish(
 
     n = len(count_files)
     all_sections = list(range(1, n + 1))
-    if sections is None:
-        sections = all_sections
-    else:
+    sections_str: list[Optional[str]] = []
+    if sections is not None:
         for section in sections:
             if section not in all_sections:
                 raise ValueError(f"Section {section} not found in the data.")
-    sections_str = [f"{SK.SECTION}{x}" for x in sections]
+            else:
+                sections_str.append(f"{SK.SECTION}{section}")
+    elif sections is None:
+        sections_str.append(None)
+    else:
+        raise ValueError("Invalid type for 'section'. Must be list[int] or None.")
 
-    def get_cell_file(section: str) -> str:
-        return f"{prefix}_{SK.CELL_COORDINATES}_{section}{SK.CSV_FILE}"
+    def get_cell_file(section: str | None) -> str:
+        return f"{prefix}{SK.CELL_COORDINATES}{section or ''}{SK.CSV_FILE}"
 
-    def get_count_file(section: str) -> str:
-        return f"{prefix}_{SK.COUNTS_FILE}_{section}{SK.CSV_FILE}"
+    def get_count_file(section: str | None) -> str:
+        return f"{prefix}{SK.COUNTS_FILE}{section or ''}{SK.CSV_FILE}"
 
-    def get_dapi_file(section: str) -> str:
-        return f"{prefix}_{SK.DAPI}_{section}{SK.OME_TIFF_FILE}"
+    def get_dapi_file(section: str | None) -> str:
+        return f"{prefix}{SK.DAPI}{section or ''}{SK.TIFF_FILE}"
 
-    def get_cell_mask_file(section: str) -> str:
-        return f"{prefix}_{SK.CELL_MASK_FILE}_{section}{SK.TIFF_FILE}"
+    def get_cell_mask_file(section: str | None) -> str:
+        return f"{prefix}{SK.SEGMENTATION}{section or ''}{SK.TIFF_FILE}"
 
-    def get_transcript_file(section: str) -> str:
-        return f"{prefix}_{SK.TRANSCRIPT_COORDINATES}_{section}{SK.CSV_FILE}"
+    def get_transcript_file(section: str | None) -> str:
+        return f"{prefix}{SK.TRANSCRIPT_COORDINATES}{section or ''}{SK.CSV_FILE}"
 
-    adatas: dict[str, ad.AnnData] = {}
-    for section in sections_str:  # type: ignore[assignment]
-        assert isinstance(section, str)
-        cell_file = get_cell_file(section)
-        count_matrix = get_count_file(section)
+    adatas: dict[Optional[str], ad.AnnData] = {}
+    for section_str in sections_str:
+        assert isinstance(section_str, str) or section_str is None
+        cell_file = get_cell_file(section_str)
+        count_matrix = get_count_file(section_str)
         adata = ad.read_csv(path / count_matrix, delimiter=",")
         cell_info = pd.read_csv(path / cell_file, delimiter=",")
         adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy()
         adata.obs[SK.AREA] = np.reshape(cell_info[SK.AREA].to_numpy(), (-1, 1))
-        region = f"cells_{section}"
+        region = f"cells_{section_str}"
         adata.obs[SK.REGION_KEY] = region
         adata.obs[SK.INSTANCE_KEY_TABLE] = adata.obs.index.astype(int)
-        adatas[section] = adata
+        adatas[section_str] = adata
 
     scale_factors = [2, 2, 2, 2]
 
     if load_images:
         images = {
-            f"image_{x}": Image2DModel.parse(
+            f"{os.path.splitext(get_dapi_file(x))[0]}": Image2DModel.parse(
                 imread(path / get_dapi_file(x), **imread_kwargs),
                 dims=("c", "y", "x"),
                 scale_factors=scale_factors,
-                transformations={x: Identity()},
+                transformations={"global": Identity()},
             )
             for x in sections_str
         }
@@ -139,11 +150,11 @@ def get_transcript_file(section: str) -> str:
 
     if load_labels:
         labels = {
-            f"labels_{x}": Labels2DModel.parse(
+            f"{os.path.splitext(get_cell_mask_file(x))[0]}": Labels2DModel.parse(
                 imread(path / get_cell_mask_file(x), **imread_kwargs).squeeze(),
                 dims=("y", "x"),
                 scale_factors=scale_factors,
-                transformations={x: Identity()},
+                transformations={"global": Identity()},
             )
             for x in sections_str
         }
@@ -152,12 +163,12 @@ def get_transcript_file(section: str) -> str:
 
     if load_points:
         points = {
-            f"transcripts_{x}": PointsModel.parse(
+            f"{os.path.splitext(get_transcript_file(x))[0]}": PointsModel.parse(
                 pd.read_csv(path / get_transcript_file(x), delimiter=","),
                 coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y},
                 feature_key=SK.FEATURE_KEY.value,
-                instance_key=SK.INSTANCE_KEY_POINTS.value,
-                transformations={x: Identity()},
+                # instance_key=SK.INSTANCE_KEY_POINTS.value, # TODO: should be optional but parameter might get deprecated anyway
+                transformations={"global": Identity()},
             )
             for x in sections_str
         }
@@ -174,16 +185,47 @@ def get_transcript_file(section: str) -> str:
         instance_key=SK.INSTANCE_KEY_TABLE.value,
     )
 
-    shapes = {
-        f"cells_{x}": ShapesModel.parse(
-            adata.obsm[SK.SPATIAL_KEY],
-            geometry=0,
-            radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi),
-            index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
-            transformations={x: Identity()},
-        )
-        for x, adata in adatas.items()
-    }
+    if load_shapes:
+        shapes = {
+            f"{os.path.splitext(get_cell_file(x))[0]}": ShapesModel.parse(
+                adata.obsm[SK.SPATIAL_KEY],
+                geometry=0,
+                radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi),
+                index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
+                transformations={"global": Identity()},
+            )
+            for x, adata in adatas.items()
+        }
+    else:
+        shapes = {}
+
+    if load_additional_shapes is not None:
+        shape_file_names = []
+        for filename in os.listdir(path):
+            if filename.endswith((SK.TIFF_FILE, SK.GEOJSON_FILE)):
+                if load_additional_shapes == "all":
+                    if not any(key in filename for key in images.keys()) and not any(
+                        key in filename for key in labels.keys()
+                    ):
+                        shape_file_names.append(filename)
+                elif load_additional_shapes == "segmentation":
+                    if SK.SEGMENTATION in filename and not any(key in filename for key in labels.keys()):
+                        shape_file_names.append(filename)
+                elif load_additional_shapes == "boundaries":
+                    if SK.BOUNDARIES in filename:
+                        shape_file_names.append(filename)
+                elif isinstance(load_additional_shapes, str):
+                    if load_additional_shapes in filename:
+                        shape_file_names.append(filename)
+                else:
+                    raise ValueError(f"No file found with identifier {load_additional_shapes}")
+
+        for x in range(len(shape_file_names)):
+            shapes[f"{os.path.splitext(shape_file_names[x])[0]}"] = ShapesModel.parse(
+                path / shape_file_names[x],
+                index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
+                transformations={"global": Identity()},
+            )
 
     sdata = SpatialData(images=images, labels=labels, points=points, table=table, shapes=shapes)
 

From 044ab9d168c2d755e44f4480e9f1f77b8d75a4d0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 24 Oct 2024 14:19:08 +0000
Subject: [PATCH 05/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/spatialdata_io/readers/_utils/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py
index 8c9a412a..f7f3abe5 100644
--- a/src/spatialdata_io/readers/_utils/_utils.py
+++ b/src/spatialdata_io/readers/_utils/_utils.py
@@ -15,12 +15,12 @@
 
 if TYPE_CHECKING:
     from numpy.typing import NDArray
+
     NDArrayA = NDArray[Any]
 else:
     NDArrayA = np.ndarray
 
 
-
 def _read_counts(
     path: str | Path,
     counts_file: str,

From 35232d07112b9baf75a5328dc7e62091c8cdd0a5 Mon Sep 17 00:00:00 2001
From: Blampey Quentin <quentin.blampey@centralesupelec.fr>
Date: Thu, 31 Oct 2024 16:49:57 +0100
Subject: [PATCH 06/33] detect full res image when path is relative

---
 src/spatialdata_io/readers/visium_hd.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/spatialdata_io/readers/visium_hd.py b/src/spatialdata_io/readers/visium_hd.py
index aa443a37..c8215269 100644
--- a/src/spatialdata_io/readers/visium_hd.py
+++ b/src/spatialdata_io/readers/visium_hd.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import json
-import os
 import re
 import warnings
 from collections.abc import Mapping
@@ -116,12 +115,12 @@ def load_image(path: Path, suffix: str, scale_factors: list[int] | None = None)
             stacklevel=2,
         )
 
-    def _get_bins(path: Path) -> list[str]:
+    def _get_bins(path_bins: Path) -> list[str]:
         return sorted(
             [
-                bin_size
-                for bin_size in os.listdir(path)
-                if os.path.isdir(os.path.join(path, bin_size)) and bin_size.startswith(VisiumHDKeys.BIN_PREFIX)
+                bin_size.name
+                for bin_size in path_bins.iterdir()
+                if bin_size.is_dir() and bin_size.name.startswith(VisiumHDKeys.BIN_PREFIX)
             ]
         )
 
@@ -265,10 +264,7 @@ def _get_bins(path: Path) -> list[str]:
     else:
         path_fullres = path / VisiumHDKeys.MICROSCOPE_IMAGE
         if path_fullres.exists():
-            fullres_image_filenames = [
-                f for f in os.listdir(path_fullres) if os.path.isfile(os.path.join(path_fullres, f))
-            ]
-            fullres_image_paths = [path_fullres / image_filename for image_filename in fullres_image_filenames]
+            fullres_image_paths = [file for file in path_fullres.iterdir() if file.is_file()]
         elif list((path_fullres := (path / f"{filename_prefix}tissue_image")).parent.glob(f"{path_fullres.name}.*")):
             fullres_image_paths = list(path_fullres.parent.glob(f"{path_fullres.name}.*"))
         else:
@@ -291,7 +287,7 @@ def _get_bins(path: Path) -> list[str]:
 
     if fullres_image_file is not None:
         load_image(
-            path=path / fullres_image_file,
+            path=fullres_image_file,
             suffix="_full_image",
             scale_factors=[2, 2, 2, 2],
         )
@@ -356,7 +352,7 @@ def _get_bins(path: Path) -> list[str]:
 
 def _infer_dataset_id(path: Path) -> str:
     suffix = f"_{VisiumHDKeys.FEATURE_SLICE_FILE.value}"
-    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith(suffix)]
+    files = [file.name for file in path.iterdir() if file.is_file() and file.name.endswith(suffix)]
     if len(files) == 0 or len(files) > 1:
         raise ValueError(
             f"Cannot infer `dataset_id` from the feature slice file in {path}, please pass `dataset_id` as an argument."

From 233946ae30b97665274bf197544929aaab601252 Mon Sep 17 00:00:00 2001
From: Laurens Lehner <laurens.lehner@helmholtz-munich.de>
Date: Tue, 12 Nov 2024 18:32:59 +0100
Subject: [PATCH 07/33] Update prefixes; Add scalefactors

---
 src/spatialdata_io/_constants/_constants.py |   4 +-
 src/spatialdata_io/readers/seqfish.py       | 129 ++++++++++++--------
 2 files changed, 78 insertions(+), 55 deletions(-)

diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
index 57f50db0..92d26db3 100644
--- a/src/spatialdata_io/_constants/_constants.py
+++ b/src/spatialdata_io/_constants/_constants.py
@@ -68,7 +68,7 @@ class SeqfishKeys(ModeEnum):
     TIFF_FILE = ".tiff"
     GEOJSON_FILE = ".geojson"
     # file identifiers
-    SECTION = "_section"
+    ROI = "Roi"
     TRANSCRIPT_COORDINATES = "TranscriptList"
     DAPI = "DAPI"
     COUNTS_FILE = "CellxGene"
@@ -88,6 +88,8 @@ class SeqfishKeys(ModeEnum):
     SPATIAL_KEY = "spatial"
     REGION_KEY = "region"
     INSTANCE_KEY_TABLE = "instance_id"
+    SCALEFEFACTOR_X = "PhysicalSizeX"
+    SCALEFEFACTOR_Y = "PhysicalSizeY"
 
 
 @unique
diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py
index e1f97f42..0ce6f144 100644
--- a/src/spatialdata_io/readers/seqfish.py
+++ b/src/spatialdata_io/readers/seqfish.py
@@ -2,6 +2,7 @@
 
 import os
 import re
+import xml.etree.ElementTree as ET
 from collections.abc import Mapping
 from pathlib import Path
 from types import MappingProxyType
@@ -10,6 +11,7 @@
 import anndata as ad
 import numpy as np
 import pandas as pd
+import tifffile
 from dask_image.imread import imread
 from spatialdata import SpatialData
 from spatialdata.models import (
@@ -19,7 +21,7 @@
     ShapesModel,
     TableModel,
 )
-from spatialdata.transformations.transformations import Identity
+from spatialdata.transformations.transformations import Identity, Scale
 
 from spatialdata_io._constants._constants import SeqfishKeys as SK
 from spatialdata_io._docs import inject_docs
@@ -35,7 +37,7 @@ def seqfish(
     load_points: bool = True,
     load_shapes: bool = True,
     load_additional_shapes: Union[Literal["segmentation", "boundaries", "all"], str, None] = None,
-    sections: Optional[list[int]] = None,
+    ROIs: Optional[list[int]] = None,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
 ) -> SpatialData:
     """
@@ -43,11 +45,11 @@ def seqfish(
 
     This function reads the following files:
 
-        - ```{vx.COUNTS_FILE!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Counts and metadata file.
-        - ```{vx.CELL_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Cell coordinates file.
-        - ```{vx.DAPI!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: High resolution tiff image.
-        - ```{vx.SEGMENTATION!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: Cell mask file.
-        - ```{vx.TRANSCRIPT_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Transcript coordinates file.
+        - ```{vx.ROI!r}{vx.COUNTS_FILE!r}{vx.CSV_FILE!r}```: Counts and metadata file.
+        - ```{vx.ROI!r}{vx.CELL_COORDINATES!r}{vx.CSV_FILE!r}```: Cell coordinates file.
+        - ```{vx.ROI!r}{vx.DAPI!r}{vx.TIFF_FILE!r}```: High resolution tiff image.
+        - ```{vx.ROI!r}{vx.SEGMENTATION!r}{vx.TIFF_FILE!r}```: Cell mask file.
+        - ```{vx.ROI!r}{vx.TRANSCRIPT_COORDINATES!r}{vx.CSV_FILE!r}```: Transcript coordinates file.
 
     .. seealso::
 
@@ -67,8 +69,8 @@ def seqfish(
         Whether to load cells as shape.
     load_additional_shapes
         Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei.
-    sections
-        Which sections (specified as integers) to load. Only necessary if multiple sections present.
+    ROIs
+        Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present.
         If "all" is specified, reads all remaining .tiff and .geojson files in the directory.
     imread_kwargs
         Keyword arguments to pass to :func:`dask_image.imread.imread`.
@@ -80,6 +82,9 @@ def seqfish(
     path = Path(path)
     count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$")
     count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)]
+    roi_pattern = re.compile(f"^{SK.ROI}(\\d+)")
+    # n_rois = {roi_pattern.match(i).group(1) for i in os.listdir(path) if roi_pattern.match(i)}
+    n_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))}
     if not count_files:
         # no file matching tbe pattern found
         raise ValueError(
@@ -88,50 +93,53 @@ def seqfish(
     matched = count_file_pattern.match(count_files[0])
     if matched is None:
         raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}")
-    prefix = matched.group(1)
-
-    n = len(count_files)
-    all_sections = list(range(1, n + 1))
-    sections_str: list[Optional[str]] = []
-    if sections is not None:
-        for section in sections:
-            if section not in all_sections:
-                raise ValueError(f"Section {section} not found in the data.")
-            else:
-                sections_str.append(f"{SK.SECTION}{section}")
-    elif sections is None:
-        sections_str.append(None)
+    # prefix = matched.group(1)
+
+    rois_str: list[str] = []
+    if ROIs is None:
+        for roi in n_rois:
+            rois_str.append(f"{SK.ROI}{roi}")
+    elif isinstance(ROIs, list):
+        for roi in ROIs:
+            if str(roi) not in n_rois:
+                raise ValueError(f"ROI{roi} not found.")
+            rois_str.append(f"{SK.ROI}{roi}")
     else:
-        raise ValueError("Invalid type for 'section'. Must be list[int] or None.")
+        raise ValueError("Invalid type for 'roi'. Must be list[int] or None.")
 
-    def get_cell_file(section: str | None) -> str:
-        return f"{prefix}{SK.CELL_COORDINATES}{section or ''}{SK.CSV_FILE}"
+    def get_cell_file(roi: str) -> str:
+        return f"{roi}_{SK.CELL_COORDINATES}{SK.CSV_FILE}"
 
-    def get_count_file(section: str | None) -> str:
-        return f"{prefix}{SK.COUNTS_FILE}{section or ''}{SK.CSV_FILE}"
+    def get_count_file(roi: str) -> str:
+        return f"{roi}_{SK.COUNTS_FILE}{SK.CSV_FILE}"
 
-    def get_dapi_file(section: str | None) -> str:
-        return f"{prefix}{SK.DAPI}{section or ''}{SK.TIFF_FILE}"
+    def get_dapi_file(roi: str) -> str:
+        return f"{roi}_{SK.DAPI}{SK.TIFF_FILE}"
 
-    def get_cell_mask_file(section: str | None) -> str:
-        return f"{prefix}{SK.SEGMENTATION}{section or ''}{SK.TIFF_FILE}"
+    def get_cell_mask_file(roi: str) -> str:
+        return f"{roi}_{SK.SEGMENTATION}{SK.TIFF_FILE}"
 
-    def get_transcript_file(section: str | None) -> str:
-        return f"{prefix}{SK.TRANSCRIPT_COORDINATES}{section or ''}{SK.CSV_FILE}"
+    def get_transcript_file(roi: str) -> str:
+        return f"{roi}_{SK.TRANSCRIPT_COORDINATES}{SK.CSV_FILE}"
 
+    scaled = {}
     adatas: dict[Optional[str], ad.AnnData] = {}
-    for section_str in sections_str:
-        assert isinstance(section_str, str) or section_str is None
-        cell_file = get_cell_file(section_str)
-        count_matrix = get_count_file(section_str)
+    for roi_str in rois_str:
+        assert isinstance(roi_str, str) or roi_str is None
+        cell_file = get_cell_file(roi_str)
+        count_matrix = get_count_file(roi_str)
         adata = ad.read_csv(path / count_matrix, delimiter=",")
         cell_info = pd.read_csv(path / cell_file, delimiter=",")
         adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy()
         adata.obs[SK.AREA] = np.reshape(cell_info[SK.AREA].to_numpy(), (-1, 1))
-        region = f"cells_{section_str}"
+        region = f"cells_{roi_str}"
         adata.obs[SK.REGION_KEY] = region
         adata.obs[SK.INSTANCE_KEY_TABLE] = adata.obs.index.astype(int)
-        adatas[section_str] = adata
+        adatas[roi_str] = adata
+        scaled[roi_str] = Scale(
+            np.array(_get_scale_factors(path / get_dapi_file(roi_str), SK.SCALEFEFACTOR_X, SK.SCALEFEFACTOR_Y)),
+            axes=("y", "x"),
+        )
 
     scale_factors = [2, 2, 2, 2]
 
@@ -141,9 +149,9 @@ def get_transcript_file(section: str | None) -> str:
                 imread(path / get_dapi_file(x), **imread_kwargs),
                 dims=("c", "y", "x"),
                 scale_factors=scale_factors,
-                transformations={"global": Identity()},
+                transformations={"global": scaled[x]},
             )
-            for x in sections_str
+            for x in rois_str
         }
     else:
         images = {}
@@ -156,7 +164,7 @@ def get_transcript_file(section: str | None) -> str:
                 scale_factors=scale_factors,
                 transformations={"global": Identity()},
             )
-            for x in sections_str
+            for x in rois_str
         }
     else:
         labels = {}
@@ -170,20 +178,22 @@ def get_transcript_file(section: str | None) -> str:
                 # instance_key=SK.INSTANCE_KEY_POINTS.value, # TODO: should be optional but parameter might get deprecated anyway
                 transformations={"global": Identity()},
             )
-            for x in sections_str
+            for x in rois_str
         }
     else:
         points = {}
 
-    adata = ad.concat(adatas.values())
-    adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category")
-    adata.obs = adata.obs.reset_index(drop=True)
-    table = TableModel.parse(
-        adata,
-        region=[f"cells_{x}" for x in sections_str],
-        region_key=SK.REGION_KEY.value,
-        instance_key=SK.INSTANCE_KEY_TABLE.value,
-    )
+    # adata = ad.concat(adatas.values())
+    tables = {}
+    for name, adata in adatas.items():
+        adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category")
+        adata.obs = adata.obs.reset_index(drop=True)
+        tables[name] = TableModel.parse(
+            adata,
+            region=f"cells_{name}",
+            region_key=SK.REGION_KEY.value,
+            instance_key=SK.INSTANCE_KEY_TABLE.value,
+        )
 
     if load_shapes:
         shapes = {
@@ -194,7 +204,7 @@ def get_transcript_file(section: str | None) -> str:
                 index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
                 transformations={"global": Identity()},
             )
-            for x, adata in adatas.items()
+            for x, adata in zip(rois_str, adatas.values())
         }
     else:
         shapes = {}
@@ -227,6 +237,17 @@ def get_transcript_file(section: str | None) -> str:
                 transformations={"global": Identity()},
             )
 
-    sdata = SpatialData(images=images, labels=labels, points=points, table=table, shapes=shapes)
+    sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes)
 
     return sdata
+
+
+def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_key: str) -> list[float]:
+    with tifffile.TiffFile(DAPI_path) as tif:
+        ome_metadata = tif.ome_metadata
+        root = ET.fromstring(ome_metadata)
+        for element in root.iter():
+            if scalefactor_x_key in element.attrib.keys():
+                scalefactor_x = element.attrib[scalefactor_x_key]
+                scalefactor_y = element.attrib[scalefactor_y_key]
+    return [float(scalefactor_x), float(scalefactor_y)]

From d9ec1f51dcd889ab8fd6919b22e3e42df78467b6 Mon Sep 17 00:00:00 2001
From: Laurens Lehner <laurens.lehner@helmholtz-munich.de>
Date: Tue, 12 Nov 2024 18:42:56 +0100
Subject: [PATCH 08/33] Remove unused code

---
 src/spatialdata_io/readers/seqfish.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py
index 0ce6f144..dac52f99 100644
--- a/src/spatialdata_io/readers/seqfish.py
+++ b/src/spatialdata_io/readers/seqfish.py
@@ -83,17 +83,14 @@ def seqfish(
     count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$")
     count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)]
     roi_pattern = re.compile(f"^{SK.ROI}(\\d+)")
-    # n_rois = {roi_pattern.match(i).group(1) for i in os.listdir(path) if roi_pattern.match(i)}
     n_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))}
     if not count_files:
-        # no file matching tbe pattern found
         raise ValueError(
             f"No files matching the pattern {count_file_pattern} were found. Cannot infer the naming scheme."
         )
     matched = count_file_pattern.match(count_files[0])
     if matched is None:
         raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}")
-    # prefix = matched.group(1)
 
     rois_str: list[str] = []
     if ROIs is None:
@@ -183,7 +180,6 @@ def get_transcript_file(roi: str) -> str:
     else:
         points = {}
 
-    # adata = ad.concat(adatas.values())
     tables = {}
     for name, adata in adatas.items():
         adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category")

From 148466dde21fdf624d3e4604b92b9d7e3e94b5a5 Mon Sep 17 00:00:00 2001
From: Blampey Quentin <quentin.blampey@centralesupelec.fr>
Date: Thu, 28 Nov 2024 12:30:02 +0100
Subject: [PATCH 09/33] set transformation to global for both hires and lowres

---
 src/spatialdata_io/readers/visium_hd.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/spatialdata_io/readers/visium_hd.py b/src/spatialdata_io/readers/visium_hd.py
index c8215269..3037ebff 100644
--- a/src/spatialdata_io/readers/visium_hd.py
+++ b/src/spatialdata_io/readers/visium_hd.py
@@ -306,7 +306,7 @@ def _get_bins(path_bins: Path) -> list[str]:
     )
     set_transformation(
         images[dataset_id + "_hires_image"],
-        {"downscaled_hires": Identity()},
+        {"downscaled_hires": Identity(), "global": transform_hires.inverse()},
         set_all=True,
     )
 
@@ -324,7 +324,7 @@ def _get_bins(path_bins: Path) -> list[str]:
     )
     set_transformation(
         images[dataset_id + "_lowres_image"],
-        {"downscaled_lowres": Identity()},
+        {"downscaled_lowres": Identity(), "global": transform_lowres.inverse()},
         set_all=True,
     )
 

From b08eb5913d44ba7ca85d49f9bdbb710c49f4cc6d Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Fri, 29 Nov 2024 10:39:30 +0100
Subject: [PATCH 10/33] lowres and hires images mapped to global also for
 Visium

---
 CHANGELOG.md                         | 4 ++++
 src/spatialdata_io/readers/visium.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 47f2900a..c1c824a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning][].
 [keep a changelog]: https://keepachangelog.com/en/1.0.0/
 [semantic versioning]: https://semver.org/spec/v2.0.0.html
 
+## incoming release
+
+-   (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230
+
 ## [0.1.6] - 2024-11-26
 
 -   (MERSCOPE) added `feature_key` attribute for points (i.e., the `'gene'` column) #210
diff --git a/src/spatialdata_io/readers/visium.py b/src/spatialdata_io/readers/visium.py
index 22a75855..19bc740a 100644
--- a/src/spatialdata_io/readers/visium.py
+++ b/src/spatialdata_io/readers/visium.py
@@ -231,14 +231,14 @@ def visium(
         image_hires = imread(path / VisiumKeys.IMAGE_HIRES_FILE, **imread_kwargs).squeeze().transpose(2, 0, 1)
         image_hires = DataArray(image_hires, dims=("c", "y", "x"))
         images[dataset_id + "_hires_image"] = Image2DModel.parse(
-            image_hires, transformations={"downscaled_hires": Identity()}, rgb=None
+            image_hires, transformations={"downscaled_hires": Identity(), "global": transform_hires.inverse()}, rgb=None
         )
     if (path / VisiumKeys.IMAGE_LOWRES_FILE).exists():
         image_lowres = imread(path / VisiumKeys.IMAGE_LOWRES_FILE, **imread_kwargs).squeeze().transpose(2, 0, 1)
         image_lowres = DataArray(image_lowres, dims=("c", "y", "x"))
         images[dataset_id + "_lowres_image"] = Image2DModel.parse(
             image_lowres,
-            transformations={"downscaled_lowres": Identity()},
+            transformations={"downscaled_lowres": Identity(), "global": transform_lowres.inverse()},
             rgb=None,
         )
 

From df804972e2372c831fec09f6dfc8fa7f809c0557 Mon Sep 17 00:00:00 2001
From: LucaMarconato <2664412+LucaMarconato@users.noreply.github.com>
Date: Fri, 6 Dec 2024 11:17:40 +0100
Subject: [PATCH 11/33] Update README.md

Added known limitations and contributing section.
---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index f4032a1b..49629e7c 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,21 @@ This package contains reader functions to load common spatial omics formats into
 
 Note: all mentioned technologies are registered trademarks of their respective companies.
 
+## Known limitations
+
+Contributions for addressing the below limitations are very welcomed. 
+
+- Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161
+
+### How to Contribute
+
+1. **Open a GitHub Issue**: Start by opening a new issue or commenting on an existing one in the repository. Clearly describe the problem and your proposed changes to avoid overlapping efforts with others.
+
+2. **Submit a Pull Request (PR)**: Once the issue is discussed, submit a PR to the `spatialdata-io` repository. Ensure your PR includes information about a suitable dataset for testing the reader, ideally no larger than 10 GB. Include clear instructions for accessing the data, preferably with a `curl` or `wget` command for easy downloading.
+
+3. **Optional Enhancements**: To facilitate reproducibility and ease of data access, consider adding a folder in the [spatialdata-sandbox](https://github.com/giovp/spatialdata-sandbox) repository. Include a `download.py` and `to_zarr.py` script (refer to examples in the repository) to enable others to reproduce your reader by simply running these scripts sequentially.
+
+
 ## Getting started
 
 Please refer to the [documentation][link-docs]. In particular, the

From 995a2bd577ad5a137a91412a5c54f00952ffd104 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 6 Dec 2024 10:20:47 +0000
Subject: [PATCH 12/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 49629e7c..a6957764 100644
--- a/README.md
+++ b/README.md
@@ -29,9 +29,9 @@ Note: all mentioned technologies are registered trademarks of their respective c
 
 ## Known limitations
 
-Contributions for addressing the below limitations are very welcomed. 
+Contributions for addressing the below limitations are very welcomed.
 
-- Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161
+-   Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161
 
 ### How to Contribute
 
@@ -41,7 +41,6 @@ Contributions for addressing the below limitations are very welcomed.
 
 3. **Optional Enhancements**: To facilitate reproducibility and ease of data access, consider adding a folder in the [spatialdata-sandbox](https://github.com/giovp/spatialdata-sandbox) repository. Include a `download.py` and `to_zarr.py` script (refer to examples in the repository) to enable others to reproduce your reader by simply running these scripts sequentially.
 
-
 ## Getting started
 
 Please refer to the [documentation][link-docs]. In particular, the

From e2341724a9bd7ab19f75c3cc4b13947c0fce5813 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Fri, 6 Dec 2024 11:57:49 +0100
Subject: [PATCH 13/33] add docs

---
 README.md                      | 1 +
 src/spatialdata_io/__init__.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/README.md b/README.md
index a6957764..a25b245c 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@ This package contains reader functions to load common spatial omics formats into
 -   Steinbock (output data)
 -   STOmics Stereo-seq®
 -   Vizgen MERSCOPE® (MERFISH)
+-   MACSima® (MACS® iQ View output)
 
 Note: all mentioned technologies are registered trademarks of their respective companies.
 
diff --git a/src/spatialdata_io/__init__.py b/src/spatialdata_io/__init__.py
index 48f784bd..618855a5 100644
--- a/src/spatialdata_io/__init__.py
+++ b/src/spatialdata_io/__init__.py
@@ -4,6 +4,7 @@
 from spatialdata_io.readers.cosmx import cosmx
 from spatialdata_io.readers.curio import curio
 from spatialdata_io.readers.dbit import dbit
+from spatialdata_io.readers.macsima import macsima
 from spatialdata_io.readers.mcmicro import mcmicro
 from spatialdata_io.readers.merscope import merscope
 from spatialdata_io.readers.seqfish import seqfish
@@ -32,6 +33,7 @@
     "xenium_explorer_selection",
     "dbit",
     "visium_hd",
+    "macsima",
 ]
 
 __version__ = version("spatialdata-io")

From 914d2b78b60c6a15a1f4409d225a7cdb037184b7 Mon Sep 17 00:00:00 2001
From: LucaMarconato <2664412+LucaMarconato@users.noreply.github.com>
Date: Fri, 6 Dec 2024 15:50:20 +0100
Subject: [PATCH 14/33] Update src/spatialdata_io/readers/_utils/_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/spatialdata_io/readers/_utils/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py
index 5891fd68..82676098 100644
--- a/src/spatialdata_io/readers/_utils/_utils.py
+++ b/src/spatialdata_io/readers/_utils/_utils.py
@@ -112,7 +112,7 @@ def parse_physical_size(path: Path | None = None, ome_pixels: Pixels | None = No
         logger.error("Physical units for x and y dimensions are not the same.")
         raise NotImplementedError
     if pixels.physical_size_x != pixels.physical_size_y:
-        logger.error("Physical sizes for x and y dimensions are the same.")
+        logger.error("Physical sizes for x and y dimensions are not the same.")
         raise NotImplementedError
     # convert to micrometer if needed
     if pixels.physical_size_x_unit == UnitsLength.NANOMETER:

From 0b631850ba4453cfda2dfcb6717c7601e4a397b5 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Fri, 6 Dec 2024 16:05:34 +0100
Subject: [PATCH 15/33] fix sorting

---
 src/spatialdata_io/readers/macsima.py |  4 ++--
 tests/test_macsima.py                 | 24 +++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py
index 2916d79b..e15c3e83 100644
--- a/src/spatialdata_io/readers/macsima.py
+++ b/src/spatialdata_io/readers/macsima.py
@@ -88,7 +88,7 @@ def from_paths(
                 raise ValueError(
                     f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}"
                 )
-        # sort imgs, cycles and channels based on cycles
+        # create MultiChannelImage object with imgs and metadata
         output = cls(
             data=imgs,
             metadata=[ChannelMetadata(name=ch, cycle=c) for c, ch in zip(cycles, channels)],
@@ -135,8 +135,8 @@ def get_cycles(self) -> list[int]:
 
     def sort_by_channel(self) -> None:
         """Sort the channels by cycle number."""
-        self.metadata = sorted(self.metadata, key=lambda x: x.cycle)
         self.data = [d for _, d in sorted(zip(self.metadata, self.data), key=lambda x: x[0].cycle)]
+        self.metadata = sorted(self.metadata, key=lambda x: x.cycle)
 
     def subset(self, subset: int | None = None) -> MultiChannelImage:
         """Subset the image."""
diff --git a/tests/test_macsima.py b/tests/test_macsima.py
index 886003b5..f77cbd19 100644
--- a/tests/test_macsima.py
+++ b/tests/test_macsima.py
@@ -3,10 +3,16 @@
 from pathlib import Path
 from typing import Any
 
+import dask.array as da
 import pytest
 from spatialdata.models import get_channels
 
-from spatialdata_io.readers.macsima import macsima, parse_name_to_cycle
+from spatialdata_io.readers.macsima import (
+    ChannelMetadata,
+    MultiChannelImage,
+    macsima,
+    parse_name_to_cycle,
+)
 
 if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()):
     # The datasets should be downloaded and placed unzipped in the "data" directory;
@@ -109,3 +115,19 @@ def test_parsing_style() -> None:
 def test_parsing_of_name_to_cycle(name: str, expected: int) -> None:
     result = parse_name_to_cycle(name)
     assert result == expected
+
+
+def test_mci_sort_by_channel():
+    rng = da.random.default_rng()
+    sizes = [100, 200, 300]
+    c_names = ["test11", "test3", "test2"]
+    cycles = [2, 0, 1]
+    mci = MultiChannelImage(
+        data=[rng.random((size, size), chunks=(10, 10)) for size in sizes],
+        metadata=[ChannelMetadata(name=c_name, cycle=cycle) for c_name, cycle in zip(c_names, cycles)],
+    )
+    assert mci.get_channel_names() == c_names
+    assert [x.shape[0] for x in mci.data] == sizes
+    mci.sort_by_channel()
+    assert mci.get_channel_names() == ["test3", "test2", "test11"]
+    assert [x.shape[0] for x in mci.data] == [200, 300, 100]

From f609882bbca9b1388c59065b73a9ae12bfd541e7 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Fri, 6 Dec 2024 16:05:57 +0100
Subject: [PATCH 16/33] first set of fixes during code review

---
 CHANGELOG.md                          |  1 +
 src/spatialdata_io/readers/macsima.py | 57 +++++++++++++++------------
 tests/_utils.py                       | 31 +++++++++++++++
 tests/test_macsima.py                 | 22 +++++------
 tests/test_xenium.py                  |  4 +-
 5 files changed, 75 insertions(+), 40 deletions(-)
 create mode 100644 tests/_utils.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c1c824a1..aa47ea7b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning][].
 ## incoming release
 
 -   (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230
+-   (Macsima) added support @berombau #224
 
 ## [0.1.6] - 2024-11-26
 
diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py
index 2916d79b..4e5b6a17 100644
--- a/src/spatialdata_io/readers/macsima.py
+++ b/src/spatialdata_io/readers/macsima.py
@@ -68,10 +68,16 @@ def from_paths(
                     warnings.simplefilter("ignore")
                     channel_names = parse_channels(p)
                 if len(channel_names) > 1:
-                    logger.warning(f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.")
+                    warnings.warn(
+                        f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.",
+                        UserWarning,
+                        stacklevel=2,
+                    )
                 channels.append(channel_names[0])
             except ValueError as e:
-                logger.warning(f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.")
+                warnings.warn(
+                    f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.", UserWarning, stacklevel=2
+                )
 
         if len(path_files) != len(cycles) or len(path_files) != len(channels):
             raise ValueError("Length of path_files, cycles and channels must be the same.")
@@ -80,29 +86,37 @@ def from_paths(
             logger.info("Skipping cycles: %d", skip_rounds)
             path_files, cycles, channels = map(
                 list,
-                zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]),
+                zip(
+                    *[
+                        (p, c, ch)
+                        for p, c, ch in zip(path_files, cycles, channels, strict=True)
+                        if c not in skip_rounds
+                    ],
+                    strict=True,
+                ),
             )
         imgs = [imread(img, **imread_kwargs) for img in path_files]
-        for img, path in zip(imgs, path_files):
+        for img, path in zip(imgs, path_files, strict=True):
             if img.shape[1:] != imgs[0].shape[1:]:
                 raise ValueError(
-                    f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}"
+                    f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image "
+                    f"{path_files[0]} has shape {imgs[0].shape[1:]}"
                 )
         # sort imgs, cycles and channels based on cycles
         output = cls(
             data=imgs,
-            metadata=[ChannelMetadata(name=ch, cycle=c) for c, ch in zip(cycles, channels)],
+            metadata=[ChannelMetadata(name=ch, cycle=c) for c, ch in zip(cycles, channels, strict=True)],
         )
         return output
 
     @classmethod
     def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage:
-        """Create new MultiChannelImage with only the channels that contain c_name."""
+        """Create new MultiChannelImage with only the channels that contain the string c_name."""
         indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name]
-        return MultiChannelImage.filter_by_index(mci, indices)
+        return MultiChannelImage.subset_by_index(mci, indices)
 
     @classmethod
-    def filter_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage:
+    def subset_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage:
         """Filter the image by index."""
         metadata = [c for i, c in enumerate(mci.metadata) if i in indices]
         data = [d for i, d in enumerate(mci.data) if i in indices]
@@ -135,37 +149,27 @@ def get_cycles(self) -> list[int]:
 
     def sort_by_channel(self) -> None:
         """Sort the channels by cycle number."""
+        self.data = [d for _, d in sorted(zip(self.metadata, self.data, strict=True), key=lambda x: x[0].cycle)]
         self.metadata = sorted(self.metadata, key=lambda x: x.cycle)
-        self.data = [d for _, d in sorted(zip(self.metadata, self.data), key=lambda x: x[0].cycle)]
 
     def subset(self, subset: int | None = None) -> MultiChannelImage:
-        """Subset the image."""
+        """Subsets the images to keep only the first `subset` x `subset` pixels."""
         if subset:
             self.data = [d[:, :subset, :subset] for d in self.data]
         return self
 
     def subset_channels(self, c_subset: int) -> MultiChannelImage:
-        """Subset the channels."""
+        """Subsets the channels to keep only the first `c_subset` channels."""
         self.data = self.data[:c_subset]
         self.metadata = self.metadata[:c_subset]
         return self
 
-    def subset_by_idx(self, indices: list[int]) -> MultiChannelImage:
-        """Subset the image by index."""
-        data = [d for i, d in enumerate(self.data) if i in indices]
-        metadata = [c for i, c in enumerate(self.metadata) if i in indices]
-        return MultiChannelImage(
-            data=data,
-            metadata=metadata,
-            include_cycle_in_channel_name=self.include_cycle_in_channel_name,
-        )
-
     def calc_scale_factors(self, default_scale_factor: int = 2) -> list[int]:
         lower_scale_limit = min(self.data[0].shape[1:])
         return calc_scale_factors(lower_scale_limit, default_scale_factor=default_scale_factor)
 
     def get_stack(self) -> da.Array:
-        return da.stack(self.data).squeeze()
+        return da.stack(self.data).squeeze(axis=0)
 
 
 def macsima(
@@ -224,7 +228,8 @@ def macsima(
     nuclei_channel_name
         Common string of the nuclei channel to separate nuclei from other channels.
     split_threshold_nuclei_channel
-        Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is greater than this threshold, the nuclei channels are split into a separate stack.
+        Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is
+        greater than this threshold, the nuclei channels are split into a separate stack.
     skip_rounds
         List of round numbers to skip when parsing the data.
     include_cycle_in_channel_name
@@ -383,10 +388,10 @@ def create_sdata(
         split_nuclei = n_nuclei_channels > split_threshold_nuclei_channel
     if split_nuclei:
         # if channel name is nuclei_channel_name, add to seperate nuclei stack
-        nuclei_mci = mci.subset_by_idx(nuclei_idx)
+        nuclei_mci = MultiChannelImage.subset_by_index(mci, indices=nuclei_idx)
         # keep the first nuclei channel in both the stack and the nuclei stack
         nuclei_idx_without_first_and_last = nuclei_idx[1:-1]
-        mci = MultiChannelImage.filter_by_index(
+        mci = MultiChannelImage.subset_by_index(
             mci,
             [i for i in range(len(mci.metadata)) if i not in nuclei_idx_without_first_and_last],
         )
diff --git a/tests/_utils.py b/tests/_utils.py
new file mode 100644
index 00000000..5573ba44
--- /dev/null
+++ b/tests/_utils.py
@@ -0,0 +1,31 @@
+import sys
+
+import pytest
+
+
+def skip_if_below_python_version() -> pytest.mark.skipif:
+    """
+    Decorator to skip tests if the Python version is below a specified version.
+
+    This decorator prevents running tests on unsupported Python versions. Update the `MIN_VERSION`
+    constant to change the minimum Python version required for the tests.
+
+    Returns
+    -------
+    pytest.mark.skipif
+        A pytest marker that skips the test if the current Python version is below the specified `MIN_VERSION`.
+
+    Notes
+    -----
+    The current minimum version is set to Python 3.10. Adjust the `MIN_VERSION` constant as needed
+    to accommodate newer Python versions.
+
+    Examples
+    --------
+    >>> @skip_if_below_python_version()
+    >>> def test_some_feature():
+    >>>     assert True
+    """
+    MIN_VERSION = (3, 10)
+    reason = f"Test requires Python {'.'.join(map(str, MIN_VERSION))} or higher"
+    return pytest.mark.skipif(sys.version_info < MIN_VERSION, reason=reason)
diff --git a/tests/test_macsima.py b/tests/test_macsima.py
index 886003b5..c64f1761 100644
--- a/tests/test_macsima.py
+++ b/tests/test_macsima.py
@@ -1,24 +1,22 @@
 import math
-import sys
 from pathlib import Path
 from typing import Any
 
 import pytest
-from spatialdata.models import get_channels
+from spatialdata.models import get_channel_names
 
 from spatialdata_io.readers.macsima import macsima, parse_name_to_cycle
+from tests._utils import skip_if_below_python_version
 
 if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()):
-    # The datasets should be downloaded and placed unzipped in the "data" directory;
-    # MACSimaData_HCA/ with e.g. unzipped HumanLiverH35/ inside: https://livercellatlas.org/download.php
-    # Lung_adc_demo/: Ask Miltenyi Biotec for the demo dataset
     pytest.skip(
-        "requires the Lung_adc_demo or MACSimaData_HCA datasets",
+        "Requires the Lung_adc_demo or MACSimaData_HCA datasets, please check "
+        "https://github.com/giovp/spatialdata-sandbox/macsima/Readme.md for instructions on how to get the data.",
         allow_module_level=True,
     )
 
 
-@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@skip_if_below_python_version()
 @pytest.mark.parametrize(
     "dataset,expected",
     [
@@ -40,7 +38,7 @@ def test_image_size(dataset: str, expected: dict[str, Any]) -> None:
     assert extent == expected
 
 
-@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@skip_if_below_python_version()
 @pytest.mark.parametrize(
     "dataset,expected",
     [("Lung_adc_demo", 116), ("MACSimaData_HCA/HumanLiverH35", 102)],
@@ -52,11 +50,11 @@ def test_total_channels(dataset: str, expected: int) -> None:
     el = sdata[list(sdata.images.keys())[0]]
 
     # get the number of channels
-    channels: int = len(get_channels(el))
+    channels: int = len(get_channel_names(el))
     assert channels == expected
 
 
-@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@skip_if_below_python_version()
 @pytest.mark.parametrize(
     "dataset,expected",
     [
@@ -71,11 +69,11 @@ def test_channel_names(dataset: str, expected: list[str]) -> None:
     el = sdata[list(sdata.images.keys())[0]]
 
     # get the channel names
-    channels = get_channels(el)
+    channels = get_channel_names(el)
     assert list(channels) == expected
 
 
-@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@skip_if_below_python_version()
 @pytest.mark.parametrize(
     "dataset,expected",
     [
diff --git a/tests/test_xenium.py b/tests/test_xenium.py
index 1c64a912..fedb824b 100644
--- a/tests/test_xenium.py
+++ b/tests/test_xenium.py
@@ -1,5 +1,4 @@
 import math
-import sys
 from pathlib import Path
 
 import numpy as np
@@ -10,6 +9,7 @@
     prefix_suffix_uint32_from_cell_id_str,
     xenium,
 )
+from tests._utils import skip_if_below_python_version
 
 
 def test_cell_id_str_from_prefix_suffix_uint32() -> None:
@@ -46,7 +46,7 @@ def test_roundtrip_with_data_limits() -> None:
 # pointing to "data".
 # The GitHub workflow "prepare_test_data.yaml" takes care of downloading the datasets and uploading an artifact for the
 # tests to use
-@pytest.mark.skipif(sys.version_info < (3, 12), reason="Test requires Python 3.10 or higher")
+@skip_if_below_python_version()
 @pytest.mark.parametrize(
     "dataset,expected",
     [

From c09f32947ed0becf41943b06833ea59c85b2d85b Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Fri, 6 Dec 2024 16:29:29 +0100
Subject: [PATCH 17/33] add test for skip_rounds

---
 src/spatialdata_io/readers/macsima.py |  4 +--
 tests/test_macsima.py                 | 40 +++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py
index e15c3e83..0a287671 100644
--- a/src/spatialdata_io/readers/macsima.py
+++ b/src/spatialdata_io/readers/macsima.py
@@ -77,7 +77,7 @@ def from_paths(
             raise ValueError("Length of path_files, cycles and channels must be the same.")
         # if any of round_channels is in skip_rounds, remove that round from the list and from path_files
         if skip_rounds:
-            logger.info("Skipping cycles: %d", skip_rounds)
+            logger.info(f"Skipping cycles: {skip_rounds}")
             path_files, cycles, channels = map(
                 list,
                 zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]),
@@ -226,7 +226,7 @@ def macsima(
     split_threshold_nuclei_channel
         Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is greater than this threshold, the nuclei channels are split into a separate stack.
     skip_rounds
-        List of round numbers to skip when parsing the data.
+        List of round numbers to skip when parsing the data. Rounds or cycles are counted from 0 e.g. skip_rounds=[1, 2] will parse only the first round 0 when there are only 3 cycles.
     include_cycle_in_channel_name
         Whether to include the cycle number in the channel name.
 
diff --git a/tests/test_macsima.py b/tests/test_macsima.py
index f77cbd19..221b7d78 100644
--- a/tests/test_macsima.py
+++ b/tests/test_macsima.py
@@ -81,6 +81,46 @@ def test_channel_names(dataset: str, expected: list[str]) -> None:
     assert list(channels) == expected
 
 
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@pytest.mark.parametrize(
+    "dataset,expected",
+    [
+        ("Lung_adc_demo", 68),
+        ("MACSimaData_HCA/HumanLiverH35", 51),
+    ],
+)
+def test_total_rounds(dataset: str, expected: list[int]) -> None:
+    f = Path("./data") / dataset
+    assert f.is_dir()
+    sdata = macsima(f)
+    table = sdata[list(sdata.tables)[0]]
+    max_cycle = table.var["cycle"].max()
+    assert max_cycle == expected
+
+
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@pytest.mark.parametrize(
+    "dataset,skip_rounds,expected",
+    [
+        ("Lung_adc_demo", list(range(2, 68)), ["DAPI (1)", "CD68", "CD163", "DAPI (2)", "Control"]),
+        (
+            "MACSimaData_HCA/HumanLiverH35",
+            list(range(2, 51)),
+            ["DAPI (1)", "PE", "CD14", "Vimentin", "DAPI (2)", "WT1"],
+        ),
+    ],
+)
+def test_skip_rounds(dataset: str, skip_rounds: list[int], expected: list[str]) -> None:
+    f = Path("./data") / dataset
+    assert f.is_dir()
+    sdata = macsima(f, skip_rounds=skip_rounds)
+    el = sdata[list(sdata.images.keys())[0]]
+
+    # get the channel names
+    channels = get_channels(el)
+    assert list(channels) == expected, f"Expected {expected}, got {list(channels)}"
+
+
 @pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
 @pytest.mark.parametrize(
     "dataset,expected",

From d4fffec1c7e22b4dd07e3b9ab8b917ff6bfdba18 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Fri, 6 Dec 2024 16:31:15 +0100
Subject: [PATCH 18/33] finished code review

---
 src/spatialdata_io/readers/macsima.py | 51 ++++++++++++++-------------
 tests/_utils.py                       |  2 +-
 tests/test_macsima.py                 |  5 +--
 3 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py
index bca799df..d215e84f 100644
--- a/src/spatialdata_io/readers/macsima.py
+++ b/src/spatialdata_io/readers/macsima.py
@@ -169,7 +169,7 @@ def calc_scale_factors(self, default_scale_factor: int = 2) -> list[int]:
         return calc_scale_factors(lower_scale_limit, default_scale_factor=default_scale_factor)
 
     def get_stack(self) -> da.Array:
-        return da.stack(self.data).squeeze(axis=0)
+        return da.stack(self.data, axis=0).squeeze(axis=1)
 
 
 def macsima(
@@ -193,7 +193,8 @@ def macsima(
     """
     Read *MACSima* formatted dataset.
 
-    This function reads images from a MACSima cyclic imaging experiment. Metadata of the cycle rounds is parsed from the image names. The channel names are parsed from the OME metadata.
+    This function reads images from a MACSima cyclic imaging experiment. Metadata of the cycle rounds is parsed from
+    the image names. The channel names are parsed from the OME metadata.
 
     .. seealso::
 
@@ -257,16 +258,16 @@ def macsima(
 
     if parsing_style == MACSimaParsingStyle.PROCESSED_SINGLE_FOLDER:
         return parse_processed_folder(
-            path,
-            imread_kwargs,
-            subset,
-            c_subset,
-            max_chunk_size,
-            c_chunks_size,
-            multiscale,
-            transformations,
-            scale_factors,
-            default_scale_factor,
+            path=path,
+            imread_kwargs=imread_kwargs,
+            subset=subset,
+            c_subset=c_subset,
+            max_chunk_size=max_chunk_size,
+            c_chunks_size=c_chunks_size,
+            multiscale=multiscale,
+            transformations=transformations,
+            scale_factors=scale_factors,
+            default_scale_factor=default_scale_factor,
             nuclei_channel_name=nuclei_channel_name,
             split_threshold_nuclei_channel=split_threshold_nuclei_channel,
             skip_rounds=skip_rounds,
@@ -281,16 +282,16 @@ def macsima(
             if p.is_dir() and (not filter_folder_names or not any(f in p.name for f in filter_folder_names))
         ]:
             sdatas[p.stem] = parse_processed_folder(
-                p,
-                imread_kwargs,
-                subset,
-                c_subset,
-                max_chunk_size,
-                c_chunks_size,
-                multiscale,
-                transformations,
-                scale_factors,
-                default_scale_factor,
+                path=p,
+                imread_kwargs=imread_kwargs,
+                subset=subset,
+                c_subset=c_subset,
+                max_chunk_size=max_chunk_size,
+                c_chunks_size=c_chunks_size,
+                multiscale=multiscale,
+                transformations=transformations,
+                scale_factors=scale_factors,
+                default_scale_factor=default_scale_factor,
                 nuclei_channel_name=nuclei_channel_name,
                 split_threshold_nuclei_channel=split_threshold_nuclei_channel,
                 skip_rounds=skip_rounds,
@@ -387,7 +388,7 @@ def create_sdata(
     else:
         split_nuclei = n_nuclei_channels > split_threshold_nuclei_channel
     if split_nuclei:
-        # if channel name is nuclei_channel_name, add to seperate nuclei stack
+        # if channel name is nuclei_channel_name, add to separate nuclei stack
         nuclei_mci = MultiChannelImage.subset_by_index(mci, indices=nuclei_idx)
         # keep the first nuclei channel in both the stack and the nuclei stack
         nuclei_idx_without_first_and_last = nuclei_idx[1:-1]
@@ -397,6 +398,7 @@ def create_sdata(
         )
 
     pixels_to_microns = parse_physical_size(path_files[0])
+
     image_element = create_image_element(
         mci,
         max_chunk_size,
@@ -406,6 +408,8 @@ def create_sdata(
         scale_factors,
         coordinate_system=filtered_name,
     )
+    table_channels = create_table(mci)
+
     if split_nuclei:
         nuclei_image_element = create_image_element(
             nuclei_mci,
@@ -417,7 +421,6 @@ def create_sdata(
             coordinate_system=filtered_name,
         )
         table_nuclei = create_table(nuclei_mci)
-    table_channels = create_table(mci)
 
     sdata = sd.SpatialData(
         images={
diff --git a/tests/_utils.py b/tests/_utils.py
index 5573ba44..9787d1bb 100644
--- a/tests/_utils.py
+++ b/tests/_utils.py
@@ -26,6 +26,6 @@ def skip_if_below_python_version() -> pytest.mark.skipif:
     >>> def test_some_feature():
     >>>     assert True
     """
-    MIN_VERSION = (3, 10)
+    MIN_VERSION = (3, 12)
     reason = f"Test requires Python {'.'.join(map(str, MIN_VERSION))} or higher"
     return pytest.mark.skipif(sys.version_info < MIN_VERSION, reason=reason)
diff --git a/tests/test_macsima.py b/tests/test_macsima.py
index 960d27f8..51323cc3 100644
--- a/tests/test_macsima.py
+++ b/tests/test_macsima.py
@@ -14,6 +14,8 @@
 )
 from tests._utils import skip_if_below_python_version
 
+RNG = da.random.default_rng(seed=0)
+
 if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()):
     pytest.skip(
         "Requires the Lung_adc_demo or MACSimaData_HCA datasets, please check "
@@ -116,12 +118,11 @@ def test_parsing_of_name_to_cycle(name: str, expected: int) -> None:
 
 
 def test_mci_sort_by_channel() -> None:
-    rng = da.random.default_rng()
     sizes = [100, 200, 300]
     c_names = ["test11", "test3", "test2"]
     cycles = [2, 0, 1]
     mci = MultiChannelImage(
-        data=[rng.random((size, size), chunks=(10, 10)) for size in sizes],
+        data=[RNG.random((size, size), chunks=(10, 10)) for size in sizes],
         metadata=[ChannelMetadata(name=c_name, cycle=cycle) for c_name, cycle in zip(c_names, cycles)],
     )
     assert mci.get_channel_names() == c_names

From 57e784b32c72e69776dd2cbfce5b01011568a209 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Fri, 6 Dec 2024 16:34:07 +0100
Subject: [PATCH 19/33] fixed pre-commit

---
 tests/test_macsima.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_macsima.py b/tests/test_macsima.py
index 2343f57d..c8b737e6 100644
--- a/tests/test_macsima.py
+++ b/tests/test_macsima.py
@@ -98,7 +98,7 @@ def test_total_rounds(dataset: str, expected: list[int]) -> None:
     assert max_cycle == expected
 
 
-@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@skip_if_below_python_version()
 @pytest.mark.parametrize(
     "dataset,skip_rounds,expected",
     [
@@ -117,11 +117,11 @@ def test_skip_rounds(dataset: str, skip_rounds: list[int], expected: list[str])
     el = sdata[list(sdata.images.keys())[0]]
 
     # get the channel names
-    channels = get_channels(el)
+    channels = get_channel_names(el)
     assert list(channels) == expected, f"Expected {expected}, got {list(channels)}"
 
 
-@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher")
+@skip_if_below_python_version()
 @pytest.mark.parametrize(
     "dataset,expected",
     [

From 0bcd47f7a1b77c4e22968b8d7e2ce3844305e759 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Fri, 6 Dec 2024 17:08:12 +0100
Subject: [PATCH 20/33] add test for array reference

---
 src/spatialdata_io/readers/macsima.py | 11 ++++++-----
 tests/test_macsima.py                 | 23 +++++++++++++++++++++++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py
index 2ffeb5ca..75b8325e 100644
--- a/src/spatialdata_io/readers/macsima.py
+++ b/src/spatialdata_io/readers/macsima.py
@@ -3,6 +3,7 @@
 import warnings
 from collections import defaultdict
 from collections.abc import Mapping
+from copy import deepcopy
 from dataclasses import dataclass
 from pathlib import Path
 from types import MappingProxyType
@@ -111,13 +112,13 @@ def from_paths(
 
     @classmethod
     def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage:
-        """Create new MultiChannelImage with only the channels that contain the string c_name."""
+        """Create new MultiChannelImage with only the channels that contain the string c_name. The underlying data will still be the same reference, use copy.deepcopy to make a new copy."""
         indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name]
-        return MultiChannelImage.subset_by_index(mci, indices)
+        return cls.subset_by_index(mci, indices)
 
     @classmethod
     def subset_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage:
-        """Filter the image by index."""
+        """Create new MultiChannelImage with only the channels selected by the indices. The underlying data will still be the same reference, use copy.deepcopy to make a new copy."""
         metadata = [c for i, c in enumerate(mci.metadata) if i in indices]
         data = [d for i, d in enumerate(mci.data) if i in indices]
         return cls(
@@ -159,7 +160,7 @@ def subset(self, subset: int | None = None) -> MultiChannelImage:
         return self
 
     def subset_channels(self, c_subset: int) -> MultiChannelImage:
-        """Subsets the channels to keep only the first `c_subset` channels."""
+        """Subsets the channels to keep only the first `c_subset` channels. The underlying data will still be the same reference, use copy.deepcopy to make a new copy."""
         self.data = self.data[:c_subset]
         self.metadata = self.metadata[:c_subset]
         return self
@@ -389,7 +390,7 @@ def create_sdata(
         split_nuclei = n_nuclei_channels > split_threshold_nuclei_channel
     if split_nuclei:
         # if channel name is nuclei_channel_name, add to separate nuclei stack
-        nuclei_mci = MultiChannelImage.subset_by_index(mci, indices=nuclei_idx)
+        nuclei_mci = deepcopy(MultiChannelImage.subset_by_index(mci, indices=nuclei_idx))
         # keep the first nuclei channel in both the stack and the nuclei stack
         nuclei_idx_without_first_and_last = nuclei_idx[1:-1]
         mci = MultiChannelImage.subset_by_index(
diff --git a/tests/test_macsima.py b/tests/test_macsima.py
index c8b737e6..66038e0b 100644
--- a/tests/test_macsima.py
+++ b/tests/test_macsima.py
@@ -1,4 +1,5 @@
 import math
+from copy import deepcopy
 from pathlib import Path
 from typing import Any
 
@@ -170,3 +171,25 @@ def test_mci_sort_by_channel() -> None:
     mci.sort_by_channel()
     assert mci.get_channel_names() == ["test3", "test2", "test11"]
     assert [x.shape[0] for x in mci.data] == [200, 300, 100]
+
+
+def test_mci_array_reference() -> None:
+    arr1 = RNG.random((100, 100), chunks=(10, 10))
+    arr2 = RNG.random((200, 200), chunks=(10, 10))
+    mci = MultiChannelImage(
+        data=[arr1, arr2],
+        metadata=[ChannelMetadata(name="test1", cycle=0), ChannelMetadata(name="test2", cycle=1)],
+    )
+    orig_arr1 = arr1.copy()
+
+    subset_mci = MultiChannelImage.subset_by_index(mci, [0])
+    # test that the subset is a view
+    assert subset_mci.data[0] is arr1
+    assert da.all(subset_mci.data[0] == orig_arr1)
+    # test that a deepcopy is not a view
+    deepcopy_mci: MultiChannelImage = deepcopy(mci)
+    deepcopy_mci.data[0][0, 0] = deepcopy_mci.data[0][0, 0] + 1
+    assert deepcopy_mci.data[0] is not arr1
+    assert not da.all(deepcopy_mci.data[0] == orig_arr1)
+    # test that the original mci is not changed
+    assert da.all(mci.data[0] == orig_arr1)

From da30404c7a47ecf8109aeef4a920c7422836a478 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Fri, 6 Dec 2024 18:48:58 +0100
Subject: [PATCH 21/33] removed subset_channels

---
 src/spatialdata_io/readers/macsima.py | 26 ++++++++++++--------------
 tests/_utils.py                       |  2 +-
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py
index 2ffeb5ca..1e376566 100644
--- a/src/spatialdata_io/readers/macsima.py
+++ b/src/spatialdata_io/readers/macsima.py
@@ -109,11 +109,12 @@ def from_paths(
         )
         return output
 
-    @classmethod
-    def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage:
-        """Create new MultiChannelImage with only the channels that contain the string c_name."""
-        indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name]
-        return MultiChannelImage.subset_by_index(mci, indices)
+    # unused
+    # @classmethod
+    # def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage:
+    #     """Create new MultiChannelImage with only the channels that contain the string c_name."""
+    #     indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name]
+    #     return MultiChannelImage.subset_by_index(mci, indices)
 
     @classmethod
     def subset_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage:
@@ -158,12 +159,6 @@ def subset(self, subset: int | None = None) -> MultiChannelImage:
             self.data = [d[:, :subset, :subset] for d in self.data]
         return self
 
-    def subset_channels(self, c_subset: int) -> MultiChannelImage:
-        """Subsets the channels to keep only the first `c_subset` channels."""
-        self.data = self.data[:c_subset]
-        self.metadata = self.metadata[:c_subset]
-        return self
-
     def calc_scale_factors(self, default_scale_factor: int = 2) -> list[int]:
         lower_scale_limit = min(self.data[0].shape[1:])
         return calc_scale_factors(lower_scale_limit, default_scale_factor=default_scale_factor)
@@ -347,7 +342,7 @@ def parse_processed_folder(
     if subset:
         mci = mci.subset(subset)
     if c_subset:
-        mci = mci.subset_channels(c_subset)
+        mci = MultiChannelImage.subset_by_index(mci, indices=list(range(0, c_subset)))
     if multiscale and not scale_factors:
         scale_factors = mci.calc_scale_factors(default_scale_factor=default_scale_factor)
     if not multiscale:
@@ -469,15 +464,18 @@ def create_image_element(
         t_dict = {coordinate_system: t_pixels_to_microns}
     # # chunk_size can be 1 for channels
     chunks = {
-        "x": max_chunk_size,
         "y": max_chunk_size,
+        "x": max_chunk_size,
         "c": c_chunks_size,
     }
     if t_dict:
         logger.debug("Adding transformation: %s", t_dict)
     el = sd.models.Image2DModel.parse(
         mci.get_stack(),
-        # TODO: make sure y and x locations are correct
+        # the data on disk is not always CYX, but imread takes care of parsing things correctly, so that we can assume
+        # mci to be CYX. Still, to make the code more robust, we could consider using a different backend, for instance
+        # bioio-ome-tiff, read both the data and its dimensions from disk, and let Image2DModel.parse() rearrange the
+        # dimensions into CYX.
         dims=["c", "y", "x"],
         scale_factors=scale_factors,
         chunks=chunks,
diff --git a/tests/_utils.py b/tests/_utils.py
index 9787d1bb..5573ba44 100644
--- a/tests/_utils.py
+++ b/tests/_utils.py
@@ -26,6 +26,6 @@ def skip_if_below_python_version() -> pytest.mark.skipif:
     >>> def test_some_feature():
     >>>     assert True
     """
-    MIN_VERSION = (3, 12)
+    MIN_VERSION = (3, 10)
     reason = f"Test requires Python {'.'.join(map(str, MIN_VERSION))} or higher"
     return pytest.mark.skipif(sys.version_info < MIN_VERSION, reason=reason)

From c4f6cf7cdd36ffe7f4c91492189d5875eac0bd42 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Fri, 6 Dec 2024 19:30:28 +0100
Subject: [PATCH 22/33] added todo for regexp approach

---
 src/spatialdata_io/readers/macsima.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py
index 611d5216..58e962eb 100644
--- a/src/spatialdata_io/readers/macsima.py
+++ b/src/spatialdata_io/readers/macsima.py
@@ -193,7 +193,7 @@ def macsima(
 
     .. seealso::
 
-        - `MACSima output <https://application.qitissue.com/getting-started/naming-your-datasets>`_.
+        - `MACSima output <https://application.qimagingsys.com/getting-started/naming-your-datasets>`_.
 
     Parameters
     ----------
@@ -227,7 +227,8 @@ def macsima(
         Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is
         greater than this threshold, the nuclei channels are split into a separate stack.
     skip_rounds
-        List of round numbers to skip when parsing the data. Rounds or cycles are counted from 0 e.g. skip_rounds=[1, 2] will parse only the first round 0 when there are only 3 cycles.
+        List of round numbers to skip when parsing the data. Rounds or cycles are counted from 0 e.g. skip_rounds=[1, 2]
+         will parse only the first round 0 when there are only 3 cycles.
     include_cycle_in_channel_name
         Whether to include the cycle number in the channel name.
 
@@ -326,6 +327,8 @@ def parse_processed_folder(
     """Parse a single folder containing images from a cyclical imaging platform."""
     # get list of image paths, get channel name from OME data and cycle round number from filename
     # look for OME-TIFF files
+    # TODO: replace this pattern and the p.suffix in [".tif", ".tiff"] with a single function based on a regexp, like
+    # this one re.compile(r".*\.tif{1,2}$", re.IGNORECASE)
     path_files = list(path.glob(file_pattern))
     logger.debug(path_files[0])
 

From c8fc779ca51559e2c2593953de93b50114b57ea6 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Tue, 10 Dec 2024 16:13:33 +0100
Subject: [PATCH 23/33] rename variable

---
 src/spatialdata_io/readers/seqfish.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py
index 028753fb..187c178b 100644
--- a/src/spatialdata_io/readers/seqfish.py
+++ b/src/spatialdata_io/readers/seqfish.py
@@ -37,7 +37,7 @@ def seqfish(
     load_points: bool = True,
     load_shapes: bool = True,
     load_additional_shapes: Literal["segmentation", "boundaries", "all"] | str | None = None,
-    ROIs: list[int] | None = None,
+    rois: list[int] | None = None,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
 ) -> SpatialData:
     """
@@ -69,7 +69,7 @@ def seqfish(
         Whether to load cells as shape.
     load_additional_shapes
         Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei.
-    ROIs
+    rois
         Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present.
         If "all" is specified, reads all remaining .tiff and .geojson files in the directory.
     imread_kwargs
@@ -93,11 +93,11 @@ def seqfish(
         raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}")
 
     rois_str: list[str] = []
-    if ROIs is None:
+    if rois is None:
         for roi in n_rois:
             rois_str.append(f"{SK.ROI}{roi}")
-    elif isinstance(ROIs, list):
-        for roi in ROIs:
+    elif isinstance(rois, list):
+        for roi in rois:
             if str(roi) not in n_rois:
                 raise ValueError(f"ROI{roi} not found.")
             rois_str.append(f"{SK.ROI}{roi}")

From 5491545abc4f914b778cdcf979ed6d77c7975531 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Tue, 10 Dec 2024 16:22:09 +0100
Subject: [PATCH 24/33] new test datasets

---
 .github/workflows/prepare_test_data.yaml | 22 ++++++++++++++++++----
 tests/test_xenium.py                     |  3 +++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/prepare_test_data.yaml b/.github/workflows/prepare_test_data.yaml
index 10d58c44..6c468b3a 100644
--- a/.github/workflows/prepare_test_data.yaml
+++ b/.github/workflows/prepare_test_data.yaml
@@ -18,15 +18,29 @@ jobs:
               run: |
                   mkdir -p ./data
                   cd ./data
+
+                  # 10x Genomics Xenium 2.0.0
                   curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Breast_2fov/Xenium_V1_human_Breast_2fov_outs.zip
                   curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Lung_2fov/Xenium_V1_human_Lung_2fov_outs.zip
 
+                  # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, multimodal cell segmentation
+                  curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip
+
+                  # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, nuclear expansion
+                  curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_Mouse_Ileum_tiny/Xenium_Prime_Mouse_Ileum_tiny_outs.zip
+
+                  # Spatial Genomics seqFISH v2
+                  curl -O https://s3.embl.de/spatialdata/raw_data/seqfish-2-test-dataset.zip
+
             - name: Unzip files
               run: |
-                  unzip ./data/Xenium_V1_human_Breast_2fov_outs.zip -d ./data/Xenium_V1_human_Breast_2fov_outs
-                  unzip ./data/Xenium_V1_human_Lung_2fov_outs.zip -d ./data/Xenium_V1_human_Lung_2fov_outs
-                  rm ./data/Xenium_V1_human_Breast_2fov_outs.zip
-                  rm ./data/Xenium_V1_human_Lung_2fov_outs.zip
+                  cd ./data
+                  for file in *.zip; do
+                      dir="${file%.zip}"
+                      mkdir -p "$dir"
+                      unzip "$file" -d "$dir"
+                      rm "$file"
+                  done
 
             - name: Upload artifacts
               uses: actions/upload-artifact@v3
diff --git a/tests/test_xenium.py b/tests/test_xenium.py
index fedb824b..2fb35ce1 100644
--- a/tests/test_xenium.py
+++ b/tests/test_xenium.py
@@ -63,3 +63,6 @@ def test_example_data(dataset: str, expected: str) -> None:
     extent = get_extent(sdata, exact=False)
     extent = {ax: (math.floor(extent[ax][0]), math.ceil(extent[ax][1])) for ax in extent}
     assert str(extent) == expected
+
+
+# TODO: add tests for Xenium 3.0.0

From 64f54f92843509b64407dadd3d5a591c9aa1fbfc Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Wed, 11 Dec 2024 11:59:43 +0100
Subject: [PATCH 25/33] update test datasets

---
 .github/workflows/prepare_test_data.yaml | 3 ++-
 src/spatialdata_io/readers/xenium.py     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/prepare_test_data.yaml b/.github/workflows/prepare_test_data.yaml
index 6c468b3a..77f984fc 100644
--- a/.github/workflows/prepare_test_data.yaml
+++ b/.github/workflows/prepare_test_data.yaml
@@ -24,7 +24,8 @@ jobs:
                   curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Lung_2fov/Xenium_V1_human_Lung_2fov_outs.zip
 
                   # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, multimodal cell segmentation
-                  curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip
+                  # this file seems to be corrupted; skipping it for now
+                  # curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip
 
                   # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, nuclear expansion
                   curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_Mouse_Ileum_tiny/Xenium_Prime_Mouse_Ileum_tiny_outs.zip
diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index 54f911d2..2a5d6165 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -84,7 +84,7 @@ def xenium(
 
     .. seealso::
 
-        - `10X Genomics Xenium file format  <https://cf.10xgenomics.com/supp/xenium/xenium_documentation.html>`_.
+        - `10X Genomics Xenium file format  <https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-at-a-glance>`_.
 
     Parameters
     ----------

From 4c92edd53fe849a570ffe64338f73fce609551f7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 11 Dec 2024 11:00:54 +0000
Subject: [PATCH 26/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CHANGELOG.md           |  98 +++++++++++++++++------------------
 README.md              |  38 +++++++-------
 docs/contributing.md   |  16 +++---
 docs/template_usage.md | 114 ++++++++++++++++++++---------------------
 4 files changed, 133 insertions(+), 133 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa47ea7b..ed3d6456 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,138 +10,138 @@ and this project adheres to [Semantic Versioning][].
 
 ## incoming release
 
--   (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230
--   (Macsima) added support @berombau #224
+- (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230
+- (Macsima) added support @berombau #224
 
 ## [0.1.6] - 2024-11-26
 
--   (MERSCOPE) added `feature_key` attribute for points (i.e., the `'gene'` column) #210
--   (Visium HD) get transformation matrices even when only images are parsed #215
--   Support for `xarray.DataTree` (which was moved from `datatree.DataTree`) #232
+- (MERSCOPE) added `feature_key` attribute for points (i.e., the `'gene'` column) #210
+- (Visium HD) get transformation matrices even when only images are parsed #215
+- Support for `xarray.DataTree` (which was moved from `datatree.DataTree`) #232
 
 ## [0.1.5] - 2024-09-25
 
 ### Added
 
--   (Xenium) added `dims` parameter for more control in `xenium_aligned_image()`
+- (Xenium) added `dims` parameter for more control in `xenium_aligned_image()`
 
 ### Fixed
 
--   Passing `rgb=None` to image model parser for both visium and visiumhd, leading to 3-4 channel images being
-    interpreted as RGB(A)
--   Fix header bug Visium data #200
--   (Visium HD) Fix path parsing when images are missing #204 #206
+- Passing `rgb=None` to image model parser for both visium and visiumhd, leading to 3-4 channel images being
+  interpreted as RGB(A)
+- Fix header bug Visium data #200
+- (Visium HD) Fix path parsing when images are missing #204 #206
 
 ## [0.1.4] - 2024-08-07
 
 ### Changed
 
--   (Xenium) changed default target of table to labels; radii of circles computed from cells, not nuclei #179
--   (Visium HD) changed default geometry to squares from circles for the bins; added parameter to choose #183
--   (CosMx) dropping points element with zero-length from the cosmx reader #191
+- (Xenium) changed default target of table to labels; radii of circles computed from cells, not nuclei #179
+- (Visium HD) changed default geometry to squares from circles for the bins; added parameter to choose #183
+- (CosMx) dropping points element with zero-length from the cosmx reader #191
 
 ## [0.1.3] - 2024-07-03
 
 ### Added
 
--   (Xenium) support reading multi-polygon selection files from the Xenium Explorer
--   (ISS) An experimental loader to load elemental ISS data objects, e.g. raw.tif, label.tif and anndata.h5ad
--   (Stereo-seq) Added reader @LLehner @timtreis @florianingelfinger #70
--   (MERSCOPE) Optional rioxarray backend for MERSCOPE data (reads chunks)
--   (MERSCOPE) Can choose which elements should be loaded
+- (Xenium) support reading multi-polygon selection files from the Xenium Explorer
+- (ISS) An experimental loader to load elemental ISS data objects, e.g. raw.tif, label.tif and anndata.h5ad
+- (Stereo-seq) Added reader @LLehner @timtreis @florianingelfinger #70
+- (MERSCOPE) Optional rioxarray backend for MERSCOPE data (reads chunks)
+- (MERSCOPE) Can choose which elements should be loaded
 
 ### Fixed
 
--   (Visium) Fixed issue with joining a SpatialElement with a table due to index values not being unique.
-    obs_names_make_unique is now called internally to enforce unique index values allowing for join operations.
+- (Visium) Fixed issue with joining a SpatialElement with a table due to index values not being unique.
+  obs_names_make_unique is now called internally to enforce unique index values allowing for join operations.
 
 ### Changed
 
--   (MERSCOPE) "global" coordinate system is used as a default instead of "microns"
+- (MERSCOPE) "global" coordinate system is used as a default instead of "microns"
 
 ## [0.1.2] - 2024-03-30
 
 ### Added
 
--   (Visium HD) added reader, coauthored by @LLehner
+- (Visium HD) added reader, coauthored by @LLehner
 
 ### Fixed
 
--   (Xenium) reader for 1.0.1 (paper data) and unknown versions
--   (Xenium) fix in reading "minimalistic" Xenium datasets #132
+- (Xenium) reader for 1.0.1 (paper data) and unknown versions
+- (Xenium) fix in reading "minimalistic" Xenium datasets #132
 
 ## [0.1.1] - 2024-03-24
 
 ### Added
 
--   (Xenium) support for post-xenium aligned images (IF, HE)
--   (Xenium) reader for the selection coordinates file from the Xenium Explorer
--   (Xenium) support for the new Xenium 2.0.0 (multimodal segmentation)
--   (Xenium) reading multiscale labels from cells.zarr.zip
--   (MCMICRO) support for TMAs (such as the data of exemplar-002)
--   (DBiT-seq) reader
--   converter functions `experimental.to_legacy_anndata()` and `experimental.from_legacy_anndata()`
--   (Visium) support for raw reads (capture locations not under tissue)
+- (Xenium) support for post-xenium aligned images (IF, HE)
+- (Xenium) reader for the selection coordinates file from the Xenium Explorer
+- (Xenium) support for the new Xenium 2.0.0 (multimodal segmentation)
+- (Xenium) reading multiscale labels from cells.zarr.zip
+- (MCMICRO) support for TMAs (such as the data of exemplar-002)
+- (DBiT-seq) reader
+- converter functions `experimental.to_legacy_anndata()` and `experimental.from_legacy_anndata()`
+- (Visium) support for raw reads (capture locations not under tissue)
 
 ### Fixed
 
--   (Xenium) fixed index (fail on write)
--   (Xenium) renamed cells_as_shapes to cells_as_circles; set default to True
--   (MERSCOPE) don't try to load unexisting elements #87
--   (Visium) fixed axes ordering
+- (Xenium) fixed index (fail on write)
+- (Xenium) renamed cells_as_shapes to cells_as_circles; set default to True
+- (MERSCOPE) don't try to load unexisting elements #87
+- (Visium) fixed axes ordering
 
 ## [0.0.9] - 2023-11-06
 
 ### Fixed
 
--   (Xenium) bug when converting feature_name #81, from @fbnrst
--   (Visium) visium() supports file counts without dataset_id #91
+- (Xenium) bug when converting feature_name #81, from @fbnrst
+- (Visium) visium() supports file counts without dataset_id #91
 
 ## [0.0.8] - 2023-10-02
 
 ### Fixed
 
--   (Xenium) coerce cell id to str #64
--   (MERSCOPE) fix coordinate transformation #68
--   (MERSCOPE) Improvements/fixes: merscope reader #73
+- (Xenium) coerce cell id to str #64
+- (MERSCOPE) fix coordinate transformation #68
+- (MERSCOPE) Improvements/fixes: merscope reader #73
 
 ## [0.0.7] - 2023-07-23
 
 ### Fixed
 
--   Bugs in Xenium and MERSCOPE
+- Bugs in Xenium and MERSCOPE
 
 ## [0.0.5] - 2023-06-21
 
 ### Added
 
--   MERFISH reader (from @quentinblampey)
--   CODEX reader (from @LLehner)
+- MERFISH reader (from @quentinblampey)
+- CODEX reader (from @LLehner)
 
 ### Fixed
 
--   Issues on Visium reader (thanks @ilia-kats) and Xenium reader
+- Issues on Visium reader (thanks @ilia-kats) and Xenium reader
 
 ## [0.0.4] - 2023-05-23
 
 ### Added
 
--   Curio reader
+- Curio reader
 
 ## [0.0.3] - 2023-05-22
 
 ### Merged
 
--   Merge pull request #40 from scverse/fix/categories
+- Merge pull request #40 from scverse/fix/categories
 
 ## [0.0.2] - 2023-05-04
 
 ### Changed
 
--   Revert version regex (#37)
+- Revert version regex (#37)
 
 ## [0.0.1] - 2023-05-04
 
 ### Tested
 
--   Test installation from pypi
+- Test installation from pypi
diff --git a/README.md b/README.md
index a25b245c..2eaa490c 100644
--- a/README.md
+++ b/README.md
@@ -12,19 +12,19 @@
 
 This package contains reader functions to load common spatial omics formats into SpatialData. Currently, we provide support for:
 
--   10x Genomics Visium®
--   10x Genomics Visium HD®
--   10x Genomics Xenium®
--   Akoya PhenoCycler® (formerly CODEX®)
--   Curio Seeker®
--   DBiT-seq
--   MCMICRO (output data)
--   NanoString CosMx®
--   Spatial Genomics GenePS® (seqFISH)
--   Steinbock (output data)
--   STOmics Stereo-seq®
--   Vizgen MERSCOPE® (MERFISH)
--   MACSima® (MACS® iQ View output)
+- 10x Genomics Visium®
+- 10x Genomics Visium HD®
+- 10x Genomics Xenium®
+- Akoya PhenoCycler® (formerly CODEX®)
+- Curio Seeker®
+- DBiT-seq
+- MCMICRO (output data)
+- NanoString CosMx®
+- Spatial Genomics GenePS® (seqFISH)
+- Steinbock (output data)
+- STOmics Stereo-seq®
+- Vizgen MERSCOPE® (MERFISH)
+- MACSima® (MACS® iQ View output)
 
 Note: all mentioned technologies are registered trademarks of their respective companies.
 
@@ -32,7 +32,7 @@ Note: all mentioned technologies are registered trademarks of their respective c
 
 Contributions for addressing the below limitations are very welcomed.
 
--   Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161
+- Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161
 
 ### How to Contribute
 
@@ -46,7 +46,7 @@ Contributions for addressing the below limitations are very welcomed.
 
 Please refer to the [documentation][link-docs]. In particular, the
 
--   [API documentation][link-api].
+- [API documentation][link-api].
 
 ## Installation
 
@@ -76,10 +76,10 @@ If you found a bug, please use the [issue tracker][issue-tracker].
 
 Technologies that can be read into `SpatialData` objects using third-party libraries:
 
--   METASPACE (MALDI, ...): [metaspace-converter](https://github.com/metaspace2020/metaspace-converter)
--   PhenoCycler®: [SOPA](https://github.com/gustaveroussy/sopa)
--   MACSima®: [SOPA](https://github.com/gustaveroussy/sopa)
--   Hyperion® (Imaging Mass Cytometry): [SOPA](https://github.com/gustaveroussy/sopa)
+- METASPACE (MALDI, ...): [metaspace-converter](https://github.com/metaspace2020/metaspace-converter)
+- PhenoCycler®: [SOPA](https://github.com/gustaveroussy/sopa)
+- MACSima®: [SOPA](https://github.com/gustaveroussy/sopa)
+- Hyperion® (Imaging Mass Cytometry): [SOPA](https://github.com/gustaveroussy/sopa)
 
 ## Disclaimer
 
diff --git a/docs/contributing.md b/docs/contributing.md
index b678c149..7df41462 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -138,10 +138,10 @@ in the cookiecutter-scverse template.
 
 Please write documentation for new or changed features and use-cases. This project uses [sphinx][] with the following features:
 
--   the [myst][] extension allows to write documentation in markdown/Markedly Structured Text
--   [Numpy-style docstrings][numpydoc] (through the [napoloen][numpydoc-napoleon] extension).
--   Jupyter notebooks as tutorials through [myst-nb][] (See [Tutorials with myst-nb](#tutorials-with-myst-nb-and-jupyter-notebooks))
--   [Sphinx autodoc typehints][], to automatically reference annotated input and output types
+- the [myst][] extension allows to write documentation in markdown/Markedly Structured Text
+- [Numpy-style docstrings][numpydoc] (through the [napoloen][numpydoc-napoleon] extension).
+- Jupyter notebooks as tutorials through [myst-nb][] (See [Tutorials with myst-nb](#tutorials-with-myst-nb-and-jupyter-notebooks))
+- [Sphinx autodoc typehints][], to automatically reference annotated input and output types
 
 See the [scanpy developer docs](https://scanpy.readthedocs.io/en/latest/dev/documentation.html) for more information
 on how to write documentation.
@@ -158,10 +158,10 @@ repository.
 
 #### Hints
 
--   If you refer to objects from other packages, please add an entry to `intersphinx_mapping` in `docs/conf.py`. Only
-    if you do so can sphinx automatically create a link to the external documentation.
--   If building the documentation fails because of a missing link that is outside your control, you can add an entry to
-    the `nitpick_ignore` list in `docs/conf.py`
+- If you refer to objects from other packages, please add an entry to `intersphinx_mapping` in `docs/conf.py`. Only
+  if you do so can sphinx automatically create a link to the external documentation.
+- If building the documentation fails because of a missing link that is outside your control, you can add an entry to
+  the `nitpick_ignore` list in `docs/conf.py`
 
 #### Building the docs locally
 
diff --git a/docs/template_usage.md b/docs/template_usage.md
index bf8152f6..f8efbc16 100644
--- a/docs/template_usage.md
+++ b/docs/template_usage.md
@@ -115,13 +115,13 @@ We recommend using [readthedocs.org][] (RTD) to build and host the documentation
 To enable readthedocs, head over to [their website][readthedocs.org] and sign in with your GitHub account.
 On the RTD dashboard choose "Import a Project" and follow the instructions to add your repository.
 
--   Make sure to choose the correct name of the default branch. On GitHub, the name of the default branch should be `main` (it has
-    recently changed from `master` to `main`).
--   We recommend to enable documentation builds for pull requests (PRs). This ensures that a PR doesn't introduce changes
-    that break the documentation. To do so, got to `Admin -> Advanced Settings`, check the
-    `Build pull requests for this projects` option, and click `Save`. For more information, please refer to
-    the [official RTD documentation](https://docs.readthedocs.io/en/stable/pull-requests.html).
--   If you find the RTD builds are failing, you can disable the `fail_on_warning` option in `.readthedocs.yaml`.
+- Make sure to choose the correct name of the default branch. On GitHub, the name of the default branch should be `main` (it has
+  recently changed from `master` to `main`).
+- We recommend to enable documentation builds for pull requests (PRs). This ensures that a PR doesn't introduce changes
+  that break the documentation. To do so, got to `Admin -> Advanced Settings`, check the
+  `Build pull requests for this projects` option, and click `Save`. For more information, please refer to
+  the [official RTD documentation](https://docs.readthedocs.io/en/stable/pull-requests.html).
+- If you find the RTD builds are failing, you can disable the `fail_on_warning` option in `.readthedocs.yaml`.
 
 If your project is private, there are ways to enable docs rendering on [readthedocs.org][] but it is more cumbersome and requires a different subscription for read the docs. See a guide [here](https://docs.readthedocs.io/en/stable/guides/importing-private-repositories.html).
 
@@ -144,52 +144,52 @@ Once authorized, pre-commit.ci should automatically be activated.
 
 The following pre-commit checks are for code style and format:
 
--   [black](https://black.readthedocs.io/en/stable/): standard code
-    formatter in Python.
--   [isort](https://pycqa.github.io/isort/): sort module imports into
-    sections and types.
--   [prettier](https://prettier.io/docs/en/index.html): standard code
-    formatter for non-Python files (e.g. YAML).
--   [blacken-docs](https://github.com/asottile/blacken-docs): black on
-    python code in docs.
+- [black](https://black.readthedocs.io/en/stable/): standard code
+  formatter in Python.
+- [isort](https://pycqa.github.io/isort/): sort module imports into
+  sections and types.
+- [prettier](https://prettier.io/docs/en/index.html): standard code
+  formatter for non-Python files (e.g. YAML).
+- [blacken-docs](https://github.com/asottile/blacken-docs): black on
+  python code in docs.
 
 The following pre-commit checks are for errors and inconsistencies:
 
--   [flake8](https://flake8.pycqa.org/en/latest/): standard check for errors in Python files.
-    -   [flake8-tidy-imports](https://github.com/adamchainz/flake8-tidy-imports):
-        tidy module imports.
-    -   [flake8-docstrings](https://github.com/PyCQA/flake8-docstrings):
-        pydocstyle extension of flake8.
-    -   [flake8-rst-docstrings](https://github.com/peterjc/e8-rst-docstrings):
-        extension of `flake8-docstrings` for `rst` docs.
-    -   [flake8-comprehensions](https://github.com/adamchainz/e8-comprehensions):
-        write better list/set/dict comprehensions.
-    -   [flake8-bugbear](https://github.com/PyCQA/flake8-bugbear):
-        find possible bugs and design issues in program.
-    -   [flake8-blind-except](https://github.com/elijahandrews/flake8-blind-except):
-        checks for blind, catch-all `except` statements.
--   [yesqa](https://github.com/asottile/yesqa):
-    remove unneccesary `# noqa` comments, follows additional dependencies listed above.
--   [autoflake](https://github.com/PyCQA/autoflake):
-    remove unused imports and variables.
--   [pre-commit-hooks](https://github.com/pre-commit/pre-commit-hooks): generic pre-commit hooks.
-    -   **detect-private-key**: checks for the existence of private keys.
-    -   **check-ast**: check whether files parse as valid python.
-    -   **end-of-file-fixer**:check files end in a newline and only a newline.
-    -   **mixed-line-ending**: checks mixed line ending.
-    -   **trailing-whitespace**: trims trailing whitespace.
-    -   **check-case-conflict**: check files that would conflict with case-insensitive file systems.
--   [pyupgrade](https://github.com/asottile/pyupgrade):
-    upgrade syntax for newer versions of the language.
--   **forbid-to-commit**: Make sure that `*.rej` files cannot be commited. These files are created by the
-    [automated template sync](#automated-template-sync) if there's a merge conflict and need to be addressed manually.
+- [flake8](https://flake8.pycqa.org/en/latest/): standard check for errors in Python files.
+    - [flake8-tidy-imports](https://github.com/adamchainz/flake8-tidy-imports):
+      tidy module imports.
+    - [flake8-docstrings](https://github.com/PyCQA/flake8-docstrings):
+      pydocstyle extension of flake8.
+    - [flake8-rst-docstrings](https://github.com/peterjc/e8-rst-docstrings):
+      extension of `flake8-docstrings` for `rst` docs.
+    - [flake8-comprehensions](https://github.com/adamchainz/e8-comprehensions):
+      write better list/set/dict comprehensions.
+    - [flake8-bugbear](https://github.com/PyCQA/flake8-bugbear):
+      find possible bugs and design issues in program.
+    - [flake8-blind-except](https://github.com/elijahandrews/flake8-blind-except):
+      checks for blind, catch-all `except` statements.
+- [yesqa](https://github.com/asottile/yesqa):
+  remove unneccesary `# noqa` comments, follows additional dependencies listed above.
+- [autoflake](https://github.com/PyCQA/autoflake):
+  remove unused imports and variables.
+- [pre-commit-hooks](https://github.com/pre-commit/pre-commit-hooks): generic pre-commit hooks.
+    - **detect-private-key**: checks for the existence of private keys.
+    - **check-ast**: check whether files parse as valid python.
+    - **end-of-file-fixer**:check files end in a newline and only a newline.
+    - **mixed-line-ending**: checks mixed line ending.
+    - **trailing-whitespace**: trims trailing whitespace.
+    - **check-case-conflict**: check files that would conflict with case-insensitive file systems.
+- [pyupgrade](https://github.com/asottile/pyupgrade):
+  upgrade syntax for newer versions of the language.
+- **forbid-to-commit**: Make sure that `*.rej` files cannot be commited. These files are created by the
+  [automated template sync](#automated-template-sync) if there's a merge conflict and need to be addressed manually.
 
 ### How to disable or add pre-commit checks
 
--   To ignore lint warnigs from **flake8**, see [Ignore certain lint warnings](#how-to-ignore-certain-lint-warnings).
--   You can add or remove pre-commit checks by simply deleting relevant lines in the `.pre-commit-config.yaml` file.
-    Some pre-commit checks have additional options that can be specified either in the `pyproject.toml` or tool-specific
-    config files, such as `.prettierrc.yml` for **prettier** and `.flake8` for **flake8**.
+- To ignore lint warnigs from **flake8**, see [Ignore certain lint warnings](#how-to-ignore-certain-lint-warnings).
+- You can add or remove pre-commit checks by simply deleting relevant lines in the `.pre-commit-config.yaml` file.
+  Some pre-commit checks have additional options that can be specified either in the `pyproject.toml` or tool-specific
+  config files, such as `.prettierrc.yml` for **prettier** and `.flake8` for **flake8**.
 
 ### How to ignore certain lint warnings
 
@@ -220,10 +220,10 @@ W504
 Scverse ecosystem packages should operate on [AnnData][] and/or [MuData][] data structures and typically use an API
 as originally [introduced by scanpy][scanpy-api] with the following submodules:
 
--   `pp` for preprocessing
--   `tl` for tools (that, compared to `pp` generate interpretable output, often associated with a corresponding plotting
-    function)
--   `pl` for plotting functions
+- `pp` for preprocessing
+- `tl` for tools (that, compared to `pp` generate interpretable output, often associated with a corresponding plotting
+  function)
+- `pl` for plotting functions
 
 You may add additional submodules as appropriate. While we encourage to follow a scanpy-like API for ecosystem packages,
 there may also be good reasons to choose a different approach, e.g. using an object-oriented API.
@@ -280,12 +280,12 @@ The pull request can only be merged after all `*.rej` files have been removed.
 :::{tip}
 The following hints may be useful to work with the template sync:
 
--   GitHub automatically disables scheduled actions if there has been not activity to the repository for 60 days.
-    You can re-enable or manually trigger the sync by navigating to `Actions` -> `Sync Template` in your GitHub repository.
--   If you want to ignore certain files from the template update, you can add them to the `[tool.cruft]` section in the
-    `pyproject.toml` file in the root of your repository. More details are described in the
-    [cruft documentation][cruft-update-project].
--   To disable the sync entirely, simply remove the file `.github/workflows/sync.yaml`.
+- GitHub automatically disables scheduled actions if there has been not activity to the repository for 60 days.
+  You can re-enable or manually trigger the sync by navigating to `Actions` -> `Sync Template` in your GitHub repository.
+- If you want to ignore certain files from the template update, you can add them to the `[tool.cruft]` section in the
+  `pyproject.toml` file in the root of your repository. More details are described in the
+  [cruft documentation][cruft-update-project].
+- To disable the sync entirely, simply remove the file `.github/workflows/sync.yaml`.
 
 :::
 

From f219088cc055f5c6bf66f14232ffdbad05450b5a Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Wed, 11 Dec 2024 20:16:54 +0100
Subject: [PATCH 27/33] fix scale factor labels; fix table

---
 src/spatialdata_io/readers/seqfish.py | 133 ++++++++++++++++++--------
 1 file changed, 93 insertions(+), 40 deletions(-)

diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py
index 187c178b..d7aa656a 100644
--- a/src/spatialdata_io/readers/seqfish.py
+++ b/src/spatialdata_io/readers/seqfish.py
@@ -37,8 +37,10 @@ def seqfish(
     load_points: bool = True,
     load_shapes: bool = True,
     load_additional_shapes: Literal["segmentation", "boundaries", "all"] | str | None = None,
+    cells_as_circles: bool = False,
     rois: list[int] | None = None,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
+    raster_models_scale_factors: list[float] | None = None,
 ) -> SpatialData:
     """
     Read *seqfish* formatted dataset.
@@ -69,38 +71,47 @@ def seqfish(
         Whether to load cells as shape.
     load_additional_shapes
         Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei.
+        If "all" is specified, reads all remaining .tiff and .geojson files in the directory.
+    cells_as_circles
+        Whether to read cells also as circles instead of labels.
     rois
         Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present.
-        If "all" is specified, reads all remaining .tiff and .geojson files in the directory.
     imread_kwargs
         Keyword arguments to pass to :func:`dask_image.imread.imread`.
 
     Returns
     -------
     :class:`spatialdata.SpatialData`
+
+    Examples
+    --------
+    This code shows how to change the annotation target of the table from the cell labels to the cell cirlces.
+    Please check that Roi1 is present in your dataset, otherwise adjust the code below.
+    >>> from spatialdata_io import seqfish
+    >>> sdata = seqfish("path/to/raw/data")
+    >>> sdata["table"].obs["region"] = "Roi1_CellCoordinates"
+    >>> sdata.set_table_annotates_spatialelement(
+    ...     table_name="Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id"
+    ... )
+    >>> sdata.write("path/to/data.zarr")
     """
     path = Path(path)
-    count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$")
-    count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)]
-    roi_pattern = re.compile(f"^{SK.ROI}(\\d+)")
-    n_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))}
+    count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}{re.escape(SK.CSV_FILE)}$")
+    count_files = [f for f in os.listdir(path) if count_file_pattern.match(f)]
     if not count_files:
         raise ValueError(
             f"No files matching the pattern {count_file_pattern} were found. Cannot infer the naming scheme."
         )
-    matched = count_file_pattern.match(count_files[0])
-    if matched is None:
-        raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}")
 
-    rois_str: list[str] = []
+    roi_pattern = re.compile(f"^{SK.ROI}(\\d+)")
+    found_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))}
     if rois is None:
-        for roi in n_rois:
-            rois_str.append(f"{SK.ROI}{roi}")
+        rois_str = [f"{SK.ROI}{roi}" for roi in found_rois]
     elif isinstance(rois, list):
         for roi in rois:
-            if str(roi) not in n_rois:
+            if str(roi) not in found_rois:
                 raise ValueError(f"ROI{roi} not found.")
-            rois_str.append(f"{SK.ROI}{roi}")
+        rois_str = [f"{SK.ROI}{roi}" for roi in rois]
     else:
         raise ValueError("Invalid type for 'roi'. Must be list[int] or None.")
 
@@ -119,33 +130,54 @@ def get_cell_mask_file(roi: str) -> str:
     def get_transcript_file(roi: str) -> str:
         return f"{roi}_{SK.TRANSCRIPT_COORDINATES}{SK.CSV_FILE}"
 
-    scaled = {}
-    adatas: dict[str, ad.AnnData] = {}
+    # parse table information
+    tables: dict[str, ad.AnnData] = {}
     for roi_str in rois_str:
-        assert isinstance(roi_str, str) or roi_str is None
-        cell_file = get_cell_file(roi_str)
+        # parse cell gene expression data
         count_matrix = get_count_file(roi_str)
-        adata = ad.read_csv(path / count_matrix, delimiter=",")
+        df = pd.read_csv(path / count_matrix, delimiter=",")
+        instance_id = df.iloc[:, 0].astype(str)
+        expression = df.drop(columns=["Unnamed: 0"])
+        expression.set_index(instance_id, inplace=True)
+        adata = ad.AnnData(expression)
+
+        # parse cell spatial information
+        cell_file = get_cell_file(roi_str)
         cell_info = pd.read_csv(path / cell_file, delimiter=",")
+        cell_info["label"] = cell_info["label"].astype("str")
+        # below, the obsm are assigned by position, not by index. Here we check that we can do it
+        assert cell_info["label"].to_numpy().tolist() == adata.obs.index.to_numpy().tolist()
+        cell_info.set_index("label", inplace=True)
+        adata.obs[SK.AREA] = cell_info[SK.AREA]
         adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy()
-        adata.obs[SK.AREA] = np.reshape(cell_info[SK.AREA].to_numpy(), (-1, 1))
-        region = f"cells_{roi_str}"
+
+        # map tables to cell labels (defined later)
+        region = os.path.splitext(get_cell_mask_file(roi_str))[0]
         adata.obs[SK.REGION_KEY] = region
-        adata.obs[SK.INSTANCE_KEY_TABLE] = adata.obs.index.astype(int)
-        adatas[roi_str] = adata
+        adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category")
+        adata.obs[SK.INSTANCE_KEY_TABLE] = instance_id.to_numpy().astype(np.uint16)
+        adata.obs = adata.obs.reset_index(drop=True)
+        tables[f"table_{roi_str}"] = TableModel.parse(
+            adata,
+            region=region,
+            region_key=SK.REGION_KEY.value,
+            instance_key=SK.INSTANCE_KEY_TABLE.value,
+        )
+
+    # parse scale factors to scale images and labels
+    scaled = {}
+    for roi_str in rois_str:
         scaled[roi_str] = Scale(
             np.array(_get_scale_factors(path / get_dapi_file(roi_str), SK.SCALEFEFACTOR_X, SK.SCALEFEFACTOR_Y)),
             axes=("y", "x"),
         )
 
-    scale_factors = [2, 2, 2, 2]
-
     if load_images:
         images = {
             f"{os.path.splitext(get_dapi_file(x))[0]}": Image2DModel.parse(
                 imread(path / get_dapi_file(x), **imread_kwargs),
                 dims=("c", "y", "x"),
-                scale_factors=scale_factors,
+                scale_factors=raster_models_scale_factors,
                 transformations={"global": scaled[x]},
             )
             for x in rois_str
@@ -158,8 +190,8 @@ def get_transcript_file(roi: str) -> str:
             f"{os.path.splitext(get_cell_mask_file(x))[0]}": Labels2DModel.parse(
                 imread(path / get_cell_mask_file(x), **imread_kwargs).squeeze(),
                 dims=("y", "x"),
-                scale_factors=scale_factors,
-                transformations={"global": Identity()},
+                scale_factors=raster_models_scale_factors,
+                transformations={"global": scaled[x]},
             )
             for x in rois_str
         }
@@ -172,7 +204,7 @@ def get_transcript_file(roi: str) -> str:
                 pd.read_csv(path / get_transcript_file(x), delimiter=","),
                 coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y},
                 feature_key=SK.FEATURE_KEY.value,
-                # instance_key=SK.INSTANCE_KEY_POINTS.value, # TODO: should be optional but parameter might get deprecated anyway
+                instance_key=None,
                 transformations={"global": Identity()},
             )
             for x in rois_str
@@ -180,17 +212,6 @@ def get_transcript_file(roi: str) -> str:
     else:
         points = {}
 
-    tables = {}
-    for name, adata in adatas.items():
-        adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category")
-        adata.obs = adata.obs.reset_index(drop=True)
-        tables[name] = TableModel.parse(
-            adata,
-            region=f"cells_{name}",
-            region_key=SK.REGION_KEY.value,
-            instance_key=SK.INSTANCE_KEY_TABLE.value,
-        )
-
     if load_shapes:
         shapes = {
             f"{os.path.splitext(get_cell_file(x))[0]}": ShapesModel.parse(
@@ -200,7 +221,7 @@ def get_transcript_file(roi: str) -> str:
                 index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
                 transformations={"global": Identity()},
             )
-            for x, adata in zip(rois_str, adatas.values())
+            for x, adata in zip(rois_str, tables.values())
         }
     else:
         shapes = {}
@@ -247,3 +268,35 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke
                 scalefactor_x = element.attrib[scalefactor_x_key]
                 scalefactor_y = element.attrib[scalefactor_y_key]
     return [float(scalefactor_x), float(scalefactor_y)]
+
+
+if __name__ == "__main__":
+    path_read = "/Users/macbook/ssd/biodata/seqfish/instrument 2 official"
+    sdata = seqfish(
+        path=path_read,
+        # load_images=True,
+        # load_labels=True,
+        # load_points=True,
+        # rois=[1],
+    )
+    # sdata.pl.render_labels(color='Arg1').pl.show()
+    import matplotlib.pyplot as plt
+
+    # plt.show()
+    # sdata["table_Roi1"].obs["region"] = "Roi1_CellCoordinates"
+    # sdata.set_table_annotates_spatialelement(
+    #     table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id"
+    # )
+
+    gene_name = "Arg1"
+    (
+        sdata.pl.render_images("Roi1_DAPI", cmap="gray")
+        .pl.render_labels("Roi1_Segmentation", color=gene_name)
+        .pl.render_points("Roi1_TranscriptList", color="name", groups=gene_name, palette="orange")
+        .pl.show(title=f"{gene_name} expression over DAPI image", coordinate_systems="global", figsize=(10, 5))
+    )
+    plt.show()
+
+    from napari_spatialdata import Interactive
+
+    Interactive(sdata)

From ff14eed2b0b144085c4758f1a286c3c7d3b0b0f2 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Thu, 12 Dec 2024 11:24:57 +0100
Subject: [PATCH 28/33] before removing 'load_additional_shapes'

---
 src/spatialdata_io/readers/seqfish.py | 90 ++++++++++++++++-----------
 1 file changed, 55 insertions(+), 35 deletions(-)

diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py
index d7aa656a..c41a940c 100644
--- a/src/spatialdata_io/readers/seqfish.py
+++ b/src/spatialdata_io/readers/seqfish.py
@@ -36,7 +36,7 @@ def seqfish(
     load_labels: bool = True,
     load_points: bool = True,
     load_shapes: bool = True,
-    load_additional_shapes: Literal["segmentation", "boundaries", "all"] | str | None = None,
+    load_additional_geometries: Literal["segmentation", "boundaries", "all"] | str | None = None,
     cells_as_circles: bool = False,
     rois: list[int] | None = None,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
@@ -69,9 +69,11 @@ def seqfish(
         Whether to load the transcript locations.
     load_shapes
         Whether to load cells as shape.
-    load_additional_shapes
-        Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei.
-        If "all" is specified, reads all remaining .tiff and .geojson files in the directory.
+    load_additional_geometries
+        Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. If "all" is
+        specified (default), it reads all remaining .tiff and .geojson files in the directory. Other options are
+        "segmentation", "boundaries", or a string to match in the filename (as a subset). If `None` is passed,
+        no additional geometry is loaded.
     cells_as_circles
         Whether to read cells also as circles instead of labels.
     rois
@@ -198,19 +200,24 @@ def get_transcript_file(roi: str) -> str:
     else:
         labels = {}
 
+    points = {}
     if load_points:
-        points = {
-            f"{os.path.splitext(get_transcript_file(x))[0]}": PointsModel.parse(
-                pd.read_csv(path / get_transcript_file(x), delimiter=","),
+        for x in rois_str:
+
+
+            # prepare data
+            name = f"{os.path.splitext(get_transcript_file(x))[0]}"
+            p = pd.read_csv(path / get_transcript_file(x), delimiter=",")
+            instance_key_points = SK.INSTANCE_KEY_POINTS.value if SK.INSTANCE_KEY_POINTS.value in p.columns else None
+
+            # call parser
+            points[name] = PointsModel.parse(
+                p,
                 coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y},
                 feature_key=SK.FEATURE_KEY.value,
-                instance_key=None,
+                instance_key=instance_key_points,
                 transformations={"global": Identity()},
             )
-            for x in rois_str
-        }
-    else:
-        points = {}
 
     if load_shapes:
         shapes = {
@@ -226,33 +233,45 @@ def get_transcript_file(roi: str) -> str:
     else:
         shapes = {}
 
-    if load_additional_shapes is not None:
-        shape_file_names = []
+    if load_additional_geometries is not None:
+        labels_filenames = []
+        shapes_filenames = []
+
         for filename in os.listdir(path):
-            if filename.endswith((SK.TIFF_FILE, SK.GEOJSON_FILE)):
-                if load_additional_shapes == "all":
-                    if not any(key in filename for key in images.keys()) and not any(
-                        key in filename for key in labels.keys()
-                    ):
-                        shape_file_names.append(filename)
-                elif load_additional_shapes == "segmentation":
-                    if SK.SEGMENTATION in filename and not any(key in filename for key in labels.keys()):
-                        shape_file_names.append(filename)
-                elif load_additional_shapes == "boundaries":
-                    if SK.BOUNDARIES in filename:
-                        shape_file_names.append(filename)
-                elif isinstance(load_additional_shapes, str):
-                    if load_additional_shapes in filename:
-                        shape_file_names.append(filename)
-                else:
-                    raise ValueError(f"No file found with identifier {load_additional_shapes}")
-
-        for x in range(len(shape_file_names)):
-            shapes[f"{os.path.splitext(shape_file_names[x])[0]}"] = ShapesModel.parse(
-                path / shape_file_names[x],
+            if filename.endswith(SK.TIFF_FILE):
+                if (
+                    load_additional_geometries == "all"
+                    or load_additional_geometries == "segmentation" and SK.SEGMENTATION in filename
+                    or isinstance(load_additional_geometries, str) and load_additional_geometries in filename
+                ):
+                    labels_filenames.append(filename)
+                    continue
+            if filename.endswith(SK.GEOJSON_FILE):
+                if (
+                    load_additional_geometries == "all"
+                    or load_additional_geometries == "boundaries" and SK.BOUNDARIES in filename
+                    or isinstance(load_additional_geometries, str) and load_additional_geometries in filename
+                ):
+                    shapes_filenames.append(filename)
+                    continue
+            raise ValueError(f"No file found with identifier {load_additional_geometries}")
+
+        for labels_filename in labels_filenames:
+            labels[f"{os.path.splitext(labels_filename)[0]}"] = Labels2DModel.parse(
+                imread(path / labels_filename, **imread_kwargs).squeeze(),
+                dims=("y", "x"),
+                scale_factors=raster_models_scale_factors,
+                transformations={"global": scaled[x]},
+            )
+
+        for shape_filename in shapes_filenames:
+            # TODO: check that indices of newly parsed shapes match the indices of the existing shapes
+            shapes[f"{os.path.splitext(shape_filename)[0]}"] = ShapesModel.parse(
+                path / shape_filename,
                 index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
                 transformations={"global": Identity()},
             )
+            pass
 
     sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes)
 
@@ -287,6 +306,7 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke
     # sdata.set_table_annotates_spatialelement(
     #     table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id"
     # )
+    import spatialdata_plot
 
     gene_name = "Arg1"
     (

From a4434695d18829171cd804a1eb932aef9ec71491 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Dec 2024 10:25:18 +0000
Subject: [PATCH 29/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/spatialdata_io/readers/seqfish.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py
index c41a940c..996c6d4d 100644
--- a/src/spatialdata_io/readers/seqfish.py
+++ b/src/spatialdata_io/readers/seqfish.py
@@ -204,7 +204,6 @@ def get_transcript_file(roi: str) -> str:
     if load_points:
         for x in rois_str:
 
-
             # prepare data
             name = f"{os.path.splitext(get_transcript_file(x))[0]}"
             p = pd.read_csv(path / get_transcript_file(x), delimiter=",")
@@ -241,16 +240,20 @@ def get_transcript_file(roi: str) -> str:
             if filename.endswith(SK.TIFF_FILE):
                 if (
                     load_additional_geometries == "all"
-                    or load_additional_geometries == "segmentation" and SK.SEGMENTATION in filename
-                    or isinstance(load_additional_geometries, str) and load_additional_geometries in filename
+                    or load_additional_geometries == "segmentation"
+                    and SK.SEGMENTATION in filename
+                    or isinstance(load_additional_geometries, str)
+                    and load_additional_geometries in filename
                 ):
                     labels_filenames.append(filename)
                     continue
             if filename.endswith(SK.GEOJSON_FILE):
                 if (
                     load_additional_geometries == "all"
-                    or load_additional_geometries == "boundaries" and SK.BOUNDARIES in filename
-                    or isinstance(load_additional_geometries, str) and load_additional_geometries in filename
+                    or load_additional_geometries == "boundaries"
+                    and SK.BOUNDARIES in filename
+                    or isinstance(load_additional_geometries, str)
+                    and load_additional_geometries in filename
                 ):
                     shapes_filenames.append(filename)
                     continue
@@ -271,7 +274,6 @@ def get_transcript_file(roi: str) -> str:
                 index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
                 transformations={"global": Identity()},
             )
-            pass
 
     sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes)
 
@@ -306,7 +308,6 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke
     # sdata.set_table_annotates_spatialelement(
     #     table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id"
     # )
-    import spatialdata_plot
 
     gene_name = "Arg1"
     (

From 375fa5cc25b3666d4fc7077d18bf256e0f1e7903 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Thu, 12 Dec 2024 13:21:48 +0100
Subject: [PATCH 30/33] finished code review

---
 src/spatialdata_io/readers/seqfish.py | 115 +++++---------------------
 1 file changed, 22 insertions(+), 93 deletions(-)

diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py
index c41a940c..27431c09 100644
--- a/src/spatialdata_io/readers/seqfish.py
+++ b/src/spatialdata_io/readers/seqfish.py
@@ -6,7 +6,7 @@
 from collections.abc import Mapping
 from pathlib import Path
 from types import MappingProxyType
-from typing import Any, Literal
+from typing import Any
 
 import anndata as ad
 import numpy as np
@@ -36,7 +36,6 @@ def seqfish(
     load_labels: bool = True,
     load_points: bool = True,
     load_shapes: bool = True,
-    load_additional_geometries: Literal["segmentation", "boundaries", "all"] | str | None = None,
     cells_as_circles: bool = False,
     rois: list[int] | None = None,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
@@ -69,11 +68,6 @@ def seqfish(
         Whether to load the transcript locations.
     load_shapes
         Whether to load cells as shape.
-    load_additional_geometries
-        Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. If "all" is
-        specified (default), it reads all remaining .tiff and .geojson files in the directory. Other options are
-        "segmentation", "boundaries", or a string to match in the filename (as a subset). If `None` is passed,
-        no additional geometry is loaded.
     cells_as_circles
         Whether to read cells also as circles instead of labels.
     rois
@@ -87,13 +81,13 @@ def seqfish(
 
     Examples
     --------
-    This code shows how to change the annotation target of the table from the cell labels to the cell cirlces.
-    Please check that Roi1 is present in your dataset, otherwise adjust the code below.
+    This code shows how to change the annotation target of the table from the cell labels to the cell boundaries.
+    Please check that the string Roi1 is used in the naming of your dataset, otherwise adjust the code below.
     >>> from spatialdata_io import seqfish
     >>> sdata = seqfish("path/to/raw/data")
-    >>> sdata["table"].obs["region"] = "Roi1_CellCoordinates"
+    >>> sdata["table_Roi1"].obs["region"] = "Roi1_Boundaries"
     >>> sdata.set_table_annotates_spatialelement(
-    ...     table_name="Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id"
+    ...     table_name="table_Roi1", region="Roi1_Boundaries", region_key="region", instance_key="instance_id"
     ... )
     >>> sdata.write("path/to/data.zarr")
     """
@@ -126,9 +120,12 @@ def get_count_file(roi: str) -> str:
     def get_dapi_file(roi: str) -> str:
         return f"{roi}_{SK.DAPI}{SK.TIFF_FILE}"
 
-    def get_cell_mask_file(roi: str) -> str:
+    def get_cell_segmentation_labels_file(roi: str) -> str:
         return f"{roi}_{SK.SEGMENTATION}{SK.TIFF_FILE}"
 
+    def get_cell_segmentation_shapes_file(roi: str) -> str:
+        return f"{roi}_{SK.BOUNDARIES}{SK.GEOJSON_FILE}"
+
     def get_transcript_file(roi: str) -> str:
         return f"{roi}_{SK.TRANSCRIPT_COORDINATES}{SK.CSV_FILE}"
 
@@ -154,7 +151,7 @@ def get_transcript_file(roi: str) -> str:
         adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy()
 
         # map tables to cell labels (defined later)
-        region = os.path.splitext(get_cell_mask_file(roi_str))[0]
+        region = os.path.splitext(get_cell_segmentation_labels_file(roi_str))[0]
         adata.obs[SK.REGION_KEY] = region
         adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category")
         adata.obs[SK.INSTANCE_KEY_TABLE] = instance_id.to_numpy().astype(np.uint16)
@@ -189,8 +186,8 @@ def get_transcript_file(roi: str) -> str:
 
     if load_labels:
         labels = {
-            f"{os.path.splitext(get_cell_mask_file(x))[0]}": Labels2DModel.parse(
-                imread(path / get_cell_mask_file(x), **imread_kwargs).squeeze(),
+            f"{os.path.splitext(get_cell_segmentation_labels_file(x))[0]}": Labels2DModel.parse(
+                imread(path / get_cell_segmentation_labels_file(x), **imread_kwargs).squeeze(),
                 dims=("y", "x"),
                 scale_factors=raster_models_scale_factors,
                 transformations={"global": scaled[x]},
@@ -204,7 +201,6 @@ def get_transcript_file(roi: str) -> str:
     if load_points:
         for x in rois_str:
 
-
             # prepare data
             name = f"{os.path.splitext(get_transcript_file(x))[0]}"
             p = pd.read_csv(path / get_transcript_file(x), delimiter=",")
@@ -219,59 +215,25 @@ def get_transcript_file(roi: str) -> str:
                 transformations={"global": Identity()},
             )
 
-    if load_shapes:
-        shapes = {
-            f"{os.path.splitext(get_cell_file(x))[0]}": ShapesModel.parse(
+    shapes = {}
+    if cells_as_circles:
+        for x, adata in zip(rois_str, tables.values()):
+            shapes[f"{os.path.splitext(get_cell_file(x))[0]}"] = ShapesModel.parse(
                 adata.obsm[SK.SPATIAL_KEY],
                 geometry=0,
                 radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi),
                 index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
                 transformations={"global": Identity()},
             )
-            for x, adata in zip(rois_str, tables.values())
-        }
-    else:
-        shapes = {}
-
-    if load_additional_geometries is not None:
-        labels_filenames = []
-        shapes_filenames = []
-
-        for filename in os.listdir(path):
-            if filename.endswith(SK.TIFF_FILE):
-                if (
-                    load_additional_geometries == "all"
-                    or load_additional_geometries == "segmentation" and SK.SEGMENTATION in filename
-                    or isinstance(load_additional_geometries, str) and load_additional_geometries in filename
-                ):
-                    labels_filenames.append(filename)
-                    continue
-            if filename.endswith(SK.GEOJSON_FILE):
-                if (
-                    load_additional_geometries == "all"
-                    or load_additional_geometries == "boundaries" and SK.BOUNDARIES in filename
-                    or isinstance(load_additional_geometries, str) and load_additional_geometries in filename
-                ):
-                    shapes_filenames.append(filename)
-                    continue
-            raise ValueError(f"No file found with identifier {load_additional_geometries}")
-
-        for labels_filename in labels_filenames:
-            labels[f"{os.path.splitext(labels_filename)[0]}"] = Labels2DModel.parse(
-                imread(path / labels_filename, **imread_kwargs).squeeze(),
-                dims=("y", "x"),
-                scale_factors=raster_models_scale_factors,
+    if load_shapes:
+        for x in rois_str:
+            # this assumes that the index matches the instance key of the table. A more robust approach could be
+            # implemented, as described here https://github.com/scverse/spatialdata-io/issues/249
+            shapes[f"{os.path.splitext(get_cell_segmentation_shapes_file(x))[0]}"] = ShapesModel.parse(
+                path / get_cell_segmentation_shapes_file(x),
                 transformations={"global": scaled[x]},
-            )
-
-        for shape_filename in shapes_filenames:
-            # TODO: check that indices of newly parsed shapes match the indices of the existing shapes
-            shapes[f"{os.path.splitext(shape_filename)[0]}"] = ShapesModel.parse(
-                path / shape_filename,
                 index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
-                transformations={"global": Identity()},
             )
-            pass
 
     sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes)
 
@@ -287,36 +249,3 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke
                 scalefactor_x = element.attrib[scalefactor_x_key]
                 scalefactor_y = element.attrib[scalefactor_y_key]
     return [float(scalefactor_x), float(scalefactor_y)]
-
-
-if __name__ == "__main__":
-    path_read = "/Users/macbook/ssd/biodata/seqfish/instrument 2 official"
-    sdata = seqfish(
-        path=path_read,
-        # load_images=True,
-        # load_labels=True,
-        # load_points=True,
-        # rois=[1],
-    )
-    # sdata.pl.render_labels(color='Arg1').pl.show()
-    import matplotlib.pyplot as plt
-
-    # plt.show()
-    # sdata["table_Roi1"].obs["region"] = "Roi1_CellCoordinates"
-    # sdata.set_table_annotates_spatialelement(
-    #     table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id"
-    # )
-    import spatialdata_plot
-
-    gene_name = "Arg1"
-    (
-        sdata.pl.render_images("Roi1_DAPI", cmap="gray")
-        .pl.render_labels("Roi1_Segmentation", color=gene_name)
-        .pl.render_points("Roi1_TranscriptList", color="name", groups=gene_name, palette="orange")
-        .pl.show(title=f"{gene_name} expression over DAPI image", coordinate_systems="global", figsize=(10, 5))
-    )
-    plt.show()
-
-    from napari_spatialdata import Interactive
-
-    Interactive(sdata)

From 28b0eb3b0d42bc014e75f53b61c316504d84c375 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Thu, 12 Dec 2024 16:30:48 +0100
Subject: [PATCH 31/33] tests for seqfish

---
 tests/test_seqfish.py | 27 +++++++++++++++++++++++++++
 tests/test_xenium.py  |  8 ++------
 2 files changed, 29 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_seqfish.py

diff --git a/tests/test_seqfish.py b/tests/test_seqfish.py
new file mode 100644
index 00000000..e0d66655
--- /dev/null
+++ b/tests/test_seqfish.py
@@ -0,0 +1,27 @@
+import math
+from pathlib import Path
+
+import pytest
+
+from spatialdata_io.readers.seqfish import seqfish
+from tests._utils import skip_if_below_python_version
+
+
+# See https://github.com/scverse/spatialdata-io/blob/main/.github/workflows/prepare_test_data.yaml for instructions on
+# how to download and place the data on disk
+@skip_if_below_python_version()
+@pytest.mark.parametrize("dataset,expected", [("instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")])
+@pytest.mark.parametrize("rois", [[1], None])
+@pytest.mark.parametrize("cells_as_circles", [False, True])
+def test_example_data(dataset: str, expected: str, rois: list[int] | None, cells_as_circles: bool) -> None:
+    f = Path("./data") / dataset
+    assert f.is_dir()
+    sdata = seqfish(f, cells_as_circles=cells_as_circles, rois=rois)
+    from spatialdata import get_extent
+
+    extent = get_extent(sdata, exact=False)
+    extent = {ax: (math.floor(extent[ax][0]), math.ceil(extent[ax][1])) for ax in extent}
+    if cells_as_circles:
+        # manual correction required to take into account for the circle radii
+        expected = "{'y': (-2, 109), 'x': (-2, 109)}"
+    assert str(extent) == expected
diff --git a/tests/test_xenium.py b/tests/test_xenium.py
index 2fb35ce1..32408323 100644
--- a/tests/test_xenium.py
+++ b/tests/test_xenium.py
@@ -40,12 +40,8 @@ def test_roundtrip_with_data_limits() -> None:
     assert np.array_equal(cell_id_str, f0(*f1(cell_id_str)))
 
 
-# The datasets should be downloaded from
-# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/resources/xenium-example-data#test-data
-# and placed in the "data" directory; if you run the tests locally you may need to create a symlink in "tests/data"
-# pointing to "data".
-# The GitHub workflow "prepare_test_data.yaml" takes care of downloading the datasets and uploading an artifact for the
-# tests to use
+# See https://github.com/scverse/spatialdata-io/blob/main/.github/workflows/prepare_test_data.yaml for instructions on
+# how to download and place the data on disk
 @skip_if_below_python_version()
 @pytest.mark.parametrize(
     "dataset,expected",

From 91efd4519072aab66b405cd69ec40c8a59b229a0 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Thu, 12 Dec 2024 17:01:04 +0100
Subject: [PATCH 32/33] fix seqfish test path

---
 tests/test_seqfish.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_seqfish.py b/tests/test_seqfish.py
index e0d66655..bdbf2a60 100644
--- a/tests/test_seqfish.py
+++ b/tests/test_seqfish.py
@@ -10,7 +10,9 @@
 # See https://github.com/scverse/spatialdata-io/blob/main/.github/workflows/prepare_test_data.yaml for instructions on
 # how to download and place the data on disk
 @skip_if_below_python_version()
-@pytest.mark.parametrize("dataset,expected", [("instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")])
+@pytest.mark.parametrize(
+    "dataset,expected", [("seqfish-2-test-dataset/instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")]
+)
 @pytest.mark.parametrize("rois", [[1], None])
 @pytest.mark.parametrize("cells_as_circles", [False, True])
 def test_example_data(dataset: str, expected: str, rois: list[int] | None, cells_as_circles: bool) -> None:

From e9b74c0a1ada826c1b3fc579401d776c887c8923 Mon Sep 17 00:00:00 2001
From: LucaMarconato <2664412+LucaMarconato@users.noreply.github.com>
Date: Thu, 12 Dec 2024 17:38:58 +0100
Subject: [PATCH 33/33] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ed3d6456..b9f29809 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning][].
 
 - (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230
 - (Macsima) added support @berombau #224
+- (seqFISH) support for v2 instrument #227
 
 ## [0.1.6] - 2024-11-26