From 55ef92cc553816628a15ee9a46754fe81ace7fa8 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Mon, 21 Oct 2024 17:54:27 +0200 Subject: [PATCH 01/33] add cleaned macsima reader --- src/spatialdata_io/readers/_utils/_utils.py | 53 ++- src/spatialdata_io/readers/macsima.py | 384 ++++++++++++++++++++ tests/test_macsima.py | 111 ++++++ 3 files changed, 546 insertions(+), 2 deletions(-) create mode 100644 src/spatialdata_io/readers/macsima.py create mode 100644 tests/test_macsima.py diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index 63f5bdf2..84cce922 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -8,6 +8,9 @@ import numpy as np from anndata import AnnData, read_text from h5py import File +from ome_types import from_tiff +from ome_types.model import Pixels, UnitsLength +from spatialdata._logging import logger from spatialdata_io.readers._utils._read_10x_h5 import _read_10x_h5 @@ -18,8 +21,8 @@ NDArrayA = NDArray[Any] except (ImportError, TypeError): - NDArray = np.ndarray # type: ignore[misc] - NDArrayA = np.ndarray # type: ignore[misc] + NDArray = np.ndarray # type: ignore[unused-ignore] + NDArrayA = np.ndarray # type: ignore[unused-ignore] def _read_counts( @@ -84,3 +87,49 @@ def _initialize_raster_models_kwargs( if "scale_factors" not in labels_models_kwargs: labels_models_kwargs["scale_factors"] = [2, 2, 2, 2] return image_models_kwargs, labels_models_kwargs + + +def calc_scale_factors(stack: Any, min_size: int = 1000, default_scale_factor: int = 2) -> list[int]: + """Calculate scale factors based on image size to get lowest resolution under min_size pixels.""" + # get lowest dimension, ignoring channels + lower_scale_limit = min(stack.shape[1:]) + scale_factor = default_scale_factor + scale_factors = [scale_factor] + lower_scale_limit /= scale_factor + while lower_scale_limit >= min_size: + # scale_factors are cumulative, so we don't need to do e.g. scale_factor *= 2 + scale_factors.append(scale_factor) + lower_scale_limit /= scale_factor + return scale_factors + + +def parse_channels(path: Path) -> list[str]: + """Parse channel names from an OME-TIFF file.""" + images = from_tiff(path).images + if len(images) > 1: + logger.warning("Found multiple images in OME-TIFF file. Only the first one will be used.") + channels = images[0].pixels.channels + logger.debug(channels) + names = [c.name for c in channels if c.name is not None] + return names + + +def parse_physical_size(path: Path | None = None, ome_pixels: Pixels | None = None) -> float: + """Parse physical size from OME-TIFF to micrometer.""" + pixels = ome_pixels or from_tiff(path).images[0].pixels + logger.debug(pixels) + if pixels.physical_size_x_unit != pixels.physical_size_y_unit: + logger.error("Physical units for x and y dimensions are not the same.") + raise NotImplementedError + if pixels.physical_size_x != pixels.physical_size_y: + logger.error("Physical sizes for x and y dimensions are the same.") + raise NotImplementedError + # convert to micrometer if needed + if pixels.physical_size_x_unit == UnitsLength.NANOMETER: + physical_size = pixels.physical_size_x / 1000 + elif pixels.physical_size_x_unit == UnitsLength.MICROMETER: + physical_size = pixels.physical_size_x + else: + logger.error(f"Physical unit not recognized: '{pixels.physical_size_x_unit}'.") + raise NotImplementedError + return float(physical_size) diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py new file mode 100644 index 00000000..fd497f19 --- /dev/null +++ b/src/spatialdata_io/readers/macsima.py @@ -0,0 +1,384 @@ +from __future__ import annotations + +import warnings +from collections.abc import Mapping +from pathlib import Path +from types import MappingProxyType +from typing import Any + +import anndata as ad +import dask.array as da +import pandas as pd +import spatialdata as sd +from dask_image.imread import imread +from spatialdata import SpatialData +from spatialdata._logging import logger + +from spatialdata_io._constants._enum import ModeEnum +from spatialdata_io.readers._utils._utils import ( + calc_scale_factors, + parse_channels, + parse_physical_size, +) + +__all__ = ["macsima"] + + +class MACSimaParsingStyle(ModeEnum): + """Different parsing styles for MACSima data.""" + + PROCESSED_SINGLE_FOLDER = "processed_single_folder" + PROCESSED_MULTIPLE_FOLDERS = "processed_multiple_folders" + RAW = "raw" + AUTO = "auto" + + +def macsima( + path: str | Path, + parsing_style: MACSimaParsingStyle | str = MACSimaParsingStyle.AUTO, + filter_folder_names: list[str] | None = None, + imread_kwargs: Mapping[str, Any] = MappingProxyType({}), + subset: int | None = None, + c_subset: int | None = None, + max_chunk_size: int = 1024, + c_chunks_size: int = 1, + multiscale: bool = True, + transformations: bool = True, + scale_factors: list[int] | None = None, + default_scale_factor: int = 2, + nuclei_channel_name: str = "DAPI", + skip_rounds: list[int] | None = None, +) -> SpatialData: + """ + Read *MACSima* formatted dataset. + + This function reads images from a MACSima cyclic imaging experiment. Metadata of the cycle rounds is parsed from the image names. The channel names are parsed from the OME metadata. + + .. seealso:: + + - `MACSima output `_. + + Parameters + ---------- + path + Path to the directory containing the data. + parsing_style + Parsing style to use. If ``auto``, the parsing style is determined based on the contents of the path. + filter_folder_names + List of folder names to filter out when parsing multiple folders. + imread_kwargs + Keyword arguments passed to :func:`dask_image.imread.imread`. + subset + Subset the image to the first ``subset`` pixels in x and y dimensions. + c_subset + Subset the image to the first ``c_subset`` channels. + max_chunk_size + Maximum chunk size for x and y dimensions. + c_chunks_size + Chunk size for c dimension. + multiscale + Whether to create a multiscale image. + transformations + Whether to add a transformation from pixels to microns to the image. + scale_factors + Scale factors to use for downsampling. If None, scale factors are calculated based on image size. + default_scale_factor + Default scale factor to use for downsampling. + nuclei_channel_name + Common string of the nuclei channel to separate nuclei from other channels. + skip_rounds + List of round numbers to skip when parsing the data. + + Returns + ------- + :class:`spatialdata.SpatialData` + """ + path = Path(path) + if not isinstance(parsing_style, MACSimaParsingStyle): + parsing_style = MACSimaParsingStyle(parsing_style) + + if parsing_style == MACSimaParsingStyle.AUTO: + assert path.is_dir(), f"Path {path} is not a directory." + + if any(p.suffix in [".tif", ".tiff"] for p in path.iterdir()): + # if path contains tifs, do parse_processed_folder on path + parsing_style = MACSimaParsingStyle.PROCESSED_SINGLE_FOLDER + elif all(p.is_dir() for p in path.iterdir() if not p.name.startswith(".")): + # if path contains only folders or hidden files, do parse_processed_folder on each folder + parsing_style = MACSimaParsingStyle.PROCESSED_MULTIPLE_FOLDERS + else: + raise ValueError(f"Cannot determine parsing style for path {path}. Please specify the parsing style.") + + if parsing_style == MACSimaParsingStyle.PROCESSED_SINGLE_FOLDER: + return parse_processed_folder( + path, + imread_kwargs, + subset, + c_subset, + max_chunk_size, + c_chunks_size, + multiscale, + transformations, + scale_factors, + default_scale_factor, + nuclei_channel_name, + skip_rounds, + ) + if parsing_style == MACSimaParsingStyle.PROCESSED_MULTIPLE_FOLDERS: + sdatas = {} + # iterate over all non-filtered folders in path and parse each folder + for p in [ + p + for p in path.iterdir() + if p.is_dir() and (not filter_folder_names or not any(f in p.name for f in filter_folder_names)) + ]: + sdatas[p.stem] = parse_processed_folder( + p, + imread_kwargs, + subset, + c_subset, + max_chunk_size, + c_chunks_size, + multiscale, + transformations, + scale_factors, + default_scale_factor, + nuclei_channel_name, + skip_rounds, + ) + return sd.concatenate(list(sdatas.values())) + if parsing_style == MACSimaParsingStyle.RAW: + # TODO: see https://github.com/scverse/spatialdata-io/issues/155 + raise NotImplementedError("Parsing raw MACSima data is not yet implemented.") + + +def parse_name_to_cycle(name: str) -> int: + """Parse the cycle number from the name of the image.""" + cycle = name.split("_")[0] + if "-" in cycle: + cycle = cycle.split("-")[1] + return int(cycle) + + +def parse_processed_folder( + path: Path, + imread_kwargs: Mapping[str, Any] = MappingProxyType({}), + subset: int | None = None, + c_subset: int | None = None, + max_chunk_size: int = 1024, + c_chunks_size: int = 1, + multiscale: bool = True, + transformations: bool = True, + scale_factors: list[int] | None = None, + default_scale_factor: int = 2, + nuclei_channel_name: str = "DAPI", + skip_rounds: list[int] | None = None, + file_pattern: str = "*.tif*", +) -> SpatialData: + """Parse a single folder containing images from a cyclical imaging platform.""" + # get list of image paths, get channel name from OME data and cycle round number from filename + # look for OME-TIFF files + path_files = list(path.glob(file_pattern)) + logger.debug(path_files[0]) + # make sure not to remove round 0 when parsing! + cycles = [] + channels = [] + for p in path_files: + cycle = parse_name_to_cycle(p.stem) + cycles.append(cycle) + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + channel_names = parse_channels(p) + if len(channel_names) > 1: + logger.warning(f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.") + channels.append(channel_names[0]) + except ValueError as e: + logger.warning(f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.") + stack, sorted_cycles, sorted_channels = get_stack( + path_files, + cycles, + channels, + imread_kwargs, + skip_rounds=skip_rounds, + ) + + sorted_cycle_channels: list[str] = [] + for c, ch in zip(sorted_cycles, sorted_channels): + sorted_cycle_channels.append(f"R{str(c)} {ch}") + + # do subsetting if needed + if subset: + stack = stack[:, :subset, :subset] + if c_subset: + stack = stack[:c_subset, :, :] + sorted_cycle_channels = sorted_cycle_channels[:c_subset] + if multiscale and not scale_factors: + scale_factors = calc_scale_factors(stack, default_scale_factor=default_scale_factor) + if not multiscale: + scale_factors = None + logger.debug(f"Scale factors: {scale_factors}") + + filtered_name = path.stem.replace(" ", "_") + + return create_sdata( + stack, + sorted_cycle_channels, + path_files, + max_chunk_size, + c_chunks_size, + transformations, + scale_factors, + nuclei_channel_name, + filtered_name, + ) + + +def create_sdata( + stack: da.Array, + sorted_cycle_channels: list[str], + path_files: list[Path], + max_chunk_size: int, + c_chunks_size: int, + transformations: bool, + scale_factors: list[int] | None, + nuclei_channel_name: str, + filtered_name: str, +) -> SpatialData: + # for stack and sorted_cycle_channels, if channel name is nuclei_channel_name, add to seperate nuclei stack + # keep the first nuclei channel in both the stack and the nuclei stack + nuclei_sorted_cycle_channels = [] + nuclei_idx = [] + for i, c in enumerate(sorted_cycle_channels): + if nuclei_channel_name in c: + nuclei_sorted_cycle_channels.append(c) + nuclei_idx.append(i) + if len(nuclei_idx) > 2: + has_nuclei_stack = True + # More than two nuclei channels found, keep only the first and last one in the stack + nuclei_stack = stack[nuclei_idx] + nuclei_idx_without_first_and_last = nuclei_idx[1:-1] + stack = stack[[i for i in range(len(sorted_cycle_channels)) if i not in nuclei_idx_without_first_and_last]] + sorted_cycle_channels = [ + c for i, c in enumerate(sorted_cycle_channels) if i not in nuclei_idx_without_first_and_last + ] + else: + has_nuclei_stack = False + # Only one or two nuclei channels found, keep all in the stack + pixels_to_microns = parse_physical_size(path_files[0]) + image_element = create_image_element( + stack, + sorted_cycle_channels, + max_chunk_size, + c_chunks_size, + transformations, + pixels_to_microns, + scale_factors, + name=filtered_name, + ) + if has_nuclei_stack: + nuclei_image_element = create_image_element( + nuclei_stack, + nuclei_sorted_cycle_channels, + max_chunk_size, + c_chunks_size, + transformations, + pixels_to_microns, + scale_factors, + name=filtered_name, + ) + table_nuclei = create_table(nuclei_sorted_cycle_channels) + table_channels = create_table(sorted_cycle_channels) + + sdata = sd.SpatialData( + images={ + f"{filtered_name}_image": image_element, + }, + tables={ + f"{filtered_name}_table": table_channels, + }, + ) + if has_nuclei_stack: + sdata.images[f"{filtered_name}_nuclei_image"] = nuclei_image_element + sdata.tables[f"{filtered_name}_nuclei_table"] = table_nuclei + + return sdata + + +def create_table(sorted_cycle_channels: list[str]) -> ad.AnnData: + df = pd.DataFrame( + { + "name": sorted_cycle_channels, + "cycle": [int(c.split(" ")[0][1:]) for c in sorted_cycle_channels], + } + ) + table = ad.AnnData(var=df) + table.var_names = sorted_cycle_channels + return sd.models.TableModel.parse(table) + + +def create_image_element( + stack: da.Array, + sorted_channels: list[str], + max_chunk_size: int, + c_chunks_size: int, + transformations: bool, + pixels_to_microns: float, + scale_factors: list[int] | None, + name: str | None = None, +) -> sd.models.Image2DModel: + t_dict = None + if transformations: + t_pixels_to_microns = sd.transformations.Scale([pixels_to_microns, pixels_to_microns], axes=("x", "y")) + # 'microns' is also used in merscope example + # no inverse needed as the transformation is already from pixels to microns + t_dict = {name: t_pixels_to_microns} + # # chunk_size can be 1 for channels + chunks = { + "x": max_chunk_size, + "y": max_chunk_size, + "c": c_chunks_size, + } + if t_dict: + logger.debug("Adding transformation: %s", t_dict) + el = sd.models.Image2DModel.parse( + stack, + # TODO: make sure y and x locations are correct + dims=["c", "y", "x"], + scale_factors=scale_factors, + chunks=chunks, + c_coords=sorted_channels, + transformations=t_dict, + ) + return el + + +def get_stack( + path_files: list[Path], + cycles: list[int], + channels: list[str], + imread_kwargs: Mapping[str, Any], + skip_rounds: list[int] | None = None, +) -> tuple[da.Array, list[int], list[str]]: + if len(path_files) != len(cycles) or len(path_files) != len(channels): + raise ValueError("Length of path_files, cycles and channels must be the same.") + # if any of round_channels is in skip_rounds, remove that round from the list and from path_files + if skip_rounds: + logger.info("Skipping cycles: %d", skip_rounds) + path_files, cycles, channels = map( + list, + zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]), + ) + imgs = [imread(img, **imread_kwargs) for img in path_files] + for img, path in zip(imgs, path_files): + if img.shape[1:] != imgs[0].shape[1:]: + raise ValueError( + f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}" + ) + # sort imgs, cycles and channels based on cycles + imgs, cycles, channels = map( + list, + zip(*sorted(zip(imgs, cycles, channels), key=lambda x: (x[1]))), + ) + stack = da.stack(imgs).squeeze() + return stack, cycles, channels diff --git a/tests/test_macsima.py b/tests/test_macsima.py new file mode 100644 index 00000000..759867b0 --- /dev/null +++ b/tests/test_macsima.py @@ -0,0 +1,111 @@ +import math +import sys +from pathlib import Path +from typing import Any + +import pytest +from spatialdata.models import get_channels + +from spatialdata_io.readers.macsima import macsima + +if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()): + # The datasets should be downloaded and placed unzipped in the "data" directory; + # MACSimaData_HCA/ with e.g. unzipped HumanLiverH35/ inside: https://livercellatlas.org/download.php + # Lung_adc_demo/: Ask Miltenyi Biotec for the demo dataset + pytest.skip( + "requires the Lung_adc_demo or MACSimaData_HCA datasets", + allow_module_level=True, + ) + + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@pytest.mark.parametrize( + "dataset,expected", + [ + ("Lung_adc_demo", {"y": (0, 15460), "x": (0, 13864)}), + ("MACSimaData_HCA/HumanLiverH35", {"y": (0, 1154), "x": (0, 1396)}), + ], +) +def test_image_size(dataset: str, expected: dict[str, Any]) -> None: + from spatialdata import get_extent + + f = Path("./data") / dataset + assert f.is_dir() + sdata = macsima(f) + el = sdata[list(sdata.images.keys())[0]] + cs = sdata.coordinate_systems[0] + + extent: dict[str, tuple[float, float]] = get_extent(el, coordinate_system=cs) + extent = {ax: (math.floor(extent[ax][0]), math.ceil(extent[ax][1])) for ax in extent} + assert extent == expected + + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@pytest.mark.parametrize( + "dataset,expected", + [("Lung_adc_demo", 116), ("MACSimaData_HCA/HumanLiverH35", 102)], +) +def test_total_channels(dataset: str, expected: int) -> None: + f = Path("./data") / dataset + assert f.is_dir() + sdata = macsima(f) + el = sdata[list(sdata.images.keys())[0]] + + # get the number of channels + channels: int = len(get_channels(el)) + assert channels == expected + + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@pytest.mark.parametrize( + "dataset,expected", + [ + ("Lung_adc_demo", ["R0 DAPI", "R1 CD68", "R1 CD163"]), + ("MACSimaData_HCA/HumanLiverH35", ["R0 DAPI", "R1 PE", "R1 DAPI"]), + ], +) +def test_channel_names(dataset: str, expected: list[str]) -> None: + f = Path("./data") / dataset + assert f.is_dir() + sdata = macsima(f, c_subset=3) + el = sdata[list(sdata.images.keys())[0]] + + # get the channel names + channels = get_channels(el) + assert list(channels) == expected + + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@pytest.mark.parametrize( + "dataset,expected", + [ + ("Lung_adc_demo", [0, 1, 1]), + ("MACSimaData_HCA/HumanLiverH35", [0, 1, 1]), + ], +) +def test_cycle_metadata(dataset: str, expected: list[str]) -> None: + f = Path("./data") / dataset + assert f.is_dir() + sdata = macsima(f, c_subset=3) + table = sdata[list(sdata.tables.keys())[0]] + + # get the channel names + cycles = table.var["cycle"] + assert list(cycles) == expected + + +def test_parsing_style() -> None: + with pytest.raises(ValueError): + macsima(Path("."), parsing_style="not_a_parsing_style") + + +# @pytest.mark.parametrize( +# "name,expected", +# [ +# ("C-002_S-000_S_FITC_R-01_W-C-1_ROI-01_A-CD147_C-REA282.tif", 2), +# ("001_S_R-01_W-B-1_ROI-01_A-CD14REA599ROI1_C-REA599.ome.tif", 1), +# ], +# ) +# def test_parsing_of_name_to_cycle(name: str, expected: int) -> None: +# result = parse_name_to_cycle(name) +# assert result == expected From e001548699c38435dd9f52b61f0ce982bc1dc935 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Tue, 22 Oct 2024 09:38:18 +0200 Subject: [PATCH 02/33] add ome-types --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 6b12efe6..e522bd8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "pyarrow", "readfcs", "tifffile>=2023.8.12", + "ome-types", ] [project.optional-dependencies] From 47d651df62c65c6540e16d85c05c60068fcec81a Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Tue, 22 Oct 2024 11:41:05 +0200 Subject: [PATCH 03/33] refactor to MultiChannelImage, add parameter include_cycle_in_channel_name --- src/spatialdata_io/readers/_utils/_utils.py | 5 +- src/spatialdata_io/readers/macsima.py | 317 +++++++++++++------- tests/test_macsima.py | 24 +- 3 files changed, 220 insertions(+), 126 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index 752d68c4..5aedf5d4 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -89,11 +89,10 @@ def _initialize_raster_models_kwargs( return image_models_kwargs, labels_models_kwargs -def calc_scale_factors(stack: Any, min_size: int = 1000, default_scale_factor: int = 2) -> list[int]: +def calc_scale_factors(lower_scale_limit: float, min_size: int = 1000, default_scale_factor: int = 2) -> list[int]: """Calculate scale factors based on image size to get lowest resolution under min_size pixels.""" # get lowest dimension, ignoring channels - lower_scale_limit = min(stack.shape[1:]) - scale_factor = default_scale_factor + scale_factor: int = default_scale_factor scale_factors = [scale_factor] lower_scale_limit /= scale_factor while lower_scale_limit >= min_size: diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py index fd497f19..2916d79b 100644 --- a/src/spatialdata_io/readers/macsima.py +++ b/src/spatialdata_io/readers/macsima.py @@ -1,7 +1,9 @@ from __future__ import annotations import warnings +from collections import defaultdict from collections.abc import Mapping +from dataclasses import dataclass from pathlib import Path from types import MappingProxyType from typing import Any @@ -33,6 +35,139 @@ class MACSimaParsingStyle(ModeEnum): AUTO = "auto" +@dataclass +class ChannelMetadata: + """Metadata for a channel in a multichannel dataset.""" + + name: str + cycle: int + + +@dataclass +class MultiChannelImage: + """Multichannel image with metadata.""" + + data: list[da.Array] + metadata: list[ChannelMetadata] + include_cycle_in_channel_name: bool = False + + @classmethod + def from_paths( + cls, + path_files: list[Path], + imread_kwargs: Mapping[str, Any], + skip_rounds: list[int] | None = None, + ) -> MultiChannelImage: + cycles = [] + channels = [] + for p in path_files: + cycle = parse_name_to_cycle(p.stem) + cycles.append(cycle) + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + channel_names = parse_channels(p) + if len(channel_names) > 1: + logger.warning(f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.") + channels.append(channel_names[0]) + except ValueError as e: + logger.warning(f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.") + + if len(path_files) != len(cycles) or len(path_files) != len(channels): + raise ValueError("Length of path_files, cycles and channels must be the same.") + # if any of round_channels is in skip_rounds, remove that round from the list and from path_files + if skip_rounds: + logger.info("Skipping cycles: %d", skip_rounds) + path_files, cycles, channels = map( + list, + zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]), + ) + imgs = [imread(img, **imread_kwargs) for img in path_files] + for img, path in zip(imgs, path_files): + if img.shape[1:] != imgs[0].shape[1:]: + raise ValueError( + f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}" + ) + # sort imgs, cycles and channels based on cycles + output = cls( + data=imgs, + metadata=[ChannelMetadata(name=ch, cycle=c) for c, ch in zip(cycles, channels)], + ) + return output + + @classmethod + def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage: + """Create new MultiChannelImage with only the channels that contain c_name.""" + indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name] + return MultiChannelImage.filter_by_index(mci, indices) + + @classmethod + def filter_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage: + """Filter the image by index.""" + metadata = [c for i, c in enumerate(mci.metadata) if i in indices] + data = [d for i, d in enumerate(mci.data) if i in indices] + return cls( + data=data, + metadata=metadata, + include_cycle_in_channel_name=mci.include_cycle_in_channel_name, + ) + + def get_channel_names(self) -> list[str]: + """Get the channel names.""" + if self.include_cycle_in_channel_name: + return [f"R{c.cycle} {c.name}" for c in self.metadata] + else: + # if name is duplicated, add (i) to the name + names = [c.name for c in self.metadata] + name_dict: dict[str, int] = defaultdict(int) + name_counter: dict[str, int] = defaultdict(int) + for name in names: + name_dict[name] += 1 + output = [] + for name in names: + name_counter[name] += 1 + output.append(f"{name} ({name_counter[name]})" if name_dict[name] > 1 else name) + return output + + def get_cycles(self) -> list[int]: + """Get the cycle numbers.""" + return [c.cycle for c in self.metadata] + + def sort_by_channel(self) -> None: + """Sort the channels by cycle number.""" + self.metadata = sorted(self.metadata, key=lambda x: x.cycle) + self.data = [d for _, d in sorted(zip(self.metadata, self.data), key=lambda x: x[0].cycle)] + + def subset(self, subset: int | None = None) -> MultiChannelImage: + """Subset the image.""" + if subset: + self.data = [d[:, :subset, :subset] for d in self.data] + return self + + def subset_channels(self, c_subset: int) -> MultiChannelImage: + """Subset the channels.""" + self.data = self.data[:c_subset] + self.metadata = self.metadata[:c_subset] + return self + + def subset_by_idx(self, indices: list[int]) -> MultiChannelImage: + """Subset the image by index.""" + data = [d for i, d in enumerate(self.data) if i in indices] + metadata = [c for i, c in enumerate(self.metadata) if i in indices] + return MultiChannelImage( + data=data, + metadata=metadata, + include_cycle_in_channel_name=self.include_cycle_in_channel_name, + ) + + def calc_scale_factors(self, default_scale_factor: int = 2) -> list[int]: + lower_scale_limit = min(self.data[0].shape[1:]) + return calc_scale_factors(lower_scale_limit, default_scale_factor=default_scale_factor) + + def get_stack(self) -> da.Array: + return da.stack(self.data).squeeze() + + def macsima( path: str | Path, parsing_style: MACSimaParsingStyle | str = MACSimaParsingStyle.AUTO, @@ -47,7 +182,9 @@ def macsima( scale_factors: list[int] | None = None, default_scale_factor: int = 2, nuclei_channel_name: str = "DAPI", + split_threshold_nuclei_channel: int | None = 2, skip_rounds: list[int] | None = None, + include_cycle_in_channel_name: bool = False, ) -> SpatialData: """ Read *MACSima* formatted dataset. @@ -86,8 +223,12 @@ def macsima( Default scale factor to use for downsampling. nuclei_channel_name Common string of the nuclei channel to separate nuclei from other channels. + split_threshold_nuclei_channel + Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is greater than this threshold, the nuclei channels are split into a separate stack. skip_rounds List of round numbers to skip when parsing the data. + include_cycle_in_channel_name + Whether to include the cycle number in the channel name. Returns ------- @@ -121,8 +262,10 @@ def macsima( transformations, scale_factors, default_scale_factor, - nuclei_channel_name, - skip_rounds, + nuclei_channel_name=nuclei_channel_name, + split_threshold_nuclei_channel=split_threshold_nuclei_channel, + skip_rounds=skip_rounds, + include_cycle_in_channel_name=include_cycle_in_channel_name, ) if parsing_style == MACSimaParsingStyle.PROCESSED_MULTIPLE_FOLDERS: sdatas = {} @@ -143,8 +286,10 @@ def macsima( transformations, scale_factors, default_scale_factor, - nuclei_channel_name, - skip_rounds, + nuclei_channel_name=nuclei_channel_name, + split_threshold_nuclei_channel=split_threshold_nuclei_channel, + skip_rounds=skip_rounds, + include_cycle_in_channel_name=include_cycle_in_channel_name, ) return sd.concatenate(list(sdatas.values())) if parsing_style == MACSimaParsingStyle.RAW: @@ -172,49 +317,33 @@ def parse_processed_folder( scale_factors: list[int] | None = None, default_scale_factor: int = 2, nuclei_channel_name: str = "DAPI", + split_threshold_nuclei_channel: int | None = 2, skip_rounds: list[int] | None = None, file_pattern: str = "*.tif*", + include_cycle_in_channel_name: bool = False, ) -> SpatialData: """Parse a single folder containing images from a cyclical imaging platform.""" # get list of image paths, get channel name from OME data and cycle round number from filename # look for OME-TIFF files path_files = list(path.glob(file_pattern)) logger.debug(path_files[0]) - # make sure not to remove round 0 when parsing! - cycles = [] - channels = [] - for p in path_files: - cycle = parse_name_to_cycle(p.stem) - cycles.append(cycle) - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - channel_names = parse_channels(p) - if len(channel_names) > 1: - logger.warning(f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.") - channels.append(channel_names[0]) - except ValueError as e: - logger.warning(f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.") - stack, sorted_cycles, sorted_channels = get_stack( + + mci = MultiChannelImage.from_paths( path_files, - cycles, - channels, imread_kwargs, - skip_rounds=skip_rounds, + skip_rounds, ) + mci.include_cycle_in_channel_name = include_cycle_in_channel_name - sorted_cycle_channels: list[str] = [] - for c, ch in zip(sorted_cycles, sorted_channels): - sorted_cycle_channels.append(f"R{str(c)} {ch}") + mci.sort_by_channel() # do subsetting if needed if subset: - stack = stack[:, :subset, :subset] + mci = mci.subset(subset) if c_subset: - stack = stack[:c_subset, :, :] - sorted_cycle_channels = sorted_cycle_channels[:c_subset] + mci = mci.subset_channels(c_subset) if multiscale and not scale_factors: - scale_factors = calc_scale_factors(stack, default_scale_factor=default_scale_factor) + scale_factors = mci.calc_scale_factors(default_scale_factor=default_scale_factor) if not multiscale: scale_factors = None logger.debug(f"Scale factors: {scale_factors}") @@ -222,73 +351,68 @@ def parse_processed_folder( filtered_name = path.stem.replace(" ", "_") return create_sdata( - stack, - sorted_cycle_channels, - path_files, - max_chunk_size, - c_chunks_size, - transformations, - scale_factors, - nuclei_channel_name, - filtered_name, + mci=mci, + path_files=path_files, + max_chunk_size=max_chunk_size, + c_chunks_size=c_chunks_size, + transformations=transformations, + scale_factors=scale_factors, + nuclei_channel_name=nuclei_channel_name, + split_threshold_nuclei_channel=split_threshold_nuclei_channel, + filtered_name=filtered_name, ) def create_sdata( - stack: da.Array, - sorted_cycle_channels: list[str], + mci: MultiChannelImage, path_files: list[Path], max_chunk_size: int, c_chunks_size: int, transformations: bool, scale_factors: list[int] | None, nuclei_channel_name: str, + split_threshold_nuclei_channel: int | None, filtered_name: str, ) -> SpatialData: - # for stack and sorted_cycle_channels, if channel name is nuclei_channel_name, add to seperate nuclei stack - # keep the first nuclei channel in both the stack and the nuclei stack - nuclei_sorted_cycle_channels = [] - nuclei_idx = [] - for i, c in enumerate(sorted_cycle_channels): - if nuclei_channel_name in c: - nuclei_sorted_cycle_channels.append(c) - nuclei_idx.append(i) - if len(nuclei_idx) > 2: - has_nuclei_stack = True - # More than two nuclei channels found, keep only the first and last one in the stack - nuclei_stack = stack[nuclei_idx] - nuclei_idx_without_first_and_last = nuclei_idx[1:-1] - stack = stack[[i for i in range(len(sorted_cycle_channels)) if i not in nuclei_idx_without_first_and_last]] - sorted_cycle_channels = [ - c for i, c in enumerate(sorted_cycle_channels) if i not in nuclei_idx_without_first_and_last - ] + nuclei_idx = [i for i, c in enumerate(mci.get_channel_names()) if nuclei_channel_name in c] + n_nuclei_channels = len(nuclei_idx) + if not split_threshold_nuclei_channel: + # if split_threshold_nuclei_channel is None, do not split nuclei channels + split_nuclei = False else: - has_nuclei_stack = False - # Only one or two nuclei channels found, keep all in the stack + split_nuclei = n_nuclei_channels > split_threshold_nuclei_channel + if split_nuclei: + # if channel name is nuclei_channel_name, add to seperate nuclei stack + nuclei_mci = mci.subset_by_idx(nuclei_idx) + # keep the first nuclei channel in both the stack and the nuclei stack + nuclei_idx_without_first_and_last = nuclei_idx[1:-1] + mci = MultiChannelImage.filter_by_index( + mci, + [i for i in range(len(mci.metadata)) if i not in nuclei_idx_without_first_and_last], + ) + pixels_to_microns = parse_physical_size(path_files[0]) image_element = create_image_element( - stack, - sorted_cycle_channels, + mci, max_chunk_size, c_chunks_size, transformations, pixels_to_microns, scale_factors, - name=filtered_name, + coordinate_system=filtered_name, ) - if has_nuclei_stack: + if split_nuclei: nuclei_image_element = create_image_element( - nuclei_stack, - nuclei_sorted_cycle_channels, + nuclei_mci, max_chunk_size, c_chunks_size, transformations, pixels_to_microns, scale_factors, - name=filtered_name, + coordinate_system=filtered_name, ) - table_nuclei = create_table(nuclei_sorted_cycle_channels) - table_channels = create_table(sorted_cycle_channels) + table_nuclei = create_table(nuclei_mci) + table_channels = create_table(mci) sdata = sd.SpatialData( images={ @@ -298,41 +422,43 @@ def create_sdata( f"{filtered_name}_table": table_channels, }, ) - if has_nuclei_stack: + + if split_nuclei: sdata.images[f"{filtered_name}_nuclei_image"] = nuclei_image_element sdata.tables[f"{filtered_name}_nuclei_table"] = table_nuclei return sdata -def create_table(sorted_cycle_channels: list[str]) -> ad.AnnData: +def create_table(mci: MultiChannelImage) -> ad.AnnData: + cycles = mci.get_cycles() + names = mci.get_channel_names() df = pd.DataFrame( { - "name": sorted_cycle_channels, - "cycle": [int(c.split(" ")[0][1:]) for c in sorted_cycle_channels], + "name": names, + "cycle": cycles, } ) table = ad.AnnData(var=df) - table.var_names = sorted_cycle_channels + table.var_names = names return sd.models.TableModel.parse(table) def create_image_element( - stack: da.Array, - sorted_channels: list[str], + mci: MultiChannelImage, max_chunk_size: int, c_chunks_size: int, transformations: bool, pixels_to_microns: float, scale_factors: list[int] | None, - name: str | None = None, + coordinate_system: str | None = None, ) -> sd.models.Image2DModel: t_dict = None if transformations: t_pixels_to_microns = sd.transformations.Scale([pixels_to_microns, pixels_to_microns], axes=("x", "y")) # 'microns' is also used in merscope example # no inverse needed as the transformation is already from pixels to microns - t_dict = {name: t_pixels_to_microns} + t_dict = {coordinate_system: t_pixels_to_microns} # # chunk_size can be 1 for channels chunks = { "x": max_chunk_size, @@ -342,43 +468,12 @@ def create_image_element( if t_dict: logger.debug("Adding transformation: %s", t_dict) el = sd.models.Image2DModel.parse( - stack, + mci.get_stack(), # TODO: make sure y and x locations are correct dims=["c", "y", "x"], scale_factors=scale_factors, chunks=chunks, - c_coords=sorted_channels, + c_coords=mci.get_channel_names(), transformations=t_dict, ) return el - - -def get_stack( - path_files: list[Path], - cycles: list[int], - channels: list[str], - imread_kwargs: Mapping[str, Any], - skip_rounds: list[int] | None = None, -) -> tuple[da.Array, list[int], list[str]]: - if len(path_files) != len(cycles) or len(path_files) != len(channels): - raise ValueError("Length of path_files, cycles and channels must be the same.") - # if any of round_channels is in skip_rounds, remove that round from the list and from path_files - if skip_rounds: - logger.info("Skipping cycles: %d", skip_rounds) - path_files, cycles, channels = map( - list, - zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]), - ) - imgs = [imread(img, **imread_kwargs) for img in path_files] - for img, path in zip(imgs, path_files): - if img.shape[1:] != imgs[0].shape[1:]: - raise ValueError( - f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}" - ) - # sort imgs, cycles and channels based on cycles - imgs, cycles, channels = map( - list, - zip(*sorted(zip(imgs, cycles, channels), key=lambda x: (x[1]))), - ) - stack = da.stack(imgs).squeeze() - return stack, cycles, channels diff --git a/tests/test_macsima.py b/tests/test_macsima.py index 759867b0..886003b5 100644 --- a/tests/test_macsima.py +++ b/tests/test_macsima.py @@ -6,7 +6,7 @@ import pytest from spatialdata.models import get_channels -from spatialdata_io.readers.macsima import macsima +from spatialdata_io.readers.macsima import macsima, parse_name_to_cycle if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()): # The datasets should be downloaded and placed unzipped in the "data" directory; @@ -67,7 +67,7 @@ def test_total_channels(dataset: str, expected: int) -> None: def test_channel_names(dataset: str, expected: list[str]) -> None: f = Path("./data") / dataset assert f.is_dir() - sdata = macsima(f, c_subset=3) + sdata = macsima(f, c_subset=3, include_cycle_in_channel_name=True) el = sdata[list(sdata.images.keys())[0]] # get the channel names @@ -99,13 +99,13 @@ def test_parsing_style() -> None: macsima(Path("."), parsing_style="not_a_parsing_style") -# @pytest.mark.parametrize( -# "name,expected", -# [ -# ("C-002_S-000_S_FITC_R-01_W-C-1_ROI-01_A-CD147_C-REA282.tif", 2), -# ("001_S_R-01_W-B-1_ROI-01_A-CD14REA599ROI1_C-REA599.ome.tif", 1), -# ], -# ) -# def test_parsing_of_name_to_cycle(name: str, expected: int) -> None: -# result = parse_name_to_cycle(name) -# assert result == expected +@pytest.mark.parametrize( + "name,expected", + [ + ("C-002_S-000_S_FITC_R-01_W-C-1_ROI-01_A-CD147_C-REA282.tif", 2), + ("001_S_R-01_W-B-1_ROI-01_A-CD14REA599ROI1_C-REA599.ome.tif", 1), + ], +) +def test_parsing_of_name_to_cycle(name: str, expected: int) -> None: + result = parse_name_to_cycle(name) + assert result == expected From 601a1223d7255e754b1f90046b08d2d1d8d0df75 Mon Sep 17 00:00:00 2001 From: LLehner Date: Thu, 24 Oct 2024 16:10:13 +0200 Subject: [PATCH 04/33] update seqfish reader --- src/spatialdata_io/_constants/_constants.py | 11 +- src/spatialdata_io/readers/_utils/_utils.py | 9 +- src/spatialdata_io/readers/seqfish.py | 136 +++++++++++++------- 3 files changed, 99 insertions(+), 57 deletions(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index 34848137..57f50db0 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -66,14 +66,15 @@ class SeqfishKeys(ModeEnum): # file extensions CSV_FILE = ".csv" TIFF_FILE = ".tiff" - OME_TIFF_FILE = ".ome.tiff" + GEOJSON_FILE = ".geojson" # file identifiers - SECTION = "section" - TRANSCRIPT_COORDINATES = "TranscriptCoordinates" + SECTION = "_section" + TRANSCRIPT_COORDINATES = "TranscriptList" DAPI = "DAPI" - COUNTS_FILE = "CxG" - CELL_MASK_FILE = "CellMask" + COUNTS_FILE = "CellxGene" + SEGMENTATION = "Segmentation" CELL_COORDINATES = "CellCoordinates" + BOUNDARIES = "Boundaries" # transcripts TRANSCRIPTS_X = "x" TRANSCRIPTS_Y = "y" diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index 4623fced..8c9a412a 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -3,7 +3,7 @@ import os from collections.abc import Mapping from pathlib import Path -from typing import Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import numpy as np from anndata import AnnData, read_text @@ -13,15 +13,14 @@ PathLike = Union[os.PathLike, str] # type:ignore[type-arg] -try: +if TYPE_CHECKING: from numpy.typing import NDArray - NDArrayA = NDArray[Any] -except (ImportError, TypeError): - NDArray = np.ndarray +else: NDArrayA = np.ndarray + def _read_counts( path: str | Path, counts_file: str, diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index b82b2eef..e1f97f42 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -5,7 +5,7 @@ from collections.abc import Mapping from pathlib import Path from types import MappingProxyType -from typing import Any +from typing import Any, Literal, Optional, Union import anndata as ad import numpy as np @@ -19,7 +19,7 @@ ShapesModel, TableModel, ) -from spatialdata.transformations import Identity +from spatialdata.transformations.transformations import Identity from spatialdata_io._constants._constants import SeqfishKeys as SK from spatialdata_io._docs import inject_docs @@ -33,7 +33,9 @@ def seqfish( load_images: bool = True, load_labels: bool = True, load_points: bool = True, - sections: list[int] | None = None, + load_shapes: bool = True, + load_additional_shapes: Union[Literal["segmentation", "boundaries", "all"], str, None] = None, + sections: Optional[list[int]] = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> SpatialData: """ @@ -43,8 +45,8 @@ def seqfish( - ```{vx.COUNTS_FILE!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Counts and metadata file. - ```{vx.CELL_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Cell coordinates file. - - ```{vx.DAPI!r}{vx.SECTION!r}{vx.OME_TIFF_FILE!r}```: High resolution tiff image. - - ```{vx.CELL_MASK_FILE!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: Cell mask file. + - ```{vx.DAPI!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: High resolution tiff image. + - ```{vx.SEGMENTATION!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: Cell mask file. - ```{vx.TRANSCRIPT_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Transcript coordinates file. .. seealso:: @@ -58,11 +60,16 @@ def seqfish( load_images Whether to load the images. load_labels - Whether to load the labels. + Whether to load cell segmentation. load_points - Whether to load the points. + Whether to load the transcript locations. + load_shapes + Whether to load cells as shape. + load_additional_shapes + Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. sections - Which sections (specified as integers) to load. By default, all sections are loaded. + Which sections (specified as integers) to load. Only necessary if multiple sections present. + If "all" is specified, reads all remaining .tiff and .geojson files in the directory. imread_kwargs Keyword arguments to pass to :func:`dask_image.imread.imread`. @@ -71,7 +78,7 @@ def seqfish( :class:`spatialdata.SpatialData` """ path = Path(path) - count_file_pattern = re.compile(rf"(.*?)_{SK.CELL_COORDINATES}_{SK.SECTION}[0-9]+" + re.escape(SK.CSV_FILE)) + count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$") count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)] if not count_files: # no file matching tbe pattern found @@ -85,52 +92,56 @@ def seqfish( n = len(count_files) all_sections = list(range(1, n + 1)) - if sections is None: - sections = all_sections - else: + sections_str: list[Optional[str]] = [] + if sections is not None: for section in sections: if section not in all_sections: raise ValueError(f"Section {section} not found in the data.") - sections_str = [f"{SK.SECTION}{x}" for x in sections] + else: + sections_str.append(f"{SK.SECTION}{section}") + elif sections is None: + sections_str.append(None) + else: + raise ValueError("Invalid type for 'section'. Must be list[int] or None.") - def get_cell_file(section: str) -> str: - return f"{prefix}_{SK.CELL_COORDINATES}_{section}{SK.CSV_FILE}" + def get_cell_file(section: str | None) -> str: + return f"{prefix}{SK.CELL_COORDINATES}{section or ''}{SK.CSV_FILE}" - def get_count_file(section: str) -> str: - return f"{prefix}_{SK.COUNTS_FILE}_{section}{SK.CSV_FILE}" + def get_count_file(section: str | None) -> str: + return f"{prefix}{SK.COUNTS_FILE}{section or ''}{SK.CSV_FILE}" - def get_dapi_file(section: str) -> str: - return f"{prefix}_{SK.DAPI}_{section}{SK.OME_TIFF_FILE}" + def get_dapi_file(section: str | None) -> str: + return f"{prefix}{SK.DAPI}{section or ''}{SK.TIFF_FILE}" - def get_cell_mask_file(section: str) -> str: - return f"{prefix}_{SK.CELL_MASK_FILE}_{section}{SK.TIFF_FILE}" + def get_cell_mask_file(section: str | None) -> str: + return f"{prefix}{SK.SEGMENTATION}{section or ''}{SK.TIFF_FILE}" - def get_transcript_file(section: str) -> str: - return f"{prefix}_{SK.TRANSCRIPT_COORDINATES}_{section}{SK.CSV_FILE}" + def get_transcript_file(section: str | None) -> str: + return f"{prefix}{SK.TRANSCRIPT_COORDINATES}{section or ''}{SK.CSV_FILE}" - adatas: dict[str, ad.AnnData] = {} - for section in sections_str: # type: ignore[assignment] - assert isinstance(section, str) - cell_file = get_cell_file(section) - count_matrix = get_count_file(section) + adatas: dict[Optional[str], ad.AnnData] = {} + for section_str in sections_str: + assert isinstance(section_str, str) or section_str is None + cell_file = get_cell_file(section_str) + count_matrix = get_count_file(section_str) adata = ad.read_csv(path / count_matrix, delimiter=",") cell_info = pd.read_csv(path / cell_file, delimiter=",") adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy() adata.obs[SK.AREA] = np.reshape(cell_info[SK.AREA].to_numpy(), (-1, 1)) - region = f"cells_{section}" + region = f"cells_{section_str}" adata.obs[SK.REGION_KEY] = region adata.obs[SK.INSTANCE_KEY_TABLE] = adata.obs.index.astype(int) - adatas[section] = adata + adatas[section_str] = adata scale_factors = [2, 2, 2, 2] if load_images: images = { - f"image_{x}": Image2DModel.parse( + f"{os.path.splitext(get_dapi_file(x))[0]}": Image2DModel.parse( imread(path / get_dapi_file(x), **imread_kwargs), dims=("c", "y", "x"), scale_factors=scale_factors, - transformations={x: Identity()}, + transformations={"global": Identity()}, ) for x in sections_str } @@ -139,11 +150,11 @@ def get_transcript_file(section: str) -> str: if load_labels: labels = { - f"labels_{x}": Labels2DModel.parse( + f"{os.path.splitext(get_cell_mask_file(x))[0]}": Labels2DModel.parse( imread(path / get_cell_mask_file(x), **imread_kwargs).squeeze(), dims=("y", "x"), scale_factors=scale_factors, - transformations={x: Identity()}, + transformations={"global": Identity()}, ) for x in sections_str } @@ -152,12 +163,12 @@ def get_transcript_file(section: str) -> str: if load_points: points = { - f"transcripts_{x}": PointsModel.parse( + f"{os.path.splitext(get_transcript_file(x))[0]}": PointsModel.parse( pd.read_csv(path / get_transcript_file(x), delimiter=","), coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y}, feature_key=SK.FEATURE_KEY.value, - instance_key=SK.INSTANCE_KEY_POINTS.value, - transformations={x: Identity()}, + # instance_key=SK.INSTANCE_KEY_POINTS.value, # TODO: should be optional but parameter might get deprecated anyway + transformations={"global": Identity()}, ) for x in sections_str } @@ -174,16 +185,47 @@ def get_transcript_file(section: str) -> str: instance_key=SK.INSTANCE_KEY_TABLE.value, ) - shapes = { - f"cells_{x}": ShapesModel.parse( - adata.obsm[SK.SPATIAL_KEY], - geometry=0, - radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi), - index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), - transformations={x: Identity()}, - ) - for x, adata in adatas.items() - } + if load_shapes: + shapes = { + f"{os.path.splitext(get_cell_file(x))[0]}": ShapesModel.parse( + adata.obsm[SK.SPATIAL_KEY], + geometry=0, + radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi), + index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), + transformations={"global": Identity()}, + ) + for x, adata in adatas.items() + } + else: + shapes = {} + + if load_additional_shapes is not None: + shape_file_names = [] + for filename in os.listdir(path): + if filename.endswith((SK.TIFF_FILE, SK.GEOJSON_FILE)): + if load_additional_shapes == "all": + if not any(key in filename for key in images.keys()) and not any( + key in filename for key in labels.keys() + ): + shape_file_names.append(filename) + elif load_additional_shapes == "segmentation": + if SK.SEGMENTATION in filename and not any(key in filename for key in labels.keys()): + shape_file_names.append(filename) + elif load_additional_shapes == "boundaries": + if SK.BOUNDARIES in filename: + shape_file_names.append(filename) + elif isinstance(load_additional_shapes, str): + if load_additional_shapes in filename: + shape_file_names.append(filename) + else: + raise ValueError(f"No file found with identifier {load_additional_shapes}") + + for x in range(len(shape_file_names)): + shapes[f"{os.path.splitext(shape_file_names[x])[0]}"] = ShapesModel.parse( + path / shape_file_names[x], + index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), + transformations={"global": Identity()}, + ) sdata = SpatialData(images=images, labels=labels, points=points, table=table, shapes=shapes) From 044ab9d168c2d755e44f4480e9f1f77b8d75a4d0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Oct 2024 14:19:08 +0000 Subject: [PATCH 05/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata_io/readers/_utils/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index 8c9a412a..f7f3abe5 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -15,12 +15,12 @@ if TYPE_CHECKING: from numpy.typing import NDArray + NDArrayA = NDArray[Any] else: NDArrayA = np.ndarray - def _read_counts( path: str | Path, counts_file: str, From 35232d07112b9baf75a5328dc7e62091c8cdd0a5 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Thu, 31 Oct 2024 16:49:57 +0100 Subject: [PATCH 06/33] detect full res image when path is relative --- src/spatialdata_io/readers/visium_hd.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/spatialdata_io/readers/visium_hd.py b/src/spatialdata_io/readers/visium_hd.py index aa443a37..c8215269 100644 --- a/src/spatialdata_io/readers/visium_hd.py +++ b/src/spatialdata_io/readers/visium_hd.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -import os import re import warnings from collections.abc import Mapping @@ -116,12 +115,12 @@ def load_image(path: Path, suffix: str, scale_factors: list[int] | None = None) stacklevel=2, ) - def _get_bins(path: Path) -> list[str]: + def _get_bins(path_bins: Path) -> list[str]: return sorted( [ - bin_size - for bin_size in os.listdir(path) - if os.path.isdir(os.path.join(path, bin_size)) and bin_size.startswith(VisiumHDKeys.BIN_PREFIX) + bin_size.name + for bin_size in path_bins.iterdir() + if bin_size.is_dir() and bin_size.name.startswith(VisiumHDKeys.BIN_PREFIX) ] ) @@ -265,10 +264,7 @@ def _get_bins(path: Path) -> list[str]: else: path_fullres = path / VisiumHDKeys.MICROSCOPE_IMAGE if path_fullres.exists(): - fullres_image_filenames = [ - f for f in os.listdir(path_fullres) if os.path.isfile(os.path.join(path_fullres, f)) - ] - fullres_image_paths = [path_fullres / image_filename for image_filename in fullres_image_filenames] + fullres_image_paths = [file for file in path_fullres.iterdir() if file.is_file()] elif list((path_fullres := (path / f"{filename_prefix}tissue_image")).parent.glob(f"{path_fullres.name}.*")): fullres_image_paths = list(path_fullres.parent.glob(f"{path_fullres.name}.*")) else: @@ -291,7 +287,7 @@ def _get_bins(path: Path) -> list[str]: if fullres_image_file is not None: load_image( - path=path / fullres_image_file, + path=fullres_image_file, suffix="_full_image", scale_factors=[2, 2, 2, 2], ) @@ -356,7 +352,7 @@ def _get_bins(path: Path) -> list[str]: def _infer_dataset_id(path: Path) -> str: suffix = f"_{VisiumHDKeys.FEATURE_SLICE_FILE.value}" - files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith(suffix)] + files = [file.name for file in path.iterdir() if file.is_file() and file.name.endswith(suffix)] if len(files) == 0 or len(files) > 1: raise ValueError( f"Cannot infer `dataset_id` from the feature slice file in {path}, please pass `dataset_id` as an argument." From 233946ae30b97665274bf197544929aaab601252 Mon Sep 17 00:00:00 2001 From: Laurens Lehner Date: Tue, 12 Nov 2024 18:32:59 +0100 Subject: [PATCH 07/33] Update prefixes; Add scalefactors --- src/spatialdata_io/_constants/_constants.py | 4 +- src/spatialdata_io/readers/seqfish.py | 129 ++++++++++++-------- 2 files changed, 78 insertions(+), 55 deletions(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index 57f50db0..92d26db3 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -68,7 +68,7 @@ class SeqfishKeys(ModeEnum): TIFF_FILE = ".tiff" GEOJSON_FILE = ".geojson" # file identifiers - SECTION = "_section" + ROI = "Roi" TRANSCRIPT_COORDINATES = "TranscriptList" DAPI = "DAPI" COUNTS_FILE = "CellxGene" @@ -88,6 +88,8 @@ class SeqfishKeys(ModeEnum): SPATIAL_KEY = "spatial" REGION_KEY = "region" INSTANCE_KEY_TABLE = "instance_id" + SCALEFEFACTOR_X = "PhysicalSizeX" + SCALEFEFACTOR_Y = "PhysicalSizeY" @unique diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index e1f97f42..0ce6f144 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -2,6 +2,7 @@ import os import re +import xml.etree.ElementTree as ET from collections.abc import Mapping from pathlib import Path from types import MappingProxyType @@ -10,6 +11,7 @@ import anndata as ad import numpy as np import pandas as pd +import tifffile from dask_image.imread import imread from spatialdata import SpatialData from spatialdata.models import ( @@ -19,7 +21,7 @@ ShapesModel, TableModel, ) -from spatialdata.transformations.transformations import Identity +from spatialdata.transformations.transformations import Identity, Scale from spatialdata_io._constants._constants import SeqfishKeys as SK from spatialdata_io._docs import inject_docs @@ -35,7 +37,7 @@ def seqfish( load_points: bool = True, load_shapes: bool = True, load_additional_shapes: Union[Literal["segmentation", "boundaries", "all"], str, None] = None, - sections: Optional[list[int]] = None, + ROIs: Optional[list[int]] = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> SpatialData: """ @@ -43,11 +45,11 @@ def seqfish( This function reads the following files: - - ```{vx.COUNTS_FILE!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Counts and metadata file. - - ```{vx.CELL_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Cell coordinates file. - - ```{vx.DAPI!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: High resolution tiff image. - - ```{vx.SEGMENTATION!r}{vx.SECTION!r}{vx.TIFF_FILE!r}```: Cell mask file. - - ```{vx.TRANSCRIPT_COORDINATES!r}{vx.SECTION!r}{vx.CSV_FILE!r}```: Transcript coordinates file. + - ```{vx.ROI!r}{vx.COUNTS_FILE!r}{vx.CSV_FILE!r}```: Counts and metadata file. + - ```{vx.ROI!r}{vx.CELL_COORDINATES!r}{vx.CSV_FILE!r}```: Cell coordinates file. + - ```{vx.ROI!r}{vx.DAPI!r}{vx.TIFF_FILE!r}```: High resolution tiff image. + - ```{vx.ROI!r}{vx.SEGMENTATION!r}{vx.TIFF_FILE!r}```: Cell mask file. + - ```{vx.ROI!r}{vx.TRANSCRIPT_COORDINATES!r}{vx.CSV_FILE!r}```: Transcript coordinates file. .. seealso:: @@ -67,8 +69,8 @@ def seqfish( Whether to load cells as shape. load_additional_shapes Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. - sections - Which sections (specified as integers) to load. Only necessary if multiple sections present. + ROIs + Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present. If "all" is specified, reads all remaining .tiff and .geojson files in the directory. imread_kwargs Keyword arguments to pass to :func:`dask_image.imread.imread`. @@ -80,6 +82,9 @@ def seqfish( path = Path(path) count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$") count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)] + roi_pattern = re.compile(f"^{SK.ROI}(\\d+)") + # n_rois = {roi_pattern.match(i).group(1) for i in os.listdir(path) if roi_pattern.match(i)} + n_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))} if not count_files: # no file matching tbe pattern found raise ValueError( @@ -88,50 +93,53 @@ def seqfish( matched = count_file_pattern.match(count_files[0]) if matched is None: raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}") - prefix = matched.group(1) - - n = len(count_files) - all_sections = list(range(1, n + 1)) - sections_str: list[Optional[str]] = [] - if sections is not None: - for section in sections: - if section not in all_sections: - raise ValueError(f"Section {section} not found in the data.") - else: - sections_str.append(f"{SK.SECTION}{section}") - elif sections is None: - sections_str.append(None) + # prefix = matched.group(1) + + rois_str: list[str] = [] + if ROIs is None: + for roi in n_rois: + rois_str.append(f"{SK.ROI}{roi}") + elif isinstance(ROIs, list): + for roi in ROIs: + if str(roi) not in n_rois: + raise ValueError(f"ROI{roi} not found.") + rois_str.append(f"{SK.ROI}{roi}") else: - raise ValueError("Invalid type for 'section'. Must be list[int] or None.") + raise ValueError("Invalid type for 'roi'. Must be list[int] or None.") - def get_cell_file(section: str | None) -> str: - return f"{prefix}{SK.CELL_COORDINATES}{section or ''}{SK.CSV_FILE}" + def get_cell_file(roi: str) -> str: + return f"{roi}_{SK.CELL_COORDINATES}{SK.CSV_FILE}" - def get_count_file(section: str | None) -> str: - return f"{prefix}{SK.COUNTS_FILE}{section or ''}{SK.CSV_FILE}" + def get_count_file(roi: str) -> str: + return f"{roi}_{SK.COUNTS_FILE}{SK.CSV_FILE}" - def get_dapi_file(section: str | None) -> str: - return f"{prefix}{SK.DAPI}{section or ''}{SK.TIFF_FILE}" + def get_dapi_file(roi: str) -> str: + return f"{roi}_{SK.DAPI}{SK.TIFF_FILE}" - def get_cell_mask_file(section: str | None) -> str: - return f"{prefix}{SK.SEGMENTATION}{section or ''}{SK.TIFF_FILE}" + def get_cell_mask_file(roi: str) -> str: + return f"{roi}_{SK.SEGMENTATION}{SK.TIFF_FILE}" - def get_transcript_file(section: str | None) -> str: - return f"{prefix}{SK.TRANSCRIPT_COORDINATES}{section or ''}{SK.CSV_FILE}" + def get_transcript_file(roi: str) -> str: + return f"{roi}_{SK.TRANSCRIPT_COORDINATES}{SK.CSV_FILE}" + scaled = {} adatas: dict[Optional[str], ad.AnnData] = {} - for section_str in sections_str: - assert isinstance(section_str, str) or section_str is None - cell_file = get_cell_file(section_str) - count_matrix = get_count_file(section_str) + for roi_str in rois_str: + assert isinstance(roi_str, str) or roi_str is None + cell_file = get_cell_file(roi_str) + count_matrix = get_count_file(roi_str) adata = ad.read_csv(path / count_matrix, delimiter=",") cell_info = pd.read_csv(path / cell_file, delimiter=",") adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy() adata.obs[SK.AREA] = np.reshape(cell_info[SK.AREA].to_numpy(), (-1, 1)) - region = f"cells_{section_str}" + region = f"cells_{roi_str}" adata.obs[SK.REGION_KEY] = region adata.obs[SK.INSTANCE_KEY_TABLE] = adata.obs.index.astype(int) - adatas[section_str] = adata + adatas[roi_str] = adata + scaled[roi_str] = Scale( + np.array(_get_scale_factors(path / get_dapi_file(roi_str), SK.SCALEFEFACTOR_X, SK.SCALEFEFACTOR_Y)), + axes=("y", "x"), + ) scale_factors = [2, 2, 2, 2] @@ -141,9 +149,9 @@ def get_transcript_file(section: str | None) -> str: imread(path / get_dapi_file(x), **imread_kwargs), dims=("c", "y", "x"), scale_factors=scale_factors, - transformations={"global": Identity()}, + transformations={"global": scaled[x]}, ) - for x in sections_str + for x in rois_str } else: images = {} @@ -156,7 +164,7 @@ def get_transcript_file(section: str | None) -> str: scale_factors=scale_factors, transformations={"global": Identity()}, ) - for x in sections_str + for x in rois_str } else: labels = {} @@ -170,20 +178,22 @@ def get_transcript_file(section: str | None) -> str: # instance_key=SK.INSTANCE_KEY_POINTS.value, # TODO: should be optional but parameter might get deprecated anyway transformations={"global": Identity()}, ) - for x in sections_str + for x in rois_str } else: points = {} - adata = ad.concat(adatas.values()) - adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") - adata.obs = adata.obs.reset_index(drop=True) - table = TableModel.parse( - adata, - region=[f"cells_{x}" for x in sections_str], - region_key=SK.REGION_KEY.value, - instance_key=SK.INSTANCE_KEY_TABLE.value, - ) + # adata = ad.concat(adatas.values()) + tables = {} + for name, adata in adatas.items(): + adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") + adata.obs = adata.obs.reset_index(drop=True) + tables[name] = TableModel.parse( + adata, + region=f"cells_{name}", + region_key=SK.REGION_KEY.value, + instance_key=SK.INSTANCE_KEY_TABLE.value, + ) if load_shapes: shapes = { @@ -194,7 +204,7 @@ def get_transcript_file(section: str | None) -> str: index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) - for x, adata in adatas.items() + for x, adata in zip(rois_str, adatas.values()) } else: shapes = {} @@ -227,6 +237,17 @@ def get_transcript_file(section: str | None) -> str: transformations={"global": Identity()}, ) - sdata = SpatialData(images=images, labels=labels, points=points, table=table, shapes=shapes) + sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes) return sdata + + +def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_key: str) -> list[float]: + with tifffile.TiffFile(DAPI_path) as tif: + ome_metadata = tif.ome_metadata + root = ET.fromstring(ome_metadata) + for element in root.iter(): + if scalefactor_x_key in element.attrib.keys(): + scalefactor_x = element.attrib[scalefactor_x_key] + scalefactor_y = element.attrib[scalefactor_y_key] + return [float(scalefactor_x), float(scalefactor_y)] From d9ec1f51dcd889ab8fd6919b22e3e42df78467b6 Mon Sep 17 00:00:00 2001 From: Laurens Lehner Date: Tue, 12 Nov 2024 18:42:56 +0100 Subject: [PATCH 08/33] Remove unused code --- src/spatialdata_io/readers/seqfish.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index 0ce6f144..dac52f99 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -83,17 +83,14 @@ def seqfish( count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$") count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)] roi_pattern = re.compile(f"^{SK.ROI}(\\d+)") - # n_rois = {roi_pattern.match(i).group(1) for i in os.listdir(path) if roi_pattern.match(i)} n_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))} if not count_files: - # no file matching tbe pattern found raise ValueError( f"No files matching the pattern {count_file_pattern} were found. Cannot infer the naming scheme." ) matched = count_file_pattern.match(count_files[0]) if matched is None: raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}") - # prefix = matched.group(1) rois_str: list[str] = [] if ROIs is None: @@ -183,7 +180,6 @@ def get_transcript_file(roi: str) -> str: else: points = {} - # adata = ad.concat(adatas.values()) tables = {} for name, adata in adatas.items(): adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") From 148466dde21fdf624d3e4604b92b9d7e3e94b5a5 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Thu, 28 Nov 2024 12:30:02 +0100 Subject: [PATCH 09/33] set transformation to global for both hires and lowres --- src/spatialdata_io/readers/visium_hd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/visium_hd.py b/src/spatialdata_io/readers/visium_hd.py index c8215269..3037ebff 100644 --- a/src/spatialdata_io/readers/visium_hd.py +++ b/src/spatialdata_io/readers/visium_hd.py @@ -306,7 +306,7 @@ def _get_bins(path_bins: Path) -> list[str]: ) set_transformation( images[dataset_id + "_hires_image"], - {"downscaled_hires": Identity()}, + {"downscaled_hires": Identity(), "global": transform_hires.inverse()}, set_all=True, ) @@ -324,7 +324,7 @@ def _get_bins(path_bins: Path) -> list[str]: ) set_transformation( images[dataset_id + "_lowres_image"], - {"downscaled_lowres": Identity()}, + {"downscaled_lowres": Identity(), "global": transform_lowres.inverse()}, set_all=True, ) From b08eb5913d44ba7ca85d49f9bdbb710c49f4cc6d Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 29 Nov 2024 10:39:30 +0100 Subject: [PATCH 10/33] lowres and hires images mapped to global also for Visium --- CHANGELOG.md | 4 ++++ src/spatialdata_io/readers/visium.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47f2900a..c1c824a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning][]. [keep a changelog]: https://keepachangelog.com/en/1.0.0/ [semantic versioning]: https://semver.org/spec/v2.0.0.html +## incoming release + +- (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230 + ## [0.1.6] - 2024-11-26 - (MERSCOPE) added `feature_key` attribute for points (i.e., the `'gene'` column) #210 diff --git a/src/spatialdata_io/readers/visium.py b/src/spatialdata_io/readers/visium.py index 22a75855..19bc740a 100644 --- a/src/spatialdata_io/readers/visium.py +++ b/src/spatialdata_io/readers/visium.py @@ -231,14 +231,14 @@ def visium( image_hires = imread(path / VisiumKeys.IMAGE_HIRES_FILE, **imread_kwargs).squeeze().transpose(2, 0, 1) image_hires = DataArray(image_hires, dims=("c", "y", "x")) images[dataset_id + "_hires_image"] = Image2DModel.parse( - image_hires, transformations={"downscaled_hires": Identity()}, rgb=None + image_hires, transformations={"downscaled_hires": Identity(), "global": transform_hires.inverse()}, rgb=None ) if (path / VisiumKeys.IMAGE_LOWRES_FILE).exists(): image_lowres = imread(path / VisiumKeys.IMAGE_LOWRES_FILE, **imread_kwargs).squeeze().transpose(2, 0, 1) image_lowres = DataArray(image_lowres, dims=("c", "y", "x")) images[dataset_id + "_lowres_image"] = Image2DModel.parse( image_lowres, - transformations={"downscaled_lowres": Identity()}, + transformations={"downscaled_lowres": Identity(), "global": transform_lowres.inverse()}, rgb=None, ) From df804972e2372c831fec09f6dfc8fa7f809c0557 Mon Sep 17 00:00:00 2001 From: LucaMarconato <2664412+LucaMarconato@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:17:40 +0100 Subject: [PATCH 11/33] Update README.md Added known limitations and contributing section. --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index f4032a1b..49629e7c 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,21 @@ This package contains reader functions to load common spatial omics formats into Note: all mentioned technologies are registered trademarks of their respective companies. +## Known limitations + +Contributions for addressing the below limitations are very welcomed. + +- Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161 + +### How to Contribute + +1. **Open a GitHub Issue**: Start by opening a new issue or commenting on an existing one in the repository. Clearly describe the problem and your proposed changes to avoid overlapping efforts with others. + +2. **Submit a Pull Request (PR)**: Once the issue is discussed, submit a PR to the `spatialdata-io` repository. Ensure your PR includes information about a suitable dataset for testing the reader, ideally no larger than 10 GB. Include clear instructions for accessing the data, preferably with a `curl` or `wget` command for easy downloading. + +3. **Optional Enhancements**: To facilitate reproducibility and ease of data access, consider adding a folder in the [spatialdata-sandbox](https://github.com/giovp/spatialdata-sandbox) repository. Include a `download.py` and `to_zarr.py` script (refer to examples in the repository) to enable others to reproduce your reader by simply running these scripts sequentially. + + ## Getting started Please refer to the [documentation][link-docs]. In particular, the From 995a2bd577ad5a137a91412a5c54f00952ffd104 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Dec 2024 10:20:47 +0000 Subject: [PATCH 12/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 49629e7c..a6957764 100644 --- a/README.md +++ b/README.md @@ -29,9 +29,9 @@ Note: all mentioned technologies are registered trademarks of their respective c ## Known limitations -Contributions for addressing the below limitations are very welcomed. +Contributions for addressing the below limitations are very welcomed. -- Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161 +- Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161 ### How to Contribute @@ -41,7 +41,6 @@ Contributions for addressing the below limitations are very welcomed. 3. **Optional Enhancements**: To facilitate reproducibility and ease of data access, consider adding a folder in the [spatialdata-sandbox](https://github.com/giovp/spatialdata-sandbox) repository. Include a `download.py` and `to_zarr.py` script (refer to examples in the repository) to enable others to reproduce your reader by simply running these scripts sequentially. - ## Getting started Please refer to the [documentation][link-docs]. In particular, the From e2341724a9bd7ab19f75c3cc4b13947c0fce5813 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 6 Dec 2024 11:57:49 +0100 Subject: [PATCH 13/33] add docs --- README.md | 1 + src/spatialdata_io/__init__.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index a6957764..a25b245c 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ This package contains reader functions to load common spatial omics formats into - Steinbock (output data) - STOmics Stereo-seq® - Vizgen MERSCOPE® (MERFISH) +- MACSima® (MACS® iQ View output) Note: all mentioned technologies are registered trademarks of their respective companies. diff --git a/src/spatialdata_io/__init__.py b/src/spatialdata_io/__init__.py index 48f784bd..618855a5 100644 --- a/src/spatialdata_io/__init__.py +++ b/src/spatialdata_io/__init__.py @@ -4,6 +4,7 @@ from spatialdata_io.readers.cosmx import cosmx from spatialdata_io.readers.curio import curio from spatialdata_io.readers.dbit import dbit +from spatialdata_io.readers.macsima import macsima from spatialdata_io.readers.mcmicro import mcmicro from spatialdata_io.readers.merscope import merscope from spatialdata_io.readers.seqfish import seqfish @@ -32,6 +33,7 @@ "xenium_explorer_selection", "dbit", "visium_hd", + "macsima", ] __version__ = version("spatialdata-io") From 914d2b78b60c6a15a1f4409d225a7cdb037184b7 Mon Sep 17 00:00:00 2001 From: LucaMarconato <2664412+LucaMarconato@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:50:20 +0100 Subject: [PATCH 14/33] Update src/spatialdata_io/readers/_utils/_utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/spatialdata_io/readers/_utils/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index 5891fd68..82676098 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -112,7 +112,7 @@ def parse_physical_size(path: Path | None = None, ome_pixels: Pixels | None = No logger.error("Physical units for x and y dimensions are not the same.") raise NotImplementedError if pixels.physical_size_x != pixels.physical_size_y: - logger.error("Physical sizes for x and y dimensions are the same.") + logger.error("Physical sizes for x and y dimensions are not the same.") raise NotImplementedError # convert to micrometer if needed if pixels.physical_size_x_unit == UnitsLength.NANOMETER: From 0b631850ba4453cfda2dfcb6717c7601e4a397b5 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 6 Dec 2024 16:05:34 +0100 Subject: [PATCH 15/33] fix sorting --- src/spatialdata_io/readers/macsima.py | 4 ++-- tests/test_macsima.py | 24 +++++++++++++++++++++++- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py index 2916d79b..e15c3e83 100644 --- a/src/spatialdata_io/readers/macsima.py +++ b/src/spatialdata_io/readers/macsima.py @@ -88,7 +88,7 @@ def from_paths( raise ValueError( f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}" ) - # sort imgs, cycles and channels based on cycles + # create MultiChannelImage object with imgs and metadata output = cls( data=imgs, metadata=[ChannelMetadata(name=ch, cycle=c) for c, ch in zip(cycles, channels)], @@ -135,8 +135,8 @@ def get_cycles(self) -> list[int]: def sort_by_channel(self) -> None: """Sort the channels by cycle number.""" - self.metadata = sorted(self.metadata, key=lambda x: x.cycle) self.data = [d for _, d in sorted(zip(self.metadata, self.data), key=lambda x: x[0].cycle)] + self.metadata = sorted(self.metadata, key=lambda x: x.cycle) def subset(self, subset: int | None = None) -> MultiChannelImage: """Subset the image.""" diff --git a/tests/test_macsima.py b/tests/test_macsima.py index 886003b5..f77cbd19 100644 --- a/tests/test_macsima.py +++ b/tests/test_macsima.py @@ -3,10 +3,16 @@ from pathlib import Path from typing import Any +import dask.array as da import pytest from spatialdata.models import get_channels -from spatialdata_io.readers.macsima import macsima, parse_name_to_cycle +from spatialdata_io.readers.macsima import ( + ChannelMetadata, + MultiChannelImage, + macsima, + parse_name_to_cycle, +) if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()): # The datasets should be downloaded and placed unzipped in the "data" directory; @@ -109,3 +115,19 @@ def test_parsing_style() -> None: def test_parsing_of_name_to_cycle(name: str, expected: int) -> None: result = parse_name_to_cycle(name) assert result == expected + + +def test_mci_sort_by_channel(): + rng = da.random.default_rng() + sizes = [100, 200, 300] + c_names = ["test11", "test3", "test2"] + cycles = [2, 0, 1] + mci = MultiChannelImage( + data=[rng.random((size, size), chunks=(10, 10)) for size in sizes], + metadata=[ChannelMetadata(name=c_name, cycle=cycle) for c_name, cycle in zip(c_names, cycles)], + ) + assert mci.get_channel_names() == c_names + assert [x.shape[0] for x in mci.data] == sizes + mci.sort_by_channel() + assert mci.get_channel_names() == ["test3", "test2", "test11"] + assert [x.shape[0] for x in mci.data] == [200, 300, 100] From f609882bbca9b1388c59065b73a9ae12bfd541e7 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 6 Dec 2024 16:05:57 +0100 Subject: [PATCH 16/33] first set of fixes during code review --- CHANGELOG.md | 1 + src/spatialdata_io/readers/macsima.py | 57 +++++++++++++++------------ tests/_utils.py | 31 +++++++++++++++ tests/test_macsima.py | 22 +++++------ tests/test_xenium.py | 4 +- 5 files changed, 75 insertions(+), 40 deletions(-) create mode 100644 tests/_utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c1c824a1..aa47ea7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning][]. ## incoming release - (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230 +- (Macsima) added support @berombau #224 ## [0.1.6] - 2024-11-26 diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py index 2916d79b..4e5b6a17 100644 --- a/src/spatialdata_io/readers/macsima.py +++ b/src/spatialdata_io/readers/macsima.py @@ -68,10 +68,16 @@ def from_paths( warnings.simplefilter("ignore") channel_names = parse_channels(p) if len(channel_names) > 1: - logger.warning(f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.") + warnings.warn( + f"Found multiple channels in OME-TIFF file {p}. Only the first one will be used.", + UserWarning, + stacklevel=2, + ) channels.append(channel_names[0]) except ValueError as e: - logger.warning(f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.") + warnings.warn( + f"Cannot parse OME metadata from {p}. Error: {e}. Skipping this file.", UserWarning, stacklevel=2 + ) if len(path_files) != len(cycles) or len(path_files) != len(channels): raise ValueError("Length of path_files, cycles and channels must be the same.") @@ -80,29 +86,37 @@ def from_paths( logger.info("Skipping cycles: %d", skip_rounds) path_files, cycles, channels = map( list, - zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]), + zip( + *[ + (p, c, ch) + for p, c, ch in zip(path_files, cycles, channels, strict=True) + if c not in skip_rounds + ], + strict=True, + ), ) imgs = [imread(img, **imread_kwargs) for img in path_files] - for img, path in zip(imgs, path_files): + for img, path in zip(imgs, path_files, strict=True): if img.shape[1:] != imgs[0].shape[1:]: raise ValueError( - f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image {path_files[0]} has shape {imgs[0].shape[1:]}" + f"Images are not all the same size. Image {path} has shape {img.shape[1:]} while the first image " + f"{path_files[0]} has shape {imgs[0].shape[1:]}" ) # sort imgs, cycles and channels based on cycles output = cls( data=imgs, - metadata=[ChannelMetadata(name=ch, cycle=c) for c, ch in zip(cycles, channels)], + metadata=[ChannelMetadata(name=ch, cycle=c) for c, ch in zip(cycles, channels, strict=True)], ) return output @classmethod def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage: - """Create new MultiChannelImage with only the channels that contain c_name.""" + """Create new MultiChannelImage with only the channels that contain the string c_name.""" indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name] - return MultiChannelImage.filter_by_index(mci, indices) + return MultiChannelImage.subset_by_index(mci, indices) @classmethod - def filter_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage: + def subset_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage: """Filter the image by index.""" metadata = [c for i, c in enumerate(mci.metadata) if i in indices] data = [d for i, d in enumerate(mci.data) if i in indices] @@ -135,37 +149,27 @@ def get_cycles(self) -> list[int]: def sort_by_channel(self) -> None: """Sort the channels by cycle number.""" + self.data = [d for _, d in sorted(zip(self.metadata, self.data, strict=True), key=lambda x: x[0].cycle)] self.metadata = sorted(self.metadata, key=lambda x: x.cycle) - self.data = [d for _, d in sorted(zip(self.metadata, self.data), key=lambda x: x[0].cycle)] def subset(self, subset: int | None = None) -> MultiChannelImage: - """Subset the image.""" + """Subsets the images to keep only the first `subset` x `subset` pixels.""" if subset: self.data = [d[:, :subset, :subset] for d in self.data] return self def subset_channels(self, c_subset: int) -> MultiChannelImage: - """Subset the channels.""" + """Subsets the channels to keep only the first `c_subset` channels.""" self.data = self.data[:c_subset] self.metadata = self.metadata[:c_subset] return self - def subset_by_idx(self, indices: list[int]) -> MultiChannelImage: - """Subset the image by index.""" - data = [d for i, d in enumerate(self.data) if i in indices] - metadata = [c for i, c in enumerate(self.metadata) if i in indices] - return MultiChannelImage( - data=data, - metadata=metadata, - include_cycle_in_channel_name=self.include_cycle_in_channel_name, - ) - def calc_scale_factors(self, default_scale_factor: int = 2) -> list[int]: lower_scale_limit = min(self.data[0].shape[1:]) return calc_scale_factors(lower_scale_limit, default_scale_factor=default_scale_factor) def get_stack(self) -> da.Array: - return da.stack(self.data).squeeze() + return da.stack(self.data).squeeze(axis=0) def macsima( @@ -224,7 +228,8 @@ def macsima( nuclei_channel_name Common string of the nuclei channel to separate nuclei from other channels. split_threshold_nuclei_channel - Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is greater than this threshold, the nuclei channels are split into a separate stack. + Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is + greater than this threshold, the nuclei channels are split into a separate stack. skip_rounds List of round numbers to skip when parsing the data. include_cycle_in_channel_name @@ -383,10 +388,10 @@ def create_sdata( split_nuclei = n_nuclei_channels > split_threshold_nuclei_channel if split_nuclei: # if channel name is nuclei_channel_name, add to seperate nuclei stack - nuclei_mci = mci.subset_by_idx(nuclei_idx) + nuclei_mci = MultiChannelImage.subset_by_index(mci, indices=nuclei_idx) # keep the first nuclei channel in both the stack and the nuclei stack nuclei_idx_without_first_and_last = nuclei_idx[1:-1] - mci = MultiChannelImage.filter_by_index( + mci = MultiChannelImage.subset_by_index( mci, [i for i in range(len(mci.metadata)) if i not in nuclei_idx_without_first_and_last], ) diff --git a/tests/_utils.py b/tests/_utils.py new file mode 100644 index 00000000..5573ba44 --- /dev/null +++ b/tests/_utils.py @@ -0,0 +1,31 @@ +import sys + +import pytest + + +def skip_if_below_python_version() -> pytest.mark.skipif: + """ + Decorator to skip tests if the Python version is below a specified version. + + This decorator prevents running tests on unsupported Python versions. Update the `MIN_VERSION` + constant to change the minimum Python version required for the tests. + + Returns + ------- + pytest.mark.skipif + A pytest marker that skips the test if the current Python version is below the specified `MIN_VERSION`. + + Notes + ----- + The current minimum version is set to Python 3.10. Adjust the `MIN_VERSION` constant as needed + to accommodate newer Python versions. + + Examples + -------- + >>> @skip_if_below_python_version() + >>> def test_some_feature(): + >>> assert True + """ + MIN_VERSION = (3, 10) + reason = f"Test requires Python {'.'.join(map(str, MIN_VERSION))} or higher" + return pytest.mark.skipif(sys.version_info < MIN_VERSION, reason=reason) diff --git a/tests/test_macsima.py b/tests/test_macsima.py index 886003b5..c64f1761 100644 --- a/tests/test_macsima.py +++ b/tests/test_macsima.py @@ -1,24 +1,22 @@ import math -import sys from pathlib import Path from typing import Any import pytest -from spatialdata.models import get_channels +from spatialdata.models import get_channel_names from spatialdata_io.readers.macsima import macsima, parse_name_to_cycle +from tests._utils import skip_if_below_python_version if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()): - # The datasets should be downloaded and placed unzipped in the "data" directory; - # MACSimaData_HCA/ with e.g. unzipped HumanLiverH35/ inside: https://livercellatlas.org/download.php - # Lung_adc_demo/: Ask Miltenyi Biotec for the demo dataset pytest.skip( - "requires the Lung_adc_demo or MACSimaData_HCA datasets", + "Requires the Lung_adc_demo or MACSimaData_HCA datasets, please check " + "https://github.com/giovp/spatialdata-sandbox/macsima/Readme.md for instructions on how to get the data.", allow_module_level=True, ) -@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@skip_if_below_python_version() @pytest.mark.parametrize( "dataset,expected", [ @@ -40,7 +38,7 @@ def test_image_size(dataset: str, expected: dict[str, Any]) -> None: assert extent == expected -@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@skip_if_below_python_version() @pytest.mark.parametrize( "dataset,expected", [("Lung_adc_demo", 116), ("MACSimaData_HCA/HumanLiverH35", 102)], @@ -52,11 +50,11 @@ def test_total_channels(dataset: str, expected: int) -> None: el = sdata[list(sdata.images.keys())[0]] # get the number of channels - channels: int = len(get_channels(el)) + channels: int = len(get_channel_names(el)) assert channels == expected -@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@skip_if_below_python_version() @pytest.mark.parametrize( "dataset,expected", [ @@ -71,11 +69,11 @@ def test_channel_names(dataset: str, expected: list[str]) -> None: el = sdata[list(sdata.images.keys())[0]] # get the channel names - channels = get_channels(el) + channels = get_channel_names(el) assert list(channels) == expected -@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@skip_if_below_python_version() @pytest.mark.parametrize( "dataset,expected", [ diff --git a/tests/test_xenium.py b/tests/test_xenium.py index 1c64a912..fedb824b 100644 --- a/tests/test_xenium.py +++ b/tests/test_xenium.py @@ -1,5 +1,4 @@ import math -import sys from pathlib import Path import numpy as np @@ -10,6 +9,7 @@ prefix_suffix_uint32_from_cell_id_str, xenium, ) +from tests._utils import skip_if_below_python_version def test_cell_id_str_from_prefix_suffix_uint32() -> None: @@ -46,7 +46,7 @@ def test_roundtrip_with_data_limits() -> None: # pointing to "data". # The GitHub workflow "prepare_test_data.yaml" takes care of downloading the datasets and uploading an artifact for the # tests to use -@pytest.mark.skipif(sys.version_info < (3, 12), reason="Test requires Python 3.10 or higher") +@skip_if_below_python_version() @pytest.mark.parametrize( "dataset,expected", [ From c09f32947ed0becf41943b06833ea59c85b2d85b Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 6 Dec 2024 16:29:29 +0100 Subject: [PATCH 17/33] add test for skip_rounds --- src/spatialdata_io/readers/macsima.py | 4 +-- tests/test_macsima.py | 40 +++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py index e15c3e83..0a287671 100644 --- a/src/spatialdata_io/readers/macsima.py +++ b/src/spatialdata_io/readers/macsima.py @@ -77,7 +77,7 @@ def from_paths( raise ValueError("Length of path_files, cycles and channels must be the same.") # if any of round_channels is in skip_rounds, remove that round from the list and from path_files if skip_rounds: - logger.info("Skipping cycles: %d", skip_rounds) + logger.info(f"Skipping cycles: {skip_rounds}") path_files, cycles, channels = map( list, zip(*[(p, c, ch) for p, c, ch in zip(path_files, cycles, channels) if c not in skip_rounds]), @@ -226,7 +226,7 @@ def macsima( split_threshold_nuclei_channel Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is greater than this threshold, the nuclei channels are split into a separate stack. skip_rounds - List of round numbers to skip when parsing the data. + List of round numbers to skip when parsing the data. Rounds or cycles are counted from 0 e.g. skip_rounds=[1, 2] will parse only the first round 0 when there are only 3 cycles. include_cycle_in_channel_name Whether to include the cycle number in the channel name. diff --git a/tests/test_macsima.py b/tests/test_macsima.py index f77cbd19..221b7d78 100644 --- a/tests/test_macsima.py +++ b/tests/test_macsima.py @@ -81,6 +81,46 @@ def test_channel_names(dataset: str, expected: list[str]) -> None: assert list(channels) == expected +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@pytest.mark.parametrize( + "dataset,expected", + [ + ("Lung_adc_demo", 68), + ("MACSimaData_HCA/HumanLiverH35", 51), + ], +) +def test_total_rounds(dataset: str, expected: list[int]) -> None: + f = Path("./data") / dataset + assert f.is_dir() + sdata = macsima(f) + table = sdata[list(sdata.tables)[0]] + max_cycle = table.var["cycle"].max() + assert max_cycle == expected + + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@pytest.mark.parametrize( + "dataset,skip_rounds,expected", + [ + ("Lung_adc_demo", list(range(2, 68)), ["DAPI (1)", "CD68", "CD163", "DAPI (2)", "Control"]), + ( + "MACSimaData_HCA/HumanLiverH35", + list(range(2, 51)), + ["DAPI (1)", "PE", "CD14", "Vimentin", "DAPI (2)", "WT1"], + ), + ], +) +def test_skip_rounds(dataset: str, skip_rounds: list[int], expected: list[str]) -> None: + f = Path("./data") / dataset + assert f.is_dir() + sdata = macsima(f, skip_rounds=skip_rounds) + el = sdata[list(sdata.images.keys())[0]] + + # get the channel names + channels = get_channels(el) + assert list(channels) == expected, f"Expected {expected}, got {list(channels)}" + + @pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") @pytest.mark.parametrize( "dataset,expected", From d4fffec1c7e22b4dd07e3b9ab8b917ff6bfdba18 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 6 Dec 2024 16:31:15 +0100 Subject: [PATCH 18/33] finished code review --- src/spatialdata_io/readers/macsima.py | 51 ++++++++++++++------------- tests/_utils.py | 2 +- tests/test_macsima.py | 5 +-- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py index bca799df..d215e84f 100644 --- a/src/spatialdata_io/readers/macsima.py +++ b/src/spatialdata_io/readers/macsima.py @@ -169,7 +169,7 @@ def calc_scale_factors(self, default_scale_factor: int = 2) -> list[int]: return calc_scale_factors(lower_scale_limit, default_scale_factor=default_scale_factor) def get_stack(self) -> da.Array: - return da.stack(self.data).squeeze(axis=0) + return da.stack(self.data, axis=0).squeeze(axis=1) def macsima( @@ -193,7 +193,8 @@ def macsima( """ Read *MACSima* formatted dataset. - This function reads images from a MACSima cyclic imaging experiment. Metadata of the cycle rounds is parsed from the image names. The channel names are parsed from the OME metadata. + This function reads images from a MACSima cyclic imaging experiment. Metadata of the cycle rounds is parsed from + the image names. The channel names are parsed from the OME metadata. .. seealso:: @@ -257,16 +258,16 @@ def macsima( if parsing_style == MACSimaParsingStyle.PROCESSED_SINGLE_FOLDER: return parse_processed_folder( - path, - imread_kwargs, - subset, - c_subset, - max_chunk_size, - c_chunks_size, - multiscale, - transformations, - scale_factors, - default_scale_factor, + path=path, + imread_kwargs=imread_kwargs, + subset=subset, + c_subset=c_subset, + max_chunk_size=max_chunk_size, + c_chunks_size=c_chunks_size, + multiscale=multiscale, + transformations=transformations, + scale_factors=scale_factors, + default_scale_factor=default_scale_factor, nuclei_channel_name=nuclei_channel_name, split_threshold_nuclei_channel=split_threshold_nuclei_channel, skip_rounds=skip_rounds, @@ -281,16 +282,16 @@ def macsima( if p.is_dir() and (not filter_folder_names or not any(f in p.name for f in filter_folder_names)) ]: sdatas[p.stem] = parse_processed_folder( - p, - imread_kwargs, - subset, - c_subset, - max_chunk_size, - c_chunks_size, - multiscale, - transformations, - scale_factors, - default_scale_factor, + path=p, + imread_kwargs=imread_kwargs, + subset=subset, + c_subset=c_subset, + max_chunk_size=max_chunk_size, + c_chunks_size=c_chunks_size, + multiscale=multiscale, + transformations=transformations, + scale_factors=scale_factors, + default_scale_factor=default_scale_factor, nuclei_channel_name=nuclei_channel_name, split_threshold_nuclei_channel=split_threshold_nuclei_channel, skip_rounds=skip_rounds, @@ -387,7 +388,7 @@ def create_sdata( else: split_nuclei = n_nuclei_channels > split_threshold_nuclei_channel if split_nuclei: - # if channel name is nuclei_channel_name, add to seperate nuclei stack + # if channel name is nuclei_channel_name, add to separate nuclei stack nuclei_mci = MultiChannelImage.subset_by_index(mci, indices=nuclei_idx) # keep the first nuclei channel in both the stack and the nuclei stack nuclei_idx_without_first_and_last = nuclei_idx[1:-1] @@ -397,6 +398,7 @@ def create_sdata( ) pixels_to_microns = parse_physical_size(path_files[0]) + image_element = create_image_element( mci, max_chunk_size, @@ -406,6 +408,8 @@ def create_sdata( scale_factors, coordinate_system=filtered_name, ) + table_channels = create_table(mci) + if split_nuclei: nuclei_image_element = create_image_element( nuclei_mci, @@ -417,7 +421,6 @@ def create_sdata( coordinate_system=filtered_name, ) table_nuclei = create_table(nuclei_mci) - table_channels = create_table(mci) sdata = sd.SpatialData( images={ diff --git a/tests/_utils.py b/tests/_utils.py index 5573ba44..9787d1bb 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -26,6 +26,6 @@ def skip_if_below_python_version() -> pytest.mark.skipif: >>> def test_some_feature(): >>> assert True """ - MIN_VERSION = (3, 10) + MIN_VERSION = (3, 12) reason = f"Test requires Python {'.'.join(map(str, MIN_VERSION))} or higher" return pytest.mark.skipif(sys.version_info < MIN_VERSION, reason=reason) diff --git a/tests/test_macsima.py b/tests/test_macsima.py index 960d27f8..51323cc3 100644 --- a/tests/test_macsima.py +++ b/tests/test_macsima.py @@ -14,6 +14,8 @@ ) from tests._utils import skip_if_below_python_version +RNG = da.random.default_rng(seed=0) + if not (Path("./data/Lung_adc_demo").exists() or Path("./data/MACSimaData_HCA").exists()): pytest.skip( "Requires the Lung_adc_demo or MACSimaData_HCA datasets, please check " @@ -116,12 +118,11 @@ def test_parsing_of_name_to_cycle(name: str, expected: int) -> None: def test_mci_sort_by_channel() -> None: - rng = da.random.default_rng() sizes = [100, 200, 300] c_names = ["test11", "test3", "test2"] cycles = [2, 0, 1] mci = MultiChannelImage( - data=[rng.random((size, size), chunks=(10, 10)) for size in sizes], + data=[RNG.random((size, size), chunks=(10, 10)) for size in sizes], metadata=[ChannelMetadata(name=c_name, cycle=cycle) for c_name, cycle in zip(c_names, cycles)], ) assert mci.get_channel_names() == c_names From 57e784b32c72e69776dd2cbfce5b01011568a209 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 6 Dec 2024 16:34:07 +0100 Subject: [PATCH 19/33] fixed pre-commit --- tests/test_macsima.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_macsima.py b/tests/test_macsima.py index 2343f57d..c8b737e6 100644 --- a/tests/test_macsima.py +++ b/tests/test_macsima.py @@ -98,7 +98,7 @@ def test_total_rounds(dataset: str, expected: list[int]) -> None: assert max_cycle == expected -@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@skip_if_below_python_version() @pytest.mark.parametrize( "dataset,skip_rounds,expected", [ @@ -117,11 +117,11 @@ def test_skip_rounds(dataset: str, skip_rounds: list[int], expected: list[str]) el = sdata[list(sdata.images.keys())[0]] # get the channel names - channels = get_channels(el) + channels = get_channel_names(el) assert list(channels) == expected, f"Expected {expected}, got {list(channels)}" -@pytest.mark.skipif(sys.version_info < (3, 10), reason="Test requires Python 3.10 or higher") +@skip_if_below_python_version() @pytest.mark.parametrize( "dataset,expected", [ From 0bcd47f7a1b77c4e22968b8d7e2ce3844305e759 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 6 Dec 2024 17:08:12 +0100 Subject: [PATCH 20/33] add test for array reference --- src/spatialdata_io/readers/macsima.py | 11 ++++++----- tests/test_macsima.py | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py index 2ffeb5ca..75b8325e 100644 --- a/src/spatialdata_io/readers/macsima.py +++ b/src/spatialdata_io/readers/macsima.py @@ -3,6 +3,7 @@ import warnings from collections import defaultdict from collections.abc import Mapping +from copy import deepcopy from dataclasses import dataclass from pathlib import Path from types import MappingProxyType @@ -111,13 +112,13 @@ def from_paths( @classmethod def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage: - """Create new MultiChannelImage with only the channels that contain the string c_name.""" + """Create new MultiChannelImage with only the channels that contain the string c_name. The underlying data will still be the same reference, use copy.deepcopy to make a new copy.""" indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name] - return MultiChannelImage.subset_by_index(mci, indices) + return cls.subset_by_index(mci, indices) @classmethod def subset_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage: - """Filter the image by index.""" + """Create new MultiChannelImage with only the channels selected by the indices. The underlying data will still be the same reference, use copy.deepcopy to make a new copy.""" metadata = [c for i, c in enumerate(mci.metadata) if i in indices] data = [d for i, d in enumerate(mci.data) if i in indices] return cls( @@ -159,7 +160,7 @@ def subset(self, subset: int | None = None) -> MultiChannelImage: return self def subset_channels(self, c_subset: int) -> MultiChannelImage: - """Subsets the channels to keep only the first `c_subset` channels.""" + """Subsets the channels to keep only the first `c_subset` channels. The underlying data will still be the same reference, use copy.deepcopy to make a new copy.""" self.data = self.data[:c_subset] self.metadata = self.metadata[:c_subset] return self @@ -389,7 +390,7 @@ def create_sdata( split_nuclei = n_nuclei_channels > split_threshold_nuclei_channel if split_nuclei: # if channel name is nuclei_channel_name, add to separate nuclei stack - nuclei_mci = MultiChannelImage.subset_by_index(mci, indices=nuclei_idx) + nuclei_mci = deepcopy(MultiChannelImage.subset_by_index(mci, indices=nuclei_idx)) # keep the first nuclei channel in both the stack and the nuclei stack nuclei_idx_without_first_and_last = nuclei_idx[1:-1] mci = MultiChannelImage.subset_by_index( diff --git a/tests/test_macsima.py b/tests/test_macsima.py index c8b737e6..66038e0b 100644 --- a/tests/test_macsima.py +++ b/tests/test_macsima.py @@ -1,4 +1,5 @@ import math +from copy import deepcopy from pathlib import Path from typing import Any @@ -170,3 +171,25 @@ def test_mci_sort_by_channel() -> None: mci.sort_by_channel() assert mci.get_channel_names() == ["test3", "test2", "test11"] assert [x.shape[0] for x in mci.data] == [200, 300, 100] + + +def test_mci_array_reference() -> None: + arr1 = RNG.random((100, 100), chunks=(10, 10)) + arr2 = RNG.random((200, 200), chunks=(10, 10)) + mci = MultiChannelImage( + data=[arr1, arr2], + metadata=[ChannelMetadata(name="test1", cycle=0), ChannelMetadata(name="test2", cycle=1)], + ) + orig_arr1 = arr1.copy() + + subset_mci = MultiChannelImage.subset_by_index(mci, [0]) + # test that the subset is a view + assert subset_mci.data[0] is arr1 + assert da.all(subset_mci.data[0] == orig_arr1) + # test that a deepcopy is not a view + deepcopy_mci: MultiChannelImage = deepcopy(mci) + deepcopy_mci.data[0][0, 0] = deepcopy_mci.data[0][0, 0] + 1 + assert deepcopy_mci.data[0] is not arr1 + assert not da.all(deepcopy_mci.data[0] == orig_arr1) + # test that the original mci is not changed + assert da.all(mci.data[0] == orig_arr1) From da30404c7a47ecf8109aeef4a920c7422836a478 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 6 Dec 2024 18:48:58 +0100 Subject: [PATCH 21/33] removed subset_channels --- src/spatialdata_io/readers/macsima.py | 26 ++++++++++++-------------- tests/_utils.py | 2 +- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py index 2ffeb5ca..1e376566 100644 --- a/src/spatialdata_io/readers/macsima.py +++ b/src/spatialdata_io/readers/macsima.py @@ -109,11 +109,12 @@ def from_paths( ) return output - @classmethod - def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage: - """Create new MultiChannelImage with only the channels that contain the string c_name.""" - indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name] - return MultiChannelImage.subset_by_index(mci, indices) + # unused + # @classmethod + # def subset_by_channel(cls, mci: MultiChannelImage, c_name: str) -> MultiChannelImage: + # """Create new MultiChannelImage with only the channels that contain the string c_name.""" + # indices = [i for i, c in enumerate(mci.metadata) if c_name in c.name] + # return MultiChannelImage.subset_by_index(mci, indices) @classmethod def subset_by_index(cls, mci: MultiChannelImage, indices: list[int]) -> MultiChannelImage: @@ -158,12 +159,6 @@ def subset(self, subset: int | None = None) -> MultiChannelImage: self.data = [d[:, :subset, :subset] for d in self.data] return self - def subset_channels(self, c_subset: int) -> MultiChannelImage: - """Subsets the channels to keep only the first `c_subset` channels.""" - self.data = self.data[:c_subset] - self.metadata = self.metadata[:c_subset] - return self - def calc_scale_factors(self, default_scale_factor: int = 2) -> list[int]: lower_scale_limit = min(self.data[0].shape[1:]) return calc_scale_factors(lower_scale_limit, default_scale_factor=default_scale_factor) @@ -347,7 +342,7 @@ def parse_processed_folder( if subset: mci = mci.subset(subset) if c_subset: - mci = mci.subset_channels(c_subset) + mci = MultiChannelImage.subset_by_index(mci, indices=list(range(0, c_subset))) if multiscale and not scale_factors: scale_factors = mci.calc_scale_factors(default_scale_factor=default_scale_factor) if not multiscale: @@ -469,15 +464,18 @@ def create_image_element( t_dict = {coordinate_system: t_pixels_to_microns} # # chunk_size can be 1 for channels chunks = { - "x": max_chunk_size, "y": max_chunk_size, + "x": max_chunk_size, "c": c_chunks_size, } if t_dict: logger.debug("Adding transformation: %s", t_dict) el = sd.models.Image2DModel.parse( mci.get_stack(), - # TODO: make sure y and x locations are correct + # the data on disk is not always CYX, but imread takes care of parsing things correctly, so that we can assume + # mci to be CYX. Still, to make the code more robust, we could consider using a different backend, for instance + # bioio-ome-tiff, read both the data and its dimensions from disk, and let Image2DModel.parse() rearrange the + # dimensions into CYX. dims=["c", "y", "x"], scale_factors=scale_factors, chunks=chunks, diff --git a/tests/_utils.py b/tests/_utils.py index 9787d1bb..5573ba44 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -26,6 +26,6 @@ def skip_if_below_python_version() -> pytest.mark.skipif: >>> def test_some_feature(): >>> assert True """ - MIN_VERSION = (3, 12) + MIN_VERSION = (3, 10) reason = f"Test requires Python {'.'.join(map(str, MIN_VERSION))} or higher" return pytest.mark.skipif(sys.version_info < MIN_VERSION, reason=reason) From c4f6cf7cdd36ffe7f4c91492189d5875eac0bd42 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 6 Dec 2024 19:30:28 +0100 Subject: [PATCH 22/33] added todo for regexp approach --- src/spatialdata_io/readers/macsima.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/macsima.py b/src/spatialdata_io/readers/macsima.py index 611d5216..58e962eb 100644 --- a/src/spatialdata_io/readers/macsima.py +++ b/src/spatialdata_io/readers/macsima.py @@ -193,7 +193,7 @@ def macsima( .. seealso:: - - `MACSima output `_. + - `MACSima output `_. Parameters ---------- @@ -227,7 +227,8 @@ def macsima( Threshold for splitting nuclei channels. If the number of channels that include nuclei_channel_name is greater than this threshold, the nuclei channels are split into a separate stack. skip_rounds - List of round numbers to skip when parsing the data. Rounds or cycles are counted from 0 e.g. skip_rounds=[1, 2] will parse only the first round 0 when there are only 3 cycles. + List of round numbers to skip when parsing the data. Rounds or cycles are counted from 0 e.g. skip_rounds=[1, 2] + will parse only the first round 0 when there are only 3 cycles. include_cycle_in_channel_name Whether to include the cycle number in the channel name. @@ -326,6 +327,8 @@ def parse_processed_folder( """Parse a single folder containing images from a cyclical imaging platform.""" # get list of image paths, get channel name from OME data and cycle round number from filename # look for OME-TIFF files + # TODO: replace this pattern and the p.suffix in [".tif", ".tiff"] with a single function based on a regexp, like + # this one re.compile(r".*\.tif{1,2}$", re.IGNORECASE) path_files = list(path.glob(file_pattern)) logger.debug(path_files[0]) From c8fc779ca51559e2c2593953de93b50114b57ea6 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 10 Dec 2024 16:13:33 +0100 Subject: [PATCH 23/33] rename variable --- src/spatialdata_io/readers/seqfish.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index 028753fb..187c178b 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -37,7 +37,7 @@ def seqfish( load_points: bool = True, load_shapes: bool = True, load_additional_shapes: Literal["segmentation", "boundaries", "all"] | str | None = None, - ROIs: list[int] | None = None, + rois: list[int] | None = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> SpatialData: """ @@ -69,7 +69,7 @@ def seqfish( Whether to load cells as shape. load_additional_shapes Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. - ROIs + rois Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present. If "all" is specified, reads all remaining .tiff and .geojson files in the directory. imread_kwargs @@ -93,11 +93,11 @@ def seqfish( raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}") rois_str: list[str] = [] - if ROIs is None: + if rois is None: for roi in n_rois: rois_str.append(f"{SK.ROI}{roi}") - elif isinstance(ROIs, list): - for roi in ROIs: + elif isinstance(rois, list): + for roi in rois: if str(roi) not in n_rois: raise ValueError(f"ROI{roi} not found.") rois_str.append(f"{SK.ROI}{roi}") From 5491545abc4f914b778cdcf979ed6d77c7975531 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 10 Dec 2024 16:22:09 +0100 Subject: [PATCH 24/33] new test datasets --- .github/workflows/prepare_test_data.yaml | 22 ++++++++++++++++++---- tests/test_xenium.py | 3 +++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/.github/workflows/prepare_test_data.yaml b/.github/workflows/prepare_test_data.yaml index 10d58c44..6c468b3a 100644 --- a/.github/workflows/prepare_test_data.yaml +++ b/.github/workflows/prepare_test_data.yaml @@ -18,15 +18,29 @@ jobs: run: | mkdir -p ./data cd ./data + + # 10x Genomics Xenium 2.0.0 curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Breast_2fov/Xenium_V1_human_Breast_2fov_outs.zip curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Lung_2fov/Xenium_V1_human_Lung_2fov_outs.zip + # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, multimodal cell segmentation + curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip + + # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, nuclear expansion + curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_Mouse_Ileum_tiny/Xenium_Prime_Mouse_Ileum_tiny_outs.zip + + # Spatial Genomics seqFISH v2 + curl -O https://s3.embl.de/spatialdata/raw_data/seqfish-2-test-dataset.zip + - name: Unzip files run: | - unzip ./data/Xenium_V1_human_Breast_2fov_outs.zip -d ./data/Xenium_V1_human_Breast_2fov_outs - unzip ./data/Xenium_V1_human_Lung_2fov_outs.zip -d ./data/Xenium_V1_human_Lung_2fov_outs - rm ./data/Xenium_V1_human_Breast_2fov_outs.zip - rm ./data/Xenium_V1_human_Lung_2fov_outs.zip + cd ./data + for file in *.zip; do + dir="${file%.zip}" + mkdir -p "$dir" + unzip "$file" -d "$dir" + rm "$file" + done - name: Upload artifacts uses: actions/upload-artifact@v3 diff --git a/tests/test_xenium.py b/tests/test_xenium.py index fedb824b..2fb35ce1 100644 --- a/tests/test_xenium.py +++ b/tests/test_xenium.py @@ -63,3 +63,6 @@ def test_example_data(dataset: str, expected: str) -> None: extent = get_extent(sdata, exact=False) extent = {ax: (math.floor(extent[ax][0]), math.ceil(extent[ax][1])) for ax in extent} assert str(extent) == expected + + +# TODO: add tests for Xenium 3.0.0 From 64f54f92843509b64407dadd3d5a591c9aa1fbfc Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Wed, 11 Dec 2024 11:59:43 +0100 Subject: [PATCH 25/33] update test datasets --- .github/workflows/prepare_test_data.yaml | 3 ++- src/spatialdata_io/readers/xenium.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/prepare_test_data.yaml b/.github/workflows/prepare_test_data.yaml index 6c468b3a..77f984fc 100644 --- a/.github/workflows/prepare_test_data.yaml +++ b/.github/workflows/prepare_test_data.yaml @@ -24,7 +24,8 @@ jobs: curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Lung_2fov/Xenium_V1_human_Lung_2fov_outs.zip # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, multimodal cell segmentation - curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip + # this file seems to be corrupted; skipping it for now + # curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, nuclear expansion curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_Mouse_Ileum_tiny/Xenium_Prime_Mouse_Ileum_tiny_outs.zip diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 54f911d2..2a5d6165 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -84,7 +84,7 @@ def xenium( .. seealso:: - - `10X Genomics Xenium file format `_. + - `10X Genomics Xenium file format `_. Parameters ---------- From 4c92edd53fe849a570ffe64338f73fce609551f7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:00:54 +0000 Subject: [PATCH 26/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 98 +++++++++++++++++------------------ README.md | 38 +++++++------- docs/contributing.md | 16 +++--- docs/template_usage.md | 114 ++++++++++++++++++++--------------------- 4 files changed, 133 insertions(+), 133 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa47ea7b..ed3d6456 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,138 +10,138 @@ and this project adheres to [Semantic Versioning][]. ## incoming release -- (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230 -- (Macsima) added support @berombau #224 +- (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230 +- (Macsima) added support @berombau #224 ## [0.1.6] - 2024-11-26 -- (MERSCOPE) added `feature_key` attribute for points (i.e., the `'gene'` column) #210 -- (Visium HD) get transformation matrices even when only images are parsed #215 -- Support for `xarray.DataTree` (which was moved from `datatree.DataTree`) #232 +- (MERSCOPE) added `feature_key` attribute for points (i.e., the `'gene'` column) #210 +- (Visium HD) get transformation matrices even when only images are parsed #215 +- Support for `xarray.DataTree` (which was moved from `datatree.DataTree`) #232 ## [0.1.5] - 2024-09-25 ### Added -- (Xenium) added `dims` parameter for more control in `xenium_aligned_image()` +- (Xenium) added `dims` parameter for more control in `xenium_aligned_image()` ### Fixed -- Passing `rgb=None` to image model parser for both visium and visiumhd, leading to 3-4 channel images being - interpreted as RGB(A) -- Fix header bug Visium data #200 -- (Visium HD) Fix path parsing when images are missing #204 #206 +- Passing `rgb=None` to image model parser for both visium and visiumhd, leading to 3-4 channel images being + interpreted as RGB(A) +- Fix header bug Visium data #200 +- (Visium HD) Fix path parsing when images are missing #204 #206 ## [0.1.4] - 2024-08-07 ### Changed -- (Xenium) changed default target of table to labels; radii of circles computed from cells, not nuclei #179 -- (Visium HD) changed default geometry to squares from circles for the bins; added parameter to choose #183 -- (CosMx) dropping points element with zero-length from the cosmx reader #191 +- (Xenium) changed default target of table to labels; radii of circles computed from cells, not nuclei #179 +- (Visium HD) changed default geometry to squares from circles for the bins; added parameter to choose #183 +- (CosMx) dropping points element with zero-length from the cosmx reader #191 ## [0.1.3] - 2024-07-03 ### Added -- (Xenium) support reading multi-polygon selection files from the Xenium Explorer -- (ISS) An experimental loader to load elemental ISS data objects, e.g. raw.tif, label.tif and anndata.h5ad -- (Stereo-seq) Added reader @LLehner @timtreis @florianingelfinger #70 -- (MERSCOPE) Optional rioxarray backend for MERSCOPE data (reads chunks) -- (MERSCOPE) Can choose which elements should be loaded +- (Xenium) support reading multi-polygon selection files from the Xenium Explorer +- (ISS) An experimental loader to load elemental ISS data objects, e.g. raw.tif, label.tif and anndata.h5ad +- (Stereo-seq) Added reader @LLehner @timtreis @florianingelfinger #70 +- (MERSCOPE) Optional rioxarray backend for MERSCOPE data (reads chunks) +- (MERSCOPE) Can choose which elements should be loaded ### Fixed -- (Visium) Fixed issue with joining a SpatialElement with a table due to index values not being unique. - obs_names_make_unique is now called internally to enforce unique index values allowing for join operations. +- (Visium) Fixed issue with joining a SpatialElement with a table due to index values not being unique. + obs_names_make_unique is now called internally to enforce unique index values allowing for join operations. ### Changed -- (MERSCOPE) "global" coordinate system is used as a default instead of "microns" +- (MERSCOPE) "global" coordinate system is used as a default instead of "microns" ## [0.1.2] - 2024-03-30 ### Added -- (Visium HD) added reader, coauthored by @LLehner +- (Visium HD) added reader, coauthored by @LLehner ### Fixed -- (Xenium) reader for 1.0.1 (paper data) and unknown versions -- (Xenium) fix in reading "minimalistic" Xenium datasets #132 +- (Xenium) reader for 1.0.1 (paper data) and unknown versions +- (Xenium) fix in reading "minimalistic" Xenium datasets #132 ## [0.1.1] - 2024-03-24 ### Added -- (Xenium) support for post-xenium aligned images (IF, HE) -- (Xenium) reader for the selection coordinates file from the Xenium Explorer -- (Xenium) support for the new Xenium 2.0.0 (multimodal segmentation) -- (Xenium) reading multiscale labels from cells.zarr.zip -- (MCMICRO) support for TMAs (such as the data of exemplar-002) -- (DBiT-seq) reader -- converter functions `experimental.to_legacy_anndata()` and `experimental.from_legacy_anndata()` -- (Visium) support for raw reads (capture locations not under tissue) +- (Xenium) support for post-xenium aligned images (IF, HE) +- (Xenium) reader for the selection coordinates file from the Xenium Explorer +- (Xenium) support for the new Xenium 2.0.0 (multimodal segmentation) +- (Xenium) reading multiscale labels from cells.zarr.zip +- (MCMICRO) support for TMAs (such as the data of exemplar-002) +- (DBiT-seq) reader +- converter functions `experimental.to_legacy_anndata()` and `experimental.from_legacy_anndata()` +- (Visium) support for raw reads (capture locations not under tissue) ### Fixed -- (Xenium) fixed index (fail on write) -- (Xenium) renamed cells_as_shapes to cells_as_circles; set default to True -- (MERSCOPE) don't try to load unexisting elements #87 -- (Visium) fixed axes ordering +- (Xenium) fixed index (fail on write) +- (Xenium) renamed cells_as_shapes to cells_as_circles; set default to True +- (MERSCOPE) don't try to load unexisting elements #87 +- (Visium) fixed axes ordering ## [0.0.9] - 2023-11-06 ### Fixed -- (Xenium) bug when converting feature_name #81, from @fbnrst -- (Visium) visium() supports file counts without dataset_id #91 +- (Xenium) bug when converting feature_name #81, from @fbnrst +- (Visium) visium() supports file counts without dataset_id #91 ## [0.0.8] - 2023-10-02 ### Fixed -- (Xenium) coerce cell id to str #64 -- (MERSCOPE) fix coordinate transformation #68 -- (MERSCOPE) Improvements/fixes: merscope reader #73 +- (Xenium) coerce cell id to str #64 +- (MERSCOPE) fix coordinate transformation #68 +- (MERSCOPE) Improvements/fixes: merscope reader #73 ## [0.0.7] - 2023-07-23 ### Fixed -- Bugs in Xenium and MERSCOPE +- Bugs in Xenium and MERSCOPE ## [0.0.5] - 2023-06-21 ### Added -- MERFISH reader (from @quentinblampey) -- CODEX reader (from @LLehner) +- MERFISH reader (from @quentinblampey) +- CODEX reader (from @LLehner) ### Fixed -- Issues on Visium reader (thanks @ilia-kats) and Xenium reader +- Issues on Visium reader (thanks @ilia-kats) and Xenium reader ## [0.0.4] - 2023-05-23 ### Added -- Curio reader +- Curio reader ## [0.0.3] - 2023-05-22 ### Merged -- Merge pull request #40 from scverse/fix/categories +- Merge pull request #40 from scverse/fix/categories ## [0.0.2] - 2023-05-04 ### Changed -- Revert version regex (#37) +- Revert version regex (#37) ## [0.0.1] - 2023-05-04 ### Tested -- Test installation from pypi +- Test installation from pypi diff --git a/README.md b/README.md index a25b245c..2eaa490c 100644 --- a/README.md +++ b/README.md @@ -12,19 +12,19 @@ This package contains reader functions to load common spatial omics formats into SpatialData. Currently, we provide support for: -- 10x Genomics Visium® -- 10x Genomics Visium HD® -- 10x Genomics Xenium® -- Akoya PhenoCycler® (formerly CODEX®) -- Curio Seeker® -- DBiT-seq -- MCMICRO (output data) -- NanoString CosMx® -- Spatial Genomics GenePS® (seqFISH) -- Steinbock (output data) -- STOmics Stereo-seq® -- Vizgen MERSCOPE® (MERFISH) -- MACSima® (MACS® iQ View output) +- 10x Genomics Visium® +- 10x Genomics Visium HD® +- 10x Genomics Xenium® +- Akoya PhenoCycler® (formerly CODEX®) +- Curio Seeker® +- DBiT-seq +- MCMICRO (output data) +- NanoString CosMx® +- Spatial Genomics GenePS® (seqFISH) +- Steinbock (output data) +- STOmics Stereo-seq® +- Vizgen MERSCOPE® (MERFISH) +- MACSima® (MACS® iQ View output) Note: all mentioned technologies are registered trademarks of their respective companies. @@ -32,7 +32,7 @@ Note: all mentioned technologies are registered trademarks of their respective c Contributions for addressing the below limitations are very welcomed. -- Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161 +- Only Stereo-seq 7.x is supported, 8.x is not currently supported. https://github.com/scverse/spatialdata-io/issues/161 ### How to Contribute @@ -46,7 +46,7 @@ Contributions for addressing the below limitations are very welcomed. Please refer to the [documentation][link-docs]. In particular, the -- [API documentation][link-api]. +- [API documentation][link-api]. ## Installation @@ -76,10 +76,10 @@ If you found a bug, please use the [issue tracker][issue-tracker]. Technologies that can be read into `SpatialData` objects using third-party libraries: -- METASPACE (MALDI, ...): [metaspace-converter](https://github.com/metaspace2020/metaspace-converter) -- PhenoCycler®: [SOPA](https://github.com/gustaveroussy/sopa) -- MACSima®: [SOPA](https://github.com/gustaveroussy/sopa) -- Hyperion® (Imaging Mass Cytometry): [SOPA](https://github.com/gustaveroussy/sopa) +- METASPACE (MALDI, ...): [metaspace-converter](https://github.com/metaspace2020/metaspace-converter) +- PhenoCycler®: [SOPA](https://github.com/gustaveroussy/sopa) +- MACSima®: [SOPA](https://github.com/gustaveroussy/sopa) +- Hyperion® (Imaging Mass Cytometry): [SOPA](https://github.com/gustaveroussy/sopa) ## Disclaimer diff --git a/docs/contributing.md b/docs/contributing.md index b678c149..7df41462 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -138,10 +138,10 @@ in the cookiecutter-scverse template. Please write documentation for new or changed features and use-cases. This project uses [sphinx][] with the following features: -- the [myst][] extension allows to write documentation in markdown/Markedly Structured Text -- [Numpy-style docstrings][numpydoc] (through the [napoloen][numpydoc-napoleon] extension). -- Jupyter notebooks as tutorials through [myst-nb][] (See [Tutorials with myst-nb](#tutorials-with-myst-nb-and-jupyter-notebooks)) -- [Sphinx autodoc typehints][], to automatically reference annotated input and output types +- the [myst][] extension allows to write documentation in markdown/Markedly Structured Text +- [Numpy-style docstrings][numpydoc] (through the [napoloen][numpydoc-napoleon] extension). +- Jupyter notebooks as tutorials through [myst-nb][] (See [Tutorials with myst-nb](#tutorials-with-myst-nb-and-jupyter-notebooks)) +- [Sphinx autodoc typehints][], to automatically reference annotated input and output types See the [scanpy developer docs](https://scanpy.readthedocs.io/en/latest/dev/documentation.html) for more information on how to write documentation. @@ -158,10 +158,10 @@ repository. #### Hints -- If you refer to objects from other packages, please add an entry to `intersphinx_mapping` in `docs/conf.py`. Only - if you do so can sphinx automatically create a link to the external documentation. -- If building the documentation fails because of a missing link that is outside your control, you can add an entry to - the `nitpick_ignore` list in `docs/conf.py` +- If you refer to objects from other packages, please add an entry to `intersphinx_mapping` in `docs/conf.py`. Only + if you do so can sphinx automatically create a link to the external documentation. +- If building the documentation fails because of a missing link that is outside your control, you can add an entry to + the `nitpick_ignore` list in `docs/conf.py` #### Building the docs locally diff --git a/docs/template_usage.md b/docs/template_usage.md index bf8152f6..f8efbc16 100644 --- a/docs/template_usage.md +++ b/docs/template_usage.md @@ -115,13 +115,13 @@ We recommend using [readthedocs.org][] (RTD) to build and host the documentation To enable readthedocs, head over to [their website][readthedocs.org] and sign in with your GitHub account. On the RTD dashboard choose "Import a Project" and follow the instructions to add your repository. -- Make sure to choose the correct name of the default branch. On GitHub, the name of the default branch should be `main` (it has - recently changed from `master` to `main`). -- We recommend to enable documentation builds for pull requests (PRs). This ensures that a PR doesn't introduce changes - that break the documentation. To do so, got to `Admin -> Advanced Settings`, check the - `Build pull requests for this projects` option, and click `Save`. For more information, please refer to - the [official RTD documentation](https://docs.readthedocs.io/en/stable/pull-requests.html). -- If you find the RTD builds are failing, you can disable the `fail_on_warning` option in `.readthedocs.yaml`. +- Make sure to choose the correct name of the default branch. On GitHub, the name of the default branch should be `main` (it has + recently changed from `master` to `main`). +- We recommend to enable documentation builds for pull requests (PRs). This ensures that a PR doesn't introduce changes + that break the documentation. To do so, got to `Admin -> Advanced Settings`, check the + `Build pull requests for this projects` option, and click `Save`. For more information, please refer to + the [official RTD documentation](https://docs.readthedocs.io/en/stable/pull-requests.html). +- If you find the RTD builds are failing, you can disable the `fail_on_warning` option in `.readthedocs.yaml`. If your project is private, there are ways to enable docs rendering on [readthedocs.org][] but it is more cumbersome and requires a different subscription for read the docs. See a guide [here](https://docs.readthedocs.io/en/stable/guides/importing-private-repositories.html). @@ -144,52 +144,52 @@ Once authorized, pre-commit.ci should automatically be activated. The following pre-commit checks are for code style and format: -- [black](https://black.readthedocs.io/en/stable/): standard code - formatter in Python. -- [isort](https://pycqa.github.io/isort/): sort module imports into - sections and types. -- [prettier](https://prettier.io/docs/en/index.html): standard code - formatter for non-Python files (e.g. YAML). -- [blacken-docs](https://github.com/asottile/blacken-docs): black on - python code in docs. +- [black](https://black.readthedocs.io/en/stable/): standard code + formatter in Python. +- [isort](https://pycqa.github.io/isort/): sort module imports into + sections and types. +- [prettier](https://prettier.io/docs/en/index.html): standard code + formatter for non-Python files (e.g. YAML). +- [blacken-docs](https://github.com/asottile/blacken-docs): black on + python code in docs. The following pre-commit checks are for errors and inconsistencies: -- [flake8](https://flake8.pycqa.org/en/latest/): standard check for errors in Python files. - - [flake8-tidy-imports](https://github.com/adamchainz/flake8-tidy-imports): - tidy module imports. - - [flake8-docstrings](https://github.com/PyCQA/flake8-docstrings): - pydocstyle extension of flake8. - - [flake8-rst-docstrings](https://github.com/peterjc/e8-rst-docstrings): - extension of `flake8-docstrings` for `rst` docs. - - [flake8-comprehensions](https://github.com/adamchainz/e8-comprehensions): - write better list/set/dict comprehensions. - - [flake8-bugbear](https://github.com/PyCQA/flake8-bugbear): - find possible bugs and design issues in program. - - [flake8-blind-except](https://github.com/elijahandrews/flake8-blind-except): - checks for blind, catch-all `except` statements. -- [yesqa](https://github.com/asottile/yesqa): - remove unneccesary `# noqa` comments, follows additional dependencies listed above. -- [autoflake](https://github.com/PyCQA/autoflake): - remove unused imports and variables. -- [pre-commit-hooks](https://github.com/pre-commit/pre-commit-hooks): generic pre-commit hooks. - - **detect-private-key**: checks for the existence of private keys. - - **check-ast**: check whether files parse as valid python. - - **end-of-file-fixer**:check files end in a newline and only a newline. - - **mixed-line-ending**: checks mixed line ending. - - **trailing-whitespace**: trims trailing whitespace. - - **check-case-conflict**: check files that would conflict with case-insensitive file systems. -- [pyupgrade](https://github.com/asottile/pyupgrade): - upgrade syntax for newer versions of the language. -- **forbid-to-commit**: Make sure that `*.rej` files cannot be commited. These files are created by the - [automated template sync](#automated-template-sync) if there's a merge conflict and need to be addressed manually. +- [flake8](https://flake8.pycqa.org/en/latest/): standard check for errors in Python files. + - [flake8-tidy-imports](https://github.com/adamchainz/flake8-tidy-imports): + tidy module imports. + - [flake8-docstrings](https://github.com/PyCQA/flake8-docstrings): + pydocstyle extension of flake8. + - [flake8-rst-docstrings](https://github.com/peterjc/e8-rst-docstrings): + extension of `flake8-docstrings` for `rst` docs. + - [flake8-comprehensions](https://github.com/adamchainz/e8-comprehensions): + write better list/set/dict comprehensions. + - [flake8-bugbear](https://github.com/PyCQA/flake8-bugbear): + find possible bugs and design issues in program. + - [flake8-blind-except](https://github.com/elijahandrews/flake8-blind-except): + checks for blind, catch-all `except` statements. +- [yesqa](https://github.com/asottile/yesqa): + remove unneccesary `# noqa` comments, follows additional dependencies listed above. +- [autoflake](https://github.com/PyCQA/autoflake): + remove unused imports and variables. +- [pre-commit-hooks](https://github.com/pre-commit/pre-commit-hooks): generic pre-commit hooks. + - **detect-private-key**: checks for the existence of private keys. + - **check-ast**: check whether files parse as valid python. + - **end-of-file-fixer**:check files end in a newline and only a newline. + - **mixed-line-ending**: checks mixed line ending. + - **trailing-whitespace**: trims trailing whitespace. + - **check-case-conflict**: check files that would conflict with case-insensitive file systems. +- [pyupgrade](https://github.com/asottile/pyupgrade): + upgrade syntax for newer versions of the language. +- **forbid-to-commit**: Make sure that `*.rej` files cannot be commited. These files are created by the + [automated template sync](#automated-template-sync) if there's a merge conflict and need to be addressed manually. ### How to disable or add pre-commit checks -- To ignore lint warnigs from **flake8**, see [Ignore certain lint warnings](#how-to-ignore-certain-lint-warnings). -- You can add or remove pre-commit checks by simply deleting relevant lines in the `.pre-commit-config.yaml` file. - Some pre-commit checks have additional options that can be specified either in the `pyproject.toml` or tool-specific - config files, such as `.prettierrc.yml` for **prettier** and `.flake8` for **flake8**. +- To ignore lint warnigs from **flake8**, see [Ignore certain lint warnings](#how-to-ignore-certain-lint-warnings). +- You can add or remove pre-commit checks by simply deleting relevant lines in the `.pre-commit-config.yaml` file. + Some pre-commit checks have additional options that can be specified either in the `pyproject.toml` or tool-specific + config files, such as `.prettierrc.yml` for **prettier** and `.flake8` for **flake8**. ### How to ignore certain lint warnings @@ -220,10 +220,10 @@ W504 Scverse ecosystem packages should operate on [AnnData][] and/or [MuData][] data structures and typically use an API as originally [introduced by scanpy][scanpy-api] with the following submodules: -- `pp` for preprocessing -- `tl` for tools (that, compared to `pp` generate interpretable output, often associated with a corresponding plotting - function) -- `pl` for plotting functions +- `pp` for preprocessing +- `tl` for tools (that, compared to `pp` generate interpretable output, often associated with a corresponding plotting + function) +- `pl` for plotting functions You may add additional submodules as appropriate. While we encourage to follow a scanpy-like API for ecosystem packages, there may also be good reasons to choose a different approach, e.g. using an object-oriented API. @@ -280,12 +280,12 @@ The pull request can only be merged after all `*.rej` files have been removed. :::{tip} The following hints may be useful to work with the template sync: -- GitHub automatically disables scheduled actions if there has been not activity to the repository for 60 days. - You can re-enable or manually trigger the sync by navigating to `Actions` -> `Sync Template` in your GitHub repository. -- If you want to ignore certain files from the template update, you can add them to the `[tool.cruft]` section in the - `pyproject.toml` file in the root of your repository. More details are described in the - [cruft documentation][cruft-update-project]. -- To disable the sync entirely, simply remove the file `.github/workflows/sync.yaml`. +- GitHub automatically disables scheduled actions if there has been not activity to the repository for 60 days. + You can re-enable or manually trigger the sync by navigating to `Actions` -> `Sync Template` in your GitHub repository. +- If you want to ignore certain files from the template update, you can add them to the `[tool.cruft]` section in the + `pyproject.toml` file in the root of your repository. More details are described in the + [cruft documentation][cruft-update-project]. +- To disable the sync entirely, simply remove the file `.github/workflows/sync.yaml`. ::: From f219088cc055f5c6bf66f14232ffdbad05450b5a Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Wed, 11 Dec 2024 20:16:54 +0100 Subject: [PATCH 27/33] fix scale factor labels; fix table --- src/spatialdata_io/readers/seqfish.py | 133 ++++++++++++++++++-------- 1 file changed, 93 insertions(+), 40 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index 187c178b..d7aa656a 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -37,8 +37,10 @@ def seqfish( load_points: bool = True, load_shapes: bool = True, load_additional_shapes: Literal["segmentation", "boundaries", "all"] | str | None = None, + cells_as_circles: bool = False, rois: list[int] | None = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), + raster_models_scale_factors: list[float] | None = None, ) -> SpatialData: """ Read *seqfish* formatted dataset. @@ -69,38 +71,47 @@ def seqfish( Whether to load cells as shape. load_additional_shapes Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. + If "all" is specified, reads all remaining .tiff and .geojson files in the directory. + cells_as_circles + Whether to read cells also as circles instead of labels. rois Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present. - If "all" is specified, reads all remaining .tiff and .geojson files in the directory. imread_kwargs Keyword arguments to pass to :func:`dask_image.imread.imread`. Returns ------- :class:`spatialdata.SpatialData` + + Examples + -------- + This code shows how to change the annotation target of the table from the cell labels to the cell cirlces. + Please check that Roi1 is present in your dataset, otherwise adjust the code below. + >>> from spatialdata_io import seqfish + >>> sdata = seqfish("path/to/raw/data") + >>> sdata["table"].obs["region"] = "Roi1_CellCoordinates" + >>> sdata.set_table_annotates_spatialelement( + ... table_name="Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" + ... ) + >>> sdata.write("path/to/data.zarr") """ path = Path(path) - count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}.*{re.escape(SK.CSV_FILE)}$") - count_files = [i for i in os.listdir(path) if count_file_pattern.match(i)] - roi_pattern = re.compile(f"^{SK.ROI}(\\d+)") - n_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))} + count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}{re.escape(SK.CSV_FILE)}$") + count_files = [f for f in os.listdir(path) if count_file_pattern.match(f)] if not count_files: raise ValueError( f"No files matching the pattern {count_file_pattern} were found. Cannot infer the naming scheme." ) - matched = count_file_pattern.match(count_files[0]) - if matched is None: - raise ValueError(f"File {count_files[0]} does not match the pattern {count_file_pattern}") - rois_str: list[str] = [] + roi_pattern = re.compile(f"^{SK.ROI}(\\d+)") + found_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))} if rois is None: - for roi in n_rois: - rois_str.append(f"{SK.ROI}{roi}") + rois_str = [f"{SK.ROI}{roi}" for roi in found_rois] elif isinstance(rois, list): for roi in rois: - if str(roi) not in n_rois: + if str(roi) not in found_rois: raise ValueError(f"ROI{roi} not found.") - rois_str.append(f"{SK.ROI}{roi}") + rois_str = [f"{SK.ROI}{roi}" for roi in rois] else: raise ValueError("Invalid type for 'roi'. Must be list[int] or None.") @@ -119,33 +130,54 @@ def get_cell_mask_file(roi: str) -> str: def get_transcript_file(roi: str) -> str: return f"{roi}_{SK.TRANSCRIPT_COORDINATES}{SK.CSV_FILE}" - scaled = {} - adatas: dict[str, ad.AnnData] = {} + # parse table information + tables: dict[str, ad.AnnData] = {} for roi_str in rois_str: - assert isinstance(roi_str, str) or roi_str is None - cell_file = get_cell_file(roi_str) + # parse cell gene expression data count_matrix = get_count_file(roi_str) - adata = ad.read_csv(path / count_matrix, delimiter=",") + df = pd.read_csv(path / count_matrix, delimiter=",") + instance_id = df.iloc[:, 0].astype(str) + expression = df.drop(columns=["Unnamed: 0"]) + expression.set_index(instance_id, inplace=True) + adata = ad.AnnData(expression) + + # parse cell spatial information + cell_file = get_cell_file(roi_str) cell_info = pd.read_csv(path / cell_file, delimiter=",") + cell_info["label"] = cell_info["label"].astype("str") + # below, the obsm are assigned by position, not by index. Here we check that we can do it + assert cell_info["label"].to_numpy().tolist() == adata.obs.index.to_numpy().tolist() + cell_info.set_index("label", inplace=True) + adata.obs[SK.AREA] = cell_info[SK.AREA] adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy() - adata.obs[SK.AREA] = np.reshape(cell_info[SK.AREA].to_numpy(), (-1, 1)) - region = f"cells_{roi_str}" + + # map tables to cell labels (defined later) + region = os.path.splitext(get_cell_mask_file(roi_str))[0] adata.obs[SK.REGION_KEY] = region - adata.obs[SK.INSTANCE_KEY_TABLE] = adata.obs.index.astype(int) - adatas[roi_str] = adata + adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") + adata.obs[SK.INSTANCE_KEY_TABLE] = instance_id.to_numpy().astype(np.uint16) + adata.obs = adata.obs.reset_index(drop=True) + tables[f"table_{roi_str}"] = TableModel.parse( + adata, + region=region, + region_key=SK.REGION_KEY.value, + instance_key=SK.INSTANCE_KEY_TABLE.value, + ) + + # parse scale factors to scale images and labels + scaled = {} + for roi_str in rois_str: scaled[roi_str] = Scale( np.array(_get_scale_factors(path / get_dapi_file(roi_str), SK.SCALEFEFACTOR_X, SK.SCALEFEFACTOR_Y)), axes=("y", "x"), ) - scale_factors = [2, 2, 2, 2] - if load_images: images = { f"{os.path.splitext(get_dapi_file(x))[0]}": Image2DModel.parse( imread(path / get_dapi_file(x), **imread_kwargs), dims=("c", "y", "x"), - scale_factors=scale_factors, + scale_factors=raster_models_scale_factors, transformations={"global": scaled[x]}, ) for x in rois_str @@ -158,8 +190,8 @@ def get_transcript_file(roi: str) -> str: f"{os.path.splitext(get_cell_mask_file(x))[0]}": Labels2DModel.parse( imread(path / get_cell_mask_file(x), **imread_kwargs).squeeze(), dims=("y", "x"), - scale_factors=scale_factors, - transformations={"global": Identity()}, + scale_factors=raster_models_scale_factors, + transformations={"global": scaled[x]}, ) for x in rois_str } @@ -172,7 +204,7 @@ def get_transcript_file(roi: str) -> str: pd.read_csv(path / get_transcript_file(x), delimiter=","), coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y}, feature_key=SK.FEATURE_KEY.value, - # instance_key=SK.INSTANCE_KEY_POINTS.value, # TODO: should be optional but parameter might get deprecated anyway + instance_key=None, transformations={"global": Identity()}, ) for x in rois_str @@ -180,17 +212,6 @@ def get_transcript_file(roi: str) -> str: else: points = {} - tables = {} - for name, adata in adatas.items(): - adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") - adata.obs = adata.obs.reset_index(drop=True) - tables[name] = TableModel.parse( - adata, - region=f"cells_{name}", - region_key=SK.REGION_KEY.value, - instance_key=SK.INSTANCE_KEY_TABLE.value, - ) - if load_shapes: shapes = { f"{os.path.splitext(get_cell_file(x))[0]}": ShapesModel.parse( @@ -200,7 +221,7 @@ def get_transcript_file(roi: str) -> str: index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) - for x, adata in zip(rois_str, adatas.values()) + for x, adata in zip(rois_str, tables.values()) } else: shapes = {} @@ -247,3 +268,35 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke scalefactor_x = element.attrib[scalefactor_x_key] scalefactor_y = element.attrib[scalefactor_y_key] return [float(scalefactor_x), float(scalefactor_y)] + + +if __name__ == "__main__": + path_read = "/Users/macbook/ssd/biodata/seqfish/instrument 2 official" + sdata = seqfish( + path=path_read, + # load_images=True, + # load_labels=True, + # load_points=True, + # rois=[1], + ) + # sdata.pl.render_labels(color='Arg1').pl.show() + import matplotlib.pyplot as plt + + # plt.show() + # sdata["table_Roi1"].obs["region"] = "Roi1_CellCoordinates" + # sdata.set_table_annotates_spatialelement( + # table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" + # ) + + gene_name = "Arg1" + ( + sdata.pl.render_images("Roi1_DAPI", cmap="gray") + .pl.render_labels("Roi1_Segmentation", color=gene_name) + .pl.render_points("Roi1_TranscriptList", color="name", groups=gene_name, palette="orange") + .pl.show(title=f"{gene_name} expression over DAPI image", coordinate_systems="global", figsize=(10, 5)) + ) + plt.show() + + from napari_spatialdata import Interactive + + Interactive(sdata) From ff14eed2b0b144085c4758f1a286c3c7d3b0b0f2 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 12 Dec 2024 11:24:57 +0100 Subject: [PATCH 28/33] before removing 'load_additional_shapes' --- src/spatialdata_io/readers/seqfish.py | 90 ++++++++++++++++----------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index d7aa656a..c41a940c 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -36,7 +36,7 @@ def seqfish( load_labels: bool = True, load_points: bool = True, load_shapes: bool = True, - load_additional_shapes: Literal["segmentation", "boundaries", "all"] | str | None = None, + load_additional_geometries: Literal["segmentation", "boundaries", "all"] | str | None = None, cells_as_circles: bool = False, rois: list[int] | None = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -69,9 +69,11 @@ def seqfish( Whether to load the transcript locations. load_shapes Whether to load cells as shape. - load_additional_shapes - Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. - If "all" is specified, reads all remaining .tiff and .geojson files in the directory. + load_additional_geometries + Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. If "all" is + specified (default), it reads all remaining .tiff and .geojson files in the directory. Other options are + "segmentation", "boundaries", or a string to match in the filename (as a subset). If `None` is passed, + no additional geometry is loaded. cells_as_circles Whether to read cells also as circles instead of labels. rois @@ -198,19 +200,24 @@ def get_transcript_file(roi: str) -> str: else: labels = {} + points = {} if load_points: - points = { - f"{os.path.splitext(get_transcript_file(x))[0]}": PointsModel.parse( - pd.read_csv(path / get_transcript_file(x), delimiter=","), + for x in rois_str: + + + # prepare data + name = f"{os.path.splitext(get_transcript_file(x))[0]}" + p = pd.read_csv(path / get_transcript_file(x), delimiter=",") + instance_key_points = SK.INSTANCE_KEY_POINTS.value if SK.INSTANCE_KEY_POINTS.value in p.columns else None + + # call parser + points[name] = PointsModel.parse( + p, coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y}, feature_key=SK.FEATURE_KEY.value, - instance_key=None, + instance_key=instance_key_points, transformations={"global": Identity()}, ) - for x in rois_str - } - else: - points = {} if load_shapes: shapes = { @@ -226,33 +233,45 @@ def get_transcript_file(roi: str) -> str: else: shapes = {} - if load_additional_shapes is not None: - shape_file_names = [] + if load_additional_geometries is not None: + labels_filenames = [] + shapes_filenames = [] + for filename in os.listdir(path): - if filename.endswith((SK.TIFF_FILE, SK.GEOJSON_FILE)): - if load_additional_shapes == "all": - if not any(key in filename for key in images.keys()) and not any( - key in filename for key in labels.keys() - ): - shape_file_names.append(filename) - elif load_additional_shapes == "segmentation": - if SK.SEGMENTATION in filename and not any(key in filename for key in labels.keys()): - shape_file_names.append(filename) - elif load_additional_shapes == "boundaries": - if SK.BOUNDARIES in filename: - shape_file_names.append(filename) - elif isinstance(load_additional_shapes, str): - if load_additional_shapes in filename: - shape_file_names.append(filename) - else: - raise ValueError(f"No file found with identifier {load_additional_shapes}") - - for x in range(len(shape_file_names)): - shapes[f"{os.path.splitext(shape_file_names[x])[0]}"] = ShapesModel.parse( - path / shape_file_names[x], + if filename.endswith(SK.TIFF_FILE): + if ( + load_additional_geometries == "all" + or load_additional_geometries == "segmentation" and SK.SEGMENTATION in filename + or isinstance(load_additional_geometries, str) and load_additional_geometries in filename + ): + labels_filenames.append(filename) + continue + if filename.endswith(SK.GEOJSON_FILE): + if ( + load_additional_geometries == "all" + or load_additional_geometries == "boundaries" and SK.BOUNDARIES in filename + or isinstance(load_additional_geometries, str) and load_additional_geometries in filename + ): + shapes_filenames.append(filename) + continue + raise ValueError(f"No file found with identifier {load_additional_geometries}") + + for labels_filename in labels_filenames: + labels[f"{os.path.splitext(labels_filename)[0]}"] = Labels2DModel.parse( + imread(path / labels_filename, **imread_kwargs).squeeze(), + dims=("y", "x"), + scale_factors=raster_models_scale_factors, + transformations={"global": scaled[x]}, + ) + + for shape_filename in shapes_filenames: + # TODO: check that indices of newly parsed shapes match the indices of the existing shapes + shapes[f"{os.path.splitext(shape_filename)[0]}"] = ShapesModel.parse( + path / shape_filename, index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) + pass sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes) @@ -287,6 +306,7 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke # sdata.set_table_annotates_spatialelement( # table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" # ) + import spatialdata_plot gene_name = "Arg1" ( From a4434695d18829171cd804a1eb932aef9ec71491 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 10:25:18 +0000 Subject: [PATCH 29/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata_io/readers/seqfish.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index c41a940c..996c6d4d 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -204,7 +204,6 @@ def get_transcript_file(roi: str) -> str: if load_points: for x in rois_str: - # prepare data name = f"{os.path.splitext(get_transcript_file(x))[0]}" p = pd.read_csv(path / get_transcript_file(x), delimiter=",") @@ -241,16 +240,20 @@ def get_transcript_file(roi: str) -> str: if filename.endswith(SK.TIFF_FILE): if ( load_additional_geometries == "all" - or load_additional_geometries == "segmentation" and SK.SEGMENTATION in filename - or isinstance(load_additional_geometries, str) and load_additional_geometries in filename + or load_additional_geometries == "segmentation" + and SK.SEGMENTATION in filename + or isinstance(load_additional_geometries, str) + and load_additional_geometries in filename ): labels_filenames.append(filename) continue if filename.endswith(SK.GEOJSON_FILE): if ( load_additional_geometries == "all" - or load_additional_geometries == "boundaries" and SK.BOUNDARIES in filename - or isinstance(load_additional_geometries, str) and load_additional_geometries in filename + or load_additional_geometries == "boundaries" + and SK.BOUNDARIES in filename + or isinstance(load_additional_geometries, str) + and load_additional_geometries in filename ): shapes_filenames.append(filename) continue @@ -271,7 +274,6 @@ def get_transcript_file(roi: str) -> str: index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) - pass sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes) @@ -306,7 +308,6 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke # sdata.set_table_annotates_spatialelement( # table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" # ) - import spatialdata_plot gene_name = "Arg1" ( From 375fa5cc25b3666d4fc7077d18bf256e0f1e7903 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 12 Dec 2024 13:21:48 +0100 Subject: [PATCH 30/33] finished code review --- src/spatialdata_io/readers/seqfish.py | 115 +++++--------------------- 1 file changed, 22 insertions(+), 93 deletions(-) diff --git a/src/spatialdata_io/readers/seqfish.py b/src/spatialdata_io/readers/seqfish.py index c41a940c..27431c09 100644 --- a/src/spatialdata_io/readers/seqfish.py +++ b/src/spatialdata_io/readers/seqfish.py @@ -6,7 +6,7 @@ from collections.abc import Mapping from pathlib import Path from types import MappingProxyType -from typing import Any, Literal +from typing import Any import anndata as ad import numpy as np @@ -36,7 +36,6 @@ def seqfish( load_labels: bool = True, load_points: bool = True, load_shapes: bool = True, - load_additional_geometries: Literal["segmentation", "boundaries", "all"] | str | None = None, cells_as_circles: bool = False, rois: list[int] | None = None, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -69,11 +68,6 @@ def seqfish( Whether to load the transcript locations. load_shapes Whether to load cells as shape. - load_additional_geometries - Whether to load additional shapes such as segmentation or boundaries for both cells and nuclei. If "all" is - specified (default), it reads all remaining .tiff and .geojson files in the directory. Other options are - "segmentation", "boundaries", or a string to match in the filename (as a subset). If `None` is passed, - no additional geometry is loaded. cells_as_circles Whether to read cells also as circles instead of labels. rois @@ -87,13 +81,13 @@ def seqfish( Examples -------- - This code shows how to change the annotation target of the table from the cell labels to the cell cirlces. - Please check that Roi1 is present in your dataset, otherwise adjust the code below. + This code shows how to change the annotation target of the table from the cell labels to the cell boundaries. + Please check that the string Roi1 is used in the naming of your dataset, otherwise adjust the code below. >>> from spatialdata_io import seqfish >>> sdata = seqfish("path/to/raw/data") - >>> sdata["table"].obs["region"] = "Roi1_CellCoordinates" + >>> sdata["table_Roi1"].obs["region"] = "Roi1_Boundaries" >>> sdata.set_table_annotates_spatialelement( - ... table_name="Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" + ... table_name="table_Roi1", region="Roi1_Boundaries", region_key="region", instance_key="instance_id" ... ) >>> sdata.write("path/to/data.zarr") """ @@ -126,9 +120,12 @@ def get_count_file(roi: str) -> str: def get_dapi_file(roi: str) -> str: return f"{roi}_{SK.DAPI}{SK.TIFF_FILE}" - def get_cell_mask_file(roi: str) -> str: + def get_cell_segmentation_labels_file(roi: str) -> str: return f"{roi}_{SK.SEGMENTATION}{SK.TIFF_FILE}" + def get_cell_segmentation_shapes_file(roi: str) -> str: + return f"{roi}_{SK.BOUNDARIES}{SK.GEOJSON_FILE}" + def get_transcript_file(roi: str) -> str: return f"{roi}_{SK.TRANSCRIPT_COORDINATES}{SK.CSV_FILE}" @@ -154,7 +151,7 @@ def get_transcript_file(roi: str) -> str: adata.obsm[SK.SPATIAL_KEY] = cell_info[[SK.CELL_X, SK.CELL_Y]].to_numpy() # map tables to cell labels (defined later) - region = os.path.splitext(get_cell_mask_file(roi_str))[0] + region = os.path.splitext(get_cell_segmentation_labels_file(roi_str))[0] adata.obs[SK.REGION_KEY] = region adata.obs[SK.REGION_KEY] = adata.obs[SK.REGION_KEY].astype("category") adata.obs[SK.INSTANCE_KEY_TABLE] = instance_id.to_numpy().astype(np.uint16) @@ -189,8 +186,8 @@ def get_transcript_file(roi: str) -> str: if load_labels: labels = { - f"{os.path.splitext(get_cell_mask_file(x))[0]}": Labels2DModel.parse( - imread(path / get_cell_mask_file(x), **imread_kwargs).squeeze(), + f"{os.path.splitext(get_cell_segmentation_labels_file(x))[0]}": Labels2DModel.parse( + imread(path / get_cell_segmentation_labels_file(x), **imread_kwargs).squeeze(), dims=("y", "x"), scale_factors=raster_models_scale_factors, transformations={"global": scaled[x]}, @@ -204,7 +201,6 @@ def get_transcript_file(roi: str) -> str: if load_points: for x in rois_str: - # prepare data name = f"{os.path.splitext(get_transcript_file(x))[0]}" p = pd.read_csv(path / get_transcript_file(x), delimiter=",") @@ -219,59 +215,25 @@ def get_transcript_file(roi: str) -> str: transformations={"global": Identity()}, ) - if load_shapes: - shapes = { - f"{os.path.splitext(get_cell_file(x))[0]}": ShapesModel.parse( + shapes = {} + if cells_as_circles: + for x, adata in zip(rois_str, tables.values()): + shapes[f"{os.path.splitext(get_cell_file(x))[0]}"] = ShapesModel.parse( adata.obsm[SK.SPATIAL_KEY], geometry=0, radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi), index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), transformations={"global": Identity()}, ) - for x, adata in zip(rois_str, tables.values()) - } - else: - shapes = {} - - if load_additional_geometries is not None: - labels_filenames = [] - shapes_filenames = [] - - for filename in os.listdir(path): - if filename.endswith(SK.TIFF_FILE): - if ( - load_additional_geometries == "all" - or load_additional_geometries == "segmentation" and SK.SEGMENTATION in filename - or isinstance(load_additional_geometries, str) and load_additional_geometries in filename - ): - labels_filenames.append(filename) - continue - if filename.endswith(SK.GEOJSON_FILE): - if ( - load_additional_geometries == "all" - or load_additional_geometries == "boundaries" and SK.BOUNDARIES in filename - or isinstance(load_additional_geometries, str) and load_additional_geometries in filename - ): - shapes_filenames.append(filename) - continue - raise ValueError(f"No file found with identifier {load_additional_geometries}") - - for labels_filename in labels_filenames: - labels[f"{os.path.splitext(labels_filename)[0]}"] = Labels2DModel.parse( - imread(path / labels_filename, **imread_kwargs).squeeze(), - dims=("y", "x"), - scale_factors=raster_models_scale_factors, + if load_shapes: + for x in rois_str: + # this assumes that the index matches the instance key of the table. A more robust approach could be + # implemented, as described here https://github.com/scverse/spatialdata-io/issues/249 + shapes[f"{os.path.splitext(get_cell_segmentation_shapes_file(x))[0]}"] = ShapesModel.parse( + path / get_cell_segmentation_shapes_file(x), transformations={"global": scaled[x]}, - ) - - for shape_filename in shapes_filenames: - # TODO: check that indices of newly parsed shapes match the indices of the existing shapes - shapes[f"{os.path.splitext(shape_filename)[0]}"] = ShapesModel.parse( - path / shape_filename, index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(), - transformations={"global": Identity()}, ) - pass sdata = SpatialData(images=images, labels=labels, points=points, tables=tables, shapes=shapes) @@ -287,36 +249,3 @@ def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_ke scalefactor_x = element.attrib[scalefactor_x_key] scalefactor_y = element.attrib[scalefactor_y_key] return [float(scalefactor_x), float(scalefactor_y)] - - -if __name__ == "__main__": - path_read = "/Users/macbook/ssd/biodata/seqfish/instrument 2 official" - sdata = seqfish( - path=path_read, - # load_images=True, - # load_labels=True, - # load_points=True, - # rois=[1], - ) - # sdata.pl.render_labels(color='Arg1').pl.show() - import matplotlib.pyplot as plt - - # plt.show() - # sdata["table_Roi1"].obs["region"] = "Roi1_CellCoordinates" - # sdata.set_table_annotates_spatialelement( - # table_name="table_Roi1", region="Roi1_CellCoordinates", region_key="region", instance_key="instance_id" - # ) - import spatialdata_plot - - gene_name = "Arg1" - ( - sdata.pl.render_images("Roi1_DAPI", cmap="gray") - .pl.render_labels("Roi1_Segmentation", color=gene_name) - .pl.render_points("Roi1_TranscriptList", color="name", groups=gene_name, palette="orange") - .pl.show(title=f"{gene_name} expression over DAPI image", coordinate_systems="global", figsize=(10, 5)) - ) - plt.show() - - from napari_spatialdata import Interactive - - Interactive(sdata) From 28b0eb3b0d42bc014e75f53b61c316504d84c375 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 12 Dec 2024 16:30:48 +0100 Subject: [PATCH 31/33] tests for seqfish --- tests/test_seqfish.py | 27 +++++++++++++++++++++++++++ tests/test_xenium.py | 8 ++------ 2 files changed, 29 insertions(+), 6 deletions(-) create mode 100644 tests/test_seqfish.py diff --git a/tests/test_seqfish.py b/tests/test_seqfish.py new file mode 100644 index 00000000..e0d66655 --- /dev/null +++ b/tests/test_seqfish.py @@ -0,0 +1,27 @@ +import math +from pathlib import Path + +import pytest + +from spatialdata_io.readers.seqfish import seqfish +from tests._utils import skip_if_below_python_version + + +# See https://github.com/scverse/spatialdata-io/blob/main/.github/workflows/prepare_test_data.yaml for instructions on +# how to download and place the data on disk +@skip_if_below_python_version() +@pytest.mark.parametrize("dataset,expected", [("instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")]) +@pytest.mark.parametrize("rois", [[1], None]) +@pytest.mark.parametrize("cells_as_circles", [False, True]) +def test_example_data(dataset: str, expected: str, rois: list[int] | None, cells_as_circles: bool) -> None: + f = Path("./data") / dataset + assert f.is_dir() + sdata = seqfish(f, cells_as_circles=cells_as_circles, rois=rois) + from spatialdata import get_extent + + extent = get_extent(sdata, exact=False) + extent = {ax: (math.floor(extent[ax][0]), math.ceil(extent[ax][1])) for ax in extent} + if cells_as_circles: + # manual correction required to take into account for the circle radii + expected = "{'y': (-2, 109), 'x': (-2, 109)}" + assert str(extent) == expected diff --git a/tests/test_xenium.py b/tests/test_xenium.py index 2fb35ce1..32408323 100644 --- a/tests/test_xenium.py +++ b/tests/test_xenium.py @@ -40,12 +40,8 @@ def test_roundtrip_with_data_limits() -> None: assert np.array_equal(cell_id_str, f0(*f1(cell_id_str))) -# The datasets should be downloaded from -# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/resources/xenium-example-data#test-data -# and placed in the "data" directory; if you run the tests locally you may need to create a symlink in "tests/data" -# pointing to "data". -# The GitHub workflow "prepare_test_data.yaml" takes care of downloading the datasets and uploading an artifact for the -# tests to use +# See https://github.com/scverse/spatialdata-io/blob/main/.github/workflows/prepare_test_data.yaml for instructions on +# how to download and place the data on disk @skip_if_below_python_version() @pytest.mark.parametrize( "dataset,expected", From 91efd4519072aab66b405cd69ec40c8a59b229a0 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 12 Dec 2024 17:01:04 +0100 Subject: [PATCH 32/33] fix seqfish test path --- tests/test_seqfish.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_seqfish.py b/tests/test_seqfish.py index e0d66655..bdbf2a60 100644 --- a/tests/test_seqfish.py +++ b/tests/test_seqfish.py @@ -10,7 +10,9 @@ # See https://github.com/scverse/spatialdata-io/blob/main/.github/workflows/prepare_test_data.yaml for instructions on # how to download and place the data on disk @skip_if_below_python_version() -@pytest.mark.parametrize("dataset,expected", [("instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")]) +@pytest.mark.parametrize( + "dataset,expected", [("seqfish-2-test-dataset/instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")] +) @pytest.mark.parametrize("rois", [[1], None]) @pytest.mark.parametrize("cells_as_circles", [False, True]) def test_example_data(dataset: str, expected: str, rois: list[int] | None, cells_as_circles: bool) -> None: From e9b74c0a1ada826c1b3fc579401d776c887c8923 Mon Sep 17 00:00:00 2001 From: LucaMarconato <2664412+LucaMarconato@users.noreply.github.com> Date: Thu, 12 Dec 2024 17:38:58 +0100 Subject: [PATCH 33/33] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed3d6456..b9f29809 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning][]. - (Visium/Visium HD) lowres and hires images now mapped also to the 'global' coordinate system #230 - (Macsima) added support @berombau #224 +- (seqFISH) support for v2 instrument #227 ## [0.1.6] - 2024-11-26