From aa51fcb2ee8b507928f36cb43b8a156b146bf666 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 9 Jan 2025 09:02:51 -0800 Subject: [PATCH 1/5] Pin torchdata + fix some typing errors (#1325) * Pin torchdata * Fix type annotation * Fix reassignment of array with different type * Fix typing for CountsAccumulator assignment * Enforce 64bit result in MeanAccumulator (unclear if this was the case before) * Fix typing error * Don't prematurely close census in test * Revert "Fix typing for CountsAccumulator assignment" This reverts commit 7a7538bccfddc813fbc35f0b0b79159136d313fd. * Revert "Enforce 64bit result in MeanAccumulator (unclear if this was the case before)" This reverts commit 8ba276a0107a15ab50a8281d7a21ce3ae2f05575. * Unrevert * Pin tiledbsoma<1.15 --- api/python/cellxgene_census/pyproject.toml | 4 ++-- .../ml/huggingface/cell_dataset_builder.py | 2 +- .../src/cellxgene_census/experimental/pp/_online.py | 6 +++--- .../tests/experimental/pp/test_online.py | 2 +- api/python/cellxgene_census/tests/test_get_helpers.py | 2 -- .../build_soma/validate_soma.py | 10 ++++++---- tools/census_contrib/src/census_contrib/load.py | 4 +--- 7 files changed, 14 insertions(+), 16 deletions(-) diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index e04c3e144..03103d795 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -30,7 +30,7 @@ dependencies= [ # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to # ensure that the assets are readable (tiledbsoma supports backward compatible reading). # Make sure this version does not fall behind the builder's tiledbsoma version. - "tiledbsoma>=1.12.3,!=1.14.1", + "tiledbsoma>=1.12.3,!=1.14.1,<1.15", "anndata", "numpy>=1.23,<2.0", "requests", @@ -41,7 +41,7 @@ dependencies= [ [project.optional-dependencies] experimental = [ "torch", - "torchdata~=0.7", + "torchdata~=0.7,<0.10", "scikit-learn>=1.2", "scikit-misc>=0.2,<0.4", # scikit-misc 0.3 dropped Python 3.8 support, and 0.4 doesn't have MacOS/ARM wheels "datasets~=2.0", diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py index 5a9c2d626..2d175bae9 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py @@ -8,7 +8,7 @@ from tiledbsoma import Experiment, ExperimentAxisQuery -class CellDatasetBuilder(ExperimentAxisQuery[Experiment], ABC): # type: ignore +class CellDatasetBuilder(ExperimentAxisQuery, ABC): # type: ignore """Abstract base class for methods to process CELLxGENE Census ExperimentAxisQuery results into a Hugging Face Dataset in which each item represents one cell. Subclasses implement the `cell_item()` method to process each row of an X layer diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py index 13bbe76c5..455fc07a9 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py @@ -96,7 +96,7 @@ def __init__(self, n_samples: int, n_variables: int, nnz_only: bool = False): self.n_samples = n_samples self.nnz_only = nnz_only # If we want to exclude zeros, we need to keep track of the denominator - self.n = np.zeros(n_variables) + self.n = np.zeros(n_variables, dtype=np.float64) def update(self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32]) -> None: if self.nnz_only: @@ -106,9 +106,9 @@ def update(self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32 def finalize(self) -> npt.NDArray[np.float64]: if self.nnz_only: - return self.u / self.n + return np.divide(self.u, self.n, dtype=np.float64) else: - return self.u / self.n_samples + return np.divide(self.u, self.n_samples, dtype=np.float64) class CountsAccumulator: diff --git a/api/python/cellxgene_census/tests/experimental/pp/test_online.py b/api/python/cellxgene_census/tests/experimental/pp/test_online.py index 1b75ed5c9..bd352fc88 100644 --- a/api/python/cellxgene_census/tests/experimental/pp/test_online.py +++ b/api/python/cellxgene_census/tests/experimental/pp/test_online.py @@ -138,7 +138,7 @@ def test_counts(matrix: sparse.coo_matrix, n_batches: int, stride: int) -> None: assert n_samples.sum() == matrix.shape[0] assert len(n_samples) == n_batches - clip_val = 50 * np.random.rand(n_batches, matrix.shape[1]) + clip_val = (50 * np.random.rand(n_batches, matrix.shape[1])).astype(np.float64) ca = CountsAccumulator(n_batches, matrix.shape[1], clip_val) for i in range(0, matrix.nnz, stride): diff --git a/api/python/cellxgene_census/tests/test_get_helpers.py b/api/python/cellxgene_census/tests/test_get_helpers.py index de75e05d0..2237f4472 100644 --- a/api/python/cellxgene_census/tests/test_get_helpers.py +++ b/api/python/cellxgene_census/tests/test_get_helpers.py @@ -34,5 +34,3 @@ def test_get_presence_matrix(organism: str, census: soma.Collection) -> None: assert pm.shape[1] == len( census["census_data"][organism].ms["RNA"].var.read(column_names=["soma_joinid"]).concat().to_pandas() ) - - census.close() diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py index 2aea90943..f112afcf6 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py @@ -761,10 +761,12 @@ def _validate_X_layers_raw_contents( # the expected_X matrix. raw_sum = np.zeros((len(obs_joinids_split),), dtype=np.float64) # 64 bit for numerical stability np.add.at(raw_sum, rows_by_position, X_raw_data) - raw_sum = raw_sum.astype( - CENSUS_OBS_TABLE_SPEC.field("raw_sum").to_pandas_dtype() - ) # cast to the storage type - assert np.allclose(raw_sum, obs_df.raw_sum.iloc[idx : idx + STRIDE].to_numpy()) + assert np.allclose( + raw_sum.astype( + CENSUS_OBS_TABLE_SPEC.field("raw_sum").to_pandas_dtype() + ), # cast to the storage type + obs_df.raw_sum.iloc[idx : idx + STRIDE].to_numpy(), + ) del raw_sum # Assertion 1 - the contents of the X matrix are EQUAL for all var values present in the AnnData diff --git a/tools/census_contrib/src/census_contrib/load.py b/tools/census_contrib/src/census_contrib/load.py index 0c2a9539e..63f186a76 100644 --- a/tools/census_contrib/src/census_contrib/load.py +++ b/tools/census_contrib/src/census_contrib/load.py @@ -230,15 +230,13 @@ def __next__(self) -> pa.Table: i = np.empty((n_embeddings, self.n_features), dtype=np.int64) i.T[:] = next_block - i = i.ravel() j = np.empty((n_embeddings, self.n_features), dtype=np.int64) j[:] = np.arange(self.n_features) - j = j.ravel() d = self._scale * self.rng.random((n_embeddings * self.n_features), dtype=np.float32) + self._offset - return pa.Table.from_pydict({"i": i, "j": j, "d": d}) + return pa.Table.from_pydict({"i": i.ravel(), "j": j.ravel(), "d": d}) @property def type(self) -> pa.DataType: From 7f617e36d586799a97e4ffb9587adebe17a2c988 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 10 Jan 2025 09:27:25 -0800 Subject: [PATCH 2/5] Update versions tested and package installation in LTS compat tests (#1326) --- .github/workflows/lts-compat-check.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/lts-compat-check.yml b/.github/workflows/lts-compat-check.yml index 9beb636bf..59b6d3a7d 100644 --- a/.github/workflows/lts-compat-check.yml +++ b/.github/workflows/lts-compat-check.yml @@ -20,14 +20,14 @@ jobs: census-build-version: # Add additional LTS releases as they occur - "latest" - "stable" + - "2024-07-01" - "2023-12-15" - "2023-07-25" - "2023-05-15" py-pkg-version: - - "~=1.10.0" - - "~=1.11.0" - - "~=1.12.0" - - "~=1.13.0" + - "~=1.14.0" + - "~=1.15.0" + - "~=1.16.0" - "head-of-main" runs-on: ${{matrix.os}} @@ -43,12 +43,11 @@ jobs: - name: Install dependencies run: | python -m pip install -U pip setuptools wheel - GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt if [ ${{matrix.py-pkg-version}} == "head-of-main" ]; then - pip install -e ./api/python/cellxgene_census/ + pip install -e ./api/python/cellxgene_census/ -r ./api/python/cellxgene_census/scripts/requirements-dev.txt else - pip install -U cellxgene_census${{ matrix.py-pkg-version }} + pip install -U cellxgene_census${{ matrix.py-pkg-version }} -r ./api/python/cellxgene_census/scripts/requirements-dev.txt fi - name: Test with pytest (API, main tests) From 3a83fb755773091da1c30604a93c2eb998bc955f Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 10 Jan 2025 15:21:54 -0800 Subject: [PATCH 3/5] chore: migrate acceptance tests to new GH runner (#1328) --- .github/workflows/full-unittests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/full-unittests.yml b/.github/workflows/full-unittests.yml index b41352d67..00082e8da 100644 --- a/.github/workflows/full-unittests.yml +++ b/.github/workflows/full-unittests.yml @@ -42,7 +42,7 @@ env: jobs: py_unit_tests: - runs-on: single-cell-1tb-runner + runs-on: sc-dev-64g-runner timeout-minutes: 1440 # 24 hour timeout strategy: fail-fast: false # prevent this job from killing other jobs @@ -88,7 +88,7 @@ jobs: PYTHONPATH=. pytest -v --durations=0 -rP --experimental --expensive ./api/python/cellxgene_census/tests/ r_unit_tests: - runs-on: single-cell-1tb-runner + runs-on: sc-dev-64g-runner timeout-minutes: 1440 # 24 hour timeout strategy: fail-fast: false # prevent this job from killing other jobs From d4db2325c39e20bbfcc60e00e0357900795bca1d Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sat, 11 Jan 2025 00:05:36 +0000 Subject: [PATCH 4/5] Bump version of tiledbsoma used to 1.15.3 --- tools/cellxgene_census_builder/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cellxgene_census_builder/pyproject.toml b/tools/cellxgene_census_builder/pyproject.toml index c7ab25b79..f48328f29 100644 --- a/tools/cellxgene_census_builder/pyproject.toml +++ b/tools/cellxgene_census_builder/pyproject.toml @@ -41,9 +41,9 @@ dependencies= [ # The compatibility matrix is defined here: # https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md # TODO (spatial): tiledbsoma pin to a PyPI release is temporarily commented out in favor git commit pin - # "tiledbsoma==1.9.3", + "tiledbsoma==1.15.3", # TODO (spatial): Pin tiledbsoma dependency to an actual released version after tiledbsoma spatial code has been released - "tiledbsoma @ git+https://github.com/single-cell-data/TileDB-SOMA.git@16467fa7405d59ab1763f103081318b839f87610#egg=tiledbsoma&subdirectory=apis/python/", + # "tiledbsoma @ git+https://github.com/single-cell-data/TileDB-SOMA.git@16467fa7405d59ab1763f103081318b839f87610#egg=tiledbsoma&subdirectory=apis/python/", # TODO (spatial): Deal with the following line before release "cellxgene-census @ {root:parent:parent:uri}/api/python/cellxgene_census", "cellxgene-ontology-guide>=1.2", From 8b54aea54420e92facaadf823a92719eec0295a6 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sat, 11 Jan 2025 00:40:01 +0000 Subject: [PATCH 5/5] Update required version of tiledbsoma for client --- api/python/cellxgene_census/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index 03103d795..99f39102d 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -30,7 +30,7 @@ dependencies= [ # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to # ensure that the assets are readable (tiledbsoma supports backward compatible reading). # Make sure this version does not fall behind the builder's tiledbsoma version. - "tiledbsoma>=1.12.3,!=1.14.1,<1.15", + "tiledbsoma>=1.15.3", "anndata", "numpy>=1.23,<2.0", "requests",