diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index e04c3e144..03103d795 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -30,7 +30,7 @@ dependencies= [ # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to # ensure that the assets are readable (tiledbsoma supports backward compatible reading). # Make sure this version does not fall behind the builder's tiledbsoma version. - "tiledbsoma>=1.12.3,!=1.14.1", + "tiledbsoma>=1.12.3,!=1.14.1,<1.15", "anndata", "numpy>=1.23,<2.0", "requests", @@ -41,7 +41,7 @@ dependencies= [ [project.optional-dependencies] experimental = [ "torch", - "torchdata~=0.7", + "torchdata~=0.7,<0.10", "scikit-learn>=1.2", "scikit-misc>=0.2,<0.4", # scikit-misc 0.3 dropped Python 3.8 support, and 0.4 doesn't have MacOS/ARM wheels "datasets~=2.0", diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py index 5a9c2d626..2d175bae9 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py @@ -8,7 +8,7 @@ from tiledbsoma import Experiment, ExperimentAxisQuery -class CellDatasetBuilder(ExperimentAxisQuery[Experiment], ABC): # type: ignore +class CellDatasetBuilder(ExperimentAxisQuery, ABC): # type: ignore """Abstract base class for methods to process CELLxGENE Census ExperimentAxisQuery results into a Hugging Face Dataset in which each item represents one cell. Subclasses implement the `cell_item()` method to process each row of an X layer diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py index 13bbe76c5..455fc07a9 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py @@ -96,7 +96,7 @@ def __init__(self, n_samples: int, n_variables: int, nnz_only: bool = False): self.n_samples = n_samples self.nnz_only = nnz_only # If we want to exclude zeros, we need to keep track of the denominator - self.n = np.zeros(n_variables) + self.n = np.zeros(n_variables, dtype=np.float64) def update(self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32]) -> None: if self.nnz_only: @@ -106,9 +106,9 @@ def update(self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32 def finalize(self) -> npt.NDArray[np.float64]: if self.nnz_only: - return self.u / self.n + return np.divide(self.u, self.n, dtype=np.float64) else: - return self.u / self.n_samples + return np.divide(self.u, self.n_samples, dtype=np.float64) class CountsAccumulator: diff --git a/api/python/cellxgene_census/tests/experimental/pp/test_online.py b/api/python/cellxgene_census/tests/experimental/pp/test_online.py index 1b75ed5c9..bd352fc88 100644 --- a/api/python/cellxgene_census/tests/experimental/pp/test_online.py +++ b/api/python/cellxgene_census/tests/experimental/pp/test_online.py @@ -138,7 +138,7 @@ def test_counts(matrix: sparse.coo_matrix, n_batches: int, stride: int) -> None: assert n_samples.sum() == matrix.shape[0] assert len(n_samples) == n_batches - clip_val = 50 * np.random.rand(n_batches, matrix.shape[1]) + clip_val = (50 * np.random.rand(n_batches, matrix.shape[1])).astype(np.float64) ca = CountsAccumulator(n_batches, matrix.shape[1], clip_val) for i in range(0, matrix.nnz, stride): diff --git a/api/python/cellxgene_census/tests/test_get_helpers.py b/api/python/cellxgene_census/tests/test_get_helpers.py index de75e05d0..2237f4472 100644 --- a/api/python/cellxgene_census/tests/test_get_helpers.py +++ b/api/python/cellxgene_census/tests/test_get_helpers.py @@ -34,5 +34,3 @@ def test_get_presence_matrix(organism: str, census: soma.Collection) -> None: assert pm.shape[1] == len( census["census_data"][organism].ms["RNA"].var.read(column_names=["soma_joinid"]).concat().to_pandas() ) - - census.close() diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py index 2aea90943..f112afcf6 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py @@ -761,10 +761,12 @@ def _validate_X_layers_raw_contents( # the expected_X matrix. raw_sum = np.zeros((len(obs_joinids_split),), dtype=np.float64) # 64 bit for numerical stability np.add.at(raw_sum, rows_by_position, X_raw_data) - raw_sum = raw_sum.astype( - CENSUS_OBS_TABLE_SPEC.field("raw_sum").to_pandas_dtype() - ) # cast to the storage type - assert np.allclose(raw_sum, obs_df.raw_sum.iloc[idx : idx + STRIDE].to_numpy()) + assert np.allclose( + raw_sum.astype( + CENSUS_OBS_TABLE_SPEC.field("raw_sum").to_pandas_dtype() + ), # cast to the storage type + obs_df.raw_sum.iloc[idx : idx + STRIDE].to_numpy(), + ) del raw_sum # Assertion 1 - the contents of the X matrix are EQUAL for all var values present in the AnnData diff --git a/tools/census_contrib/src/census_contrib/load.py b/tools/census_contrib/src/census_contrib/load.py index 0c2a9539e..63f186a76 100644 --- a/tools/census_contrib/src/census_contrib/load.py +++ b/tools/census_contrib/src/census_contrib/load.py @@ -230,15 +230,13 @@ def __next__(self) -> pa.Table: i = np.empty((n_embeddings, self.n_features), dtype=np.int64) i.T[:] = next_block - i = i.ravel() j = np.empty((n_embeddings, self.n_features), dtype=np.int64) j[:] = np.arange(self.n_features) - j = j.ravel() d = self._scale * self.rng.random((n_embeddings * self.n_features), dtype=np.float32) + self._offset - return pa.Table.from_pydict({"i": i, "j": j, "d": d}) + return pa.Table.from_pydict({"i": i.ravel(), "j": j.ravel(), "d": d}) @property def type(self) -> pa.DataType: