Pin torchdata + fix some typing errors (#1325)

* Pin torchdata * Fix type annotation * Fix reassignment of array with different type * Fix typing for CountsAccumulator assignment * Enforce 64bit result in MeanAccumulator (unclear if this was the case before) * Fix typing error * Don't prematurely close census in test * Revert "Fix typing for CountsAccumulator assignment" This reverts commit 7a7538b. * Revert "Enforce 64bit result in MeanAccumulator (unclear if this was the case before)" This reverts commit 8ba276a. * Unrevert * Pin tiledbsoma<1.15
chanzuckerberg · Jan 9, 2025 · aa51fcb · aa51fcb
1 parent ec0e754
commit aa51fcb
Show file tree

Hide file tree

Showing 7 changed files with 14 additions and 16 deletions.
diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml
@@ -30,7 +30,7 @@ dependencies= [
     # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to
     # ensure that the assets are readable (tiledbsoma supports backward compatible reading).
     # Make sure this version does not fall behind the builder's tiledbsoma version.
-    "tiledbsoma>=1.12.3,!=1.14.1",
+    "tiledbsoma>=1.12.3,!=1.14.1,<1.15",
     "anndata",
     "numpy>=1.23,<2.0",
     "requests",
@@ -41,7 +41,7 @@ dependencies= [
 [project.optional-dependencies]
 experimental = [
     "torch",
-    "torchdata~=0.7",
+    "torchdata~=0.7,<0.10",
     "scikit-learn>=1.2",
     "scikit-misc>=0.2,<0.4",  # scikit-misc 0.3 dropped Python 3.8 support, and 0.4 doesn't have MacOS/ARM wheels
     "datasets~=2.0",

diff --git a/...cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py b/...cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py
@@ -8,7 +8,7 @@
 from tiledbsoma import Experiment, ExperimentAxisQuery
 
 
-class CellDatasetBuilder(ExperimentAxisQuery[Experiment], ABC):  # type: ignore
+class CellDatasetBuilder(ExperimentAxisQuery, ABC):  # type: ignore
     """Abstract base class for methods to process CELLxGENE Census ExperimentAxisQuery
     results into a Hugging Face Dataset in which each item represents one cell.
     Subclasses implement the `cell_item()` method to process each row of an X layer

diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py
@@ -96,7 +96,7 @@ def __init__(self, n_samples: int, n_variables: int, nnz_only: bool = False):
         self.n_samples = n_samples
         self.nnz_only = nnz_only
         # If we want to exclude zeros, we need to keep track of the denominator
-        self.n = np.zeros(n_variables)
+        self.n = np.zeros(n_variables, dtype=np.float64)
 
     def update(self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32]) -> None:
         if self.nnz_only:
@@ -106,9 +106,9 @@ def update(self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32
 
     def finalize(self) -> npt.NDArray[np.float64]:
         if self.nnz_only:
-            return self.u / self.n
+            return np.divide(self.u, self.n, dtype=np.float64)
         else:
-            return self.u / self.n_samples
+            return np.divide(self.u, self.n_samples, dtype=np.float64)
 
 
 class CountsAccumulator:

diff --git a/api/python/cellxgene_census/tests/experimental/pp/test_online.py b/api/python/cellxgene_census/tests/experimental/pp/test_online.py
@@ -138,7 +138,7 @@ def test_counts(matrix: sparse.coo_matrix, n_batches: int, stride: int) -> None:
     assert n_samples.sum() == matrix.shape[0]
     assert len(n_samples) == n_batches
 
-    clip_val = 50 * np.random.rand(n_batches, matrix.shape[1])
+    clip_val = (50 * np.random.rand(n_batches, matrix.shape[1])).astype(np.float64)
 
     ca = CountsAccumulator(n_batches, matrix.shape[1], clip_val)
     for i in range(0, matrix.nnz, stride):

diff --git a/api/python/cellxgene_census/tests/test_get_helpers.py b/api/python/cellxgene_census/tests/test_get_helpers.py
@@ -34,5 +34,3 @@ def test_get_presence_matrix(organism: str, census: soma.Collection) -> None:
     assert pm.shape[1] == len(
         census["census_data"][organism].ms["RNA"].var.read(column_names=["soma_joinid"]).concat().to_pandas()
     )
-
-    census.close()
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py
@@ -761,10 +761,12 @@ def _validate_X_layers_raw_contents(
                     # the expected_X matrix.
                     raw_sum = np.zeros((len(obs_joinids_split),), dtype=np.float64)  # 64 bit for numerical stability
                     np.add.at(raw_sum, rows_by_position, X_raw_data)
-                    raw_sum = raw_sum.astype(
-                        CENSUS_OBS_TABLE_SPEC.field("raw_sum").to_pandas_dtype()
-                    )  # cast to the storage type
-                    assert np.allclose(raw_sum, obs_df.raw_sum.iloc[idx : idx + STRIDE].to_numpy())
+                    assert np.allclose(
+                        raw_sum.astype(
+                            CENSUS_OBS_TABLE_SPEC.field("raw_sum").to_pandas_dtype()
+                        ),  # cast to the storage type
+                        obs_df.raw_sum.iloc[idx : idx + STRIDE].to_numpy(),
+                    )
                     del raw_sum
 
                     # Assertion 1 - the contents of the X matrix are EQUAL for all var values present in the AnnData

diff --git a/tools/census_contrib/src/census_contrib/load.py b/tools/census_contrib/src/census_contrib/load.py
@@ -230,15 +230,13 @@ def __next__(self) -> pa.Table:
 
         i = np.empty((n_embeddings, self.n_features), dtype=np.int64)
         i.T[:] = next_block
-        i = i.ravel()
 
         j = np.empty((n_embeddings, self.n_features), dtype=np.int64)
         j[:] = np.arange(self.n_features)
-        j = j.ravel()
 
         d = self._scale * self.rng.random((n_embeddings * self.n_features), dtype=np.float32) + self._offset
 
-        return pa.Table.from_pydict({"i": i, "j": j, "d": d})
+        return pa.Table.from_pydict({"i": i.ravel(), "j": j.ravel(), "d": d})
 
     @property
     def type(self) -> pa.DataType: