Skip to content

Commit

Permalink
Pin torchdata + fix some typing errors (#1325)
Browse files Browse the repository at this point in the history
* Pin torchdata

* Fix type annotation

* Fix reassignment of array with different type

* Fix typing for CountsAccumulator assignment

* Enforce 64bit result in MeanAccumulator (unclear if this was the case before)

* Fix typing error

* Don't prematurely close census in test

* Revert "Fix typing for CountsAccumulator assignment"

This reverts commit 7a7538b.

* Revert "Enforce 64bit result in MeanAccumulator (unclear if this was the case before)"

This reverts commit 8ba276a.

* Unrevert

* Pin tiledbsoma<1.15
  • Loading branch information
ivirshup authored Jan 9, 2025
1 parent ec0e754 commit aa51fcb
Show file tree
Hide file tree
Showing 7 changed files with 14 additions and 16 deletions.
4 changes: 2 additions & 2 deletions api/python/cellxgene_census/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ dependencies= [
# NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to
# ensure that the assets are readable (tiledbsoma supports backward compatible reading).
# Make sure this version does not fall behind the builder's tiledbsoma version.
"tiledbsoma>=1.12.3,!=1.14.1",
"tiledbsoma>=1.12.3,!=1.14.1,<1.15",
"anndata",
"numpy>=1.23,<2.0",
"requests",
Expand All @@ -41,7 +41,7 @@ dependencies= [
[project.optional-dependencies]
experimental = [
"torch",
"torchdata~=0.7",
"torchdata~=0.7,<0.10",
"scikit-learn>=1.2",
"scikit-misc>=0.2,<0.4", # scikit-misc 0.3 dropped Python 3.8 support, and 0.4 doesn't have MacOS/ARM wheels
"datasets~=2.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from tiledbsoma import Experiment, ExperimentAxisQuery


class CellDatasetBuilder(ExperimentAxisQuery[Experiment], ABC): # type: ignore
class CellDatasetBuilder(ExperimentAxisQuery, ABC): # type: ignore
"""Abstract base class for methods to process CELLxGENE Census ExperimentAxisQuery
results into a Hugging Face Dataset in which each item represents one cell.
Subclasses implement the `cell_item()` method to process each row of an X layer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def __init__(self, n_samples: int, n_variables: int, nnz_only: bool = False):
self.n_samples = n_samples
self.nnz_only = nnz_only
# If we want to exclude zeros, we need to keep track of the denominator
self.n = np.zeros(n_variables)
self.n = np.zeros(n_variables, dtype=np.float64)

def update(self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32]) -> None:
if self.nnz_only:
Expand All @@ -106,9 +106,9 @@ def update(self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32

def finalize(self) -> npt.NDArray[np.float64]:
if self.nnz_only:
return self.u / self.n
return np.divide(self.u, self.n, dtype=np.float64)
else:
return self.u / self.n_samples
return np.divide(self.u, self.n_samples, dtype=np.float64)


class CountsAccumulator:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def test_counts(matrix: sparse.coo_matrix, n_batches: int, stride: int) -> None:
assert n_samples.sum() == matrix.shape[0]
assert len(n_samples) == n_batches

clip_val = 50 * np.random.rand(n_batches, matrix.shape[1])
clip_val = (50 * np.random.rand(n_batches, matrix.shape[1])).astype(np.float64)

ca = CountsAccumulator(n_batches, matrix.shape[1], clip_val)
for i in range(0, matrix.nnz, stride):
Expand Down
2 changes: 0 additions & 2 deletions api/python/cellxgene_census/tests/test_get_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,3 @@ def test_get_presence_matrix(organism: str, census: soma.Collection) -> None:
assert pm.shape[1] == len(
census["census_data"][organism].ms["RNA"].var.read(column_names=["soma_joinid"]).concat().to_pandas()
)

census.close()
Original file line number Diff line number Diff line change
Expand Up @@ -761,10 +761,12 @@ def _validate_X_layers_raw_contents(
# the expected_X matrix.
raw_sum = np.zeros((len(obs_joinids_split),), dtype=np.float64) # 64 bit for numerical stability
np.add.at(raw_sum, rows_by_position, X_raw_data)
raw_sum = raw_sum.astype(
CENSUS_OBS_TABLE_SPEC.field("raw_sum").to_pandas_dtype()
) # cast to the storage type
assert np.allclose(raw_sum, obs_df.raw_sum.iloc[idx : idx + STRIDE].to_numpy())
assert np.allclose(
raw_sum.astype(
CENSUS_OBS_TABLE_SPEC.field("raw_sum").to_pandas_dtype()
), # cast to the storage type
obs_df.raw_sum.iloc[idx : idx + STRIDE].to_numpy(),
)
del raw_sum

# Assertion 1 - the contents of the X matrix are EQUAL for all var values present in the AnnData
Expand Down
4 changes: 1 addition & 3 deletions tools/census_contrib/src/census_contrib/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,15 +230,13 @@ def __next__(self) -> pa.Table:

i = np.empty((n_embeddings, self.n_features), dtype=np.int64)
i.T[:] = next_block
i = i.ravel()

j = np.empty((n_embeddings, self.n_features), dtype=np.int64)
j[:] = np.arange(self.n_features)
j = j.ravel()

d = self._scale * self.rng.random((n_embeddings * self.n_features), dtype=np.float32) + self._offset

return pa.Table.from_pydict({"i": i, "j": j, "d": d})
return pa.Table.from_pydict({"i": i.ravel(), "j": j.ravel(), "d": d})

@property
def type(self) -> pa.DataType:
Expand Down

0 comments on commit aa51fcb

Please sign in to comment.