chanzuckerberg · bkmartinjr · Dec 21, 2023 · Dec 3, 2023 · Dec 3, 2023 · Dec 4, 2023
diff --git a/docs/cellxgene_census_schema.md b/docs/cellxgene_census_schema.md
@@ -1,8 +1,8 @@
 # CZ CELLxGENE Discover Census Schema
 
-**Version**: 1.2.0
+**Version**: 1.3.0
 
-**Last edited**: Sept, 2023.
+**Last edited**: December, 2023.
 
 The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED" "MAY", and "OPTIONAL" in this document are to be interpreted as described in [BCP 14](https://tools.ietf.org/html/bcp14), [RFC2119](https://www.rfc-editor.org/rfc/rfc2119.txt), and [RFC8174](https://www.rfc-editor.org/rfc/rfc8174.txt) when, and only when, they appear in all capitals, as shown here.
 
@@ -339,7 +339,7 @@ An example of this `SOMADataFrame` is shown below:
 <tbody>
   <tr>
     <td>census_schema_version</td>
-    <td>1.2.0</td>
+    <td>1.3.0</td>
   </tr>
   <tr>
     <td>census_build_date</td>
@@ -381,10 +381,15 @@ All datasets used to build the Census MUST be included in a table modeled as a `
   </tr>
 </thead>
 <tbody>
+  <tr>
+    <td>citation</td>
+    <td>string</td>
+    <td>As defined in the CELLxGENE schema.</td>
+  </tr>
   <tr>
     <td>collection_id</td>
     <td>string</td>
-    <td rowspan="5">As defined in CELLxGENE Discover <a href="https://api.cellxgene.cziscience.com/curation/ui/">data schema</a> (see &quot;Schemas&quot; section for field definitions)".</td>
+    <td rowspan="6">As defined in CELLxGENE Discover <a href="https://api.cellxgene.cziscience.com/curation/ui/">data schema</a> (see &quot;Schemas&quot; section for field definitions)".</td>
   </tr>
   <tr>
     <td>collection_name</td>
@@ -752,7 +757,7 @@ The following columns MUST be included:
   <tr>
     <td>feature_length</td>
     <td>int</td>
-    <td>Gene length in base pairs derived from the <a href="https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md#required-gene-annotations">gene reference files from the CELLxGENE dataset schema</a>.</td>
+    <td>As defined in CELLxGENE dataset schema</a>.</td>
   </tr>
   <tr>
     <td>nnz</td>
@@ -838,7 +843,7 @@ Cell metadata MUST be encoded as a `SOMADataFrame` with the following columns:
   </tr>
   <tr>
     <td>assay_ontology_term_id</td>
-    <td colspan="2" rowspan="17">As defined in CELLxGENE dataset schema</td>
+    <td colspan="2" rowspan="19">As defined in CELLxGENE dataset schema</td>
   </tr>
   <tr>
     <td>assay</td>
@@ -867,6 +872,9 @@ Cell metadata MUST be encoded as a `SOMADataFrame` with the following columns:
   <tr>
     <td>is_primary_data</td>
   </tr>
+  <tr>
+    <td>observation_joinid</td>
+  </tr>
   <tr>
     <td>self_reported_ethnicity_ontology_term_id</td>
   </tr>
@@ -888,6 +896,9 @@ Cell metadata MUST be encoded as a `SOMADataFrame` with the following columns:
   <tr>
     <td>tissue</td>
   </tr>
+  <tr>
+    <td>tissue_type</td>
+  </tr>
   <tr>
     <td>nnz</td>
     <td>int64</td>
@@ -918,6 +929,12 @@ Cell metadata MUST be encoded as a `SOMADataFrame` with the following columns:
 
 ## Changelog
 
+### Version 1.3.0
+
+* Update to require [CELLxGENE schema version 4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md)
+* Adds `citation` to "Census table of CELLxGENE Discover datasets – `census_obj["census_info"]["datasets"]`"
+* Adds `observation_joinid` and `tissue_type` to `obs` dataframe
+
 ### Version 1.2.0
 
 * Update to require [CELLxGENE schema version 3.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md)

diff --git a/tools/cellxgene_census_builder/pyproject.toml b/tools/cellxgene_census_builder/pyproject.toml
@@ -26,26 +26,26 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 dependencies= [
-    "typing_extensions==4.8.0",
-    "pyarrow==13.0.0",
-    "pandas[performance]==2.0.3",
-    "anndata==0.9",
+    "typing_extensions==4.9.0",
+    "pyarrow==14.0.1",
+    "pandas[performance]==2.1.4",
+    "anndata==0.10.3",
     "numpy==1.23.5",
     # IMPORTANT: consider TileDB format compat before advancing this version. It is important that
     # IMPORTANT: the tiledbsoma version lag that used in cellxgene-census package.
-    "tiledbsoma==1.4.4",
-    "cellxgene-census==1.6.0",
-    "scipy==1.10.1",  # cellxgene-census==1.5.1 forces scipy<1.11
-    "fsspec==2023.9.2",
-    "s3fs==2023.9.2",
+    "tiledbsoma==1.6.1",
+    "cellxgene-census==1.9.1",
+    "scipy==1.11.4",
+    "fsspec==2023.12.2",
+    "s3fs==2023.12.2",
     "requests==2.31.0",
-    "aiohttp==3.9.0",
+    "aiohttp==3.9.1",
     "Cython", # required by owlready2
     "wheel",  # required by owlready2
     "owlready2==0.44",
-    "gitpython==3.1.37",
+    "gitpython==3.1.40",
     "attrs==23.1.0",
-    "psutil==5.9.5",
+    "psutil==5.9.6",
     "pyyaml==6.0.1",
     "numba==0.56.4",
 ]

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py
@@ -42,7 +42,7 @@ def open_anndata(
         # These are schema versions this code is known to work with. This is a
         # sanity check, which would be better implemented via a unit test at
         # some point in the future.
-        assert CXG_SCHEMA_VERSION in ["3.1.0", "3.0.0"]
+        assert CXG_SCHEMA_VERSION in ["4.0.0"]
 
         if h5ad.schema_version == "":
             h5ad.schema_version = get_cellxgene_schema_version(ad)
@@ -80,6 +80,7 @@ def open_anndata(
                 # TODO - these should be looked up in the ontology
                 raw_var["feature_name"] = "unknown"
                 raw_var["feature_reference"] = "unknown"
+                raw_var["feature_length"] = 0
                 var = pd.concat([ad.var, raw_var])
             else:
                 var = ad.raw.var
@@ -96,7 +97,7 @@ def open_anndata(
             not isinstance(X, (sparse.csr_matrix, sparse.csc_matrix)) or X.has_canonical_format
         ), f"Found H5AD with non-canonical X matrix in {path}"
 
-        ad = anndata.AnnData(X=X if need_X else None, obs=ad.obs, var=var, raw=None, uns=ad.uns, dtype=np.float32)
+        ad = anndata.AnnData(X=X if need_X else None, obs=ad.obs, var=var, raw=None, uns=ad.uns)
         assert not need_X or ad.X.shape == (len(ad.obs), len(ad.var))
 
         # TODO: In principle, we could look up missing feature_name, but for now, just assert they exist
@@ -154,7 +155,7 @@ def _filter(ad: anndata.AnnData, need_X: Optional[bool] = True) -> anndata.AnnDa
         assert ad.raw is None
 
         # This discards all other ancillary state, eg, obsm/varm/....
-        ad = anndata.AnnData(X=X, obs=obs, var=var, dtype=np.float32)
+        ad = anndata.AnnData(X=X, obs=obs, var=var)
 
         assert (
             X is None or isinstance(X, np.ndarray) or X.has_canonical_format

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/datasets.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/datasets.py
@@ -25,6 +25,7 @@ class Dataset:
 
     # Optional - as reported by REST API
     dataset_title: str = ""  # CELLxGENE dataset title
+    citation: str = ""  # CELLxGENE citation
     collection_id: str = ""  # CELLxGENE collection id
     collection_name: str = ""  # CELLxGENE collection name
     collection_doi: str = ""  # CELLxGENE collection doi

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py
@@ -1,6 +1,5 @@
 import concurrent.futures
 import gc
-import io
 import logging
 from contextlib import ExitStack
 from typing import (
@@ -54,13 +53,12 @@
     log_on_broken_process_pool,
     n_workers_from_memory_budget,
 )
-from .source_assets import cat_file
 from .stats import get_obs_stats, get_var_stats
 from .summary_cell_counts import (
     accumulate_summary_counts,
     init_summary_counts_accumulator,
 )
-from .util import anndata_ordered_bool_issue_853_workaround, array_chunker, is_nonnegative_integral
+from .util import array_chunker, is_nonnegative_integral
 
 
 @attrs.define
@@ -107,40 +105,15 @@ class ExperimentSpecification:
 
     name: str
     anndata_cell_filter_spec: AnnDataFilterSpec
-    gene_feature_length_uris: List[str]
-    gene_feature_length: pd.DataFrame
 
     @classmethod
     def create(
         cls,
         name: str,
         anndata_cell_filter_spec: AnnDataFilterSpec,
-        gene_feature_length_uris: List[str],
     ) -> Self:
         """Factory method. Do not instantiate the class directly."""
-        gene_feature_length = cls._load_gene_feature_length(gene_feature_length_uris)
-        logging.info(f"Loaded gene lengths external reference for {name}, {len(gene_feature_length)} genes.")
-        return cls(
-            name,
-            anndata_cell_filter_spec,
-            gene_feature_length_uris,
-            gene_feature_length,
-        )
-
-    @classmethod
-    def _load_gene_feature_length(cls, gene_feature_length_uris: Sequence[str]) -> pd.DataFrame:
-        """
-        Private. Load any external assets required to create the experiment.
-        """
-        return pd.concat(
-            pd.read_csv(
-                io.BytesIO(cat_file(uri)),
-                names=["feature_id", "feature_name", "gene_version", "feature_length"],
-            )
-            .set_index("feature_id")
-            .drop(columns=["feature_name", "gene_version"])
-            for uri in gene_feature_length_uris
-        )
+        return cls(name, anndata_cell_filter_spec)
 
 
 class ExperimentBuilder:
@@ -176,10 +149,6 @@ def name(self) -> str:
     def anndata_cell_filter_spec(self) -> AnnDataFilterSpec:
         return self.specification.anndata_cell_filter_spec
 
-    @property
-    def gene_feature_length(self) -> pd.DataFrame:
-        return self.specification.gene_feature_length
-
     def create(self, census_data: soma.Collection) -> None:
         """Create experiment within the specified Collection with a single Measurement."""
 
@@ -249,7 +218,6 @@ def accumulate_axes(self, dataset: Dataset, ad: anndata.AnnData) -> int:
 
         # drop columns we don't want to write (e.g., organism)
         obs_df = obs_df[list(CENSUS_OBS_TERM_COLUMNS)]
-        obs_df = anndata_ordered_bool_issue_853_workaround(obs_df)
 
         # accumulate obs
         self.obs_df_accumulation.append(obs_df)
@@ -259,7 +227,7 @@ def accumulate_axes(self, dataset: Dataset, ad: anndata.AnnData) -> int:
         # Accumulate the union of all var ids/names (for raw and processed), to be later persisted.
         # NOTE: assumes raw.var is None, OR has same index as var. Currently enforced in open_anndata(),
         # but may need to evolve this logic if that assumption is not scalable.
-        tv = ad.var.rename_axis("feature_id").reset_index()[["feature_id", "feature_name"]]
+        tv = ad.var.rename_axis("feature_id").reset_index()[["feature_id", "feature_name", "feature_length"]]
         for key in CENSUS_VAR_TERM_COLUMNS:
             if key not in tv:
                 tv[key] = np.full((len(tv),), 0, dtype=CENSUS_VAR_TERM_COLUMNS[key].to_pandas_dtype())
@@ -321,9 +289,6 @@ def populate_var_axis(self) -> None:
         # it is possible there is nothing to write
         if self.var_df is not None and len(self.var_df) > 0:
             self.var_df["soma_joinid"] = range(len(self.var_df))
-            self.var_df = self.var_df.drop(columns=["feature_length"]).join(self.gene_feature_length, on="feature_id")
-            self.var_df.feature_length.fillna(0, inplace=True)
-            self.var_df = anndata_ordered_bool_issue_853_workaround(self.var_df)
             self.var_df = self.var_df.set_index("soma_joinid", drop=False)
 
             self.global_var_joinids = self.var_df[["feature_id", "soma_joinid"]].set_index("feature_id")
@@ -370,7 +335,7 @@ def populate_presence_matrix(self, datasets: List[Dataset]) -> None:
             max_dataset_joinid = max(d.soma_joinid for d in datasets)
 
             # LIL is fast way to create spmatrix
-            pm = sparse.lil_array((max_dataset_joinid + 1, self.n_var), dtype=bool)
+            pm = sparse.lil_matrix((max_dataset_joinid + 1, self.n_var), dtype=bool)
             for dataset_joinid, presence in self.presence.items():
                 data, cols = presence
                 pm[dataset_joinid, cols] = data
@@ -527,7 +492,7 @@ def _accumulate_all_X_layers(
             assert (row >= 0).all()
             col = local_var_joinids[X.col]
             assert (col >= 0).all()
-            X_remap = sparse.coo_array((X.data, (row, col)), shape=(eb.n_obs, eb.n_var))
+            X_remap = sparse.coo_matrix((X.data, (row, col)), shape=(eb.n_obs, eb.n_var))
             with soma.Experiment.open(eb.experiment_uri, "w", context=SOMA_TileDB_Context()) as experiment:
                 experiment.ms[ms_name].X[layer_name].write(pa.SparseCOOTensor.from_scipy(X_remap))
             gc.collect()

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py
@@ -13,25 +13,14 @@ def make_experiment_specs() -> List[ExperimentSpecification]:
     Functionally, this defines per-experiment name, anndata filter, etc.
     It also loads any required per-Experiment assets.
     """
-    GENE_LENGTH_BASE_URI = (
-        "https://raw.githubusercontent.com/chanzuckerberg/single-cell-curation/"
-        "100f935eac932e1f5f5dadac0627204da3790f6f/cellxgene_schema_cli/cellxgene_schema/ontology_files/"
-    )
-    GENE_LENGTH_URIS = [
-        GENE_LENGTH_BASE_URI + "genes_homo_sapiens.csv.gz",
-        GENE_LENGTH_BASE_URI + "genes_mus_musculus.csv.gz",
-        GENE_LENGTH_BASE_URI + "genes_sars_cov_2.csv.gz",
-    ]
     return [  # The soma.Experiments we want to build
         ExperimentSpecification.create(
             name="homo_sapiens",
             anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:9606", assay_ontology_term_ids=RNA_SEQ),
-            gene_feature_length_uris=GENE_LENGTH_URIS,
         ),
         ExperimentSpecification.create(
             name="mus_musculus",
             anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:10090", assay_ontology_term_ids=RNA_SEQ),
-            gene_feature_length_uris=GENE_LENGTH_URIS,
         ),
     ]