Skip to content

Commit

Permalink
[python] Check for uniqueness of registration obs/var field-name inpu…
Browse files Browse the repository at this point in the history
…ts (#2380)

* [python] Check for uniqueness of registration obs/var field-name inputs

* conditional contextmanager for boilerplate reduction

---------

Co-authored-by: Ryan Williams <[email protected]>
  • Loading branch information
johnkerl and ryan-williams authored Apr 4, 2024
1 parent b1677be commit f0c483d
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 6 deletions.
18 changes: 14 additions & 4 deletions apis/python/src/tiledbsoma/io/_registration/id_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,22 @@ def from_isolated_anndata(
return cls(obs_axis=obs_mapping, var_axes=var_axes)


def _check_dataframe_values(values: List[str], field_name: str) -> List[str]:
if len(values) != len(set(values)):
raise ValueError(
"non-unique registration values have been provided in field {field_name}"
)
return values


def get_dataframe_values(df: pd.DataFrame, field_name: str) -> List[str]:
"""Extracts the label values (e.g. cell barcode, gene symbol) from an AnnData/H5AD
``obs`` or ``var`` dataframe."""
if field_name in df:
return [str(e) for e in df[field_name]]
if df.index.name in (field_name, "index", None):
return list(df.index)
values = [str(e) for e in df[field_name]]
elif df.index.name in (field_name, "index", None):
values = list(df.index)
else:
raise ValueError(f"could not find field name {field_name} in dataframe")

raise ValueError(f"could not find field name {field_name} in dataframe")
return _check_dataframe_values(values, field_name)
74 changes: 72 additions & 2 deletions apis/python/tests/test_registration_mappings.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""
Test join-id registrations for ingesting multiple AnnData objects into a single SOMA Experiment.
"""

import math
import tempfile
from contextlib import nullcontext
from typing import Optional, Sequence

import anndata as ad
Expand Down Expand Up @@ -120,8 +120,40 @@ def create_anndata_canned(which: int, obs_field_name: str, var_field_name: str):
]
X_value_base = 400

elif which == 8:
obs_ids = ["TAAT", "TCTG", "TGAG", "DUP", "DUP"]
var_ids = ["AKT1", "APOE", "ESR1", "TP53", "VEGFA", "ZZZ3"]
raw_var_ids = [
"AKT1",
"APOE",
"ESR1",
"TP53",
"VEGFA",
"ZZZ3",
"RAW1",
"RAW3",
"RAW2",
]
X_value_base = 800

elif which == 9:
obs_ids = ["TAAT", "TCTG", "TGAG"]
var_ids = ["AKT1", "DUP", "ESR1", "TP53", "DUP", "ZZZ3"]
raw_var_ids = [
"AKT1",
"APOE",
"ESR1",
"TP53",
"VEGFA",
"ZZZ3",
"RAW1",
"RAW3",
"RAW2",
]
X_value_base = 900

else:
raise Exception(f"create_anndata_canned takes 1..4; got {which}")
raise Exception(f"create_anndata_canned got unrecognized which={which}")

return _create_anndata(
obs_ids=obs_ids,
Expand Down Expand Up @@ -1130,3 +1162,41 @@ def test_ealm_expose():
# However, the pre-commit hook will strip out this import statement as "unused".
# So, assert something.
assert tiledbsoma.io.ExperimentAmbientLabelMapping is not None


@pytest.mark.parametrize("obs_field_name", ["obs_id", "cell_id"])
@pytest.mark.parametrize("var_field_name", ["var_id", "gene_id"])
@pytest.mark.parametrize(
"dataset_ids_and_exc",
[
[None, 2],
[ValueError, 8],
[ValueError, 9],
],
)
def test_append_with_nonunique_field_values(
tmp_path,
obs_field_name,
var_field_name,
dataset_ids_and_exc,
):
"""Verifies that we do a proactive check for uniqueness of obs/var registration-field values"""
ida = 1
exc, idb = dataset_ids_and_exc
measurement_name = "test"

anndataa = create_anndata_canned(ida, obs_field_name, var_field_name)
anndatab = create_anndata_canned(idb, obs_field_name, var_field_name)
soma_uri = tmp_path.as_posix()

tiledbsoma.io.from_anndata(soma_uri, anndataa, measurement_name=measurement_name)

ctx = pytest.raises(exc) if exc else nullcontext()
with ctx:
tiledbsoma.io.register_anndatas(
soma_uri,
[anndatab],
measurement_name=measurement_name,
obs_field_name=obs_field_name,
var_field_name=var_field_name,
)

0 comments on commit f0c483d

Please sign in to comment.