diff --git a/src/otg/dataset/study_locus.py b/src/otg/dataset/study_locus.py index 64fcf4931..d84f40e55 100644 --- a/src/otg/dataset/study_locus.py +++ b/src/otg/dataset/study_locus.py @@ -37,6 +37,7 @@ class StudyLocusQualityCheck(Enum): AMBIGUOUS_STUDY (str): Association with ambiguous study UNRESOLVED_LD (str): Variant not found in LD reference LD_CLUMPED (str): Explained by a more significant variant in high LD (clumped) + UNPICSABLE (str): Unable to calculate PIPs with the provided data """ SUBSIGNIFICANT_FLAG = "Subsignificant p-value" @@ -49,6 +50,7 @@ class StudyLocusQualityCheck(Enum): UNRESOLVED_LD = "Variant not found in LD reference" LD_CLUMPED = "Explained by a more significant variant in high LD (clumped)" NO_POPULATION = "Study does not have population annotation to resolve LD" + NOT_QUALIFYING_LD_BLOCK = "LD block does not contain variants at the required R^2 threshold" class CredibleInterval(Enum): diff --git a/src/otg/method/pics.py b/src/otg/method/pics.py index daff8dbf2..6d38f4643 100644 --- a/src/otg/method/pics.py +++ b/src/otg/method/pics.py @@ -8,7 +8,7 @@ import pyspark.sql.types as t from scipy.stats import norm -from otg.dataset.study_locus import StudyLocus +from otg.dataset.study_locus import StudyLocus, StudyLocusQualityCheck if TYPE_CHECKING: from pyspark.sql import Row @@ -127,7 +127,7 @@ def _finemap( ... Row(variantId="var2", r2Overall=None), ... ] >>> PICS._finemap(ld_set_with_no_r2, lead_neglog_p=10.0, k=6.4) - [{'variantId': 'var1', 'r2Overall': None}, {'variantId': 'var2', 'r2Overall': None}] + [] """ if ld_set is None: return None @@ -145,8 +145,7 @@ def _finemap( or tag_dict["r2Overall"] < 0.5 or not lead_neglog_p ): - # If PICS cannot be calculated, we'll return the original credible set - new_credible_set.append(tag_dict) + # If PICS cannot be calculated, we drop the variant from the credible set continue pics_snp_mu = PICS._pics_mu(lead_neglog_p, tag_dict["r2Overall"]) @@ -222,6 +221,9 @@ def finemap( lambda locus, neglog_p: PICS._finemap(locus, neglog_p, k), picsed_ldset_schema, ) + non_picsable_expr = ( + f.size(f.filter(f.col("ldSet"), lambda x: x.r2Overall >= 0.5)) == 0 + ) return StudyLocus( _df=( associations.df @@ -239,7 +241,18 @@ def finemap( ), ), ) - # Rename tagVariantId to variantId + .withColumn( + "qualityControls", + StudyLocus.update_quality_flag( + f.col("qualityControls"), + non_picsable_expr, + StudyLocusQualityCheck.NOT_QUALIFYING_LD_BLOCK, + ), + ) + .withColumn( + "finemappingMethod", + f.coalesce(f.col("finemappingMethod"), f.lit("pics")), + ) .drop("neglog_pvalue") ), _schema=StudyLocus.get_schema(), diff --git a/tests/method/test_pics.py b/tests/method/test_pics.py index 7d2f1f78e..41c9c5c20 100644 --- a/tests/method/test_pics.py +++ b/tests/method/test_pics.py @@ -30,11 +30,26 @@ def test_finemap_empty_array( def test_finemap_null_ld_set( self: TestFinemap, mock_study_locus: StudyLocus ) -> None: - """Test how we apply `finemap` when `locus` is null by returning a null field.""" + """Test how we apply `finemap` when `ldSet` is null by returning a null field.""" mock_study_locus.df = mock_study_locus.df.filter(f.col("ldSet").isNull()) observed_df = PICS.finemap(mock_study_locus).df.limit(1) assert observed_df.collect()[0]["locus"] is None + def test_finemap_quality_control( + self: TestFinemap, mock_study_locus: StudyLocus + ) -> None: + """Test that we add a `empty locus` flag when any variant in the locus meets PICS criteria.""" + mock_study_locus.df = mock_study_locus.df.withColumn( + # Association with an empty ldSet + "ldSet", + f.when(f.col("ldSet").isNull(), f.array()).otherwise(f.col("ldSet")), + ).filter(f.size("ldSet") == 0) + observed_df = PICS.finemap(mock_study_locus).df.limit(1) + qc_flag = "LD block does not contain variants at the required R^2 threshold" + assert ( + qc_flag in observed_df.collect()[0]["qualityControls"] + ), "Empty locus QC flag is missing." + def test__finemap_udf() -> None: """Test the _finemap UDF with a simple case."""