From 043c1bf2181eb94be525ea1b47ba1e030476538b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 18 Dec 2023 11:47:55 +0000 Subject: [PATCH 1/7] feat(pics): variants not in locus when if pips cant be calculated --- src/otg/method/pics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/otg/method/pics.py b/src/otg/method/pics.py index daff8dbf2..433087b07 100644 --- a/src/otg/method/pics.py +++ b/src/otg/method/pics.py @@ -127,7 +127,7 @@ def _finemap( ... Row(variantId="var2", r2Overall=None), ... ] >>> PICS._finemap(ld_set_with_no_r2, lead_neglog_p=10.0, k=6.4) - [{'variantId': 'var1', 'r2Overall': None}, {'variantId': 'var2', 'r2Overall': None}] + [] """ if ld_set is None: return None @@ -146,7 +146,6 @@ def _finemap( or not lead_neglog_p ): # If PICS cannot be calculated, we'll return the original credible set - new_credible_set.append(tag_dict) continue pics_snp_mu = PICS._pics_mu(lead_neglog_p, tag_dict["r2Overall"]) From fb5bb5b4f683b767167b4f15926c12bd3a64c0c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 18 Dec 2023 12:12:23 +0000 Subject: [PATCH 2/7] feat(pics): add empty_locus qc flag --- src/otg/dataset/study_locus.py | 2 ++ src/otg/method/pics.py | 10 +++++++++- tests/method/test_pics.py | 17 ++++++++++++++++- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/otg/dataset/study_locus.py b/src/otg/dataset/study_locus.py index 64fcf4931..cdfc1a1ff 100644 --- a/src/otg/dataset/study_locus.py +++ b/src/otg/dataset/study_locus.py @@ -37,6 +37,7 @@ class StudyLocusQualityCheck(Enum): AMBIGUOUS_STUDY (str): Association with ambiguous study UNRESOLVED_LD (str): Variant not found in LD reference LD_CLUMPED (str): Explained by a more significant variant in high LD (clumped) + EMPTY_LOCUS (str): Unable to calculate PIPs with the provided data """ SUBSIGNIFICANT_FLAG = "Subsignificant p-value" @@ -49,6 +50,7 @@ class StudyLocusQualityCheck(Enum): UNRESOLVED_LD = "Variant not found in LD reference" LD_CLUMPED = "Explained by a more significant variant in high LD (clumped)" NO_POPULATION = "Study does not have population annotation to resolve LD" + EMPTY_LOCUS = "Unable to calculate PIPs with the provided data" class CredibleInterval(Enum): diff --git a/src/otg/method/pics.py b/src/otg/method/pics.py index 433087b07..8aae04360 100644 --- a/src/otg/method/pics.py +++ b/src/otg/method/pics.py @@ -8,7 +8,7 @@ import pyspark.sql.types as t from scipy.stats import norm -from otg.dataset.study_locus import StudyLocus +from otg.dataset.study_locus import StudyLocus, StudyLocusQualityCheck if TYPE_CHECKING: from pyspark.sql import Row @@ -238,6 +238,14 @@ def finemap( ), ), ) + .withColumn( + "qualityControls", + StudyLocus.update_quality_flag( + f.col("qualityControls"), + f.size("locus") == 0, + StudyLocusQualityCheck.EMPTY_LOCUS, + ), + ) # Rename tagVariantId to variantId .drop("neglog_pvalue") ), diff --git a/tests/method/test_pics.py b/tests/method/test_pics.py index 7d2f1f78e..f87933e1a 100644 --- a/tests/method/test_pics.py +++ b/tests/method/test_pics.py @@ -30,11 +30,26 @@ def test_finemap_empty_array( def test_finemap_null_ld_set( self: TestFinemap, mock_study_locus: StudyLocus ) -> None: - """Test how we apply `finemap` when `locus` is null by returning a null field.""" + """Test how we apply `finemap` when `ldSet` is null by returning a null field.""" mock_study_locus.df = mock_study_locus.df.filter(f.col("ldSet").isNull()) observed_df = PICS.finemap(mock_study_locus).df.limit(1) assert observed_df.collect()[0]["locus"] is None + def test_finemap_quality_control( + self: TestFinemap, mock_study_locus: StudyLocus + ) -> None: + """Test that we add a `empty locus` flag when any variant in the locus meets PICS criteria.""" + mock_study_locus.df = mock_study_locus.df.withColumn( + # Association with an empty ldSet + "ldSet", + f.when(f.col("ldSet").isNull(), f.array()).otherwise(f.col("ldSet")), + ).filter(f.size("ldSet") == 0) + observed_df = PICS.finemap(mock_study_locus).df.limit(1) + qc_flag = "Unable to calculate PIPs with the provided data" + assert ( + qc_flag in observed_df.collect()[0]["qualityControls"] + ), "Empty locus QC flag is missing." + def test__finemap_udf() -> None: """Test the _finemap UDF with a simple case.""" From e24b08a394585935e04e54388dc894b87a66b77c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 18 Dec 2023 12:16:39 +0000 Subject: [PATCH 3/7] chore(pics): add to finemappingMethod column --- src/otg/method/pics.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/otg/method/pics.py b/src/otg/method/pics.py index 8aae04360..743a192e6 100644 --- a/src/otg/method/pics.py +++ b/src/otg/method/pics.py @@ -246,7 +246,10 @@ def finemap( StudyLocusQualityCheck.EMPTY_LOCUS, ), ) - # Rename tagVariantId to variantId + .withColumn( + "finemappingMethod", + f.coalesce(f.col("finemappingMethod"), f.lit("pics")), + ) .drop("neglog_pvalue") ), _schema=StudyLocus.get_schema(), From 8b1abc62969ffed3d7f05ebaffad262a0e81e970 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 18 Dec 2023 17:06:44 +0000 Subject: [PATCH 4/7] refactor(pics): change definition of non picsable based on ldset --- src/otg/dataset/study_locus.py | 4 ++-- src/otg/method/pics.py | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/otg/dataset/study_locus.py b/src/otg/dataset/study_locus.py index cdfc1a1ff..c7f911e16 100644 --- a/src/otg/dataset/study_locus.py +++ b/src/otg/dataset/study_locus.py @@ -37,7 +37,7 @@ class StudyLocusQualityCheck(Enum): AMBIGUOUS_STUDY (str): Association with ambiguous study UNRESOLVED_LD (str): Variant not found in LD reference LD_CLUMPED (str): Explained by a more significant variant in high LD (clumped) - EMPTY_LOCUS (str): Unable to calculate PIPs with the provided data + UNPICSABLE (str): Unable to calculate PIPs with the provided data """ SUBSIGNIFICANT_FLAG = "Subsignificant p-value" @@ -50,7 +50,7 @@ class StudyLocusQualityCheck(Enum): UNRESOLVED_LD = "Variant not found in LD reference" LD_CLUMPED = "Explained by a more significant variant in high LD (clumped)" NO_POPULATION = "Study does not have population annotation to resolve LD" - EMPTY_LOCUS = "Unable to calculate PIPs with the provided data" + UNPICSABLE = "Unable to calculate PIPs with the provided data" class CredibleInterval(Enum): diff --git a/src/otg/method/pics.py b/src/otg/method/pics.py index 743a192e6..739e64205 100644 --- a/src/otg/method/pics.py +++ b/src/otg/method/pics.py @@ -145,7 +145,7 @@ def _finemap( or tag_dict["r2Overall"] < 0.5 or not lead_neglog_p ): - # If PICS cannot be calculated, we'll return the original credible set + # If PICS cannot be calculated, we drop the variant from the credible set continue pics_snp_mu = PICS._pics_mu(lead_neglog_p, tag_dict["r2Overall"]) @@ -221,6 +221,9 @@ def finemap( lambda locus, neglog_p: PICS._finemap(locus, neglog_p, k), picsed_ldset_schema, ) + non_picsable_expr = ( + f.size(f.filter(f.col("ldSet"), lambda x: x.r2Overall >= 0.5)) == 0 + ) return StudyLocus( _df=( associations.df @@ -242,8 +245,8 @@ def finemap( "qualityControls", StudyLocus.update_quality_flag( f.col("qualityControls"), - f.size("locus") == 0, - StudyLocusQualityCheck.EMPTY_LOCUS, + non_picsable_expr, + StudyLocusQualityCheck.UNPICSABLE, ), ) .withColumn( From 4408fdf39d838db13066f68711df6a09d648deb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Tue, 19 Dec 2023 15:57:08 +0000 Subject: [PATCH 5/7] Update src/otg/dataset/study_locus.py Co-authored-by: David Ochoa --- src/otg/dataset/study_locus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otg/dataset/study_locus.py b/src/otg/dataset/study_locus.py index c7f911e16..d84f40e55 100644 --- a/src/otg/dataset/study_locus.py +++ b/src/otg/dataset/study_locus.py @@ -50,7 +50,7 @@ class StudyLocusQualityCheck(Enum): UNRESOLVED_LD = "Variant not found in LD reference" LD_CLUMPED = "Explained by a more significant variant in high LD (clumped)" NO_POPULATION = "Study does not have population annotation to resolve LD" - UNPICSABLE = "Unable to calculate PIPs with the provided data" + NOT_QUALIFYING_LD_BLOCK = "LD block does not contain variants at the required R^2 threshold" class CredibleInterval(Enum): From bdeffd6b93c61612504dff51d53eec8fd159883e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Tue, 19 Dec 2023 15:57:45 +0000 Subject: [PATCH 6/7] Update src/otg/method/pics.py Co-authored-by: David Ochoa --- src/otg/method/pics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otg/method/pics.py b/src/otg/method/pics.py index 739e64205..6d38f4643 100644 --- a/src/otg/method/pics.py +++ b/src/otg/method/pics.py @@ -246,7 +246,7 @@ def finemap( StudyLocus.update_quality_flag( f.col("qualityControls"), non_picsable_expr, - StudyLocusQualityCheck.UNPICSABLE, + StudyLocusQualityCheck.NOT_QUALIFYING_LD_BLOCK, ), ) .withColumn( From 9d13c2d960e135c867b06ca31b585c6141233c01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Tue, 19 Dec 2023 15:57:58 +0000 Subject: [PATCH 7/7] Update tests/method/test_pics.py Co-authored-by: David Ochoa --- tests/method/test_pics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/method/test_pics.py b/tests/method/test_pics.py index f87933e1a..41c9c5c20 100644 --- a/tests/method/test_pics.py +++ b/tests/method/test_pics.py @@ -45,7 +45,7 @@ def test_finemap_quality_control( f.when(f.col("ldSet").isNull(), f.array()).otherwise(f.col("ldSet")), ).filter(f.size("ldSet") == 0) observed_df = PICS.finemap(mock_study_locus).df.limit(1) - qc_flag = "Unable to calculate PIPs with the provided data" + qc_flag = "LD block does not contain variants at the required R^2 threshold" assert ( qc_flag in observed_df.collect()[0]["qualityControls"] ), "Empty locus QC flag is missing."