Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(l2g): add features based on predicted variant consequences #360

Merged
merged 8 commits into from
Dec 21, 2023
12 changes: 12 additions & 0 deletions src/otg/assets/schemas/l2g_feature_matrix.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@
"nullable": true,
"type": "float"
},
{
"metadata": {},
"name": "vepMaximumNeighborhood",
"nullable": true,
"type": "float"
},
{
"metadata": {},
"name": "vepMaximum",
"nullable": true,
"type": "float"
},
{
"metadata": {},
"name": "eqtlColocClppLocalMaximum",
Expand Down
1 change: 1 addition & 0 deletions src/otg/dataset/l2g_feature_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def generate_features(
# study_locus, study_index, colocalisation
# ).df,
StudyLocusFactory._get_tss_distance_features(study_locus, variant_gene).df,
StudyLocusFactory._get_vep_features(study_locus, variant_gene).df,
]:
fm = reduce(
lambda x, y: x.unionByName(y),
Expand Down
6 changes: 5 additions & 1 deletion src/otg/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,11 @@ class LocusToGeneStep:
# average distance of all tagging variants to gene TSS
"distanceTssMean",
# # minimum distance of all tagging variants to gene TSS
# "distanceTssMinimum",
"distanceTssMinimum",
# # maximum vep consequence score of the locus 95% credible set among all genes in the vicinity
"vepMaximumNeighborhood",
# # maximum vep consequence score of the locus 95% credible set split by gene
"vepMaximum",
# # max clpp for each (study, locus, gene) aggregating over all eQTLs
# "eqtlColocClppLocalMaximum",
# # max clpp for each (study, locus) aggregating over all eQTLs
Expand Down
126 changes: 126 additions & 0 deletions src/otg/method/l2g/feature_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from otg.dataset.study_locus import CredibleInterval, StudyLocus

if TYPE_CHECKING:
from pyspark.sql import Column, DataFrame

from otg.dataset.colocalisation import Colocalisation
from otg.dataset.study_index import StudyIndex
from otg.dataset.v2g import V2G
Expand Down Expand Up @@ -232,3 +234,127 @@ def _get_tss_distance_features(
),
_schema=L2GFeature.get_schema(),
)

@staticmethod
def _get_vep_features(
credible_set: StudyLocus,
v2g: V2G,
) -> L2GFeature:
"""Get the maximum VEP score for all variants in a locus's 95% credible set.

This informs about functional impact of the variants in the locus. For more information on variant consequences, see: https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
Two metrics: max VEP score per study locus and gene, and max VEP score per study locus.


Args:
credible_set (StudyLocus): Study locus dataset with the associations to be annotated
v2g (V2G): V2G dataset with the variant/gene relationships and their consequences

Returns:
L2GFeature: Stores the features with the max VEP score.
"""

def _aggregate_vep_feature(
df: DataFrame,
aggregation_expr: Column,
aggregation_cols: list[str],
feature_name: str,
) -> DataFrame:
"""Extracts the maximum or average VEP score after grouping by the given columns. Different aggregations return different predictive annotations.

If the group_cols include "geneId", the maximum/mean VEP score per gene is returned.
Otherwise, the maximum/mean VEP score for all genes in the neighborhood of the locus is returned.

Args:
df (DataFrame): DataFrame with the VEP scores for each variant in a studyLocus
aggregation_expr (Column): Aggregation expression to apply
aggregation_cols (list[str]): Columns to group by
feature_name (str): Name of the feature to be returned

Returns:
DataFrame: DataFrame with the maximum VEP score per locus or per locus/gene
"""
if "geneId" in aggregation_cols:
return df.groupBy(aggregation_cols).agg(
aggregation_expr.alias(feature_name)
)
return (
df.groupBy(aggregation_cols)
.agg(
aggregation_expr.alias(feature_name),
f.collect_set("geneId").alias("geneId"),
)
.withColumn("geneId", f.explode("geneId"))
)

credible_set_w_variant_consequences = (
credible_set.filter_credible_set(CredibleInterval.IS95)
.df.withColumn("variantInLocusId", f.explode(f.col("locus.variantId")))
.withColumn(
"variantInLocusPosteriorProbability",
f.explode(f.col("locus.posteriorProbability")),
)
.join(
# Join with V2G to get variant consequences
v2g.df.filter(
f.col("datasourceId") == "variantConsequence"
).withColumnRenamed("variantId", "variantInLocusId"),
on="variantInLocusId",
)
.withColumn(
"weightedScore",
f.col("score") * f.col("variantInLocusPosteriorProbability"),
)
.select(
"studyLocusId",
"variantId",
"studyId",
"geneId",
"score",
"weightedScore",
)
.distinct()
.persist()
)

return L2GFeature(
_df=convert_from_wide_to_long(
reduce(
lambda x, y: x.unionByName(y, allowMissingColumns=True),
[
# Calculate overall max VEP score for all genes in the vicinity
credible_set_w_variant_consequences.transform(
_aggregate_vep_feature,
f.max("score"),
["studyLocusId"],
"vepMaximumNeighbourhood",
),
# Calculate overall max VEP score per gene
credible_set_w_variant_consequences.transform(
_aggregate_vep_feature,
f.max("score"),
["studyLocusId", "geneId"],
"vepMaximum",
),
# Calculate mean VEP score for all genes in the vicinity
credible_set_w_variant_consequences.transform(
_aggregate_vep_feature,
f.mean("weightedScore"),
["studyLocusId"],
"vepMeanNeighbourhood",
),
# Calculate mean VEP score per gene
credible_set_w_variant_consequences.transform(
_aggregate_vep_feature,
f.mean("weightedScore"),
["studyLocusId", "geneId"],
"vepMean",
),
],
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
).filter(f.col("featureValue").isNotNull()),
_schema=L2GFeature.get_schema(),
)
9 changes: 9 additions & 0 deletions tests/method/test_locus_to_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,12 @@ def test_get_tss_distance_features(
assert isinstance(
tss_distance, L2GFeature
), "Unexpected model type returned from _get_tss_distance_features"

def test_get_vep_features(
self: TestStudyLocusFactory, mock_study_locus: StudyLocus, mock_v2g: V2G
) -> None:
"""Test the function that extracts the VEP features."""
vep_features = StudyLocusFactory._get_vep_features(mock_study_locus, mock_v2g)
assert isinstance(
vep_features, L2GFeature
), "Unexpected model type returned from _get_vep_features"