Skip to content

Commit

Permalink
Merge branch 'main' into feat-code-of-conduct
Browse files Browse the repository at this point in the history
  • Loading branch information
d0choa authored Dec 12, 2023
2 parents 95ae5f1 + 47fb71f commit 00fe023
Show file tree
Hide file tree
Showing 15 changed files with 166 additions and 104 deletions.
19 changes: 7 additions & 12 deletions config/datasets/gcp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,18 @@ variant_annotation: ${datasets.outputs}/variant_annotation
variant_index: ${datasets.outputs}/variant_index
study_locus: ${datasets.outputs}/study_locus
credible_set: ${datasets.outputs}/credible_set
study_index: ${datasets.outputs}/study_index
summary_statistics: ${datasets.outputs}/summary_statistics
study_locus_overlap: ${datasets.outputs}/study_locus_overlap
colocalisation: ${datasets.outputs}/colocalisation
v2g: ${datasets.outputs}/v2g
ld_index: ${datasets.outputs}/ld_index
catalog_study_index: ${datasets.outputs}/catalog_study_index
catalog_study_locus: ${datasets.study_locus}/catalog_curated
finngen_study_index: ${datasets.outputs}/finngen_study_index
finngen_summary_stats: ${datasets.outputs}/finngen_summary_stats
catalog_study_index: ${datasets.study_index}/catalog_curated
catalog_study_locus: ${datasets.credible_set}/catalog_curated
finngen_study_index: ${datasets.study_index}/finngen
finngen_summary_stats: ${datasets.summary_statistics}/finngen
from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats
from_sumstats_pics: ${datasets.credible_set}/from_sumstats
ukbiobank_study_index: ${datasets.outputs}/ukbiobank_study_index
ukbiobank_study_index: ${datasets.study_index}/ukbiobank
l2g_model: ${datasets.outputs}/l2g_model
l2g_predictions: ${datasets.outputs}/l2g_predictions
eqtl_catalogue_study_index_out: ${datasets.outputs}/preprocess/eqtl_catalogue/study_index
eqtl_catalogue_summary_stats_out: ${datasets.outputs}/preprocess/eqtl_catalogue/summary_stats

# Constants
finngen_release_prefix: FINNGEN_R9
finngen_sumstat_url_prefix: gs://finngen-public-data-r9/summary_stats/finngen_R9_
finngen_sumstat_url_suffix: .gz
4 changes: 0 additions & 4 deletions config/step/finngen.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
_target_: otg.finngen.FinnGenStep
finngen_phenotype_table_url: ${datasets.finngen_phenotype_table_url}
finngen_release_prefix: ${datasets.finngen_release_prefix}
finngen_sumstat_url_prefix: ${datasets.finngen_sumstat_url_prefix}
finngen_sumstat_url_suffix: ${datasets.finngen_sumstat_url_suffix}
finngen_study_index_out: ${datasets.finngen_study_index}
finngen_summary_stats_out: ${datasets.finngen_summary_stats}
7 changes: 4 additions & 3 deletions src/airflow/dags/common_airflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
# Shared DAG construction parameters.
shared_dag_args = dict(
owner="Open Targets Data Team",
retries=1,
retries=0,
)
shared_dag_kwargs = dict(
tags=["genetics_etl", "experimental"],
Expand All @@ -68,6 +68,7 @@ def create_cluster(
num_preemptible_workers: int = 0,
num_local_ssds: int = 1,
autoscaling_policy: str = GCP_AUTOSCALING_POLICY,
master_disk_size: int = 500,
) -> DataprocCreateClusterOperator:
"""Generate an Airflow task to create a Dataproc cluster. Common parameters are reused, and varying parameters can be specified as needed.
Expand All @@ -79,6 +80,7 @@ def create_cluster(
num_preemptible_workers (int): Number of preemptible worker nodes. Defaults to 0.
num_local_ssds (int): How many local SSDs to attach to each worker node, both primary and secondary. Defaults to 1.
autoscaling_policy (str): Name of the autoscaling policy to use. Defaults to GCP_AUTOSCALING_POLICY.
master_disk_size (int): Size of the master node's boot disk in GB. Defaults to 500.
Returns:
DataprocCreateClusterOperator: Airflow task to create a Dataproc cluster.
Expand All @@ -89,7 +91,7 @@ def create_cluster(
zone=GCP_ZONE,
master_machine_type=master_machine_type,
worker_machine_type=worker_machine_type,
master_disk_size=500,
master_disk_size=master_disk_size,
worker_disk_size=500,
num_preemptible_workers=num_preemptible_workers,
num_workers=num_workers,
Expand Down Expand Up @@ -273,7 +275,6 @@ def delete_cluster(cluster_name: str) -> DataprocDeleteClusterOperator:
cluster_name=cluster_name,
region=GCP_REGION,
trigger_rule=TriggerRule.ALL_DONE,
deferrable=True,
)


Expand Down
1 change: 1 addition & 0 deletions src/airflow/dags/dag_genetics_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
this_task = common.submit_step(
cluster_name=CLUSTER_NAME,
step_id=step_id,
task_id=step_id,
)
# Chain prerequisites.
tasks[step_id] = this_task
Expand Down
4 changes: 2 additions & 2 deletions src/airflow/dags/dag_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

CLUSTER_NAME = "otg-preprocess"

ALL_STEPS = ["finngen", "eqtl_catalogue", "ld_index", "variant_annotation"]
ALL_STEPS = ["eqtl_catalogue", "ld_index", "variant_annotation"]


with DAG(
Expand All @@ -18,7 +18,7 @@
**common.shared_dag_kwargs,
):
all_tasks = [
common.submit_step(cluster_name=CLUSTER_NAME, step_id=step)
common.submit_step(cluster_name=CLUSTER_NAME, step_id=step, task_id=step)
for step in ALL_STEPS
]
dag = common.generate_dag(cluster_name=CLUSTER_NAME, tasks=all_tasks)
75 changes: 75 additions & 0 deletions src/airflow/dags/finngen_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Airflow DAG for the Preprocess part of the pipeline."""
from __future__ import annotations

from pathlib import Path

import common_airflow as common
from airflow.models.dag import DAG
from airflow.utils.trigger_rule import TriggerRule

CLUSTER_NAME = "otg-preprocess-finngen"
AUTOSCALING = "finngen-preprocess"

RELEASEBUCKET = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX"
SUMSTATS = "{RELEASEBUCKET}/summary_statistics/finngen"
WINDOWBASED_CLUMPED = (
"{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_window_clumped/finngen"
)
LD_CLUMPED = "{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_ld_clumped/finngen"
PICSED = "{RELEASEBUCKET}/credible_set/from_sumstats_study_locus/finngen"

with DAG(
dag_id=Path(__file__).stem,
description="Open Targets Genetics — Finngen preprocess",
default_args=common.shared_dag_args,
**common.shared_dag_kwargs,
):
study_and_sumstats = common.submit_step(
cluster_name=CLUSTER_NAME,
step_id="finngen",
task_id="finngen_sumstats_and_study_index",
)

window_based_clumping = common.submit_step(
cluster_name=CLUSTER_NAME,
step_id="clump",
task_id="finngen_window_based_clumping",
other_args=[
"step.input_path={SUMSTATS}",
"step.clumped_study_locus_path={WINDOWBASED_CLUMPED}",
],
)
ld_clumping = common.submit_step(
cluster_name=CLUSTER_NAME,
step_id="clump",
task_id="finngen_ld_clumping",
other_args=[
"step.input_path={WINDOWBASED_CLUMPED}",
"step.clumped_study_locus_path={LD_CLUMPED}",
],
trigger_rule=TriggerRule.ALL_DONE,
)

pics = common.submit_step(
cluster_name=CLUSTER_NAME,
step_id="pics",
task_id="finngen_pics",
other_args=[
f"step.study_locus_ld_annotated_in={LD_CLUMPED}",
f"step.picsed_study_locus_out={PICSED}",
],
# This allows to attempt running the task when above step fails do to failifexists
trigger_rule=TriggerRule.ALL_DONE,
)

(
common.create_cluster(
CLUSTER_NAME, autoscaling_policy=AUTOSCALING, master_disk_size=2000
)
>> common.install_dependencies(CLUSTER_NAME)
>> study_and_sumstats
>> window_based_clumping
>> ld_clumping
>> pics
>> common.delete_cluster(CLUSTER_NAME)
)
2 changes: 1 addition & 1 deletion src/otg/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
order_array_of_structs_by_field,
)
from otg.dataset.dataset import Dataset
from otg.dataset.ld_index import LDIndex
from otg.dataset.study_locus_overlap import StudyLocusOverlap
from otg.method.clump import LDclumping

if TYPE_CHECKING:
from pyspark.sql import Column, DataFrame
from pyspark.sql.types import StructType

from otg.dataset.ld_index import LDIndex
from otg.dataset.study_index import StudyIndex


Expand Down
4 changes: 2 additions & 2 deletions src/otg/dataset/summary_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ def window_based_clumping(
"""Generate study-locus from summary statistics by distance based clumping + collect locus.
Args:
distance (int): Distance in base pairs to be used for clumping.
distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.
baseline_significance (float, optional): Baseline significance threshold for inclusion in the locus. Defaults to 0.05.
locus_collect_distance (int | None): The distance to collect locus around semi-indices.
locus_collect_distance (int | None): The distance to collect locus around semi-indices. If not provided, locus is not collected.
Returns:
StudyLocus: Clumped study-locus containing variants based on window.
Expand Down
40 changes: 21 additions & 19 deletions src/otg/datasource/finngen/study_index.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
"""Study Index for Finngen data source."""
from __future__ import annotations

from typing import TYPE_CHECKING
from urllib.request import urlopen

import pyspark.sql.functions as f
from pyspark.sql import SparkSession

from otg.dataset.study_index import StudyIndex

if TYPE_CHECKING:
from pyspark.sql import DataFrame


class FinnGenStudyIndex:
"""Study index dataset from FinnGen.
Expand All @@ -24,35 +22,39 @@ class FinnGenStudyIndex:
Some fields are also populated as constants, such as study type and the initial sample size.
"""

finngen_phenotype_table_url: str = "https://r9.finngen.fi/api/phenos"
finngen_release_prefix: str = "FINNGEN_R9"
finngen_summary_stats_url_prefix: str = (
"gs://finngen-public-data-r9/summary_stats/finngen_R9_"
)
finngen_summary_stats_url_suffix: str = ".gz"

@classmethod
def from_source(
cls: type[FinnGenStudyIndex],
finngen_studies: DataFrame,
finngen_release_prefix: str,
finngen_summary_stats_url_prefix: str,
finngen_summary_stats_url_suffix: str,
spark: SparkSession,
) -> StudyIndex:
"""This function ingests study level metadata from FinnGen.
Args:
finngen_studies (DataFrame): FinnGen raw study table
finngen_release_prefix (str): Release prefix pattern.
finngen_summary_stats_url_prefix (str): URL prefix for summary statistics location.
finngen_summary_stats_url_suffix (str): URL prefix suffix for summary statistics location.
spark (SparkSession): Spark session object.
Returns:
StudyIndex: Parsed and annotated FinnGen study table.
"""
json_data = urlopen(cls.finngen_phenotype_table_url).read().decode("utf-8")
rdd = spark.sparkContext.parallelize([json_data])
raw_df = spark.read.json(rdd)
return StudyIndex(
_df=finngen_studies.select(
f.concat(f.lit(f"{finngen_release_prefix}_"), f.col("phenocode")).alias(
"studyId"
),
_df=raw_df.select(
f.concat(
f.lit(f"{cls.finngen_release_prefix}_"), f.col("phenocode")
).alias("studyId"),
f.col("phenostring").alias("traitFromSource"),
f.col("num_cases").alias("nCases"),
f.col("num_controls").alias("nControls"),
(f.col("num_cases") + f.col("num_controls")).alias("nSamples"),
f.lit(finngen_release_prefix).alias("projectId"),
f.lit(cls.finngen_release_prefix).alias("projectId"),
f.lit("gwas").alias("studyType"),
f.lit(True).alias("hasSumstats"),
f.lit("377,277 (210,870 females and 166,407 males)").alias(
Expand All @@ -67,9 +69,9 @@ def from_source(
# Cohort label is consistent with GWAS Catalog curation.
f.array(f.lit("FinnGen")).alias("cohorts"),
f.concat(
f.lit(finngen_summary_stats_url_prefix),
f.lit(cls.finngen_summary_stats_url_prefix),
f.col("phenocode"),
f.lit(finngen_summary_stats_url_suffix),
f.lit(cls.finngen_summary_stats_url_suffix),
).alias("summarystatsLocation"),
).withColumn(
"ldPopulationStructure",
Expand Down
35 changes: 28 additions & 7 deletions src/otg/datasource/finngen/summary_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,38 +3,59 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType

from otg.common.utils import parse_pvalue
from otg.dataset.summary_statistics import SummaryStatistics

if TYPE_CHECKING:
from pyspark.sql import DataFrame


@dataclass
class FinnGenSummaryStats:
"""Summary statistics dataset for FinnGen."""

raw_schema: t.StructType = StructType(
[
StructField("#chrom", StringType(), True),
StructField("pos", StringType(), True),
StructField("ref", StringType(), True),
StructField("alt", StringType(), True),
StructField("rsids", StringType(), True),
StructField("nearest_genes", StringType(), True),
StructField("pval", StringType(), True),
StructField("mlogp", StringType(), True),
StructField("beta", StringType(), True),
StructField("sebeta", StringType(), True),
StructField("af_alt", StringType(), True),
StructField("af_alt_cases", StringType(), True),
StructField("af_alt_controls", StringType(), True),
]
)

@classmethod
def from_source(
cls: type[FinnGenSummaryStats],
summary_stats_df: DataFrame,
spark: SparkSession,
raw_files: list[str],
) -> SummaryStatistics:
"""Ingests all summary statst for all FinnGen studies.
Args:
summary_stats_df (DataFrame): Raw summary statistics dataframe
spark (SparkSession): Spark session object.
raw_files (list[str]): Paths to raw summary statistics .gz files.
Returns:
SummaryStatistics: Processed summary statistics dataset
"""
processed_summary_stats_df = (
summary_stats_df
spark.read.schema(cls.raw_schema)
.option("delimiter", "\t")
.csv(raw_files, header=True)
# Drop rows which don't have proper position.
.filter(f.col("pos").cast(t.IntegerType()).isNotNull())
.select(
# From the full path, extracts just the filename, and converts to upper case to get the study ID.
f.upper(f.regexp_extract(f.input_file_name(), r"([^/]+)\.gz", 1)).alias(
Expand Down
Loading

0 comments on commit 00fe023

Please sign in to comment.