Skip to content

Commit

Permalink
refactor: moving sanitizing logic to qtl ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
DSuveges committed Jan 14, 2025
1 parent 1afcd2f commit e9af39b
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 28 deletions.
15 changes: 0 additions & 15 deletions src/gentropy/dataset/study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from gentropy.assets import data
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.spark_helpers import (
clean_strings_from_symbols,
convert_from_wide_to_long,
)
from gentropy.dataset.dataset import Dataset
Expand Down Expand Up @@ -383,20 +382,6 @@ def validate_disease(self: StudyIndex, disease_map: DataFrame) -> StudyIndex:
_schema=StudyIndex.get_schema(),
)

def url_safe_study_id(self: StudyIndex) -> StudyIndex:
"""Normalise study identifiers to be URL safe.
Returns:
StudyIndex: with normalised study identifiers.
"""
return StudyIndex(
_df=self.df.withColumn(
"studyId",
clean_strings_from_symbols(f.col("studyId")),
),
_schema=StudyIndex.get_schema(),
)

def validate_study_type(self: StudyIndex) -> StudyIndex:
"""Validating study type and flag unsupported types.
Expand Down
16 changes: 10 additions & 6 deletions src/gentropy/datasource/eqtl_catalogue/finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)

from gentropy.common.session import Session
from gentropy.common.spark_helpers import clean_strings_from_symbols
from gentropy.common.utils import parse_pvalue
from gentropy.dataset.study_locus import FinemappingMethod, StudyLocus
from gentropy.datasource.eqtl_catalogue.study_index import EqtlCatalogueStudyIndex
Expand Down Expand Up @@ -171,12 +172,15 @@ def parse_susie_results(
f.col("molecular_trait_id").alias("traitFromSource"),
f.col("gene_id").alias("geneId"),
f.col("dataset_id"),
f.concat_ws(
"_",
f.col("study_label"),
f.col("quant_method"),
f.col("sample_group"),
f.col("molecular_trait_id"),
# Upon creation, the studyId cleaned from symbols:
clean_strings_from_symbols(
f.concat_ws(
"_",
f.col("study_label"),
f.col("quant_method"),
f.col("sample_group"),
f.col("molecular_trait_id"),
)
).alias("studyId"),
f.col("tissue_id").alias("biosampleFromSourceId"),
EqtlCatalogueStudyIndex._identify_study_type().alias("studyType"),
Expand Down
10 changes: 3 additions & 7 deletions src/gentropy/datasource/eqtl_catalogue/study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,9 @@ def from_susie_results(
for field in StudyIndex.get_schema().fields
if field.name in processed_finemapping_df.columns
]
return (
StudyIndex(
_df=processed_finemapping_df.select(study_index_cols).distinct(),
_schema=StudyIndex.get_schema(),
)
# Convert study identifier to a URL safe format:
.url_safe_study_id()
return StudyIndex(
_df=processed_finemapping_df.select(study_index_cols).distinct(),
_schema=StudyIndex.get_schema(),
)

@classmethod
Expand Down

0 comments on commit e9af39b

Please sign in to comment.