From 5e1a475a3bc46153cb8e74c4f5112e326aff83be Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Thu, 5 Dec 2024 14:20:32 +0000 Subject: [PATCH 1/3] refactor: rename all gene_index related files to target_index --- docs/python_api/datasets/{gene_index.md => target_index.md} | 0 docs/python_api/steps/{gene_index.md => target_index.md} | 0 .../assets/schemas/{gene_index.json => target_index.json} | 0 src/gentropy/dataset/{gene_index.py => target_index.py} | 0 src/gentropy/{gene_index.py => target_index.py} | 0 .../gentropy/dataset/{test_gene_index.py => test_target_index.py} | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename docs/python_api/datasets/{gene_index.md => target_index.md} (100%) rename docs/python_api/steps/{gene_index.md => target_index.md} (100%) rename src/gentropy/assets/schemas/{gene_index.json => target_index.json} (100%) rename src/gentropy/dataset/{gene_index.py => target_index.py} (100%) rename src/gentropy/{gene_index.py => target_index.py} (100%) rename tests/gentropy/dataset/{test_gene_index.py => test_target_index.py} (100%) diff --git a/docs/python_api/datasets/gene_index.md b/docs/python_api/datasets/target_index.md similarity index 100% rename from docs/python_api/datasets/gene_index.md rename to docs/python_api/datasets/target_index.md diff --git a/docs/python_api/steps/gene_index.md b/docs/python_api/steps/target_index.md similarity index 100% rename from docs/python_api/steps/gene_index.md rename to docs/python_api/steps/target_index.md diff --git a/src/gentropy/assets/schemas/gene_index.json b/src/gentropy/assets/schemas/target_index.json similarity index 100% rename from src/gentropy/assets/schemas/gene_index.json rename to src/gentropy/assets/schemas/target_index.json diff --git a/src/gentropy/dataset/gene_index.py b/src/gentropy/dataset/target_index.py similarity index 100% rename from src/gentropy/dataset/gene_index.py rename to src/gentropy/dataset/target_index.py diff --git a/src/gentropy/gene_index.py b/src/gentropy/target_index.py similarity index 100% rename from src/gentropy/gene_index.py rename to src/gentropy/target_index.py diff --git a/tests/gentropy/dataset/test_gene_index.py b/tests/gentropy/dataset/test_target_index.py similarity index 100% rename from tests/gentropy/dataset/test_gene_index.py rename to tests/gentropy/dataset/test_target_index.py From 26abb1b6f8f8acf82d65dfe61c149a941ea1aa31 Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:08:56 +0000 Subject: [PATCH 2/3] refactor: rename gene_index to target_index in various files --- docs/python_api/datasets/target_index.md | 6 ++-- docs/python_api/steps/target_index.md | 4 +-- src/gentropy/biosample_index.py | 2 +- src/gentropy/config.py | 12 +++---- src/gentropy/dataset/intervals.py | 8 ++--- .../dataset/l2g_features/colocalisation.py | 28 ++++++++-------- src/gentropy/dataset/l2g_features/distance.py | 16 +++++----- src/gentropy/dataset/l2g_features/other.py | 32 +++++++++---------- src/gentropy/dataset/l2g_features/vep.py | 12 +++---- src/gentropy/dataset/study_index.py | 6 ++-- src/gentropy/dataset/target_index.py | 22 ++++++------- .../datasource/intervals/andersson.py | 10 +++--- src/gentropy/datasource/intervals/javierre.py | 8 ++--- src/gentropy/datasource/intervals/jung.py | 8 ++--- src/gentropy/datasource/intervals/thurman.py | 8 ++--- .../datasource/open_targets/target.py | 16 +++++----- src/gentropy/l2g.py | 14 ++++---- src/gentropy/study_validation.py | 4 +-- src/gentropy/target_index.py | 18 +++++------ 19 files changed, 117 insertions(+), 117 deletions(-) diff --git a/docs/python_api/datasets/target_index.md b/docs/python_api/datasets/target_index.md index d66d3a6bc..1995e8a4d 100644 --- a/docs/python_api/datasets/target_index.md +++ b/docs/python_api/datasets/target_index.md @@ -1,9 +1,9 @@ --- -title: Gene Index +title: Target Index --- -::: gentropy.dataset.gene_index.GeneIndex +::: gentropy.dataset.target_index.TargetIndex ## Schema ---8<-- "assets/schemas/gene_index.md" +--8<-- "assets/schemas/target_index.md" diff --git a/docs/python_api/steps/target_index.md b/docs/python_api/steps/target_index.md index a0808dcad..4a572eca7 100644 --- a/docs/python_api/steps/target_index.md +++ b/docs/python_api/steps/target_index.md @@ -1,5 +1,5 @@ --- -title: gene_index +title: target_index --- -::: gentropy.gene_index.GeneIndexStep +::: gentropy.target_index.TargetIndexStep diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py index a6e8b5223..846bb97ce 100644 --- a/src/gentropy/biosample_index.py +++ b/src/gentropy/biosample_index.py @@ -27,7 +27,7 @@ def __init__( cell_ontology_input_path (str): Input cell ontology dataset path. uberon_input_path (str): Input uberon dataset path. efo_input_path (str): Input efo dataset path. - biosample_index_path (str): Output gene index dataset path. + biosample_index_path (str): Output biosample index dataset path. """ cell_ontology_index = extract_ontology_from_json( cell_ontology_input_path, session.spark diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 65fdb5897..f5233100f 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -44,12 +44,12 @@ class ColocalisationConfig(StepConfig): @dataclass -class GeneIndexConfig(StepConfig): - """Gene index step configuration.""" +class TargetIndexConfig(StepConfig): + """Target index step configuration.""" target_path: str = MISSING - gene_index_path: str = MISSING - _target_: str = "gentropy.gene_index.GeneIndexStep" + target_index_path: str = MISSING + _target_: str = "gentropy.target_index.TargetIndexStep" @dataclass @@ -298,7 +298,7 @@ class LocusToGeneFeatureMatrixConfig(StepConfig): variant_index_path: str | None = None colocalisation_path: str | None = None study_index_path: str | None = None - gene_index_path: str | None = None + target_index_path: str | None = None feature_matrix_path: str = MISSING features_list: list[str] = field( default_factory=lambda: [ @@ -688,7 +688,7 @@ def register_config() -> None: cs.store(group="step/session", name="base_session", node=SessionConfig) cs.store(group="step", name="colocalisation", node=ColocalisationConfig) cs.store(group="step", name="eqtl_catalogue", node=EqtlCatalogueConfig) - cs.store(group="step", name="gene_index", node=GeneIndexConfig) + cs.store(group="step", name="target_index", node=TargetIndexConfig) cs.store(group="step", name="biosample_index", node=BiosampleIndexConfig) cs.store( group="step", diff --git a/src/gentropy/dataset/intervals.py b/src/gentropy/dataset/intervals.py index 37158810b..e38960b27 100644 --- a/src/gentropy/dataset/intervals.py +++ b/src/gentropy/dataset/intervals.py @@ -8,7 +8,7 @@ from gentropy.common.Liftover import LiftOverSpark from gentropy.common.schemas import parse_spark_schema from gentropy.dataset.dataset import Dataset -from gentropy.dataset.gene_index import GeneIndex +from gentropy.dataset.target_index import TargetIndex if TYPE_CHECKING: from pyspark.sql import SparkSession @@ -35,7 +35,7 @@ def from_source( spark: SparkSession, source_name: str, source_path: str, - gene_index: GeneIndex, + target_index: TargetIndex, lift: LiftOverSpark, ) -> Intervals: """Collect interval data for a particular source. @@ -44,7 +44,7 @@ def from_source( spark (SparkSession): Spark session source_name (str): Name of the interval source source_path (str): Path to the interval source file - gene_index (GeneIndex): Gene index + target_index (TargetIndex): Target index lift (LiftOverSpark): LiftOverSpark instance to convert coordinats from hg37 to hg38 Returns: @@ -70,4 +70,4 @@ def from_source( source_class = source_to_class[source_name] data = source_class.read(spark, source_path) # type: ignore - return source_class.parse(data, gene_index, lift) # type: ignore + return source_class.parse(data, target_index, lift) # type: ignore diff --git a/src/gentropy/dataset/l2g_features/colocalisation.py b/src/gentropy/dataset/l2g_features/colocalisation.py index 68509ca79..39f35835d 100644 --- a/src/gentropy/dataset/l2g_features/colocalisation.py +++ b/src/gentropy/dataset/l2g_features/colocalisation.py @@ -9,11 +9,11 @@ from gentropy.common.spark_helpers import convert_from_wide_to_long from gentropy.dataset.colocalisation import Colocalisation -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.l2g_features.l2g_feature import L2GFeature from gentropy.dataset.l2g_gold_standard import L2GGoldStandard from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.target_index import TargetIndex from gentropy.dataset.variant_index import VariantIndex if TYPE_CHECKING: @@ -74,7 +74,7 @@ def extend_missing_colocalisation_to_neighbourhood_genes( feature_name: str, local_features: DataFrame, variant_index: VariantIndex, - gene_index: GeneIndex, + target_index: TargetIndex, study_locus: StudyLocus, ) -> DataFrame: """This function creates an artificial dataset of features that represents the missing colocalisation to the neighbourhood genes. @@ -83,7 +83,7 @@ def extend_missing_colocalisation_to_neighbourhood_genes( feature_name (str): The name of the feature to extend local_features (DataFrame): The dataframe of features to extend variant_index (VariantIndex): Variant index containing all variant/gene relationships - gene_index (GeneIndex): Gene index to fetch the gene information + target_index (TargetIndex): Target index to fetch the gene information study_locus (StudyLocus): Study locus to traverse between colocalisation and variant index Returns: @@ -94,7 +94,7 @@ def extend_missing_colocalisation_to_neighbourhood_genes( "variantId", f.explode("transcriptConsequences").alias("tc") ) .select(f.col("tc.targetId").alias("geneId"), "variantId") - .join(gene_index.df.select("geneId", "biotype"), "geneId", "left") + .join(target_index.df.select("geneId", "biotype"), "geneId", "left") .filter(f.col("biotype") == "protein_coding") .drop("biotype") .distinct() @@ -127,7 +127,7 @@ def common_neighbourhood_colocalisation_feature_logic( *, colocalisation: Colocalisation, study_index: StudyIndex, - gene_index: GeneIndex, + target_index: TargetIndex, study_locus: StudyLocus, variant_index: VariantIndex, ) -> DataFrame: @@ -141,7 +141,7 @@ def common_neighbourhood_colocalisation_feature_logic( qtl_types (list[str] | str): The types of QTL to filter the data by colocalisation (Colocalisation): Dataset with the colocalisation results study_index (StudyIndex): Study index to fetch study type and gene - gene_index (GeneIndex): Gene index to add gene type + target_index (TargetIndex): Target index to add gene type study_locus (StudyLocus): Study locus to traverse between colocalisation and study index variant_index (VariantIndex): Variant index to annotate all overlapping genes @@ -165,7 +165,7 @@ def common_neighbourhood_colocalisation_feature_logic( local_feature_name, local_max, variant_index, - gene_index, + target_index, study_locus, ) ) @@ -173,7 +173,7 @@ def common_neighbourhood_colocalisation_feature_logic( extended_local_max.join( # Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus) # (non protein coding genes in the vicinity are excluded see #3552) - gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"), + target_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"), "geneId", "inner", ) @@ -242,7 +242,7 @@ class EQtlColocClppMaximumNeighbourhoodFeature(L2GFeature): feature_dependency_type = [ Colocalisation, StudyIndex, - GeneIndex, + TargetIndex, StudyLocus, VariantIndex, ] @@ -333,7 +333,7 @@ class PQtlColocClppMaximumNeighbourhoodFeature(L2GFeature): feature_dependency_type = [ Colocalisation, StudyIndex, - GeneIndex, + TargetIndex, StudyLocus, VariantIndex, ] @@ -423,7 +423,7 @@ class SQtlColocClppMaximumNeighbourhoodFeature(L2GFeature): feature_dependency_type = [ Colocalisation, StudyIndex, - GeneIndex, + TargetIndex, StudyLocus, VariantIndex, ] @@ -513,7 +513,7 @@ class EQtlColocH4MaximumNeighbourhoodFeature(L2GFeature): feature_dependency_type = [ Colocalisation, StudyIndex, - GeneIndex, + TargetIndex, StudyLocus, VariantIndex, ] @@ -603,7 +603,7 @@ class PQtlColocH4MaximumNeighbourhoodFeature(L2GFeature): feature_dependency_type = [ Colocalisation, StudyIndex, - GeneIndex, + TargetIndex, StudyLocus, VariantIndex, ] @@ -693,7 +693,7 @@ class SQtlColocH4MaximumNeighbourhoodFeature(L2GFeature): feature_dependency_type = [ Colocalisation, StudyIndex, - GeneIndex, + TargetIndex, StudyLocus, VariantIndex, ] diff --git a/src/gentropy/dataset/l2g_features/distance.py b/src/gentropy/dataset/l2g_features/distance.py index 40ad568ac..f95862ddd 100644 --- a/src/gentropy/dataset/l2g_features/distance.py +++ b/src/gentropy/dataset/l2g_features/distance.py @@ -8,10 +8,10 @@ from pyspark.sql import Window from gentropy.common.spark_helpers import convert_from_wide_to_long -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.l2g_features.l2g_feature import L2GFeature from gentropy.dataset.l2g_gold_standard import L2GGoldStandard from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.target_index import TargetIndex from gentropy.dataset.variant_index import VariantIndex if TYPE_CHECKING: @@ -85,7 +85,7 @@ def common_neighbourhood_distance_feature_logic( variant_index: VariantIndex, feature_name: str, distance_type: str, - gene_index: GeneIndex, + target_index: TargetIndex, genomic_window: int = 500_000, ) -> DataFrame: """Calculate the distance feature that correlates any variant in a credible set with any protein coding gene nearby the locus. The distance is weighted by the posterior probability of the variant to factor in its contribution to the trait. @@ -95,7 +95,7 @@ def common_neighbourhood_distance_feature_logic( variant_index (VariantIndex): The dataset containing distance to gene information feature_name (str): The name of the feature distance_type (str): The type of distance to gene - gene_index (GeneIndex): The dataset containing gene information + target_index (TargetIndex): The dataset containing gene information genomic_window (int): The maximum window size to consider Returns: @@ -113,7 +113,7 @@ def common_neighbourhood_distance_feature_logic( return ( # Then compute mean distance in the vicinity (feature will be the same for any gene associated with a studyLocus) local_metric.join( - gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"), + target_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"), "geneId", "inner", ) @@ -185,7 +185,7 @@ def compute( class DistanceTssMeanNeighbourhoodFeature(L2GFeature): """Minimum mean distance to TSS for all genes in the vicinity of a studyLocus.""" - feature_dependency_type = [VariantIndex, GeneIndex] + feature_dependency_type = [VariantIndex, TargetIndex] feature_name = "distanceTssMeanNeighbourhood" @classmethod @@ -261,7 +261,7 @@ def compute( class DistanceSentinelTssNeighbourhoodFeature(L2GFeature): """Distance between the sentinel variant and a gene TSS as a relation of the distnace with all the genes in the vicinity of a studyLocus. This is not weighted by the causal probability.""" - feature_dependency_type = [VariantIndex, GeneIndex] + feature_dependency_type = [VariantIndex, TargetIndex] feature_name = "distanceSentinelTssNeighbourhood" @classmethod @@ -342,7 +342,7 @@ def compute( class DistanceFootprintMeanNeighbourhoodFeature(L2GFeature): """Minimum mean distance to footprint for all genes in the vicinity of a studyLocus.""" - feature_dependency_type = [VariantIndex, GeneIndex] + feature_dependency_type = [VariantIndex, TargetIndex] feature_name = "distanceFootprintMeanNeighbourhood" @classmethod @@ -418,7 +418,7 @@ def compute( class DistanceSentinelFootprintNeighbourhoodFeature(L2GFeature): """Distance between the sentinel variant and a gene footprint as a relation of the distnace with all the genes in the vicinity of a studyLocus. This is not weighted by the causal probability.""" - feature_dependency_type = [VariantIndex, GeneIndex] + feature_dependency_type = [VariantIndex, TargetIndex] feature_name = "distanceSentinelFootprintNeighbourhood" @classmethod diff --git a/src/gentropy/dataset/l2g_features/other.py b/src/gentropy/dataset/l2g_features/other.py index 2fc32592b..4c9b5520d 100644 --- a/src/gentropy/dataset/l2g_features/other.py +++ b/src/gentropy/dataset/l2g_features/other.py @@ -7,10 +7,10 @@ import pyspark.sql.functions as f from gentropy.common.spark_helpers import convert_from_wide_to_long -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.l2g_features.l2g_feature import L2GFeature from gentropy.dataset.l2g_gold_standard import L2GGoldStandard from gentropy.dataset.study_locus import CredibleSetConfidenceClasses, StudyLocus +from gentropy.dataset.target_index import TargetIndex from gentropy.dataset.variant_index import VariantIndex if TYPE_CHECKING: @@ -20,7 +20,7 @@ def common_genecount_feature_logic( study_loci_to_annotate: StudyLocus | L2GGoldStandard, *, - gene_index: GeneIndex, + target_index: TargetIndex, feature_name: str, genomic_window: int, protein_coding_only: bool = False, @@ -30,7 +30,7 @@ def common_genecount_feature_logic( Args: study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation - gene_index (GeneIndex): Dataset containing information related to all genes in release. + target_index (TargetIndex): Dataset containing information related to all genes in release. feature_name (str): The name of the feature genomic_window (int): The maximum window size to consider protein_coding_only (bool): Whether to only consider protein coding genes in calculation. @@ -45,16 +45,16 @@ def common_genecount_feature_logic( .withColumn("window_end", f.col("position") + (genomic_window / 2)) .withColumnRenamed("chromosome", "SL_chromosome") ) - gene_index_filter = gene_index.df + target_index_filter = target_index.df if protein_coding_only: - gene_index_filter = gene_index_filter.filter( + target_index_filter = target_index_filter.filter( f.col("biotype") == "protein_coding" ) distinct_gene_counts = ( study_loci_window.join( - gene_index_filter.alias("genes"), + target_index_filter.alias("genes"), on=( (f.col("SL_chromosome") == f.col("genes.chromosome")) & (f.col("genes.tss") >= f.col("window_start")) @@ -68,7 +68,7 @@ def common_genecount_feature_logic( return ( study_loci_window.join( - gene_index_filter.alias("genes"), + target_index_filter.alias("genes"), on=( (f.col("SL_chromosome") == f.col("genes.chromosome")) & (f.col("genes.tss") >= f.col("window_start")) @@ -85,7 +85,7 @@ def common_genecount_feature_logic( def is_protein_coding_feature_logic( study_loci_to_annotate: StudyLocus | L2GGoldStandard, *, - gene_index: GeneIndex, + target_index: TargetIndex, feature_name: str, genomic_window: int, ) -> DataFrame: @@ -94,7 +94,7 @@ def is_protein_coding_feature_logic( Args: study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation - gene_index (GeneIndex): Dataset containing information related to all genes in release. + target_index (TargetIndex): Dataset containing information related to all genes in release. feature_name (str): The name of the feature genomic_window (int): The maximum window size to consider @@ -110,7 +110,7 @@ def is_protein_coding_feature_logic( ) return ( study_loci_window.join( - gene_index.df.alias("genes"), + target_index.df.alias("genes"), on=( (f.col("SL_chromosome") == f.col("genes.chromosome")) & (f.col("genes.tss") >= f.col("window_start")) @@ -130,7 +130,7 @@ def is_protein_coding_feature_logic( class GeneCountFeature(L2GFeature): """Counts the number of genes within a specified window size from the study locus.""" - feature_dependency_type = GeneIndex + feature_dependency_type = TargetIndex feature_name = "geneCount500kb" @classmethod @@ -143,7 +143,7 @@ def compute( Args: study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation - feature_dependency (dict[str, Any]): Dictionary containing dependencies, with gene index and window size + feature_dependency (dict[str, Any]): Dictionary containing dependencies, with target index and window size Returns: GeneCountFeature: Feature dataset @@ -170,7 +170,7 @@ def compute( class ProteinGeneCountFeature(L2GFeature): """Counts the number of protein coding genes within a specified window size from the study locus.""" - feature_dependency_type = GeneIndex + feature_dependency_type = TargetIndex feature_name = "proteinGeneCount500kb" @classmethod @@ -183,7 +183,7 @@ def compute( Args: study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation - feature_dependency (dict[str, Any]): Dictionary containing dependencies, with gene index and window size + feature_dependency (dict[str, Any]): Dictionary containing dependencies, with target index and window size Returns: ProteinGeneCountFeature: Feature dataset @@ -211,7 +211,7 @@ def compute( class ProteinCodingFeature(L2GFeature): """Indicates whether a gene is protein-coding within a specified window size from the study locus.""" - feature_dependency_type = GeneIndex + feature_dependency_type = TargetIndex feature_name = "isProteinCoding" @classmethod @@ -224,7 +224,7 @@ def compute( Args: study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation - feature_dependency (dict[str, Any]): Dictionary containing dependencies, including gene index + feature_dependency (dict[str, Any]): Dictionary containing dependencies, including target index Returns: ProteinCodingFeature: Feature dataset with 1 if the gene is protein-coding, 0 otherwise diff --git a/src/gentropy/dataset/l2g_features/vep.py b/src/gentropy/dataset/l2g_features/vep.py index 4f8dd6779..11f056ec5 100644 --- a/src/gentropy/dataset/l2g_features/vep.py +++ b/src/gentropy/dataset/l2g_features/vep.py @@ -8,10 +8,10 @@ from pyspark.sql import Window from gentropy.common.spark_helpers import convert_from_wide_to_long -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.l2g_features.l2g_feature import L2GFeature from gentropy.dataset.l2g_gold_standard import L2GGoldStandard from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.target_index import TargetIndex from gentropy.dataset.variant_index import VariantIndex if TYPE_CHECKING: @@ -77,7 +77,7 @@ def common_neighbourhood_vep_feature_logic( study_loci_to_annotate: StudyLocus | L2GGoldStandard, *, variant_index: VariantIndex, - gene_index: GeneIndex, + target_index: TargetIndex, feature_name: str, ) -> DataFrame: """Extracts variant severity score computed from VEP for any gene, based on what is the max score for protein coding genes that are nearby the locus. @@ -85,7 +85,7 @@ def common_neighbourhood_vep_feature_logic( Args: study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation variant_index (VariantIndex): The dataset containing functional consequence information - gene_index (GeneIndex): The dataset containing the gene biotype + target_index (TargetIndex): The dataset containing the gene biotype feature_name (str): The name of the feature Returns: @@ -102,7 +102,7 @@ def common_neighbourhood_vep_feature_logic( # Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus) # (non protein coding genes in the vicinity are excluded see #3552) .join( - gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"), + target_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"), "geneId", "inner", ) @@ -161,7 +161,7 @@ def compute( class VepMaximumNeighbourhoodFeature(L2GFeature): """Maximum functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity.""" - feature_dependency_type = [VariantIndex, GeneIndex] + feature_dependency_type = [VariantIndex, TargetIndex] feature_name = "vepMaximumNeighbourhood" @classmethod @@ -239,7 +239,7 @@ class VepMeanNeighbourhoodFeature(L2GFeature): The mean severity score is weighted by the posterior probability of each variant. """ - feature_dependency_type = [VariantIndex, GeneIndex] + feature_dependency_type = [VariantIndex, TargetIndex] feature_name = "vepMeanNeighbourhood" @classmethod diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py index da310f6f1..3f973bb9f 100644 --- a/src/gentropy/dataset/study_index.py +++ b/src/gentropy/dataset/study_index.py @@ -23,7 +23,7 @@ from pyspark.sql.types import StructType from gentropy.dataset.biosample_index import BiosampleIndex - from gentropy.dataset.gene_index import GeneIndex + from gentropy.dataset.target_index import TargetIndex class StudyQualityCheck(Enum): @@ -392,11 +392,11 @@ def validate_study_type(self: StudyIndex) -> StudyIndex: ) return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) - def validate_target(self: StudyIndex, target_index: GeneIndex) -> StudyIndex: + def validate_target(self: StudyIndex, target_index: TargetIndex) -> StudyIndex: """Validating gene identifiers in the study index against the provided target index. Args: - target_index (GeneIndex): gene index containing the reference gene identifiers (Ensembl gene identifiers). + target_index (TargetIndex): target index containing the reference gene identifiers (Ensembl gene identifiers). Returns: StudyIndex: with flagged studies if geneId could not be validated. diff --git a/src/gentropy/dataset/target_index.py b/src/gentropy/dataset/target_index.py index 31259d2d1..8248f5f44 100644 --- a/src/gentropy/dataset/target_index.py +++ b/src/gentropy/dataset/target_index.py @@ -1,4 +1,4 @@ -"""Gene index dataset.""" +"""Target index dataset.""" from __future__ import annotations from dataclasses import dataclass @@ -15,34 +15,34 @@ @dataclass -class GeneIndex(Dataset): - """Gene index dataset. +class TargetIndex(Dataset): + """Target index dataset. Gene-based annotation. """ @classmethod - def get_schema(cls: type[GeneIndex]) -> StructType: - """Provides the schema for the GeneIndex dataset. + def get_schema(cls: type[TargetIndex]) -> StructType: + """Provides the schema for the TargetIndex dataset. Returns: - StructType: Schema for the GeneIndex dataset + StructType: Schema for the TargetIndex dataset """ - return parse_spark_schema("gene_index.json") + return parse_spark_schema("target_index.json") - def filter_by_biotypes(self: GeneIndex, biotypes: list[str]) -> GeneIndex: + def filter_by_biotypes(self: TargetIndex, biotypes: list[str]) -> TargetIndex: """Filter by approved biotypes. Args: biotypes (list[str]): List of Ensembl biotypes to keep. Returns: - GeneIndex: Gene index dataset filtered by biotypes. + TargetIndex: Target index dataset filtered by biotypes. """ self.df = self._df.filter(f.col("biotype").isin(biotypes)) return self - def locations_lut(self: GeneIndex) -> DataFrame: + def locations_lut(self: TargetIndex) -> DataFrame: """Gene location information. Returns: @@ -57,7 +57,7 @@ def locations_lut(self: GeneIndex) -> DataFrame: "tss", ) - def symbols_lut(self: GeneIndex) -> DataFrame: + def symbols_lut(self: TargetIndex) -> DataFrame: """Gene symbol lookup table. Pre-processess gene/target dataset to create lookup table of gene symbols, including diff --git a/src/gentropy/datasource/intervals/andersson.py b/src/gentropy/datasource/intervals/andersson.py index a6e92470c..254f334a8 100644 --- a/src/gentropy/datasource/intervals/andersson.py +++ b/src/gentropy/datasource/intervals/andersson.py @@ -15,7 +15,7 @@ from pyspark.sql import DataFrame, SparkSession from gentropy.common.Liftover import LiftOverSpark - from gentropy.dataset.gene_index import GeneIndex + from gentropy.dataset.target_index import TargetIndex class IntervalsAndersson: @@ -49,14 +49,14 @@ def read(spark: SparkSession, path: str) -> DataFrame: def parse( cls: type[IntervalsAndersson], raw_anderson_df: DataFrame, - gene_index: GeneIndex, + target_index: TargetIndex, lift: LiftOverSpark, ) -> Intervals: """Parse Andersson et al. 2014 dataset. Args: raw_anderson_df (DataFrame): Raw Andersson et al. dataset - gene_index (GeneIndex): Gene index + target_index (TargetIndex): Target index lift (LiftOverSpark): LiftOverSpark instance Returns: @@ -108,10 +108,10 @@ def parse( .withColumnRenamed("mapped_start", "start") .withColumnRenamed("mapped_end", "end") .distinct() - # Joining with the gene index + # Joining with the target index .alias("intervals") .join( - gene_index.symbols_lut().alias("genes"), + target_index.symbols_lut().alias("genes"), on=[ f.col("intervals.gene_symbol") == f.col("genes.geneSymbol"), # Drop rows where the TSS is far from the start of the region diff --git a/src/gentropy/datasource/intervals/javierre.py b/src/gentropy/datasource/intervals/javierre.py index a05fa9886..6ecd9351a 100644 --- a/src/gentropy/datasource/intervals/javierre.py +++ b/src/gentropy/datasource/intervals/javierre.py @@ -12,7 +12,7 @@ from pyspark.sql import DataFrame, SparkSession from gentropy.common.Liftover import LiftOverSpark - from gentropy.dataset.gene_index import GeneIndex + from gentropy.dataset.target_index import TargetIndex class IntervalsJavierre: @@ -35,14 +35,14 @@ def read(spark: SparkSession, path: str) -> DataFrame: def parse( cls: type[IntervalsJavierre], javierre_raw: DataFrame, - gene_index: GeneIndex, + target_index: TargetIndex, lift: LiftOverSpark, ) -> Intervals: """Parse Javierre et al. 2016 dataset. Args: javierre_raw (DataFrame): Raw Javierre data - gene_index (GeneIndex): Gene index + target_index (TargetIndex): Target index lift (LiftOverSpark): LiftOverSpark instance Returns: @@ -115,7 +115,7 @@ def parse( .distinct() .alias("intervals") .join( - gene_index.locations_lut().alias("genes"), + target_index.locations_lut().alias("genes"), on=[ f.col("intervals.chrom") == f.col("genes.chromosome"), ( diff --git a/src/gentropy/datasource/intervals/jung.py b/src/gentropy/datasource/intervals/jung.py index 64b56967a..62f9963a6 100644 --- a/src/gentropy/datasource/intervals/jung.py +++ b/src/gentropy/datasource/intervals/jung.py @@ -12,7 +12,7 @@ from pyspark.sql import DataFrame, SparkSession from gentropy.common.Liftover import LiftOverSpark - from gentropy.dataset.gene_index import GeneIndex + from gentropy.dataset.target_index import TargetIndex class IntervalsJung: @@ -35,14 +35,14 @@ def read(spark: SparkSession, path: str) -> DataFrame: def parse( cls: type[IntervalsJung], jung_raw: DataFrame, - gene_index: GeneIndex, + target_index: TargetIndex, lift: LiftOverSpark, ) -> Intervals: """Parse the Jung et al. 2019 dataset. Args: jung_raw (DataFrame): raw Jung et al. 2019 dataset - gene_index (GeneIndex): gene index + target_index (TargetIndex): Target index lift (LiftOverSpark): LiftOverSpark instance Returns: @@ -81,7 +81,7 @@ def parse( .alias("intervals") # Joining with genes: .join( - gene_index.symbols_lut().alias("genes"), + target_index.symbols_lut().alias("genes"), on=[f.col("intervals.gene_name") == f.col("genes.geneSymbol")], how="inner", ) diff --git a/src/gentropy/datasource/intervals/thurman.py b/src/gentropy/datasource/intervals/thurman.py index a8113e5a6..459e72f2d 100644 --- a/src/gentropy/datasource/intervals/thurman.py +++ b/src/gentropy/datasource/intervals/thurman.py @@ -12,7 +12,7 @@ from pyspark.sql import DataFrame, SparkSession from gentropy.common.Liftover import LiftOverSpark - from gentropy.dataset.gene_index import GeneIndex + from gentropy.dataset.target_index import TargetIndex class IntervalsThurman: @@ -47,14 +47,14 @@ def read(spark: SparkSession, path: str) -> DataFrame: def parse( cls: type[IntervalsThurman], thurman_raw: DataFrame, - gene_index: GeneIndex, + target_index: TargetIndex, lift: LiftOverSpark, ) -> Intervals: """Parse the Thurman et al. 2012 dataset. Args: thurman_raw (DataFrame): raw Thurman et al. 2019 dataset - gene_index (GeneIndex): gene index + target_index (TargetIndex): Target index lift (LiftOverSpark): LiftOverSpark instance Returns: @@ -80,7 +80,7 @@ def parse( .alias("intervals") # Map gene names to gene IDs: .join( - gene_index.symbols_lut().alias("genes"), + target_index.symbols_lut().alias("genes"), on=[ f.col("intervals.gene_name") == f.col("genes.geneSymbol"), f.col("intervals.chrom") == f.col("genes.chromosome"), diff --git a/src/gentropy/datasource/open_targets/target.py b/src/gentropy/datasource/open_targets/target.py index 5e795c19b..40c1cca7d 100644 --- a/src/gentropy/datasource/open_targets/target.py +++ b/src/gentropy/datasource/open_targets/target.py @@ -4,13 +4,13 @@ import pyspark.sql.functions as f from pyspark.sql import Column, DataFrame -from gentropy.dataset.gene_index import GeneIndex +from gentropy.dataset.target_index import TargetIndex class OpenTargetsTarget: """Parser for OTPlatform target dataset. - Genomic data from Open Targets provides gene identification and genomic coordinates that are integrated into the gene index of our ETL pipeline. + Genomic data from Open Targets provides gene identification and genomic coordinates that are integrated into the target index of our ETL pipeline. The EMBL-EBI Ensembl database is used as a source for human targets in the Platform, with the Ensembl gene ID as the primary identifier. The criteria for target inclusion is: - Genes from all biotypes encoded in canonical chromosomes @@ -44,18 +44,18 @@ def _get_gene_tss(strand_col: Column, start_col: Column, end_col: Column) -> Col return f.when(strand_col == 1, start_col).when(strand_col == -1, end_col) @classmethod - def as_gene_index( + def as_target_index( cls: type[OpenTargetsTarget], target_index: DataFrame - ) -> GeneIndex: - """Initialise GeneIndex from source dataset. + ) -> TargetIndex: + """Initialise TargetIndex from source dataset. Args: target_index (DataFrame): Target index dataframe Returns: - GeneIndex: Gene index dataset + TargetIndex: Target index dataset """ - return GeneIndex( + return TargetIndex( _df=target_index.select( f.coalesce(f.col("id"), f.lit("unknown")).alias("geneId"), "approvedSymbol", @@ -74,5 +74,5 @@ def as_gene_index( f.col("genomicLocation.end").alias("end"), f.col("genomicLocation.strand").alias("strand"), ), - _schema=GeneIndex.get_schema(), + _schema=TargetIndex.get_schema(), ) diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py index 16922ef78..9c2da149b 100644 --- a/src/gentropy/l2g.py +++ b/src/gentropy/l2g.py @@ -14,12 +14,12 @@ from gentropy.common.spark_helpers import calculate_harmonic_sum from gentropy.common.utils import access_gcp_secret from gentropy.dataset.colocalisation import Colocalisation -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.l2g_feature_matrix import L2GFeatureMatrix from gentropy.dataset.l2g_gold_standard import L2GGoldStandard from gentropy.dataset.l2g_prediction import L2GPrediction from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.target_index import TargetIndex from gentropy.dataset.variant_index import VariantIndex from gentropy.method.l2g.feature_factory import L2GFeatureInputLoader from gentropy.method.l2g.model import LocusToGeneModel @@ -38,7 +38,7 @@ def __init__( variant_index_path: str | None = None, colocalisation_path: str | None = None, study_index_path: str | None = None, - gene_index_path: str | None = None, + target_index_path: str | None = None, feature_matrix_path: str, ) -> None: """Initialise the step and run the logic based on mode. @@ -50,7 +50,7 @@ def __init__( variant_index_path (str | None): Path to the variant index dataset colocalisation_path (str | None): Path to the colocalisation dataset study_index_path (str | None): Path to the study index dataset - gene_index_path (str | None): Path to the gene index dataset + target_index_path (str | None): Path to the target index dataset feature_matrix_path (str): Path to the L2G feature matrix output dataset """ credible_set = StudyLocus.from_parquet( @@ -73,9 +73,9 @@ def __init__( if colocalisation_path else None ) - gene_index = ( - GeneIndex.from_parquet(session, gene_index_path, recursiveFileLookup=True) - if gene_index_path + target_index = ( + TargetIndex.from_parquet(session, target_index_path, recursiveFileLookup=True) + if target_index_path else None ) features_input_loader = L2GFeatureInputLoader( @@ -83,7 +83,7 @@ def __init__( colocalisation=coloc, study_index=studies, study_locus=credible_set, - gene_index=gene_index, + target_index=target_index, ) fm = credible_set.filter(f.col("studyType") == "gwas").build_feature_matrix( diff --git a/src/gentropy/study_validation.py b/src/gentropy/study_validation.py index 3d2fdd060..1aecbabb4 100644 --- a/src/gentropy/study_validation.py +++ b/src/gentropy/study_validation.py @@ -6,8 +6,8 @@ from gentropy.common.session import Session from gentropy.dataset.biosample_index import BiosampleIndex -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.study_index import StudyIndex +from gentropy.dataset.target_index import TargetIndex class StudyValidationStep: @@ -41,7 +41,7 @@ def __init__( invalid_qc_reasons (list[str]): List of invalid quality check reason names from `StudyQualityCheck` (e.g. ['DUPLICATED_STUDY']). """ # Reading datasets: - target_index = GeneIndex.from_parquet(session, target_index_path) + target_index = TargetIndex.from_parquet(session, target_index_path) biosample_index = BiosampleIndex.from_parquet(session, biosample_index_path) # Reading disease index and pre-process. # This logic does not belong anywhere, but gentorpy has no disease dataset yet. diff --git a/src/gentropy/target_index.py b/src/gentropy/target_index.py index 0a317d077..b57b0e835 100644 --- a/src/gentropy/target_index.py +++ b/src/gentropy/target_index.py @@ -1,4 +1,4 @@ -"""Step to generate gene index dataset.""" +"""Step to generate target index dataset.""" from __future__ import annotations @@ -6,29 +6,29 @@ from gentropy.datasource.open_targets.target import OpenTargetsTarget -class GeneIndexStep: - """Gene index step. +class TargetIndexStep: + """Target index step. - This step generates a gene index dataset from an Open Targets Platform target dataset. + This step generates a target index dataset from an Open Targets Platform target dataset. """ def __init__( self, session: Session, target_path: str, - gene_index_path: str, + target_index_path: str, ) -> None: """Initialize step. Args: session (Session): Session object. target_path (str): Input Open Targets Platform target dataset path. - gene_index_path (str): Output gene index dataset path. + target_index_path (str): Output target index dataset path. """ platform_target = session.spark.read.parquet(target_path) # Transform - gene_index = OpenTargetsTarget.as_gene_index(platform_target) + target_index = OpenTargetsTarget.as_target_index(platform_target) # Load - gene_index.df.coalesce(session.output_partitions).write.mode( + target_index.df.coalesce(session.output_partitions).write.mode( session.write_mode - ).parquet(gene_index_path) + ).parquet(target_index_path) From 19bb7c98afcfeaa538fe926f93d58cf85d4f1b1d Mon Sep 17 00:00:00 2001 From: vivienho <56025826+vivienho@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:14:53 +0000 Subject: [PATCH 3/3] refactor: rename gene_index to target_index in tests --- tests/gentropy/conftest.py | 12 ++--- tests/gentropy/dataset/test_l2g_feature.py | 54 +++++++++---------- .../dataset/test_l2g_feature_matrix.py | 8 +-- tests/gentropy/dataset/test_study_index.py | 14 ++--- .../dataset/test_summary_statistics.py | 2 +- tests/gentropy/dataset/test_target_index.py | 28 +++++----- tests/gentropy/dataset/test_variant_index.py | 2 +- .../datasource/intervals/test_andersson.py | 6 +-- .../datasource/intervals/test_javierre.py | 6 +-- .../datasource/intervals/test_jung.py | 6 +-- .../datasource/intervals/test_thurman.py | 6 +-- .../open_targets/test_l2g_gold_standard.py | 6 +-- .../datasource/open_targets/test_target.py | 10 ++-- tests/gentropy/test_schemas.py | 22 ++++---- 14 files changed, 92 insertions(+), 90 deletions(-) diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py index f19c28623..c4178ba37 100644 --- a/tests/gentropy/conftest.py +++ b/tests/gentropy/conftest.py @@ -15,7 +15,6 @@ from gentropy.common.session import Session from gentropy.dataset.biosample_index import BiosampleIndex from gentropy.dataset.colocalisation import Colocalisation -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.intervals import Intervals from gentropy.dataset.l2g_feature_matrix import L2GFeatureMatrix from gentropy.dataset.l2g_gold_standard import L2GGoldStandard @@ -25,6 +24,7 @@ from gentropy.dataset.study_locus import StudyLocus from gentropy.dataset.study_locus_overlap import StudyLocusOverlap from gentropy.dataset.summary_statistics import SummaryStatistics +from gentropy.dataset.target_index import TargetIndex from gentropy.dataset.variant_index import VariantIndex from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping from gentropy.datasource.eqtl_catalogue.study_index import EqtlCatalogueStudyIndex @@ -379,7 +379,7 @@ def mock_summary_statistics( @pytest.fixture() def mock_ld_index(spark: SparkSession) -> LDIndex: - """Mock gene index.""" + """Mock ld index.""" ld_schema = LDIndex.get_schema() data_spec = ( @@ -519,9 +519,9 @@ def sample_target_index(spark: SparkSession) -> DataFrame: @pytest.fixture() -def mock_gene_index(spark: SparkSession) -> GeneIndex: - """Mock gene index dataset.""" - gi_schema = GeneIndex.get_schema() +def mock_target_index(spark: SparkSession) -> TargetIndex: + """Mock target index dataset.""" + gi_schema = TargetIndex.get_schema() data_spec = ( dg.DataGenerator( @@ -540,7 +540,7 @@ def mock_gene_index(spark: SparkSession) -> GeneIndex: .withColumnSpec("strand", percentNulls=0.1) ) - return GeneIndex(_df=data_spec.build(), _schema=gi_schema) + return TargetIndex(_df=data_spec.build(), _schema=gi_schema) @pytest.fixture() diff --git a/tests/gentropy/dataset/test_l2g_feature.py b/tests/gentropy/dataset/test_l2g_feature.py index feb8e449a..0ae9fea85 100644 --- a/tests/gentropy/dataset/test_l2g_feature.py +++ b/tests/gentropy/dataset/test_l2g_feature.py @@ -21,7 +21,7 @@ ) from gentropy.dataset.colocalisation import Colocalisation -from gentropy.dataset.gene_index import GeneIndex +from gentropy.dataset.target_index import TargetIndex from gentropy.dataset.l2g_features.colocalisation import ( EQtlColocClppMaximumFeature, EQtlColocClppMaximumNeighbourhoodFeature, @@ -116,7 +116,7 @@ def test_feature_factory_return_type( mock_colocalisation: Colocalisation, mock_study_index: StudyIndex, mock_variant_index: VariantIndex, - mock_gene_index: GeneIndex, + mock_target_index: TargetIndex, ) -> None: """Test that every feature factory returns a L2GFeature dataset.""" loader = L2GFeatureInputLoader( @@ -124,7 +124,7 @@ def test_feature_factory_return_type( study_index=mock_study_index, variant_index=mock_variant_index, study_locus=mock_study_locus, - gene_index=mock_gene_index, + target_index=mock_target_index, ) feature_dataset = feature_class.compute( study_loci_to_annotate=mock_study_locus, @@ -136,9 +136,9 @@ def test_feature_factory_return_type( @pytest.fixture(scope="module") -def sample_gene_index(spark: SparkSession) -> GeneIndex: - """Create a sample gene index for testing.""" - return GeneIndex( +def sample_target_index(spark: SparkSession) -> TargetIndex: + """Create a sample target index for testing.""" + return TargetIndex( _df=spark.createDataFrame( [ { @@ -157,9 +157,9 @@ def sample_gene_index(spark: SparkSession) -> GeneIndex: "chromosome": "1", }, ], - GeneIndex.get_schema(), + TargetIndex.get_schema(), ), - _schema=GeneIndex.get_schema(), + _schema=TargetIndex.get_schema(), ) @@ -294,7 +294,7 @@ def test__common_colocalisation_feature_logic( def test_extend_missing_colocalisation_to_neighbourhood_genes( self: TestCommonColocalisationFeatureLogic, spark: SparkSession, - sample_gene_index: GeneIndex, + sample_target_index: TargetIndex, sample_variant_index: VariantIndex, ) -> None: """Test the extend_missing_colocalisation_to_neighbourhood_genes function.""" @@ -316,7 +316,7 @@ def test_extend_missing_colocalisation_to_neighbourhood_genes( feature_name="eQtlColocH4Maximum", local_features=local_features, variant_index=sample_variant_index, - gene_index=sample_gene_index, + target_index=sample_target_index, study_locus=self.sample_study_locus, ).select("studyLocusId", "geneId", "eQtlColocH4Maximum") expected_df = spark.createDataFrame( @@ -329,7 +329,7 @@ def test_extend_missing_colocalisation_to_neighbourhood_genes( def test_common_neighbourhood_colocalisation_feature_logic( self: TestCommonColocalisationFeatureLogic, spark: SparkSession, - sample_gene_index: GeneIndex, + sample_target_index: TargetIndex, sample_variant_index: VariantIndex, ) -> None: """Test the common logic of the neighbourhood colocalisation features.""" @@ -343,7 +343,7 @@ def test_common_neighbourhood_colocalisation_feature_logic( colocalisation=self.sample_colocalisation, study_index=self.sample_studies, study_locus=self.sample_study_locus, - gene_index=sample_gene_index, + target_index=sample_target_index, variant_index=sample_variant_index, ).withColumn(feature_name, f.round(f.col(feature_name), 3)) # expected max is 0.81 @@ -561,7 +561,7 @@ def test_common_neighbourhood_distance_feature_logic( common_neighbourhood_distance_feature_logic( self.sample_study_locus, variant_index=self.sample_variant_index, - gene_index=self.sample_gene_index, + target_index=self.sample_target_index, feature_name=feature_name, distance_type=self.distance_type, genomic_window=10, @@ -653,7 +653,7 @@ def _setup( ), _schema=VariantIndex.get_schema(), ) - self.sample_gene_index = GeneIndex( + self.sample_target_index = TargetIndex( _df=spark.createDataFrame( [ { @@ -675,9 +675,9 @@ def _setup( "biotype": "non_coding", }, ], - GeneIndex.get_schema(), + TargetIndex.get_schema(), ), - _schema=GeneIndex.get_schema(), + _schema=TargetIndex.get_schema(), ) @@ -760,7 +760,7 @@ def test_common_vep_feature_logic( def test_common_neighbourhood_vep_feature_logic( self: TestCommonVepFeatureLogic, spark: SparkSession, - sample_gene_index: GeneIndex, + sample_target_index: TargetIndex, sample_variant_index: VariantIndex, ) -> None: """Test the logic of the function that extracts the maximum severity score for a gene given the maximum of the maximum scores for all protein coding genes in the vicinity.""" @@ -769,7 +769,7 @@ def test_common_neighbourhood_vep_feature_logic( common_neighbourhood_vep_feature_logic( self.sample_study_locus, variant_index=sample_variant_index, - gene_index=sample_gene_index, + target_index=sample_target_index, feature_name=feature_name, ) .withColumn(feature_name, f.round(f.col(feature_name), 2)) @@ -859,7 +859,7 @@ def test_common_genecount_feature_logic( """Test the common logic of the gene count features.""" observed_df = common_genecount_feature_logic( study_loci_to_annotate=self.sample_study_locus, - gene_index=self.sample_gene_index, + target_index=self.sample_target_index, feature_name=feature_name, genomic_window=500000, protein_coding_only=protein_coding_only, @@ -892,7 +892,7 @@ def _setup(self: TestCommonGeneCountFeatureLogic, spark: SparkSession) -> None: ), _schema=StudyLocus.get_schema(), ) - self.sample_gene_index = GeneIndex( + self.sample_target_index = TargetIndex( _df=spark.createDataFrame( [ { @@ -914,9 +914,9 @@ def _setup(self: TestCommonGeneCountFeatureLogic, spark: SparkSession) -> None: "biotype": "non_coding", }, ], - GeneIndex.get_schema(), + TargetIndex.get_schema(), ), - _schema=GeneIndex.get_schema(), + _schema=TargetIndex.get_schema(), ) @@ -944,7 +944,7 @@ def test_is_protein_coding_feature_logic( observed_df = ( is_protein_coding_feature_logic( study_loci_to_annotate=self.sample_study_locus, - gene_index=self.sample_gene_index, + target_index=self.sample_target_index, feature_name="isProteinCoding500kb", genomic_window=500000, ) @@ -981,8 +981,8 @@ def _setup(self: TestCommonProteinCodingFeatureLogic, spark: SparkSession) -> No _schema=StudyLocus.get_schema(), ) - # Sample gene index data with biotype - self.sample_gene_index = GeneIndex( + # Sample target index data with biotype + self.sample_target_index = TargetIndex( _df=spark.createDataFrame( [ { @@ -1004,9 +1004,9 @@ def _setup(self: TestCommonProteinCodingFeatureLogic, spark: SparkSession) -> No "biotype": "non_coding", }, ], - GeneIndex.get_schema(), + TargetIndex.get_schema(), ), - _schema=GeneIndex.get_schema(), + _schema=TargetIndex.get_schema(), ) diff --git a/tests/gentropy/dataset/test_l2g_feature_matrix.py b/tests/gentropy/dataset/test_l2g_feature_matrix.py index 6677d123e..8d63bc5ee 100644 --- a/tests/gentropy/dataset/test_l2g_feature_matrix.py +++ b/tests/gentropy/dataset/test_l2g_feature_matrix.py @@ -16,11 +16,11 @@ ) from gentropy.dataset.colocalisation import Colocalisation -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.l2g_feature_matrix import L2GFeatureMatrix from gentropy.dataset.l2g_gold_standard import L2GGoldStandard from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.target_index import TargetIndex from gentropy.method.l2g.feature_factory import L2GFeatureInputLoader if TYPE_CHECKING: @@ -54,7 +54,7 @@ def test_study_locus( colocalisation=self.sample_colocalisation, study_index=self.sample_study_index, study_locus=self.sample_study_locus, - gene_index=self.sample_gene_index, + target_index=self.sample_target_index, ) fm = L2GFeatureMatrix.from_features_list( self.sample_study_locus, features_list, loader @@ -170,7 +170,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None: ), _schema=Colocalisation.get_schema(), ) - self.sample_gene_index = GeneIndex( + self.sample_target_index = TargetIndex( _df=spark.createDataFrame( [ ("g1", "X", "protein_coding", 200), @@ -183,7 +183,7 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None: "tss", ], ), - _schema=GeneIndex.get_schema(), + _schema=TargetIndex.get_schema(), ) diff --git a/tests/gentropy/dataset/test_study_index.py b/tests/gentropy/dataset/test_study_index.py index 05b652752..22391e8ea 100644 --- a/tests/gentropy/dataset/test_study_index.py +++ b/tests/gentropy/dataset/test_study_index.py @@ -7,8 +7,8 @@ from pyspark.sql import functions as f from gentropy.dataset.biosample_index import BiosampleIndex -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.study_index import StudyIndex +from gentropy.dataset.target_index import TargetIndex def test_study_index_creation(mock_study_index: StudyIndex) -> None: @@ -188,9 +188,9 @@ def create_study_index(drop_column: str) -> StudyIndex: self.study_index_no_gene = create_study_index("geneId") self.study_index_no_biosample_id = create_study_index("biosampleFromSourceId") - self.gene_index = GeneIndex( + self.target_index = TargetIndex( _df=spark.createDataFrame(self.GENE_DATA, self.GENE_COLUMNS), - _schema=GeneIndex.get_schema(), + _schema=TargetIndex.get_schema(), ) self.biosample_index = BiosampleIndex( _df=spark.createDataFrame(self.BIOSAMPLE_DATA, self.BIOSAMPLE_COLUMNS), @@ -199,7 +199,7 @@ def create_study_index(drop_column: str) -> StudyIndex: def test_gene_validation_type(self: TestQTLValidation) -> None: """Testing if the target validation runs and returns the expected type.""" - validated = self.study_index.validate_target(self.gene_index) + validated = self.study_index.validate_target(self.target_index) assert isinstance(validated, StudyIndex) def test_biosample_validation_type(self: TestQTLValidation) -> None: @@ -211,7 +211,7 @@ def test_biosample_validation_type(self: TestQTLValidation) -> None: def test_qtl_validation_correctness(self: TestQTLValidation, test: str) -> None: """Testing if the QTL validation only flags the expected studies.""" if test == "gene": - validated = self.study_index.validate_target(self.gene_index).persist() + validated = self.study_index.validate_target(self.target_index).persist() bad_study = "s2" if test == "biosample": validated = self.study_index.validate_biosample( @@ -252,7 +252,7 @@ def test_qtl_validation_drop_relevant_column( """Testing what happens if an expected column is not present.""" if drop == "gene": if test == "gene": - validated = self.study_index_no_gene.validate_target(self.gene_index) + validated = self.study_index_no_gene.validate_target(self.target_index) if test == "biosample": validated = self.study_index_no_gene.validate_biosample( self.biosample_index @@ -260,7 +260,7 @@ def test_qtl_validation_drop_relevant_column( if drop == "biosample": if test == "gene": validated = self.study_index_no_biosample_id.validate_target( - self.gene_index + self.target_index ) if test == "biosample": validated = self.study_index_no_biosample_id.validate_biosample( diff --git a/tests/gentropy/dataset/test_summary_statistics.py b/tests/gentropy/dataset/test_summary_statistics.py index b1b06442b..033fba663 100644 --- a/tests/gentropy/dataset/test_summary_statistics.py +++ b/tests/gentropy/dataset/test_summary_statistics.py @@ -17,7 +17,7 @@ def test_summary_statistics__creation( mock_summary_statistics: SummaryStatistics, ) -> None: - """Test gene index creation with mock gene index.""" + """Test summary statistics creation with mock summary statistics.""" assert isinstance(mock_summary_statistics, SummaryStatistics) diff --git a/tests/gentropy/dataset/test_target_index.py b/tests/gentropy/dataset/test_target_index.py index e4ae8e581..070bf7d8a 100644 --- a/tests/gentropy/dataset/test_target_index.py +++ b/tests/gentropy/dataset/test_target_index.py @@ -4,29 +4,29 @@ from pyspark.sql import DataFrame -from gentropy.dataset.gene_index import GeneIndex +from gentropy.dataset.target_index import TargetIndex -def test_gene_index_creation(mock_gene_index: GeneIndex) -> None: - """Test gene index creation with mock gene index.""" - assert isinstance(mock_gene_index, GeneIndex) +def test_target_index_creation(mock_target_index: TargetIndex) -> None: + """Test target index creation with mock target index.""" + assert isinstance(mock_target_index, TargetIndex) -def test_gene_index_location_lut(mock_gene_index: GeneIndex) -> None: - """Test gene index location lut.""" - assert isinstance(mock_gene_index.locations_lut(), DataFrame) +def test_target_index_location_lut(mock_target_index: TargetIndex) -> None: + """Test target index location lut.""" + assert isinstance(mock_target_index.locations_lut(), DataFrame) -def test_gene_index_symbols_lut(mock_gene_index: GeneIndex) -> None: - """Test gene index symbols lut.""" - assert isinstance(mock_gene_index.symbols_lut(), DataFrame) +def test_target_index_symbols_lut(mock_target_index: TargetIndex) -> None: + """Test target index symbols lut.""" + assert isinstance(mock_target_index.symbols_lut(), DataFrame) -def test_gene_index_filter_by_biotypes(mock_gene_index: GeneIndex) -> None: - """Test gene index filter by biotypes.""" +def test_target_index_filter_by_biotypes(mock_target_index: TargetIndex) -> None: + """Test target index filter by biotypes.""" assert isinstance( - mock_gene_index.filter_by_biotypes( + mock_target_index.filter_by_biotypes( biotypes=["protein_coding", "3prime_overlapping_ncRNA", "antisense"] ), - GeneIndex, + TargetIndex, ) diff --git a/tests/gentropy/dataset/test_variant_index.py b/tests/gentropy/dataset/test_variant_index.py index 43c409ea6..11f1c966f 100644 --- a/tests/gentropy/dataset/test_variant_index.py +++ b/tests/gentropy/dataset/test_variant_index.py @@ -15,7 +15,7 @@ def test_variant_index_creation(mock_variant_index: VariantIndex) -> None: - """Test gene index creation with mock gene index.""" + """Test variant index creation with mock variant index.""" assert isinstance(mock_variant_index, VariantIndex) diff --git a/tests/gentropy/datasource/intervals/test_andersson.py b/tests/gentropy/datasource/intervals/test_andersson.py index 69575b7c3..1820c8322 100644 --- a/tests/gentropy/datasource/intervals/test_andersson.py +++ b/tests/gentropy/datasource/intervals/test_andersson.py @@ -6,8 +6,8 @@ from pyspark.sql import DataFrame, SparkSession from gentropy.common.Liftover import LiftOverSpark -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.intervals import Intervals +from gentropy.dataset.target_index import TargetIndex from gentropy.datasource.intervals.andersson import IntervalsAndersson @@ -26,13 +26,13 @@ def test_read_andersson(sample_intervals_andersson: DataFrame) -> None: def test_andersson_intervals_from_source( sample_intervals_andersson: DataFrame, - mock_gene_index: GeneIndex, + mock_target_index: TargetIndex, liftover_chain_37_to_38: LiftOverSpark, ) -> None: """Test AnderssonIntervals creation with mock data.""" assert isinstance( IntervalsAndersson.parse( - sample_intervals_andersson, mock_gene_index, liftover_chain_37_to_38 + sample_intervals_andersson, mock_target_index, liftover_chain_37_to_38 ), Intervals, ) diff --git a/tests/gentropy/datasource/intervals/test_javierre.py b/tests/gentropy/datasource/intervals/test_javierre.py index 886a28c52..4fdd9db7a 100644 --- a/tests/gentropy/datasource/intervals/test_javierre.py +++ b/tests/gentropy/datasource/intervals/test_javierre.py @@ -6,8 +6,8 @@ from pyspark.sql import DataFrame, SparkSession from gentropy.common.Liftover import LiftOverSpark -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.intervals import Intervals +from gentropy.dataset.target_index import TargetIndex from gentropy.datasource.intervals.javierre import IntervalsJavierre @@ -26,13 +26,13 @@ def test_read_javierre(sample_intervals_javierre: DataFrame) -> None: def test_javierre_intervals_from_source( sample_intervals_javierre: DataFrame, - mock_gene_index: GeneIndex, + mock_target_index: TargetIndex, liftover_chain_37_to_38: LiftOverSpark, ) -> None: """Test JavierreIntervals creation with mock data.""" assert isinstance( IntervalsJavierre.parse( - sample_intervals_javierre, mock_gene_index, liftover_chain_37_to_38 + sample_intervals_javierre, mock_target_index, liftover_chain_37_to_38 ), Intervals, ) diff --git a/tests/gentropy/datasource/intervals/test_jung.py b/tests/gentropy/datasource/intervals/test_jung.py index e391b8f96..bac6918b8 100644 --- a/tests/gentropy/datasource/intervals/test_jung.py +++ b/tests/gentropy/datasource/intervals/test_jung.py @@ -6,8 +6,8 @@ from pyspark.sql import DataFrame, SparkSession from gentropy.common.Liftover import LiftOverSpark -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.intervals import Intervals +from gentropy.dataset.target_index import TargetIndex from gentropy.datasource.intervals.jung import IntervalsJung @@ -24,13 +24,13 @@ def test_read_jung(sample_intervals_jung: DataFrame) -> None: def test_jung_intervals_from_source( sample_intervals_jung: DataFrame, - mock_gene_index: GeneIndex, + mock_target_index: TargetIndex, liftover_chain_37_to_38: LiftOverSpark, ) -> None: """Test JungIntervals creation with mock data.""" assert isinstance( IntervalsJung.parse( - sample_intervals_jung, mock_gene_index, liftover_chain_37_to_38 + sample_intervals_jung, mock_target_index, liftover_chain_37_to_38 ), Intervals, ) diff --git a/tests/gentropy/datasource/intervals/test_thurman.py b/tests/gentropy/datasource/intervals/test_thurman.py index 616e1abec..a6f4074b0 100644 --- a/tests/gentropy/datasource/intervals/test_thurman.py +++ b/tests/gentropy/datasource/intervals/test_thurman.py @@ -6,8 +6,8 @@ from pyspark.sql import DataFrame, SparkSession from gentropy.common.Liftover import LiftOverSpark -from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.intervals import Intervals +from gentropy.dataset.target_index import TargetIndex from gentropy.datasource.intervals.thurman import IntervalsThurman @@ -26,13 +26,13 @@ def test_read_thurman(sample_intervals_thurman: DataFrame) -> None: def test_thurman_intervals_from_source( sample_intervals_thurman: DataFrame, - mock_gene_index: GeneIndex, + mock_target_index: TargetIndex, liftover_chain_37_to_38: LiftOverSpark, ) -> None: """Test IntervalsThurman creation with mock data.""" assert isinstance( IntervalsThurman.parse( - sample_intervals_thurman, mock_gene_index, liftover_chain_37_to_38 + sample_intervals_thurman, mock_target_index, liftover_chain_37_to_38 ), Intervals, ) diff --git a/tests/gentropy/datasource/open_targets/test_l2g_gold_standard.py b/tests/gentropy/datasource/open_targets/test_l2g_gold_standard.py index 79f9d925a..3c8ff1aed 100644 --- a/tests/gentropy/datasource/open_targets/test_l2g_gold_standard.py +++ b/tests/gentropy/datasource/open_targets/test_l2g_gold_standard.py @@ -29,8 +29,8 @@ from pyspark.sql.session import SparkSession from gentropy.dataset.colocalisation import Colocalisation - from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.study_locus import StudyLocus + from gentropy.dataset.target_index import TargetIndex def test_open_targets_as_l2g_gold_standard( @@ -162,7 +162,7 @@ def test_build_feature_matrix( mock_study_locus: StudyLocus, mock_colocalisation: Colocalisation, mock_study_index: StudyIndex, - mock_gene_index: GeneIndex, + mock_target_index: TargetIndex, ) -> None: """Test building feature matrix with the eQtlColocH4Maximum feature.""" features_list = ["eQtlColocH4Maximum", "isProteinCoding"] @@ -170,7 +170,7 @@ def test_build_feature_matrix( colocalisation=mock_colocalisation, study_index=mock_study_index, study_locus=mock_study_locus, - gene_index=mock_gene_index, + target_index=mock_target_index, ) fm = mock_study_locus.build_feature_matrix(features_list, loader) assert isinstance( diff --git a/tests/gentropy/datasource/open_targets/test_target.py b/tests/gentropy/datasource/open_targets/test_target.py index 091dcea53..b32886a4b 100644 --- a/tests/gentropy/datasource/open_targets/test_target.py +++ b/tests/gentropy/datasource/open_targets/test_target.py @@ -4,10 +4,12 @@ from pyspark.sql import DataFrame -from gentropy.dataset.gene_index import GeneIndex +from gentropy.dataset.target_index import TargetIndex from gentropy.datasource.open_targets.target import OpenTargetsTarget -def test_open_targets_as_gene_index(sample_target_index: DataFrame) -> None: - """Test gene index from source.""" - assert isinstance(OpenTargetsTarget.as_gene_index(sample_target_index), GeneIndex) +def test_open_targets_as_target_index(sample_target_index: DataFrame) -> None: + """Test target index from source.""" + assert isinstance( + OpenTargetsTarget.as_target_index(sample_target_index), TargetIndex + ) diff --git a/tests/gentropy/test_schemas.py b/tests/gentropy/test_schemas.py index 1b06076d0..500fbcd69 100644 --- a/tests/gentropy/test_schemas.py +++ b/tests/gentropy/test_schemas.py @@ -17,8 +17,8 @@ if TYPE_CHECKING: from _pytest.fixtures import FixtureRequest - from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.l2g_prediction import L2GPrediction + from gentropy.dataset.target_index import TargetIndex SCHEMA_DIR = "src/gentropy/assets/schemas" @@ -75,23 +75,23 @@ def test_schema_columns_camelcase(schema_json: str) -> None: class TestValidateSchema: - """Test validate_schema method using L2GPrediction (unnested) and GeneIndex (nested) as a testing dataset.""" + """Test validate_schema method using L2GPrediction (unnested) and TargetIndex (nested) as a testing dataset.""" @pytest.fixture() def mock_dataset_instance( self: TestValidateSchema, request: FixtureRequest - ) -> L2GPrediction | GeneIndex: + ) -> L2GPrediction | TargetIndex: """Meta fixture to return the value of any requested fixture.""" return request.getfixturevalue(request.param) @pytest.mark.parametrize( "mock_dataset_instance", - ["mock_l2g_predictions", "mock_gene_index"], + ["mock_l2g_predictions", "mock_target_index"], indirect=True, ) def test_validate_schema_extra_field( self: TestValidateSchema, - mock_dataset_instance: L2GPrediction | GeneIndex, + mock_dataset_instance: L2GPrediction | TargetIndex, ) -> None: """Test that validate_schema raises an error if the observed schema has an extra field.""" with pytest.raises(SchemaValidationError, match="extraField"): @@ -101,12 +101,12 @@ def test_validate_schema_extra_field( @pytest.mark.parametrize( "mock_dataset_instance", - ["mock_l2g_predictions", "mock_gene_index"], + ["mock_l2g_predictions", "mock_target_index"], indirect=True, ) def test_validate_schema_missing_field( self: TestValidateSchema, - mock_dataset_instance: L2GPrediction | GeneIndex, + mock_dataset_instance: L2GPrediction | TargetIndex, ) -> None: """Test that validate_schema raises an error if the observed schema is missing a required field, geneId in this case.""" with pytest.raises(SchemaValidationError, match="geneId"): @@ -114,12 +114,12 @@ def test_validate_schema_missing_field( @pytest.mark.parametrize( "mock_dataset_instance", - ["mock_l2g_predictions", "mock_gene_index"], + ["mock_l2g_predictions", "mock_target_index"], indirect=True, ) def test_validate_schema_duplicated_field( self: TestValidateSchema, - mock_dataset_instance: L2GPrediction | GeneIndex, + mock_dataset_instance: L2GPrediction | TargetIndex, ) -> None: """Test that validate_schema raises an error if the observed schema has a duplicated field, geneId in this case.""" with pytest.raises(SchemaValidationError, match="geneId"): @@ -129,12 +129,12 @@ def test_validate_schema_duplicated_field( @pytest.mark.parametrize( "mock_dataset_instance", - ["mock_l2g_predictions", "mock_gene_index"], + ["mock_l2g_predictions", "mock_target_index"], indirect=True, ) def test_validate_schema_different_datatype( self: TestValidateSchema, - mock_dataset_instance: L2GPrediction | GeneIndex, + mock_dataset_instance: L2GPrediction | TargetIndex, ) -> None: """Test that validate_schema raises an error if any field in the observed schema has a different type than expected.""" with pytest.raises(SchemaValidationError, match="geneId"):