From aacb00aee0b1e5982cf9d3195471992f470e6eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Thu, 19 Oct 2023 11:46:27 +0100 Subject: [PATCH 1/3] fix(gene_index): typo in gene_index output dataset --- config/datasets/gcp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/datasets/gcp.yaml b/config/datasets/gcp.yaml index 9b7af54a0..578e76b14 100644 --- a/config/datasets/gcp.yaml +++ b/config/datasets/gcp.yaml @@ -27,7 +27,7 @@ ld_index_raw_template: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.geno ld_matrix_template: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm # Output datasets -gene_index: ${datasets.outputs}/gene_index/gene_index +gene_index: ${datasets.outputs}/gene_index variant_annotation: ${datasets.outputs}/variant_annotation variant_index: ${datasets.outputs}/variant_index study_locus: ${datasets.outputs}/study_locus From 68a280b484d465601238ab5dc4381e80e990b4c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Thu, 19 Oct 2023 12:36:38 +0100 Subject: [PATCH 2/3] fix(gene_index): add `approvedName` `approvedSymbol` `biotype` to `as_gene_index` --- src/otg/datasource/open_targets/target.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/otg/datasource/open_targets/target.py b/src/otg/datasource/open_targets/target.py index 3d334d34d..64172a99c 100644 --- a/src/otg/datasource/open_targets/target.py +++ b/src/otg/datasource/open_targets/target.py @@ -49,6 +49,9 @@ def as_gene_index(cls: type[GeneIndex], target_index: DataFrame) -> GeneIndex: return GeneIndex( _df=target_index.select( f.coalesce(f.col("id"), f.lit("unknown")).alias("geneId"), + "approvedSymbol", + "approvedName", + "biotype", f.coalesce(f.col("genomicLocation.chromosome"), f.lit("unknown")).alias( "chromosome" ), From ec237667faf6d307b8d95693ff85f5c3ffdd999e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Thu, 19 Oct 2023 12:40:47 +0100 Subject: [PATCH 3/3] fix(gene_index): add `obsoleteSymbols` and simplify schema --- src/otg/assets/schemas/gene_index.json | 28 +++++------------------ src/otg/dataset/gene_index.py | 2 +- src/otg/datasource/open_targets/target.py | 1 + 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/src/otg/assets/schemas/gene_index.json b/src/otg/assets/schemas/gene_index.json index cd901a995..c8139c20d 100644 --- a/src/otg/assets/schemas/gene_index.json +++ b/src/otg/assets/schemas/gene_index.json @@ -32,30 +32,14 @@ "metadata": {} }, { + "metadata": {}, "name": "obsoleteSymbols", - "type": { - "type": "array", - "elementType": { - "type": "struct", - "fields": [ - { - "name": "label", - "type": "string", - "nullable": true, - "metadata": {} - }, - { - "name": "source", - "type": "string", - "nullable": true, - "metadata": {} - } - ] - }, - "containsNull": true - }, "nullable": true, - "metadata": {} + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } }, { "name": "tss", diff --git a/src/otg/dataset/gene_index.py b/src/otg/dataset/gene_index.py index 3b67560da..979ef959e 100644 --- a/src/otg/dataset/gene_index.py +++ b/src/otg/dataset/gene_index.py @@ -64,7 +64,7 @@ def symbols_lut(self: GeneIndex) -> DataFrame: """ return self.df.select( f.explode( - f.array_union(f.array("approvedSymbol"), f.col("obsoleteSymbols.label")) + f.array_union(f.array("approvedSymbol"), f.col("obsoleteSymbols")) ).alias("geneSymbol"), "*", ) diff --git a/src/otg/datasource/open_targets/target.py b/src/otg/datasource/open_targets/target.py index 64172a99c..78096c6e3 100644 --- a/src/otg/datasource/open_targets/target.py +++ b/src/otg/datasource/open_targets/target.py @@ -52,6 +52,7 @@ def as_gene_index(cls: type[GeneIndex], target_index: DataFrame) -> GeneIndex: "approvedSymbol", "approvedName", "biotype", + f.col("obsoleteSymbols.label").alias("obsoleteSymbols"), f.coalesce(f.col("genomicLocation.chromosome"), f.lit("unknown")).alias( "chromosome" ),