Revamped config

opentargets · Jan 10, 2025 · ea79c6d · ea79c6d
1 parent 0b5ffb0
commit ea79c6d
Show file tree

Hide file tree

Showing 9 changed files with 121 additions and 132 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ontoform"
-version = "25.0.0-rc.3"
+version = "25.0.0-rc.4"
 description = "Open Targets pipeline converter tool"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -21,6 +21,10 @@ ontoform = "ontoform.main:main"
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 
+[tool.coverage.report]
+omit = ["tests/**"]
+exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:"]
+
 [tool.pytest.ini_options]
 testpaths = ["tests/pis"]
 

diff --git a/src/ontoform/steps/disease.py b/src/ontoform/steps/disease.py
@@ -6,7 +6,7 @@
     name='disease',
     transformations=[
         FileTransformation(
-            src_path='input/ontology-inputs/efo_otar_slim.json',
+            src_path='input/disease/efo_otar_slim.json',
             dst_path=lambda _, f: f'output/disease/disease.{stem(f)}',
             transformer=DiseaseTransformer,
         ),

diff --git a/src/ontoform/steps/expression.py b/src/ontoform/steps/expression.py
@@ -6,13 +6,13 @@
     name='expression',
     transformations=[
         FileTransformation(
-            src_path='input/expression-inputs/normal_tissue.tsv.zip',
-            dst_path='input/expression-inputs/normal_tissue.tsv.gz',
+            src_path='input/expression/normal_tissue.tsv.zip',
+            dst_path='intermediate/expression/normal_tissue.tsv.gz',
             transformer=NormalTissueTransformer,
         ),
         FileTransformation(
-            src_path='input/expression-inputs/map_with_efos.json',
-            dst_path=lambda _, f: f'input/expression-inputs/tissue-translation-map.{stem(f)}',
+            src_path='input/expression/map_with_efos.json',
+            dst_path=lambda _, f: f'intermediate/expression/tissue-translation-map.{stem(f)}',
             transformer=TissueTransformer,
         ),
     ],

diff --git a/src/ontoform/steps/homologue.py b/src/ontoform/steps/homologue.py
diff --git a/src/ontoform/steps/openfda.py b/src/ontoform/steps/openfda.py
@@ -8,8 +8,8 @@
     name='openfda',
     transformations=[
         GlobTransformation(
-            src_prefix='input/fda-inputs',
-            dst_path=lambda _, f: f'input/fda-inputs/{uuid4()}.{stem(f)}',
+            src_prefix='input/openfda',
+            dst_path=lambda _, f: f'intermediate/openfda/{uuid4()}.{stem(f)}',
             glob='**/*.zip',
             transformer=OpenFdaTransformer,
         ),

diff --git a/src/ontoform/steps/target.py b/src/ontoform/steps/target.py
@@ -1,45 +1,54 @@
 from ontoform.file_format import stem
-from ontoform.model import FileTransformation, Step
+from ontoform.model import FileTransformation, GlobTransformation, Step
 from ontoform.transformers.target import (
     EnsemblTransformer,
     EssentialityMatricesTransformer,
     GeneIdentifiersTransformer,
     GnomadTransformer,
+    HomologueTransformer,
     SubcellularLocationSSLTransformer,
     SubcellularLocationTransformer,
 )
 
 target = Step(
     name='expression',
     transformations=[
+        GlobTransformation(
+            src_prefix='input/target/homologue/gene_dictionary',
+            dst_path=lambda p, f: f'intermediate/target/homologue/gene_dictionary/{p.rsplit("/")[-1]}'.replace(
+                '.json', f'.{stem(f)}'
+            ),
+            glob='**/*.json',
+            transformer=HomologueTransformer,
+        ),
         FileTransformation(
-            src_path='input/target-inputs/hpa/subcellular_location.tsv.zip',
-            dst_path='input/target-inputs/hpa/subcellular_location.tsv.gz',
+            src_path='input/target/hpa/subcellular_location.tsv.zip',
+            dst_path='intermediate/target/hpa/subcellular_location.tsv.gz',
             transformer=SubcellularLocationTransformer,
         ),
         FileTransformation(
-            src_path='input/target-inputs/hpa/subcellular_locations_ssl.tsv',
-            dst_path=lambda _, f: f'input/target-inputs/hpa/subcellular_locations_ssl.{stem(f)}',
+            src_path='input/target/hpa/subcellular_locations_ssl.tsv',
+            dst_path=lambda _, f: f'intermediate/target/hpa/subcellular_locations_ssl.{stem(f)}',
             transformer=SubcellularLocationSSLTransformer,
         ),
         FileTransformation(
-            src_path='input/target-inputs/project-scores/essentiality_matrices.zip',
-            dst_path=lambda _, f: f'input/target-inputs/project-scores/04_binaryDepScores.{stem(f)}',
+            src_path='input/target/project-scores/essentiality_matrices.zip',
+            dst_path=lambda _, f: f'intermediate/target/project-scores/04_binaryDepScores.{stem(f)}',
             transformer=EssentialityMatricesTransformer,
         ),
         FileTransformation(
-            src_path='input/target-inputs/project-scores/gene_identifiers_latest.csv.gz',
-            dst_path=lambda _, f: f'input/target-inputs/project-scores/gene_identifiers_latest.{stem(f)}',
+            src_path='input/target/project-scores/gene_identifiers_latest.csv.gz',
+            dst_path=lambda _, f: f'intermediate/target/project-scores/gene_identifiers_latest.{stem(f)}',
             transformer=GeneIdentifiersTransformer,
         ),
         FileTransformation(
-            src_path='input/target-inputs/ensembl/homo_sapiens.json',
-            dst_path=lambda _, f: f'input/target-inputs/ensembl/homo_sapiens.{stem(f)}',
+            src_path='input/target/ensembl/homo_sapiens.json',
+            dst_path=lambda _, f: f'intermediate/target/ensembl/homo_sapiens.{stem(f)}',
             transformer=EnsemblTransformer,
         ),
         FileTransformation(
-            src_path='input/target-inputs/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz',
-            dst_path='input/target-inputs/gnomad/gnomad_lof_by_gene.txt.gz',
+            src_path='input/target/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz',
+            dst_path='intermediate/target/gnomad/gnomad_lof_by_gene.txt.gz',
             transformer=GnomadTransformer,
         ),
     ],

diff --git a/src/ontoform/transformers/homologue.py b/src/ontoform/transformers/homologue.py
diff --git a/src/ontoform/transformers/target.py b/src/ontoform/transformers/target.py
@@ -1,4 +1,6 @@
+import gc
 import gzip
+import json
 import zipfile
 from typing import BinaryIO
 
@@ -9,6 +11,29 @@
 from ontoform.schemas.ensembl import schema
 
 
+class FilteredJSONDecoder(json.JSONDecoder):
+    """JSON Decoder that filter keys in the JSON object.
+
+    This decoder calls a hook on each JSON object that filters out keys not in
+    allowed_keys set. It also processes the root_key, so we can get to the data.
+
+    This saves us a lot of memory and time, and serves as a workaround for the
+    bug in polars that causes it to crash when loading large JSON objects:
+
+    https://github.com/pola-rs/polars/issues/17677
+    """
+
+    def __init__(self, root_key='genes', allowed_keys=None, *args, **kwargs):
+        self.root_key = root_key
+        self.allowed_keys = allowed_keys or {'id', 'name'}
+        super().__init__(*args, **kwargs, object_hook=self.filter_keys)
+
+    def filter_keys(self, obj: dict) -> dict:
+        if self.root_key in obj:
+            return {self.root_key: obj[self.root_key]}
+        return {k: v for k, v in obj.items() if k in self.allowed_keys}
+
+
 class SubcellularLocationTransformer:
     def transform(self, src: BinaryIO, dst: BinaryIO, output_format: FileFormat) -> None:
         logger.info(f'transforming to gzip, ignoring format argument {output_format.name}')
@@ -46,22 +71,20 @@ def transform(self, src: BinaryIO, dst: BinaryIO, output_format: FileFormat) ->
             .drop('id')
             .explode('genes')
             .unnest('genes')
-            .select(
-                [
-                    'id',
-                    'biotype',
-                    'description',
-                    'end',
-                    'start',
-                    'strand',
-                    pl.col('seq_region_name').alias('chromosome'),
-                    pl.col('name').alias('approvedSymbol'),
-                    'transcripts',
-                    'SignalP',
-                    pl.col('Uniprot/SPTREMBL').alias('uniprot_trembl'),
-                    pl.col('Uniprot/SWISSPROT').alias('uniprot_swissprot'),
-                ]
-            )
+            .select([
+                'id',
+                'biotype',
+                'description',
+                'end',
+                'start',
+                'strand',
+                pl.col('seq_region_name').alias('chromosome'),
+                pl.col('name').alias('approvedSymbol'),
+                'transcripts',
+                'SignalP',
+                pl.col('Uniprot/SPTREMBL').alias('uniprot_trembl'),
+                pl.col('Uniprot/SWISSPROT').alias('uniprot_swissprot'),
+            ])
         )
 
         logger.debug(f'transformation complete, writing file to {dst.name}')
@@ -74,3 +97,15 @@ def transform(self, src: BinaryIO, dst: BinaryIO, output_format: FileFormat) ->
         with gzip.open(src) as file:
             with gzip.open(dst, 'wb') as gzip_file:
                 gzip_file.write(file.read())
+
+
+class HomologueTransformer:
+    def transform(self, src: BinaryIO, dst: BinaryIO, output_format: FileFormat) -> None:
+        # load the homologues, parse the json with our decoder, then delete from memory
+        data = json.loads(src.read(), cls=FilteredJSONDecoder)
+        df = pl.from_dicts(data['genes'])
+        del data
+        gc.collect()
+
+        # write the result
+        write_format(df, dst, output_format)