Skip to content

Commit

Permalink
Revamped config
Browse files Browse the repository at this point in the history
  • Loading branch information
javfg committed Jan 10, 2025
1 parent 0b5ffb0 commit ea79c6d
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 132 deletions.
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "ontoform"
version = "25.0.0-rc.3"
version = "25.0.0-rc.4"
description = "Open Targets pipeline converter tool"
readme = "README.md"
requires-python = ">=3.12"
Expand All @@ -21,6 +21,10 @@ ontoform = "ontoform.main:main"
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.coverage.report]
omit = ["tests/**"]
exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:"]

[tool.pytest.ini_options]
testpaths = ["tests/pis"]

Expand Down
2 changes: 1 addition & 1 deletion src/ontoform/steps/disease.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
name='disease',
transformations=[
FileTransformation(
src_path='input/ontology-inputs/efo_otar_slim.json',
src_path='input/disease/efo_otar_slim.json',
dst_path=lambda _, f: f'output/disease/disease.{stem(f)}',
transformer=DiseaseTransformer,
),
Expand Down
8 changes: 4 additions & 4 deletions src/ontoform/steps/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
name='expression',
transformations=[
FileTransformation(
src_path='input/expression-inputs/normal_tissue.tsv.zip',
dst_path='input/expression-inputs/normal_tissue.tsv.gz',
src_path='input/expression/normal_tissue.tsv.zip',
dst_path='intermediate/expression/normal_tissue.tsv.gz',
transformer=NormalTissueTransformer,
),
FileTransformation(
src_path='input/expression-inputs/map_with_efos.json',
dst_path=lambda _, f: f'input/expression-inputs/tissue-translation-map.{stem(f)}',
src_path='input/expression/map_with_efos.json',
dst_path=lambda _, f: f'intermediate/expression/tissue-translation-map.{stem(f)}',
transformer=TissueTransformer,
),
],
Expand Down
17 changes: 0 additions & 17 deletions src/ontoform/steps/homologue.py

This file was deleted.

4 changes: 2 additions & 2 deletions src/ontoform/steps/openfda.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
name='openfda',
transformations=[
GlobTransformation(
src_prefix='input/fda-inputs',
dst_path=lambda _, f: f'input/fda-inputs/{uuid4()}.{stem(f)}',
src_prefix='input/openfda',
dst_path=lambda _, f: f'intermediate/openfda/{uuid4()}.{stem(f)}',
glob='**/*.zip',
transformer=OpenFdaTransformer,
),
Expand Down
35 changes: 22 additions & 13 deletions src/ontoform/steps/target.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,54 @@
from ontoform.file_format import stem
from ontoform.model import FileTransformation, Step
from ontoform.model import FileTransformation, GlobTransformation, Step
from ontoform.transformers.target import (
EnsemblTransformer,
EssentialityMatricesTransformer,
GeneIdentifiersTransformer,
GnomadTransformer,
HomologueTransformer,
SubcellularLocationSSLTransformer,
SubcellularLocationTransformer,
)

target = Step(
name='expression',
transformations=[
GlobTransformation(
src_prefix='input/target/homologue/gene_dictionary',
dst_path=lambda p, f: f'intermediate/target/homologue/gene_dictionary/{p.rsplit("/")[-1]}'.replace(
'.json', f'.{stem(f)}'
),
glob='**/*.json',
transformer=HomologueTransformer,
),
FileTransformation(
src_path='input/target-inputs/hpa/subcellular_location.tsv.zip',
dst_path='input/target-inputs/hpa/subcellular_location.tsv.gz',
src_path='input/target/hpa/subcellular_location.tsv.zip',
dst_path='intermediate/target/hpa/subcellular_location.tsv.gz',
transformer=SubcellularLocationTransformer,
),
FileTransformation(
src_path='input/target-inputs/hpa/subcellular_locations_ssl.tsv',
dst_path=lambda _, f: f'input/target-inputs/hpa/subcellular_locations_ssl.{stem(f)}',
src_path='input/target/hpa/subcellular_locations_ssl.tsv',
dst_path=lambda _, f: f'intermediate/target/hpa/subcellular_locations_ssl.{stem(f)}',
transformer=SubcellularLocationSSLTransformer,
),
FileTransformation(
src_path='input/target-inputs/project-scores/essentiality_matrices.zip',
dst_path=lambda _, f: f'input/target-inputs/project-scores/04_binaryDepScores.{stem(f)}',
src_path='input/target/project-scores/essentiality_matrices.zip',
dst_path=lambda _, f: f'intermediate/target/project-scores/04_binaryDepScores.{stem(f)}',
transformer=EssentialityMatricesTransformer,
),
FileTransformation(
src_path='input/target-inputs/project-scores/gene_identifiers_latest.csv.gz',
dst_path=lambda _, f: f'input/target-inputs/project-scores/gene_identifiers_latest.{stem(f)}',
src_path='input/target/project-scores/gene_identifiers_latest.csv.gz',
dst_path=lambda _, f: f'intermediate/target/project-scores/gene_identifiers_latest.{stem(f)}',
transformer=GeneIdentifiersTransformer,
),
FileTransformation(
src_path='input/target-inputs/ensembl/homo_sapiens.json',
dst_path=lambda _, f: f'input/target-inputs/ensembl/homo_sapiens.{stem(f)}',
src_path='input/target/ensembl/homo_sapiens.json',
dst_path=lambda _, f: f'intermediate/target/ensembl/homo_sapiens.{stem(f)}',
transformer=EnsemblTransformer,
),
FileTransformation(
src_path='input/target-inputs/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz',
dst_path='input/target-inputs/gnomad/gnomad_lof_by_gene.txt.gz',
src_path='input/target/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz',
dst_path='intermediate/target/gnomad/gnomad_lof_by_gene.txt.gz',
transformer=GnomadTransformer,
),
],
Expand Down
42 changes: 0 additions & 42 deletions src/ontoform/transformers/homologue.py

This file was deleted.

67 changes: 51 additions & 16 deletions src/ontoform/transformers/target.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import gc
import gzip
import json
import zipfile
from typing import BinaryIO

Expand All @@ -9,6 +11,29 @@
from ontoform.schemas.ensembl import schema


class FilteredJSONDecoder(json.JSONDecoder):
"""JSON Decoder that filter keys in the JSON object.
This decoder calls a hook on each JSON object that filters out keys not in
allowed_keys set. It also processes the root_key, so we can get to the data.
This saves us a lot of memory and time, and serves as a workaround for the
bug in polars that causes it to crash when loading large JSON objects:
https://github.com/pola-rs/polars/issues/17677
"""

def __init__(self, root_key='genes', allowed_keys=None, *args, **kwargs):
self.root_key = root_key
self.allowed_keys = allowed_keys or {'id', 'name'}
super().__init__(*args, **kwargs, object_hook=self.filter_keys)

def filter_keys(self, obj: dict) -> dict:
if self.root_key in obj:
return {self.root_key: obj[self.root_key]}
return {k: v for k, v in obj.items() if k in self.allowed_keys}


class SubcellularLocationTransformer:
def transform(self, src: BinaryIO, dst: BinaryIO, output_format: FileFormat) -> None:
logger.info(f'transforming to gzip, ignoring format argument {output_format.name}')
Expand Down Expand Up @@ -46,22 +71,20 @@ def transform(self, src: BinaryIO, dst: BinaryIO, output_format: FileFormat) ->
.drop('id')
.explode('genes')
.unnest('genes')
.select(
[
'id',
'biotype',
'description',
'end',
'start',
'strand',
pl.col('seq_region_name').alias('chromosome'),
pl.col('name').alias('approvedSymbol'),
'transcripts',
'SignalP',
pl.col('Uniprot/SPTREMBL').alias('uniprot_trembl'),
pl.col('Uniprot/SWISSPROT').alias('uniprot_swissprot'),
]
)
.select([
'id',
'biotype',
'description',
'end',
'start',
'strand',
pl.col('seq_region_name').alias('chromosome'),
pl.col('name').alias('approvedSymbol'),
'transcripts',
'SignalP',
pl.col('Uniprot/SPTREMBL').alias('uniprot_trembl'),
pl.col('Uniprot/SWISSPROT').alias('uniprot_swissprot'),
])
)

logger.debug(f'transformation complete, writing file to {dst.name}')
Expand All @@ -74,3 +97,15 @@ def transform(self, src: BinaryIO, dst: BinaryIO, output_format: FileFormat) ->
with gzip.open(src) as file:
with gzip.open(dst, 'wb') as gzip_file:
gzip_file.write(file.read())


class HomologueTransformer:
def transform(self, src: BinaryIO, dst: BinaryIO, output_format: FileFormat) -> None:
# load the homologues, parse the json with our decoder, then delete from memory
data = json.loads(src.read(), cls=FilteredJSONDecoder)
df = pl.from_dicts(data['genes'])
del data
gc.collect()

# write the result
write_format(df, dst, output_format)
Loading

0 comments on commit ea79c6d

Please sign in to comment.