Skip to content

Commit

Permalink
Fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
javfg committed Jan 15, 2025
1 parent 26ae92d commit 1d13af1
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 250 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ htmlcov

# input dir can be used for tests
input
work
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "ontoform"
version = "25.0.0-rc.4"
version = "25.0.0-rc.5"
description = "Open Targets pipeline converter tool"
readme = "README.md"
requires-python = ">=3.12"
Expand Down
File renamed without changes.
File renamed without changes.
6 changes: 3 additions & 3 deletions tests/test_disease.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ work_dir=/tmp
# set up stuff
set -x
set -e
mkdir -p $work_dir/input/ontology-inputs
mkdir -p $work_dir/input/disease

# get efo
# DiseaseTransformer —
# input/ontology-inputs/efo_otar_slim.json -> output/disease/disease.parquet
curl -Ls https://github.com/EBISPOT/efo/releases/download/v3.70.0/efo_otar_slim.json > $work_dir/input/ontology-inputs/efo_otar_slim.json
# input/disease/efo_otar_slim.json -> output/disease/disease.parquet
curl -Ls https://github.com/EBISPOT/efo/releases/download/v3.70.0/efo_otar_slim.json > $work_dir/input/disease/efo_otar_slim.json
gsutil cp gs://open-targets-pre-data-releases/24.09dev/output/oldetl_diseases.jsonl $work_dir/oldetl-disease.jsonl

# run ontoform
Expand Down
14 changes: 7 additions & 7 deletions tests/test_expression.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,28 @@ work_dir=/tmp
# set up stuff
set -x
set -e
mkdir -p $work_dir/input/expression-inputs
mkdir -p $work_dir/input/expression

# get expression files
# NormalTissueTransformer —
# input/expression-inputs/normal_tissue.tsv.zip -> input/expression-inputs-transformed/normal_tissue.tsv.gz
curl -Ls https://www.proteinatlas.org/download/tsv/normal_tissue.tsv.zip > $work_dir/input/expression-inputs/normal_tissue.tsv.zip
# input/expression/normal_tissue.tsv.zip -> input/expression-transformed/normal_tissue.tsv.gz
curl -Ls https://www.proteinatlas.org/download/tsv/normal_tissue.tsv.zip > $work_dir/input/expression/normal_tissue.tsv.zip

# TissueTransformer —
# input/expression-inputs/map_with_efos.json -> input/expression-inputs-transformed/tissue-translation-map.parquet
curl -Ls https://raw.githubusercontent.com/opentargets/expression_hierarchy/master/process/map_with_efos.json > $work_dir/input/expression-inputs/map_with_efos.json
# input/expression/map_with_efos.json -> input/expression-transformed/tissue-translation-map.parquet
curl -Ls https://raw.githubusercontent.com/opentargets/expression_hierarchy/master/process/map_with_efos.json > $work_dir/input/expression/map_with_efos.json
gsutil cp gs://open-targets-pre-data-releases/24.09dev/input/expression-inputs/tissue-translation-map.json $work_dir/oldpis-tissue-translation-map.json

# run ontoform
uv run ontoform --work-dir /tmp expression --output-format ndjson

# we cannot compare with the old pis output because we do not know about version
# control in protein atlas so we cant get the same original file
diff <(unzip -p $work_dir/input/expression-inputs/normal_tissue.tsv.zip normal_tissue.tsv) <(gzip -d $work_dir/input/expression-inputs/normal_tissue.tsv.gz -c)
diff <(unzip -p $work_dir/input/expression/normal_tissue.tsv.zip normal_tissue.tsv) <(gzip -d $work_dir/intermediate/expression/normal_tissue.tsv.gz -c)

# sort rows
sort < $work_dir/oldpis-tissue-translation-map.json > /tmp/oldpis-tissue-translation-map-sort.jsonl
sort < $work_dir/input/expression-inputs/tissue-translation-map.jsonl > /tmp/ontoform-tissue-translation-map-sort.jsonl
sort < $work_dir/intermediate/expression/tissue-translation-map.jsonl > /tmp/ontoform-tissue-translation-map-sort.jsonl

# remove whitespace
jq -c . /tmp/oldpis-tissue-translation-map-sort.jsonl > /tmp/oldpis-tissue-translation-map-sort_nowhite.jsonl
Expand Down
42 changes: 0 additions & 42 deletions tests/test_homologue.sh

This file was deleted.

10 changes: 5 additions & 5 deletions tests/test_so.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,27 @@ work_dir=/tmp
# set up stuff
set -x
set -e
mkdir -p $work_dir/input/so-inputs
mkdir -p $work_dir/input/so

# get so
# SOTransformer —
# input/so-inputs/so.json -> input/so-inputs/so.parquet
curl -Ls https://raw.githubusercontent.com/The-Sequence-Ontology/SO-Ontologies/refs/heads/master/Ontology_Files/so.json > $work_dir/input/so-inputs/so.json
# input/so/so.json -> input/so/so.parquet
curl -Ls https://raw.githubusercontent.com/The-Sequence-Ontology/SO-Ontologies/refs/heads/master/Ontology_Files/so.json > $work_dir/input/so/so.json
gsutil cp gs://open-targets-pre-data-releases/24.09dev/input/so-inputs/so.json $work_dir/oldpis-so.jsonl

# run the transformation
uv run ontoform --work-dir $work_dir so --output-format ndjson

# sort rows
sort < $work_dir/oldpis-so.jsonl > /tmp/oldpis-so-sort.jsonl
sort < $work_dir/input/so-inputs/so.jsonl > /tmp/ontoform-so-sort.jsonl
sort < $work_dir/intermediate/so/so.jsonl > /tmp/ontoform-so-sort.jsonl

# sort the arrays
jq -cf ./tools/diff.jq /tmp/oldpis-so-sort.jsonl > /tmp/oldpis-so-sort_keysort.jsonl
jq -cf ./tools/diff.jq /tmp/ontoform-so-sort.jsonl > /tmp/ontoform-so-sort_keysort.jsonl

# sort the object keys
jq -s "." /tmp/oldpis-so-sort_keysort.jsonl | jq --sort-keys "." | jq -c ".[]" > /tmp/oldpis-so-sort_keysort_objsort.jsonl
jq -s "." /tmp/oldpis-so-sort_keysort.jsonl | jq --sort-keys "." | jq -c ".[]" > /tmp/oldpis-so-sort_keysort_objsort.jsonl
jq -s "." /tmp/ontoform-so-sort_keysort.jsonl | jq --sort-keys "." | jq -c ".[]" > /tmp/ontoform-so-sort_keysort_objsort.jsonl

# we only need id and label so let's drop everything else from the old pis output
Expand Down
54 changes: 29 additions & 25 deletions tests/test_target.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,48 +6,52 @@ if [ ! -d ./tests ]; then
exit 1
fi

work_dir=/tmp
work_dir=./work # /tmp might be too small if it uses tmpfsgg

# set up stuff
set -x
set -e
mkdir -p $work_dir/input/target-inputs/hpa
mkdir -p $work_dir/input/target-inputs/project-scores
mkdir -p $work_dir/input/target-inputs/ensembl
mkdir -p $work_dir/input/target-inputs/gnomad
mkdir -p $work_dir/input/target/hpa
mkdir -p $work_dir/input/target/project-scores
mkdir -p $work_dir/input/target/ensembl
mkdir -p $work_dir/input/target/gnomad
mkdir -p $work_dir/input/target/homologue/gene_dictionary

# get target files

# SubcellularLocationTransformer —
# input/target-inputs/hpa/subcellular_locations.tsv.zip -> input/target-inputs/hpa/subcellular_locations.tsv.gz
curl -Ls https://www.proteinatlas.org/download/tsv/subcellular_location.tsv.zip -o $work_dir/input/target-inputs/hpa/subcellular_location.tsv.zip
# input/target/hpa/subcellular_locations.tsv.zip -> intermediate/target/hpa/subcellular_locations.tsv.gz
curl -Ls https://www.proteinatlas.org/download/tsv/subcellular_location.tsv.zip -o $work_dir/input/target/hpa/subcellular_location.tsv.zip

# SubcellularLocationSSLTransformer —
# input/target-inputs/hpa/subcellular_locations_ssl.tsv -> target-inputs/hpa/subcellular_locations_ssl.parquet
curl -Ls https://storage.googleapis.com/otar001-core/subcellularLocations/HPA_subcellular_locations_SL-2021-08-19.tsv -o $work_dir/input/target-inputs/hpa/subcellular_locations_ssl.tsv
# input/target/hpa/subcellular_locations_ssl.tsv -> intermediate/target/hpa/subcellular_locations_ssl.parquet
curl -Ls https://storage.googleapis.com/otar001-core/subcellularLocations/HPA_subcellular_locations_SL-2021-08-19.tsv -o $work_dir/input/target/hpa/subcellular_locations_ssl.tsv

# EssentialityMatricesTransformer —
# input/target-inputs/project-scores/essentiality_matrices.zip -> input/target-inputs/project-scores/04_binaryDepScores.parquet
curl -Ls https://cog.sanger.ac.uk/cmp/download/essentiality_matrices.zip -o $work_dir/input/target-inputs/project-scores/essentiality_matrices.zip
# input/target/project-scores/essentiality_matrices.zip -> intermediate/target/project-scores/04_binaryDepScores.parquet
curl -Ls https://cog.sanger.ac.uk/cmp/download/essentiality_matrices.zip -o $work_dir/input/target/project-scores/essentiality_matrices.zip

# GeneIdentifiersTransformer —
# input/target-inputs/project-scores/gene_identifiers_latest.csv.gz -> input/target-inputs/project-scores/gene_identifiers_latest.parquet
curl -Ls https://cog.sanger.ac.uk/cmp/download/gene_identifiers_latest.csv.gz -o $work_dir/input/target-inputs/project-scores/gene_identifiers_latest.csv.gz

# Ensembl — download conditionally
# input/target-inputs/ensembl/homo_sapiens.json -> input/target-inputs/ensembl/homo_sapiens.parquet
if [ ! -f $work_dir/input/target-inputs/ensembl/homo_sapiens.json ]; then
curl -Ls https://ftp.ensembl.org/pub/release-113/json/homo_sapiens/homo_sapiens.json -o $work_dir/input/target-inputs/ensembl/homo_sapiens.json
fi
# input/target/project-scores/gene_identifiers_latest.csv.gz -> intermediate/target/project-scores/gene_identifiers_latest.parquet
curl -Ls https://cog.sanger.ac.uk/cmp/download/gene_identifiers_latest.csv.gz -o $work_dir/input/target/project-scores/gene_identifiers_latest.csv.gz

# GnomAD —
# input/target-inputs/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz -> input/target-inputs/gnomad/gnomad_lof_by_gene.txt.gz
curl -Ls https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz -o $work_dir/input/target-inputs/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz
# input/target/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz -> intermediate/target/gnomad/gnomad_lof_by_gene.txt.gz
curl -Ls https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz -o $work_dir/input/target/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz

# Ensembl — download conditionally
# input/target/ensembl/homo_sapiens.json -> intermediate/target/ensembl/homo_sapiens.parquet
if [ ! -f $work_dir/input/target/ensembl/homo_sapiens.json ]; then
curl -Ls https://ftp.ensembl.org/pub/release-113/json/homo_sapiens/homo_sapiens.json -o $work_dir/input/target/ensembl/homo_sapiens.json
fi
# homologue — input/target/homologue/gene_dictionary/homo_sapiens.json -> intermediate/target/homologue/gene_dictionary/homo_sapiens.parquet
cp $work_dir/input/target/ensembl/homo_sapiens.json $work_dir/input/target/homologue/gene_dictionary/homo_sapiens.json
gsutil cp gs://open-targets-pre-data-releases/24.12-uo_test-3/input/target-inputs/homologue/gene_dictionary/homo_sapiens.parquet $work_dir/previous_homologue.parquet

# run ontoform
uv run ontoform --work-dir /tmp target --output-format ndjson
# run the target transformation
uv run ontoform --work-dir $work_dir target

# compare the outputs
diff $work_dir/previous_homologue.parquet $work_dir/intermediate/target/homologue/gene_dictionary/homo_sapiens.parquet || :

# we don't need to check outputs, just knowing everything runs is fine
# except for ensembl, TODO once regular pis runs for release 113
# we don't need to check other outputs (those just convert formats), just knowing everything runs is fine
Loading

0 comments on commit 1d13af1

Please sign in to comment.