Skip to content

Commit

Permalink
Merge pull request #348 from monarch-initiative/346-duckdb-for-benchm…
Browse files Browse the repository at this point in the history
…arking

346 duckdb for benchmarking
  • Loading branch information
yaseminbridges authored Sep 10, 2024
2 parents 339f492 + dfe23e3 commit 3116bfa
Show file tree
Hide file tree
Showing 31 changed files with 1,805 additions and 3,151 deletions.
49 changes: 39 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,6 @@ $(TMP_DATA)/semsim/%.sql:
wget $(SEMSIM_BASE_URL)/$*.sql -O $@


$(ROOT_DIR)/results/run_data.txt:
touch $@

$(ROOT_DIR)/results/gene_rank_stats.svg: $(ROOT_DIR)/results/run_data.txt
pheval-utils benchmark-comparison -r $< -o $(ROOT_DIR)/$(shell dirname $@)/results --gene-analysis -y bar_cumulative
mv $(ROOT_DIR)/gene_rank_stats.svg $@

.PHONY: pheval-report
pheval-report: $(ROOT_DIR)/results/gene_rank_stats.svg


$(ROOT_DIR)/results/template-1.0.0/results.yml: configurations/template-1.0.0/config.yaml corpora/lirical/default/corpus.yml
Expand All @@ -88,10 +79,48 @@ $(ROOT_DIR)/results/template-1.0.0/results.yml: configurations/template-1.0.0/co
--output-dir $(shell dirname $@)

touch $@
echo -e "$(ROOT_DIR)/corpora/lirical/default/phenopackets\t$(shell dirname $@)" >> results/run_data.txt

.PHONY: pheval-run
pheval-run: $(ROOT_DIR)/results/template-1.0.0/results.yml


$(ROOT_DIR)/results/template-1.0.0/run_data.yaml:
printf '%s\n' \
"benchmark_name: fake_predictor_benchmark" \
"runs:" \
" - run_identifier: run_identifier_1" \
" results_dir: $(shell dirname $@)" \
" phenopacket_dir: $(ROOT_DIR)/corpora/lirical/default/phenopackets" \
" gene_analysis: True" \
" variant_analysis: False" \
" disease_analysis: False" \
" threshold:" \
" score_order: descending" \
"plot_customisation:" \
" gene_plots:" \
" plot_type: bar_cumulative" \
" rank_plot_title:" \
" roc_curve_title: " \
" precision_recall_title: " \
" disease_plots:" \
" plot_type: bar_cumulative" \
" rank_plot_title:" \
" roc_curve_title: " \
" precision_recall_title: " \
" variant_plots:" \
" plot_type: bar_cumulative" \
" rank_plot_title: " \
" roc_curve_title: " \
" precision_recall_title: " \
> $@

$(ROOT_DIR)/results/template-1.0.0/gene_rank_stats.svg: $(ROOT_DIR)/results/template-1.0.0/run_data.yaml
pheval-utils generate-benchmark-stats -r $<

.PHONY: pheval-report
pheval-report: $(ROOT_DIR)/results/template-1.0.0/gene_rank_stats.svg


corpora/lirical/default/corpus.yml:
test -d $(ROOT_DIR)/corpora/lirical/default/ || mkdir -p $(ROOT_DIR)/corpora/lirical/default/

Expand Down
107 changes: 107 additions & 0 deletions docs/executing_a_benchmark.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Executing a Benchmark

PhEval is designed for benchmarking algorithms across various datasets. To execute a benchmark using PhEval, you need to:

1. Execute your runner; generating the PhEval standardised TSV outputs for gene/variant/disease prioritisation.
2. Configure the benchmarking parameters.
3. Run the benchmark.

PhEval will generate various performance reports, allowing you to easily compare the effectiveness of different algorithms.

## After the Runner Execution

After executing a run, you may be left with an output directory structure like so:

```tree
.
├── pheval_disease_results
│   ├── patient_1-pheval_disease_result.tsv
├── pheval_gene_results
│   ├── patient_1-pheval_gene_result.tsv
├── pheval_variant_results
│   ├── patient_1-pheval_variant_result.tsv
├── raw_results
│   ├── patient_1.json
├── results.yml
└── tool_input_commands
└── tool_input_commands.txt
```
Whether you have populated `pheval_disease_results`, `pheval_gene_results`, and `pheval_variant_results` directories will depend on what is specified in the `config.yaml` for the runner execution. It is the results in these directories that are consumed in the benchmarking to produce the statistical comparison reports.

## Benchmarking Configuration File

To configure the benchmarking parameters, a YAML configuration file should be created and supplied to the CLI command.

An outline of the configuration file structure follows below:

```yaml
benchmark_name: exomiser_14_benchmark
runs:
- run_identifier: run_identifier_1
results_dir: /path/to/results_dir_1
phenopacket_dir: /path/to/phenopacket_dir
gene_analysis: True
variant_analysis: False
disease_analysis: True
threshold:
score_order: descending
- run_identifier: run_identifier_2
results_dir: /path/to/results_dir_2
phenopacket_dir: /path/to/phenopacket_dir
gene_analysis: True
variant_analysis: True
disease_analysis: True
threshold:
score_order: descending
plot_customisation:
gene_plots:
plot_type: bar_cumulative
rank_plot_title:
roc_curve_title:
precision_recall_title:
disease_plots:
plot_type: bar_cumulative
rank_plot_title:
roc_curve_title:
precision_recall_title:
variant_plots:
plot_type: bar_cumulative
rank_plot_title:
roc_curve_title:
precision_recall_title:

```

The `benchmark_name` is what will be used to name the duckdb database that will contain all the ranking and binary statistics as well as comparisons between runs. The name provided should not have any whitespace or special characters.

### Runs section

The `runs` section specifies which run configurations should be included in the benchmarking. For each run configuration you will need to populate the following parameters:

- `run_identifier`: The identifier associated with the run - this should be meaningful as it will be used in the naming in tables and plots.
- `results_dir`: The full path to the root directory where the directories `pheval_gene_results`/`pheval_variant_results`/`pheval_disease_results` can be found.
- `phenopacket_dir`: The full path to the phenopacket directory used during the runner execution.
- `gene_analysis`: Boolean specifying whether to perform benchmarking for gene prioritisation analysis.
- `variant_analysis`: Boolean specifying whether to perform benchmarking for variant prioritisation analysis
- `disease_analysis`: Boolean specifying whether to perform benchmarking for disease prioritisation analysis
- `threshold`: OPTIONAL score threshold to consider for inclusion of results.
- `score_order`: Ordering of results for ranking. Either ascending or descending.

### Plot customisation section

The `plot_customisation` section specifies any additional customisation to the plots output from the benchmarking. Here you can specify title names for all the plots output, as well as the plot type for displaying the summary ranking stats. This section is split by the plots output from the gene, variant and disease prioritisation benchmarking. The parameters in this section do not need to be populated - however, if left blank it will default to generic titles. The parameters as follows are:

- `plot_type`: The plot type output for the summary rank stats plot. This can be either, bar_cumulative, bar_non_cumulative or bar_stacked.
- `rank_plot_title`: The customised title for the summary rank stats plot.
- `roc_curve_title`: The customised title for the ROC curve plot.
- `precision_recall_title` The customised title for the precision-recall curve plot.

## Executing the benchmark

After configuring the benchmarking YAML, executing the benchmark is relatively simple.

```bash
pheval-utils generate-benchmark-stats --run-yaml benchmarking_config.yaml
```


1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ nav:
- "styleguide.md"
- "CODE_OF_CONDUCT.md"
- Plugins: "plugins.md"
- Executing a Benchmark: "executing_a_benchmark.md"
- "roadmap.md"


Expand Down
63 changes: 59 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ matplotlib = "^3.7.0"
pyserde = "^0.9.8"
polars = "^0.19.15"
scikit-learn = "^1.4.0"
duckdb = "^1.0.0"

[tool.poetry.dev-dependencies]
pytest = "^7.2.0"
Expand Down
49 changes: 39 additions & 10 deletions resources/Makefile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,6 @@ $(TMP_DATA)/semsim/%.sql:
wget $(SEMSIM_BASE_URL)/$*.sql -O $@


$(ROOT_DIR)/results/run_data.txt:
touch $@

$(ROOT_DIR)/results/gene_rank_stats.svg: $(ROOT_DIR)/results/run_data.txt
pheval-utils benchmark-comparison -r $< -o $(ROOT_DIR)/$(shell dirname $@)/results --gene-analysis -y bar_cumulative
mv $(ROOT_DIR)/gene_rank_stats.svg $@

.PHONY: pheval-report
pheval-report: $(ROOT_DIR)/results/gene_rank_stats.svg

{% for run in runs %}
$(ROOT_DIR)/results/{{ run.configuration }}/results.yml: configurations/{{ run.configuration }}/config.yaml corpora/{{ run.corpus }}/{{ run.corpusvariant }}/corpus.yml
Expand All @@ -125,10 +116,48 @@ $(ROOT_DIR)/results/{{ run.configuration }}/results.yml: configurations/{{ run.c
--output-dir $(shell dirname $@)

touch $@
echo -e "$(ROOT_DIR)/corpora/{{ run.corpus }}/default/phenopackets\t$(shell dirname $@)" >> results/run_data.txt

.PHONY: pheval-run
pheval-run: $(ROOT_DIR)/results/{{ run.configuration }}/results.yml


$(ROOT_DIR)/results/{{ run.configuration }}/run_data.yaml:
printf '%s\n' \
"benchmark_name: fake_predictor_benchmark" \
"runs:" \
" - run_identifier: run_identifier_1" \
" results_dir: $(shell dirname $@)" \
" phenopacket_dir: $(ROOT_DIR)/corpora/lirical/default/phenopackets" \
" gene_analysis: True" \
" variant_analysis: False" \
" disease_analysis: False" \
" threshold:" \
" score_order: descending" \
"plot_customisation:" \
" gene_plots:" \
" plot_type: bar_cumulative" \
" rank_plot_title:" \
" roc_curve_title: " \
" precision_recall_title: " \
" disease_plots:" \
" plot_type: bar_cumulative" \
" rank_plot_title:" \
" roc_curve_title: " \
" precision_recall_title: " \
" variant_plots:" \
" plot_type: bar_cumulative" \
" rank_plot_title: " \
" roc_curve_title: " \
" precision_recall_title: " \
> $@

$(ROOT_DIR)/results/{{ run.configuration }}/gene_rank_stats.svg: $(ROOT_DIR)/results/{{ run.configuration }}/run_data.yaml
pheval-utils generate-benchmark-stats -r $<

.PHONY: pheval-report
pheval-report: $(ROOT_DIR)/results/{{ run.configuration }}/gene_rank_stats.svg


{% endfor %}


Expand Down
Loading

0 comments on commit 3116bfa

Please sign in to comment.