From c8899c6393d61960757d1dba81896812854e0ba9 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Tue, 7 Jan 2025 15:40:14 +0200 Subject: [PATCH 1/9] Initial commit of data_manager_clair3_models --- .../data_manager_clair3_models/.shed.yml | 11 ++ .../data_manager/install_clair3_models.xml | 53 ++++++++ .../data_manager/model_fetcher.py | 122 ++++++++++++++++++ .../data_manager_conf.xml | 19 +++ .../test-data/clair3_models.loc | 8 ++ .../tool-data/clair3_models.loc.sample | 8 ++ .../tool_data_table_conf.xml.sample | 6 + .../tool_data_table_conf.xml.test | 6 + 8 files changed, 233 insertions(+) create mode 100644 data_managers/data_manager_clair3_models/.shed.yml create mode 100644 data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml create mode 100644 data_managers/data_manager_clair3_models/data_manager/model_fetcher.py create mode 100644 data_managers/data_manager_clair3_models/data_manager_conf.xml create mode 100644 data_managers/data_manager_clair3_models/test-data/clair3_models.loc create mode 100644 data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample create mode 100644 data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample create mode 100644 data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test diff --git a/data_managers/data_manager_clair3_models/.shed.yml b/data_managers/data_manager_clair3_models/.shed.yml new file mode 100644 index 00000000000..f0bf0f3509a --- /dev/null +++ b/data_managers/data_manager_clair3_models/.shed.yml @@ -0,0 +1,11 @@ +categories: +- Data Managers +description: Install Clair3 models from the Oxford Nanopore Rerio repository +long_description: | + This data manager downloads the Clair3 models from the Oxford Nanopore Rerio repository and installs + them in the Galaxy instance. Note that these models are licensed according to the terms of the + "Oxford Nanopore Technologies, Ltd. Public License Version 1.0" +name: data_manager_clair3_models +owner: iuc +remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_clair3_models +type: unrestricted \ No newline at end of file diff --git a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml new file mode 100644 index 00000000000..c37cb8f1663 --- /dev/null +++ b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml @@ -0,0 +1,53 @@ + + + python + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py new file mode 100644 index 00000000000..5f2961ee27e --- /dev/null +++ b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +import argparse +import json +import sys +import tarfile + + +from io import StringIO, BytesIO +from pathlib import Path +from urllib.request import urlopen, Request + +DATA_TABLE_NAME = 'clair3_models' + + +def find_latest_models(): + # based on the README.rst of the rerio repository as of 7 January 2025 + url = 'https://raw.githubusercontent.com/nanoporetech/rerio/refs/heads/master/README.rst' + httprequest = Request(url) + with urlopen(httprequest) as response: + if response.status != 200: + raise IOError(f'Failed to fetch the latest models: {response.status}') + data = response.read().decode('utf-8') + init_line_seen = False + config_line_seen = False + read_lines = False + models = [] + for line in StringIO(data): + if read_lines: + if line.startswith('=========================='): + read_lines = False + break + model = line[:break1-1] + models.append(model) + if config_line_seen and line.startswith('=========================='): + break1 = line.find(' ') + read_lines = True + continue + if init_line_seen and line.startswith('Config'): + config_line_seen = True + continue + if line.startswith('Clair3 models for the following configurations are available:'): + init_line_seen = True + continue + return models + + +def fetch_model(model_name): + # the model files are tar gzipped, with a structure like: + # model_name/pileup.index + # model_name/full_alignment.index + # and other files, with the key point being that the model_name becoomes the model_directory + + url = f'https://raw.githubusercontent.com/nanoporetech/rerio/refs/heads/master/clair3_models/{model_name}_model' + httprequest = Request(url) + with urlopen(httprequest) as response: + if response.status != 200: + raise IOError(f'Failed to fetch the model {model_name}: {response.status}') + final_url = response.read().decode('utf-8').strip() + httprequest = Request(final_url) + with urlopen(httprequest) as response: + if response.status != 200: + raise IOError(f'Failed to fetch the model {model_name} from CDN URL {final_url}: {response.status}') + data = response.read() + return data + + +def unpack_model(data, outdir): + with tarfile.open(fileobj=BytesIO(data), mode='r:*') as tar: + tar.extractall(outdir) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('dm_filename', type=str, help='The filename of the data manager file to read parameters from and write outputs to') + parser.add_argument('--download_latest', action='store_true', default=False, help='Download the latest models as per the rerio repository') + parser.add_argument('--download_models', type=str, help='Comma separated list of models to download') + args = parser.parse_args() + + # parameters to a data manager are passed in a JSON file (see https://docs.galaxyproject.org/en/latest/dev/data_managers.html) and + # similarily a JSON file is created to pass the output back to Galaxy + models = [] + if args.download_latest: + models.extend(find_latest_models()) + if args.download_models: + models.extend(args.download_models.split(',')) + + if not models: + sys.exit('No models to download, please specify either --download_latest or --download_models') + + with open(args.galaxy_datamanager_filename) as fh: + config = json.load(fh) + if 'extra_files_path' not in config.get('output_data', [{}])[0]: + sys.exit('Please specify the output directory in the data manager configuration (the extra_files_path)') + output_directory = config["output_data"]["extra_files_path"] + if not Path(output_directory).exists(): + Path(output_directory).mkdir(parents=True) + + data_manager_dict = {} + data_manager_dict["data_tables"] = config.get("data_tables", {}) + data_manager_dict["data_tables"][DATA_TABLE_NAME] = [] + + for model in models: + model_dir = Path(output_directory) / model + # In the test below we assume that the contents of the model are uniquely identified by the model name. It is possible + # that Oxford Nanopore will change the contents of the model tarball without changing the name. Hopefully this will never + # happen. + if model_dir.exists(): + print(f'Model {model} already exists, skipping', file=sys.stderr) + continue + data = fetch_model(model) + unpack_model(data, output_directory) + + data_manager_dict["data_tables"][DATA_TABLE_NAME].append( + dict( + value=model, + path=str(model_dir) + ) + ) + + with open(args.output_dm_filename, 'w') as fh: + json.dump(data_manager_dict, fh, sort_keys=True, indent=4) diff --git a/data_managers/data_manager_clair3_models/data_manager_conf.xml b/data_managers/data_manager_clair3_models/data_manager_conf.xml new file mode 100644 index 00000000000..2830e0a5fc9 --- /dev/null +++ b/data_managers/data_manager_clair3_models/data_manager_conf.xml @@ -0,0 +1,19 @@ + + + + + + + + + + ${path} + clair3_models/#echo str($value)# + + ${GALAXY_DATA_MANAGER_DATA_PATH}/clair3_models/#echo str($value)# + abspath + + + + + \ No newline at end of file diff --git a/data_managers/data_manager_clair3_models/test-data/clair3_models.loc b/data_managers/data_manager_clair3_models/test-data/clair3_models.loc new file mode 100644 index 00000000000..5bdacf10348 --- /dev/null +++ b/data_managers/data_manager_clair3_models/test-data/clair3_models.loc @@ -0,0 +1,8 @@ +# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Revio site and provided as directories) +# +# the columns are: +# 1. value +# 2. path (path to directory containing model) +# for example +# +# r1041_e82_400bps_hac_v500 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 diff --git a/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample b/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample new file mode 100644 index 00000000000..5bdacf10348 --- /dev/null +++ b/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample @@ -0,0 +1,8 @@ +# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Revio site and provided as directories) +# +# the columns are: +# 1. value +# 2. path (path to directory containing model) +# for example +# +# r1041_e82_400bps_hac_v500 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 diff --git a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample new file mode 100644 index 00000000000..2bfc1788e80 --- /dev/null +++ b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample @@ -0,0 +1,6 @@ + + + value, path + +
+
\ No newline at end of file diff --git a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test new file mode 100644 index 00000000000..782c1597826 --- /dev/null +++ b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test @@ -0,0 +1,6 @@ + + + value, path + +
+
\ No newline at end of file From 746a2c3f53b73c6252432e9c597dc38b04ba42e4 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Tue, 7 Jan 2025 16:17:28 +0200 Subject: [PATCH 2/9] Fix typos and planemo linting errors --- .../data_manager/install_clair3_models.xml | 40 +++++++++++++++---- .../test-data/clair3_models.loc | 2 +- .../tool-data/clair3_models.loc.sample | 2 +- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml index c37cb8f1663..4cbeb54881f 100644 --- a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml +++ b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml @@ -1,6 +1,6 @@ - + - python + python Latest models from rerio page - - - - - - + + + + + ^[a-z_0-9,]+$ + + + @@ -49,5 +51,27 @@ + + + + + 10.1101/2021.12.29.474431v2 + + \ No newline at end of file diff --git a/data_managers/data_manager_clair3_models/test-data/clair3_models.loc b/data_managers/data_manager_clair3_models/test-data/clair3_models.loc index 5bdacf10348..37c1e6ec6f3 100644 --- a/data_managers/data_manager_clair3_models/test-data/clair3_models.loc +++ b/data_managers/data_manager_clair3_models/test-data/clair3_models.loc @@ -1,4 +1,4 @@ -# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Revio site and provided as directories) +# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Rerio site and provided as directories) # # the columns are: # 1. value diff --git a/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample b/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample index 5bdacf10348..37c1e6ec6f3 100644 --- a/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample +++ b/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample @@ -1,4 +1,4 @@ -# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Revio site and provided as directories) +# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Rerio site and provided as directories) # # the columns are: # 1. value From 8a096944f755a3d93f0214be48773e91c32740c4 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Tue, 7 Jan 2025 16:23:15 +0200 Subject: [PATCH 3/9] Fix flake8 errors --- .../data_manager/model_fetcher.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py index 5f2961ee27e..3cdf9f5d1fd 100644 --- a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py +++ b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py @@ -25,12 +25,13 @@ def find_latest_models(): config_line_seen = False read_lines = False models = [] + break1 = 0 for line in StringIO(data): if read_lines: if line.startswith('=========================='): read_lines = False break - model = line[:break1-1] + model = line[:break1 - 1] models.append(model) if config_line_seen and line.startswith('=========================='): break1 = line.find(' ') @@ -84,7 +85,7 @@ def unpack_model(data, outdir): models.extend(find_latest_models()) if args.download_models: models.extend(args.download_models.split(',')) - + if not models: sys.exit('No models to download, please specify either --download_latest or --download_models') @@ -95,7 +96,7 @@ def unpack_model(data, outdir): output_directory = config["output_data"]["extra_files_path"] if not Path(output_directory).exists(): Path(output_directory).mkdir(parents=True) - + data_manager_dict = {} data_manager_dict["data_tables"] = config.get("data_tables", {}) data_manager_dict["data_tables"][DATA_TABLE_NAME] = [] From c3c4c820239a931e54e1db21d6c4c97113092646 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Tue, 7 Jan 2025 16:45:38 +0200 Subject: [PATCH 4/9] Fix more flake8 errors --- .../data_manager_clair3_models/data_manager/model_fetcher.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py index 3cdf9f5d1fd..f0b3f2e1129 100644 --- a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py +++ b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py @@ -5,10 +5,9 @@ import sys import tarfile - -from io import StringIO, BytesIO +from io import BytesIO, StringIO from pathlib import Path -from urllib.request import urlopen, Request +from urllib.request import Request, urlopen DATA_TABLE_NAME = 'clair3_models' From f81151a5f4a347da0ba441a12fe14f74e24d33f5 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Tue, 7 Jan 2025 18:29:48 +0200 Subject: [PATCH 5/9] Fix more flake8 errors and bugs --- .../data_manager/install_clair3_models.xml | 8 ++++---- .../data_manager/model_fetcher.py | 6 +++--- .../data_manager_clair3_models/data_manager_conf.xml | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml index 4cbeb54881f..cabdf45e453 100644 --- a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml +++ b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml @@ -3,12 +3,12 @@ python @@ -74,4 +74,4 @@ commit = {c0c8ce6} }]]> - \ No newline at end of file + diff --git a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py index f0b3f2e1129..567e994202a 100644 --- a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py +++ b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py @@ -88,11 +88,11 @@ def unpack_model(data, outdir): if not models: sys.exit('No models to download, please specify either --download_latest or --download_models') - with open(args.galaxy_datamanager_filename) as fh: + with open(args.dm_filename) as fh: config = json.load(fh) if 'extra_files_path' not in config.get('output_data', [{}])[0]: sys.exit('Please specify the output directory in the data manager configuration (the extra_files_path)') - output_directory = config["output_data"]["extra_files_path"] + output_directory = config["output_data"][0]["extra_files_path"] if not Path(output_directory).exists(): Path(output_directory).mkdir(parents=True) @@ -118,5 +118,5 @@ def unpack_model(data, outdir): ) ) - with open(args.output_dm_filename, 'w') as fh: + with open(args.dm_filename, 'w') as fh: json.dump(data_manager_dict, fh, sort_keys=True, indent=4) diff --git a/data_managers/data_manager_clair3_models/data_manager_conf.xml b/data_managers/data_manager_clair3_models/data_manager_conf.xml index 2830e0a5fc9..28367c6c940 100644 --- a/data_managers/data_manager_clair3_models/data_manager_conf.xml +++ b/data_managers/data_manager_clair3_models/data_manager_conf.xml @@ -1,6 +1,6 @@ - + @@ -16,4 +16,4 @@ - \ No newline at end of file + From 46f89ba39af94c4004fea05db98f2125d820d6e2 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Tue, 7 Jan 2025 18:39:25 +0200 Subject: [PATCH 6/9] Fix output format, more flake8 and linting errors --- data_managers/data_manager_clair3_models/.shed.yml | 1 + .../data_manager/install_clair3_models.xml | 2 +- .../data_manager_clair3_models/data_manager/model_fetcher.py | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data_managers/data_manager_clair3_models/.shed.yml b/data_managers/data_manager_clair3_models/.shed.yml index f0bf0f3509a..769e843cb05 100644 --- a/data_managers/data_manager_clair3_models/.shed.yml +++ b/data_managers/data_manager_clair3_models/.shed.yml @@ -7,5 +7,6 @@ long_description: | "Oxford Nanopore Technologies, Ltd. Public License Version 1.0" name: data_manager_clair3_models owner: iuc +homepage_url: https://github.com/nanoporetech/rerio?tab=readme-ov-file#clair3-models remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_clair3_models type: unrestricted \ No newline at end of file diff --git a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml index cabdf45e453..b5fb54e71b9 100644 --- a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml +++ b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml @@ -27,7 +27,7 @@ - + diff --git a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py index 567e994202a..dfa7e828171 100644 --- a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py +++ b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py @@ -4,7 +4,6 @@ import json import sys import tarfile - from io import BytesIO, StringIO from pathlib import Path from urllib.request import Request, urlopen From 2006dea877c4e817117ca12122499a983aeedf57 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Tue, 7 Jan 2025 19:33:36 +0200 Subject: [PATCH 7/9] Fix detection of duplicate models --- .../data_manager/install_clair3_models.xml | 24 +++++++++- .../data_manager/model_fetcher.py | 48 ++++++++++++++----- .../data_manager_conf.xml | 1 + .../test-data/clair3_models.loc | 3 +- .../tool-data/clair3_models.loc.sample | 3 +- .../tool_data_table_conf.xml.sample | 2 +- .../tool_data_table_conf.xml.test | 4 +- 7 files changed, 67 insertions(+), 18 deletions(-) diff --git a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml index b5fb54e71b9..8dbd3006ba9 100644 --- a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml +++ b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml @@ -3,8 +3,30 @@ python 0: + #set $known_models = ','.join([ row[0] for row in $data_table.get_fields() ]) + #set $sha256_sums = ','.join([ row[1] for row in $data_table.get_fields() ]) + #else + #set $known_models = None + #set $sha256_sums = None + #end if + python '$__tool_directory__/model_fetcher.py' '${output_file}' + #if $known_models is not None + --known_models '$known_models' + --sha256_sums '$sha256_sums' + #end if #if $model_selection.source == 'latest' --download_latest #elif $model_selection.source == 'chosen' @@ -14,7 +36,7 @@ - + diff --git a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py index dfa7e828171..5252a06a49f 100644 --- a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py +++ b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py @@ -4,8 +4,10 @@ import json import sys import tarfile +from hashlib import sha256 from io import BytesIO, StringIO from pathlib import Path +from urllib.error import HTTPError from urllib.request import Request, urlopen DATA_TABLE_NAME = 'clair3_models' @@ -52,11 +54,16 @@ def fetch_model(model_name): url = f'https://raw.githubusercontent.com/nanoporetech/rerio/refs/heads/master/clair3_models/{model_name}_model' httprequest = Request(url) - with urlopen(httprequest) as response: - if response.status != 200: - raise IOError(f'Failed to fetch the model {model_name}: {response.status}') - final_url = response.read().decode('utf-8').strip() - httprequest = Request(final_url) + try: + # urlopen throws a HTTPError if it gets a 404 status (and perhaps other non-200 status?) + with urlopen(httprequest) as response: + if response.status != 200: + raise IOError(f'Failed to fetch the model {model_name}: {response.status}') + final_url = response.read().decode('utf-8').strip() + httprequest = Request(final_url) + except HTTPError as e: + raise IOError(f'Failed to fetch the model {model_name}: {e}') + with urlopen(httprequest) as response: if response.status != 200: raise IOError(f'Failed to fetch the model {model_name} from CDN URL {final_url}: {response.status}') @@ -72,7 +79,9 @@ def unpack_model(data, outdir): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('dm_filename', type=str, help='The filename of the data manager file to read parameters from and write outputs to') - parser.add_argument('--download_latest', action='store_true', default=False, help='Download the latest models as per the rerio repository') + parser.add_argument('--known_models', type=str, help='List of models already known in the Galaxy data table') + parser.add_argument('--sha256_sums', type=str, help='List of sha256sums of the models already known in the Galaxy data table') + parser.add_argument('--download_latest', action='store_true', default=False, help='Download the latest models as per the Rerio repository') parser.add_argument('--download_models', type=str, help='Comma separated list of models to download') args = parser.parse_args() @@ -99,23 +108,38 @@ def unpack_model(data, outdir): data_manager_dict["data_tables"] = config.get("data_tables", {}) data_manager_dict["data_tables"][DATA_TABLE_NAME] = [] + known_models = set(args.known_models.split(',')) if args.known_models else set() + model_to_sha256 = {} + if args.known_models: + sha256_sums = args.sha256_sums.split(',') + for (i, model) in enumerate(known_models): + model_to_sha256[model] = sha256_sums[i] + for model in models: model_dir = Path(output_directory) / model - # In the test below we assume that the contents of the model are uniquely identified by the model name. It is possible - # that Oxford Nanopore will change the contents of the model tarball without changing the name. Hopefully this will never - # happen. - if model_dir.exists(): + # The data table cannot handle duplicate entries, so we skip models that are already in the data table + if model in known_models: print(f'Model {model} already exists, skipping', file=sys.stderr) continue data = fetch_model(model) + sha256sum = sha256(data).hexdigest() + + # Since we skip models that are already known we cannot test the sha256sum here. This code is retained to illustrate that an + # alternative logic would be to download the model each time and check if the sha256sum matches what is already known. Hopefully + # ONT does not update the models while keeping the same name, so this is not needed. The sha256sum is stored in the data table + # in case it is needed in the future. + # if model in model_to_sha256 and sha256sum != model_to_sha256[model]: + # sys.exit(f'Model {model} already exists with a different sha256sum {model_to_sha256[model]}. This is a serious error, inform the Galaxy admin') + unpack_model(data, output_directory) data_manager_dict["data_tables"][DATA_TABLE_NAME].append( dict( value=model, + sha256=sha256sum, path=str(model_dir) ) ) - with open(args.dm_filename, 'w') as fh: - json.dump(data_manager_dict, fh, sort_keys=True, indent=4) + with open(args.dm_filename, 'w') as fh: + json.dump(data_manager_dict, fh, sort_keys=True, indent=4) diff --git a/data_managers/data_manager_clair3_models/data_manager_conf.xml b/data_managers/data_manager_clair3_models/data_manager_conf.xml index 28367c6c940..0d9156c8735 100644 --- a/data_managers/data_manager_clair3_models/data_manager_conf.xml +++ b/data_managers/data_manager_clair3_models/data_manager_conf.xml @@ -4,6 +4,7 @@ + diff --git a/data_managers/data_manager_clair3_models/test-data/clair3_models.loc b/data_managers/data_manager_clair3_models/test-data/clair3_models.loc index 37c1e6ec6f3..b6461d35717 100644 --- a/data_managers/data_manager_clair3_models/test-data/clair3_models.loc +++ b/data_managers/data_manager_clair3_models/test-data/clair3_models.loc @@ -2,7 +2,8 @@ # # the columns are: # 1. value +# 2. sha256sum (sha256 hash of the downloaded model, before unpacking) # 2. path (path to directory containing model) # for example # -# r1041_e82_400bps_hac_v500 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 +# r1041_e82_400bps_hac_v500 a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 diff --git a/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample b/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample index 37c1e6ec6f3..b6461d35717 100644 --- a/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample +++ b/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample @@ -2,7 +2,8 @@ # # the columns are: # 1. value +# 2. sha256sum (sha256 hash of the downloaded model, before unpacking) # 2. path (path to directory containing model) # for example # -# r1041_e82_400bps_hac_v500 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 +# r1041_e82_400bps_hac_v500 a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 diff --git a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample index 2bfc1788e80..879fd789196 100644 --- a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample +++ b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample @@ -1,6 +1,6 @@ - value, path + value, sha256, path
\ No newline at end of file diff --git a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test index 782c1597826..a1a749556b2 100644 --- a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test +++ b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test @@ -1,6 +1,6 @@ - value, path - + value, sha256, path +
\ No newline at end of file From c703412b82d82f8c97eb6e3cbca9a3c8b48d31d3 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Wed, 8 Jan 2025 10:05:23 +0200 Subject: [PATCH 8/9] Add platform to data table --- .../data_manager/model_fetcher.py | 1 + .../data_manager_clair3_models/data_manager_conf.xml | 1 + .../data_manager_clair3_models/test-data/clair3_models.loc | 7 ++++--- .../tool-data/clair3_models.loc.sample | 7 ++++--- .../tool_data_table_conf.xml.sample | 2 +- .../tool_data_table_conf.xml.test | 2 +- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py index 5252a06a49f..98343b445bf 100644 --- a/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py +++ b/data_managers/data_manager_clair3_models/data_manager/model_fetcher.py @@ -136,6 +136,7 @@ def unpack_model(data, outdir): data_manager_dict["data_tables"][DATA_TABLE_NAME].append( dict( value=model, + platform="ont", sha256=sha256sum, path=str(model_dir) ) diff --git a/data_managers/data_manager_clair3_models/data_manager_conf.xml b/data_managers/data_manager_clair3_models/data_manager_conf.xml index 0d9156c8735..2315e90aed5 100644 --- a/data_managers/data_manager_clair3_models/data_manager_conf.xml +++ b/data_managers/data_manager_clair3_models/data_manager_conf.xml @@ -4,6 +4,7 @@ + diff --git a/data_managers/data_manager_clair3_models/test-data/clair3_models.loc b/data_managers/data_manager_clair3_models/test-data/clair3_models.loc index b6461d35717..2fa44f98f8e 100644 --- a/data_managers/data_manager_clair3_models/test-data/clair3_models.loc +++ b/data_managers/data_manager_clair3_models/test-data/clair3_models.loc @@ -2,8 +2,9 @@ # # the columns are: # 1. value -# 2. sha256sum (sha256 hash of the downloaded model, before unpacking) -# 2. path (path to directory containing model) +# 2. platform +# 3. sha256sum (sha256 hash of the downloaded model, before unpacking) +# 4. path (path to directory containing model) # for example # -# r1041_e82_400bps_hac_v500 a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 +# r1041_e82_400bps_hac_v500 ont a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 diff --git a/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample b/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample index b6461d35717..2fa44f98f8e 100644 --- a/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample +++ b/data_managers/data_manager_clair3_models/tool-data/clair3_models.loc.sample @@ -2,8 +2,9 @@ # # the columns are: # 1. value -# 2. sha256sum (sha256 hash of the downloaded model, before unpacking) -# 2. path (path to directory containing model) +# 2. platform +# 3. sha256sum (sha256 hash of the downloaded model, before unpacking) +# 4. path (path to directory containing model) # for example # -# r1041_e82_400bps_hac_v500 a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 +# r1041_e82_400bps_hac_v500 ont a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 diff --git a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample index 879fd789196..aa195b3ad81 100644 --- a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample +++ b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.sample @@ -1,6 +1,6 @@ - value, sha256, path + value, platform, sha256, path
\ No newline at end of file diff --git a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test index a1a749556b2..59c1fe758c9 100644 --- a/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test +++ b/data_managers/data_manager_clair3_models/tool_data_table_conf.xml.test @@ -1,6 +1,6 @@ - value, sha256, path + value, platform, sha256, path
\ No newline at end of file From afe5f53ae387dcc584e75e13409680f3c7ea4852 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Sat, 11 Jan 2025 13:59:45 +0200 Subject: [PATCH 9/9] Explain why the list of known models is produced --- .../data_manager/install_clair3_models.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml index 8dbd3006ba9..8fd6fb4d072 100644 --- a/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml +++ b/data_managers/data_manager_clair3_models/data_manager/install_clair3_models.xml @@ -4,6 +4,8 @@