diff --git a/.gitignore b/.gitignore index abf4e20..c21d868 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ upload_json .snakemake __pycache__ .DS_Store +logs +data/validate \ No newline at end of file diff --git a/Makefile b/Makefile index 3670b59..8152cd7 100644 --- a/Makefile +++ b/Makefile @@ -7,4 +7,8 @@ upload: update: upload clean: - rm -fr output_pieces_* upload_json + rm -fr output_pieces_* upload_json logs/*txt + +log: + for file in output_pieces*/*/; do echo $$file; echo $$file >> logs/chunks.txt; find $$file -maxdepth 1 -name "*md" | wc -l >> logs/chunks.txt; done + for file in upload_json/*json; do echo $$file >> logs/aggregated.txt; grep -o '{"id":' $$file | wc -l >> logs/aggregated.txt; done diff --git a/Snakefile b/Snakefile index 2057d0a..97552a3 100644 --- a/Snakefile +++ b/Snakefile @@ -7,6 +7,20 @@ TERM_TYPES = ['anatomy', 'compound', 'disease', 'gene', 'protein'] +# dictionary mapping terms to list of valid IDs. +# +# note: could do further magic by just building the URL directly, but this +# is simpler to grok, I think. +VALID_ID_URLS = { + 'anatomy': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:anatomy/id@sort(id)?accept=csv', + 'gene': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:gene/id@sort(id)?accept=csv', + 'protein': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:protein/id@sort(id)?accept=csv', + 'disease': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:disease/id@sort(id)?accept=csv', + 'compound': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:compound/id@sort(id)?accept=csv', + } + + + rule all: message: f"Building content for all {len(TERM_TYPES)} controlled vocab types." @@ -31,6 +45,26 @@ rule upload: """ +rule retrieve: + message: + f"retrieve list of ids in the registry" + input: + expand("data/validate/{term}.csv", term=TERM_TYPES) + + +# use wildcards to pull down the valid IDs file for each term +rule retrieve_term_wc: + output: + "data/validate/{term}.csv", + params: + # construct url by looking up term in VALID_ID_URLS dynamically + url = lambda w: VALID_ID_URLS[w.term] + shell: """ + curl -L "{params.url}" -o {output} + """ + + + rule gene_json: message: "build markdown content for genes." @@ -126,6 +160,7 @@ rule gene_json_alias_widget: script = "scripts/build-markdown-pieces-gene-translate.py", id_list = "data/inputs/gene_IDs_for_alias_tables.txt", alias_info = "data/inputs/Homo_sapiens.gene_info_20220304.txt_conv_wNCBI_AC.txt", + validate_csv = expand("data/validate/{term}.csv", term=TERM_TYPES), output: directory("output_pieces_gene/00-alias") params: diff --git a/scripts/aggregate-markdown-pieces.py b/scripts/aggregate-markdown-pieces.py index 744b217..c285a4c 100755 --- a/scripts/aggregate-markdown-pieces.py +++ b/scripts/aggregate-markdown-pieces.py @@ -93,8 +93,9 @@ def main(): print(f"Loaded {n_loaded} chunks total.", file=sys.stderr) print(F"Skipped {n_skipped} files for not ending in .json.", file=sys.stderr) - print(f"Wrote {len(chunks)} chunks to {args.output_json}", file=sys.stderr) - + print(f"Wrote {len(chunks)} chunks to {args.output_json}", file=sys.stderr) + print(f"Aggregated {n_loaded} chunks of information for {len(chunks)} IDs into {args.output_json}." , file=sys.stderr) + if __name__ == '__main__': sys.exit(main()) diff --git a/scripts/cfde_common.py b/scripts/cfde_common.py index f001cc6..e3edd29 100755 --- a/scripts/cfde_common.py +++ b/scripts/cfde_common.py @@ -14,6 +14,14 @@ 'protein': 'data/validate/protein.tsv', } +ID_FILES = { + 'anatomy': 'data/validate/anatomy.csv', + 'compound': 'data/validate/compound.csv', + 'disease': 'data/validate/disease.csv', + 'gene': 'data/validate/gene.csv', + 'protein': 'data/validate/protein.csv', + } + def write_output_pieces(output_dir, widget_name, cv_id, md, *, verbose=False): output_filename = f"{widget_name}_{urllib.parse.quote(cv_id)}.json" @@ -35,3 +43,23 @@ def write_output_pieces(output_dir, widget_name, cv_id, md, *, verbose=False): if verbose: print(f"Wrote markdown to {output_filename}") + +def get_validation_ids(term): + # get list of validation retrieved form portal pages + validation_file = ID_FILES.get(term) + if validation_file is None: + print(f"ERROR: no validation file. Run `make retrieve`.", file=sys.stderr) + sys.exit(-1) + + # load validation; ID is first column + validation_ids = set() + with open(validation_file, 'r', newline='') as fp: + r = csv.DictReader(fp, delimiter=',') + for row in r: + validation_id = row['id'] + validation_ids.add(validation_id) + + print(f"Loaded {len(validation_ids)} IDs from {validation_file}.", + file=sys.stderr) + + return(validation_ids) \ No newline at end of file