Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retrieve ids #75

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add script and rule to retrieve ids
raynamharris committed Oct 25, 2022
commit 167927014e96ab2f5a6b61f6424ac778a3174548
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -4,4 +4,5 @@ upload_json
.snakemake
__pycache__
.DS_Store
logs
logs
data/validate
34 changes: 34 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -7,6 +7,20 @@

TERM_TYPES = ['anatomy', 'compound', 'disease', 'gene', 'protein']

# dictionary mapping terms to list of valid IDs.
#
# note: could do further magic by just building the URL directly, but this
# is simpler to grok, I think.
VALID_ID_URLS = {
'anatomy': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:anatomy/id@sort(id)?accept=csv',
'gene': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:gene/id@sort(id)?accept=csv',
'protein': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:protein/id@sort(id)?accept=csv',
'disease': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:disease/id@sort(id)?accept=csv',
'compound': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:compound/id@sort(id)?accept=csv',
}



rule all:
message:
f"Building content for all {len(TERM_TYPES)} controlled vocab types."
@@ -31,6 +45,26 @@ rule upload:
"""


rule retrieve:
message:
f"retrieve list of ids in the registry"
input:
expand("data/validate/{term}.csv", term=TERM_TYPES)


# use wildcards to pull down the valid IDs file for each term
rule retrieve_term_wc:
output:
"data/validate/{term}.csv",
params:
# construct url by looking up term in VALID_ID_URLS dynamically
url = lambda w: VALID_ID_URLS[w.term]
shell: """
curl -L "{params.url}" -o {output}
"""



rule gene_json:
message:
"build markdown content for genes."
28 changes: 28 additions & 0 deletions scripts/cfde_common.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,14 @@
'protein': 'data/validate/protein.tsv',
}

ID_FILES = {
'anatomy': 'data/validate/anatomy.csv',
'compound': 'data/validate/compound.csv',
'disease': 'data/validate/disease.csv',
'gene': 'data/validate/gene.csv',
'protein': 'data/validate/protein.csv',
}


def write_output_pieces(output_dir, widget_name, cv_id, md, *, verbose=False):
output_filename = f"{widget_name}_{urllib.parse.quote(cv_id)}.json"
@@ -35,3 +43,23 @@ def write_output_pieces(output_dir, widget_name, cv_id, md, *, verbose=False):

if verbose:
print(f"Wrote markdown to {output_filename}")

def get_validation_ids(term):
# get list of validation retrieved form portal pages
validation_file = ID_FILES.get(term)
if validation_file is None:
print(f"ERROR: no validation file. Run `make retrieve`.", file=sys.stderr)
sys.exit(-1)

# load validation; ID is first column
validation_ids = set()
with open(validation_file, 'r', newline='') as fp:
r = csv.DictReader(fp, delimiter=',')
for row in r:
validation_id = row['id']
validation_ids.add(validation_id)

print(f"Loaded {len(validation_ids)} IDs from {validation_file}.",
file=sys.stderr)

return(validation_ids)