add script and rule to retrieve ids

nih-cfde · raynamharris · Oct 24, 2022 · Oct 25, 2022 · Oct 25, 2022 · Oct 25, 2022
commit 167927014e96ab2f5a6b61f6424ac778a3174548
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,5 @@ upload_json
 .snakemake
 __pycache__
 .DS_Store
-logs
+logs
+data/validate
diff --git a/Snakefile b/Snakefile
@@ -7,6 +7,20 @@
 
 TERM_TYPES = ['anatomy', 'compound', 'disease', 'gene', 'protein']
 
+# dictionary mapping terms to list of valid IDs.
+#
+# note: could do further magic by just building the URL directly, but this
+# is simpler to grok, I think.
+VALID_ID_URLS = {
+    'anatomy': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:anatomy/id@sort(id)?accept=csv',
+    'gene': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:gene/id@sort(id)?accept=csv',
+    'protein': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:protein/id@sort(id)?accept=csv',
+    'disease': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:disease/id@sort(id)?accept=csv',
+    'compound': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:compound/id@sort(id)?accept=csv',
+    }
+
+
+
 rule all:
     message:
         f"Building content for all {len(TERM_TYPES)} controlled vocab types."
@@ -31,6 +45,26 @@ rule upload:
     """
 
 
+rule retrieve: 
+    message:
+        f"retrieve list of ids in the registry"
+    input:
+        expand("data/validate/{term}.csv", term=TERM_TYPES)
+
+
+# use wildcards to pull down the valid IDs file for each term
+rule retrieve_term_wc:
+    output:
+        "data/validate/{term}.csv",
+    params:
+        # construct url by looking up term in VALID_ID_URLS dynamically
+        url = lambda w: VALID_ID_URLS[w.term]
+    shell: """
+        curl -L "{params.url}" -o {output}
+    """
+
+
+
 rule gene_json:
     message:
         "build markdown content for genes."

diff --git a/scripts/cfde_common.py b/scripts/cfde_common.py
@@ -14,6 +14,14 @@
     'protein': 'data/validate/protein.tsv',
     }
 
+ID_FILES = {
+    'anatomy': 'data/validate/anatomy.csv',
+    'compound': 'data/validate/compound.csv',
+    'disease': 'data/validate/disease.csv',
+    'gene': 'data/validate/gene.csv',
+    'protein': 'data/validate/protein.csv',
+    }    
+
 
 def write_output_pieces(output_dir, widget_name, cv_id, md, *, verbose=False):
     output_filename = f"{widget_name}_{urllib.parse.quote(cv_id)}.json"
@@ -35,3 +43,23 @@ def write_output_pieces(output_dir, widget_name, cv_id, md, *, verbose=False):
 
     if verbose:
         print(f"Wrote markdown to {output_filename}")
+
+def get_validation_ids(term):
+    # get list of validation retrieved form portal pages
+    validation_file = ID_FILES.get(term)
+    if validation_file is None:
+        print(f"ERROR: no validation file. Run `make retrieve`.", file=sys.stderr)
+        sys.exit(-1)
+
+    # load validation; ID is first column
+    validation_ids = set()
+    with open(validation_file, 'r', newline='') as fp:
+        r = csv.DictReader(fp, delimiter=',')
+        for row in r:
+            validation_id = row['id']
+            validation_ids.add(validation_id)
+
+    print(f"Loaded {len(validation_ids)} IDs from {validation_file}.",
+          file=sys.stderr)
+
+    return(validation_ids)