diff --git a/.gitignore b/.gitignore
index abf4e20..c21d868 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,5 @@ upload_json
 .snakemake
 __pycache__
 .DS_Store
+logs
+data/validate
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 3670b59..8152cd7 100644
--- a/Makefile
+++ b/Makefile
@@ -7,4 +7,8 @@ upload:
 update: upload
 
 clean:
-	rm -fr output_pieces_* upload_json
+	rm -fr output_pieces_* upload_json logs/*txt
+	
+log:
+	for file in output_pieces*/*/; do  echo $$file; echo $$file >> logs/chunks.txt; find $$file -maxdepth 1 -name "*md" | wc -l >> logs/chunks.txt; done
+	for file in upload_json/*json; do echo $$file >> logs/aggregated.txt; grep -o '{"id":' $$file | wc -l >> logs/aggregated.txt; done
diff --git a/Snakefile b/Snakefile
index 2057d0a..97552a3 100644
--- a/Snakefile
+++ b/Snakefile
@@ -7,6 +7,20 @@
 
 TERM_TYPES = ['anatomy', 'compound', 'disease', 'gene', 'protein']
 
+# dictionary mapping terms to list of valid IDs.
+#
+# note: could do further magic by just building the URL directly, but this
+# is simpler to grok, I think.
+VALID_ID_URLS = {
+    'anatomy': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:anatomy/id@sort(id)?accept=csv',
+    'gene': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:gene/id@sort(id)?accept=csv',
+    'protein': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:protein/id@sort(id)?accept=csv',
+    'disease': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:disease/id@sort(id)?accept=csv',
+    'compound': 'https://app.nih-cfde.org/ermrest/catalog/1/attribute/CFDE:compound/id@sort(id)?accept=csv',
+    }
+    
+    
+
 rule all:
     message:
         f"Building content for all {len(TERM_TYPES)} controlled vocab types."
@@ -31,6 +45,26 @@ rule upload:
     """
 
 
+rule retrieve: 
+    message:
+        f"retrieve list of ids in the registry"
+    input:
+        expand("data/validate/{term}.csv", term=TERM_TYPES)
+
+
+# use wildcards to pull down the valid IDs file for each term
+rule retrieve_term_wc:
+    output:
+        "data/validate/{term}.csv",
+    params:
+        # construct url by looking up term in VALID_ID_URLS dynamically
+        url = lambda w: VALID_ID_URLS[w.term]
+    shell: """
+        curl -L "{params.url}" -o {output}
+    """
+    
+    
+
 rule gene_json:
     message:
         "build markdown content for genes."
@@ -126,6 +160,7 @@ rule gene_json_alias_widget:
         script = "scripts/build-markdown-pieces-gene-translate.py",
         id_list = "data/inputs/gene_IDs_for_alias_tables.txt",
         alias_info = "data/inputs/Homo_sapiens.gene_info_20220304.txt_conv_wNCBI_AC.txt",
+        validate_csv = expand("data/validate/{term}.csv", term=TERM_TYPES),
     output:
         directory("output_pieces_gene/00-alias")
     params:
diff --git a/scripts/aggregate-markdown-pieces.py b/scripts/aggregate-markdown-pieces.py
index 744b217..c285a4c 100755
--- a/scripts/aggregate-markdown-pieces.py
+++ b/scripts/aggregate-markdown-pieces.py
@@ -93,8 +93,9 @@ def main():
 
     print(f"Loaded {n_loaded} chunks total.", file=sys.stderr)
     print(F"Skipped {n_skipped} files for not ending in .json.", file=sys.stderr)
-    print(f"Wrote {len(chunks)} chunks to {args.output_json}", file=sys.stderr)
-
+    print(f"Wrote {len(chunks)} chunks to {args.output_json}", file=sys.stderr)    
+    print(f"Aggregated {n_loaded} chunks of information for {len(chunks)} IDs  into {args.output_json}." , file=sys.stderr)
+    
 
 if __name__ == '__main__':
     sys.exit(main())
diff --git a/scripts/cfde_common.py b/scripts/cfde_common.py
index f001cc6..e3edd29 100755
--- a/scripts/cfde_common.py
+++ b/scripts/cfde_common.py
@@ -14,6 +14,14 @@
     'protein': 'data/validate/protein.tsv',
     }
 
+ID_FILES = {
+    'anatomy': 'data/validate/anatomy.csv',
+    'compound': 'data/validate/compound.csv',
+    'disease': 'data/validate/disease.csv',
+    'gene': 'data/validate/gene.csv',
+    'protein': 'data/validate/protein.csv',
+    }    
+
 
 def write_output_pieces(output_dir, widget_name, cv_id, md, *, verbose=False):
     output_filename = f"{widget_name}_{urllib.parse.quote(cv_id)}.json"
@@ -35,3 +43,23 @@ def write_output_pieces(output_dir, widget_name, cv_id, md, *, verbose=False):
 
     if verbose:
         print(f"Wrote markdown to {output_filename}")
+
+def get_validation_ids(term):
+    # get list of validation retrieved form portal pages
+    validation_file = ID_FILES.get(term)
+    if validation_file is None:
+        print(f"ERROR: no validation file. Run `make retrieve`.", file=sys.stderr)
+        sys.exit(-1)
+        
+    # load validation; ID is first column
+    validation_ids = set()
+    with open(validation_file, 'r', newline='') as fp:
+        r = csv.DictReader(fp, delimiter=',')
+        for row in r:
+            validation_id = row['id']
+            validation_ids.add(validation_id)
+
+    print(f"Loaded {len(validation_ids)} IDs from {validation_file}.",
+          file=sys.stderr)
+          
+    return(validation_ids)      
\ No newline at end of file