made plot nicer, linted, other minor things

monarch-initiative · Jan 6, 2025 · c914cd2 · c914cd2
1 parent 194a648
commit c914cd2
Show file tree

Hide file tree

Showing 11 changed files with 214 additions and 152 deletions.
diff --git a/src/malco/analysis/check_grounding_by_GPT.py b/src/malco/analysis/check_grounding_by_GPT.py
@@ -1,27 +1,31 @@
 # Check grounding of replies by GPT
-# Loop over replies, for each MONDO ID make dict entry with 
+# Loop over replies, for each MONDO ID make dict entry with
 # mondo_id: [gpt reply]
 # it's an array, mondo id returns a list of labels, check how many non unique
 # than put into df and add a third column with actual mondo_id and save to excel
-from malco.post_process.post_process_results_format import read_raw_result_yaml
 import os
+
 import pandas as pd
 
-filepath = "/Users/leonardo/git/malco/out_multlingual_nov24/raw_results/multilingual/en/results.yaml"
+from malco.post_process.post_process_results_format import read_raw_result_yaml
+
+filepath = (
+    "/Users/leonardo/git/malco/out_multlingual_nov24/raw_results/multilingual/en/results.yaml"
+)
 
 
 if os.path.isfile(filepath):
     data = []
 
     all_results = read_raw_result_yaml(filepath)
-    
+
     for this_result in all_results:
         extracted_object = this_result.get("extracted_object")
         ne = this_result.get("named_entities")
         if extracted_object and ne:
             label = extracted_object.get("label")
             gptinput = this_result.get("input_text")
-            data.append({"label": label, "gpt_reply": gptinput ,"grounded_response": ne})
+            data.append({"label": label, "gpt_reply": gptinput, "grounded_response": ne})
 
 df = pd.DataFrame(data, index=1)
-df['gpt_reply'].iloc[0].rstrip().split('\n')
+df["gpt_reply"].iloc[0].rstrip().split("\n")
diff --git a/src/malco/analysis/check_lens.py b/src/malco/analysis/check_lens.py
@@ -26,14 +26,14 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
 
 unique_ppkts = {}
 # model=str(sys.argv[1])
-#models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
+# models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
 models = ["en", "cs", "nl", "zh", "it", "es", "de", "tr"]
-with_post = False # if also post_process had been run, this can be True
+with_post = False  # if also post_process had been run, this can be True
 
 for model in models:
     print("===" * 10, "\nEvaluating now: ", model, "\n" + "===" * 10)
 
-    #yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
+    # yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
     yamlfile = f"out_multlingual_nov24/raw_results/multilingual/{model}/results.yaml"
     all_results = read_raw_result_yaml(yamlfile)
 
@@ -50,7 +50,6 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
             if terms:
                 counter += 1
 
-
     # The first should be equivalent to grepping "raw_" in some results.yaml
     print("The number of prompts that have something in results.yaml are: ", len(labelvec))
     print(

diff --git a/src/malco/analysis/count_translated_prompts_and_copy.py b/src/malco/analysis/count_translated_prompts_and_copy.py
@@ -1,6 +1,6 @@
-import shutil
 import os
 import re
+import shutil
 
 fp = "/Users/leonardo/IdeaProjects/phenopacket2prompt/prompts/"
 
@@ -20,7 +20,7 @@
     promptfiles[lang] = []
     for dirpath, dirnames, filenames in os.walk(fp + lang):
         for fn in filenames:
-            fn = fn.replace('_' + lang +'-prompt.txt','')
+            fn = fn.replace("_" + lang + "-prompt.txt", "")
             promptfiles[lang].append(fn)
         break
 
@@ -44,4 +44,4 @@
 dst_dir = "/Users/leonardo/git/malco/in_multlingual_nov24/prompts/"
 for id in intersection:
     for lang in langs:
-        shutil.copy(fp + lang + "/" + id + '_' + lang +'-prompt.txt', dst_dir + lang) 
+        shutil.copy(fp + lang + "/" + id + "_" + lang + "-prompt.txt", dst_dir + lang)
diff --git a/src/malco/analysis/ita_grounding_analysis.py b/src/malco/analysis/ita_grounding_analysis.py
@@ -1,16 +1,17 @@
-from malco.post_process.post_process_results_format import read_raw_result_yaml
+import os
 from pathlib import Path
+
 import pandas as pd
-import os
+
+from malco.post_process.post_process_results_format import read_raw_result_yaml
+
 # Each row has
 #    c1      *       c2         *  c3   *       c4         *        c5              *            c6              *  c7                       * c8
 # PMID (str) * label/term (str) *       *   rank           * ita_reply (bool) * correct_result OMIM ID * correct_result OMIM label  *  MONDO label (if applicable) * correct? 0/1 (in excel)
 
 # Correct results
 file = "/Users/leonardo/git/malco/in_ita_reply/correct_results.tsv"
-answers = pd.read_csv(
-        file, sep="\t", header=None, names=["description", "term", "label"]
-    )
+answers = pd.read_csv(file, sep="\t", header=None, names=["description", "term", "label"])
 
 # Mapping each label to its correct term
 cres = answers.set_index("label").to_dict()
@@ -26,26 +27,37 @@
 for ppkt_out in ita_result:
     extracted_object = ppkt_out.get("extracted_object")
     if extracted_object:
-        label = extracted_object.get("label").replace('_it-prompt', '_en-prompt')
+        label = extracted_object.get("label").replace("_it-prompt", "_en-prompt")
         terms = extracted_object.get("terms")
         if terms:
             num_terms = len(terms)
             rank_list = [i + 1 for i in range(num_terms)]
             for term, rank in zip(terms, rank_list):
-                data.append({"pubmedid": label, "term": term, "mondo_label": float('Nan'), "rank": rank, "ita_reply": True, "correct_omim_id": cres['term'][label], 
-                             "correct_omim_description": cres['description'][label]})
+                data.append(
+                    {
+                        "pubmedid": label,
+                        "term": term,
+                        "mondo_label": float("Nan"),
+                        "rank": rank,
+                        "ita_reply": True,
+                        "correct_omim_id": cres["term"][label],
+                        "correct_omim_description": cres["description"][label],
+                    }
+                )
 
 
 # load eng replies
-eng_file = Path("/Users/leonardo/git/malco/out_itanoeng/raw_results/multilingual/it_w_en/results.yaml")
+eng_file = Path(
+    "/Users/leonardo/git/malco/out_itanoeng/raw_results/multilingual/it_w_en/results.yaml"
+)
 eng_result = read_raw_result_yaml(eng_file)
 
 # extract named_entities, id and label from yaml for eng
 # extract input_text from yaml for ita, or extracted_object, terms
 for ppkt_out in eng_result:
     extracted_object = ppkt_out.get("extracted_object")
     if extracted_object:
-        label = extracted_object.get("label").replace('_it-prompt', '_en-prompt')
+        label = extracted_object.get("label").replace("_it-prompt", "_en-prompt")
         terms = extracted_object.get("terms")
         if terms:
             num_terms = len(terms)
@@ -54,13 +66,22 @@
                 if term.startswith("MONDO"):
                     ne = ppkt_out.get("named_entities")
                     for entity in ne:
-                        if entity.get('id')==term:
-                            mlab = entity.get('label')
+                        if entity.get("id") == term:
+                            mlab = entity.get("label")
                 else:
-                    mlab = float('Nan')
+                    mlab = float("Nan")
 
-                data.append({"pubmedid": label, "term": mlab, "mondo_label": term, "rank": rank, "ita_reply": False, "correct_omim_id": cres["term"][label], 
-                             "correct_omim_description": cres['description'][label]})
+                data.append(
+                    {
+                        "pubmedid": label,
+                        "term": mlab,
+                        "mondo_label": term,
+                        "rank": rank,
+                        "ita_reply": False,
+                        "correct_omim_id": cres["term"][label],
+                        "correct_omim_description": cres["description"][label],
+                    }
+                )
 
 # Create DataFrame
 column_names = [
@@ -75,5 +96,5 @@
 
 df = pd.DataFrame(data)
 df.columns = column_names
-df.sort_values(by = ['PMID', 'ita_reply', 'rank'], inplace=True) 
-#df.to_excel(os.getcwd() + "ita_replies2curate.xlsx") # does not work, wrong path, not important
+df.sort_values(by=["PMID", "ita_reply", "rank"], inplace=True)
+# df.to_excel(os.getcwd() + "ita_replies2curate.xlsx") # does not work, wrong path, not important
diff --git a/src/malco/analysis/main_analysis.py b/src/malco/analysis/main_analysis.py
@@ -1,10 +1,12 @@
 # Main Analysis of output
-import pandas as pd 
-from pathlib import Path 
+from pathlib import Path
+
+import pandas as pd
+
 from malco.post_process.df_save_util import safe_save_tsv
 from malco.post_process.generate_plots import make_plots
 
-# MALCO langs check output. 
+# MALCO langs check output.
 # If grounding fails number is not too different across languages, we could use
 # v
 
@@ -15,55 +17,60 @@
 mrr_file = data_dir / "mrr_result.tsv"
 
 df = pd.read_csv(topn_file, delimiter="\t")
-#TODO add strict scoring here
-#valid_cases = df["num_cases"] - df["grounding_failed"]
+# TODO add strict scoring here
+# valid_cases = df["num_cases"] - df["grounding_failed"]
 valid_cases = df["num_cases"]
-df["top1"] = (df["n1"]) / valid_cases
-df["top3"] = (df["n1"] + df["n2"] + df["n3"]) / valid_cases
-df["top5"] = (df["n1"] + df["n2"] + df["n3"] + df["n4"] + df["n5"]) / valid_cases
-df["top10"] = (
-    df["n1"]
-    + df["n2"]
-    + df["n3"]
-    + df["n4"]
-    + df["n5"]
-    + df["n6"]
-    + df["n7"]
-    + df["n8"]
-    + df["n9"]
-    + df["n10"]
-) / valid_cases
-df["not_found"] = (df["nf"] + df["grounding_failed"]) / valid_cases
+df["Top-1"] = 100 * (df["n1"]) / valid_cases
+df["Top-3"] = 100 * (df["n1"] + df["n2"] + df["n3"]) / valid_cases
+df["Top-5"] = 100 * (df["n1"] + df["n2"] + df["n3"] + df["n4"] + df["n5"]) / valid_cases
+df["Top-10"] = (
+    100
+    * (
+        df["n1"]
+        + df["n2"]
+        + df["n3"]
+        + df["n4"]
+        + df["n5"]
+        + df["n6"]
+        + df["n7"]
+        + df["n8"]
+        + df["n9"]
+        + df["n10"]
+    )
+    / valid_cases
+)
+df["Not Found"] = 100 * (df["nf"] + df["grounding_failed"]) / valid_cases
 
 df_aggr = pd.DataFrame()
 df_aggr = pd.melt(
     df,
     id_vars=comparing,
-    value_vars=["top1", "top3", "top5", "top10", "not_found"],
+    value_vars=["Top-1", "Top-3", "Top-5", "Top-10", "Not Found"],
     var_name="Rank_in",
     value_name="percentage",
 )
 
+
 # If "topn_aggr.tsv" already exists, prepend "old_"
 # It's the user's responsibility to know only up to 2 versions can exist, then data is lost
 topn_aggr_file_name = "topn_aggr.tsv"
 topn_aggr_file = data_dir / topn_aggr_file_name
 safe_save_tsv(data_dir, topn_aggr_file_name, df_aggr)
 
-languages = ["en","es","cs","tr","de","it","zh","nl"]
+languages = ["en", "es", "cs", "tr", "de", "it", "zh", "nl"]
 
 num_ppkt = {}
 for lang in languages:
-    num_ppkt[lang] = df[df[comparing]==lang]['num_cases'].iloc[0]
+    num_ppkt[lang] = df[df[comparing] == lang]["num_cases"].iloc[0]
 
-models = "USELESS FOR LANGUAGES" # !
+models = "USELESS FOR LANGUAGES"  # !
 
 make_plots(
-            mrr_file,
-            data_dir,
-            languages,
-            num_ppkt,
-            models,
-            topn_aggr_file,
-            comparing,
-        )
+    mrr_file,
+    data_dir,
+    languages,
+    num_ppkt,
+    models,
+    topn_aggr_file,
+    comparing,
+)