Babelon tables subsetprompts (#61)

* trying to edit formatting with autopep8 to conform to pep8 * fixed several things, among which flake8 version, tox.ini formatting, lots in the source code format, ran it once to make sure it still works * added babelon table creation, also fixed creation of prompts wo missing translations in any language, added a dropna to make sure #52 is solved
monarch-initiative · Nov 26, 2024 · 6586222 · 6586222
1 parent 0f71c4b
commit 6586222
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 7 deletions.
diff --git a/src/malco/analysis/count_translated_prompts.py b/src/malco/analysis/count_translated_prompts.py
@@ -1,3 +1,4 @@
+import shutil
 import os
 import re
 
@@ -11,16 +12,15 @@
     "nl",
     "tr",
     "zh",
+    "cs",
 ]
 
 promptfiles = {}
 for lang in langs:
     promptfiles[lang] = []
     for dirpath, dirnames, filenames in os.walk(fp + lang):
         for fn in filenames:
-            fn = fn[0:-14]  # TODO may be problematic if there are 2 "_" before "{langcode}-"
-            # Maybe something along the lines of other script disease_avail_knowledge.py
-            # ppkt_label = ppkt[0].replace('_en-prompt.txt','')
+            fn = fn.replace('_' + lang +'-prompt.txt','')
             promptfiles[lang].append(fn)
         break
 
@@ -33,7 +33,15 @@
 nlset = set(promptfiles["nl"])
 zhset = set(promptfiles["zh"])
 trset = set(promptfiles["tr"])
+csset = set(promptfiles["cs"])
 
-intersection = enset & esset & deset & itset & nlset & zhset & trset
+intersection = enset & esset & deset & itset & nlset & zhset & trset & csset
 
 print("Common ppkts are: ", len(intersection))
+
+
+# COPY
+dst_dir = "/Users/leonardo/git/malco/in_multlingual_nov24/prompts/"
+for id in intersection:
+    for lang in langs:
+        shutil.copy(fp + lang + "/" + id + '_' + lang +'-prompt.txt', dst_dir + lang) 
diff --git a/src/malco/analysis/xlsx2babelon.py b/src/malco/analysis/xlsx2babelon.py
@@ -0,0 +1,41 @@
+"""
+Create babelon complying tables from .xlsx that was sent to us 
+"""
+
+import pandas as pd
+#tr_lang_code = "es"
+#tr_lang_code = "nl"
+#tr_lang_code = "de"
+tr_lang_code = "tr"
+#tr_lang = "spanish-1"
+#tr_lang = "dutch"
+#tr_lang = "german"
+tr_lang = "turkish-1"
+
+
+data_path = "/Users/leonardo/data/translate_missing/"
+data_file = data_path + "missing_hp_" + tr_lang + ".xlsx"
+
+langs = ["it", "de", "es", "cs", "tr", ] # "zh" has a different structure, it's already been done
+# German, manually edit based on excel as well
+
+babelon_names = ["source_language", "source_value", "subject_id", "predicate_id", 
+                 "translation_language", "translation_value", "translation_status",
+                 "translator", "translator_expertise", "translation_date",
+                 ]
+
+
+df = pd.DataFrame(columns=babelon_names)
+df[["source_value", "translation_value"]] = pd.read_excel(data_file, header=None, usecols="A:B")
+df[["source_value", "subject_id"]] = df["source_value"].str.split('(', n=1, expand=True)
+df['subject_id'] = df['subject_id'].str.replace(')','')
+df['translation_value'] = df['translation_value'].str.replace(r'\(.*\)', '', regex=True)
+df["source_language"] = "en"
+df["predicate_id"] = "rdfs:label"
+df["translation_language"] = tr_lang_code
+df["translation_status"] = "CANDIDATE"
+df["translator"] = "DeepL"
+df["translator_expertise"] = "ALGORITHM"
+df["translation_date"] = "2024-09-11"
+
+df.to_excel(data_path + "hp-" + tr_lang_code + "-babelon.xlsx", index=False)
diff --git a/src/malco/post_process/ranking_utils.py b/src/malco/post_process/ranking_utils.py
@@ -10,6 +10,7 @@
 from oaklib import get_adapter
 from oaklib.interfaces import OboGraphInterface
 from shelved_cache import PersistentCache
+from typing import Tuple
 
 from malco.post_process.df_save_util import safe_save_tsv
 from malco.post_process.mondo_score_utils import score_grounded_result
@@ -39,7 +40,12 @@ def compute_mrr_and_ranks(
     out_subdir: str,
     prompt_dir: str,
     correct_answer_file: str,
-) -> Path:
+) -> Tuple[Path, Path, dict, Path]:
+    """
+    Go from the slightly preprocessed data to a dataframe with ranks, correct results, and most importantly, score the results.
+
+    The scoring happens in score_grounded_result().
+    """
 
     # Read in results TSVs from self.output_dir that match glob results*tsv
     out_caches = Path("caches")
@@ -108,11 +114,11 @@ def compute_mrr_and_ranks(
             df["rank"] = df.groupby("label")["score"].rank(ascending=False, method="first")
             label_4_non_eng = df["label"].str.replace(
                 "_[a-z][a-z]-prompt", "_en-prompt", regex=True
-            )  # TODO is bug here?
+            )  
 
             # df['correct_term'] is an OMIM
             # df['term'] is Mondo or OMIM ID, or even disease label
-            df["correct_term"] = label_4_non_eng.map(label_to_correct_term)
+            df["correct_term"] = label_4_non_eng.map(label_to_correct_term, na_action='ignore')
 
             # Make sure caching is used in the following by unwrapping explicitly
             results = []
@@ -136,6 +142,7 @@ def compute_mrr_and_ranks(
                 lambda row: 1 / row["rank"] if row["is_correct"] else 0, axis=1
             )
 
+            df.dropna(subset=["correct_term"])
             # Save full data frame
             full_df_path = output_dir / results_files[i].split("/")[0]
             full_df_filename = "full_df_results.tsv"