Skip to content

Commit

Permalink
Babelon tables subsetprompts (#61)
Browse files Browse the repository at this point in the history
* trying to edit formatting with autopep8 to conform to pep8

* fixed several things, among which flake8 version, tox.ini formatting, lots in the source code format, ran it once to make sure it still works

* added babelon table creation, also fixed creation of prompts wo missing translations in any language, added a dropna to make sure #52 is solved
  • Loading branch information
leokim-l authored Nov 26, 2024
1 parent 0f71c4b commit 6586222
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 7 deletions.
16 changes: 12 additions & 4 deletions src/malco/analysis/count_translated_prompts.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
import os
import re

Expand All @@ -11,16 +12,15 @@
"nl",
"tr",
"zh",
"cs",
]

promptfiles = {}
for lang in langs:
promptfiles[lang] = []
for dirpath, dirnames, filenames in os.walk(fp + lang):
for fn in filenames:
fn = fn[0:-14] # TODO may be problematic if there are 2 "_" before "{langcode}-"
# Maybe something along the lines of other script disease_avail_knowledge.py
# ppkt_label = ppkt[0].replace('_en-prompt.txt','')
fn = fn.replace('_' + lang +'-prompt.txt','')
promptfiles[lang].append(fn)
break

Expand All @@ -33,7 +33,15 @@
nlset = set(promptfiles["nl"])
zhset = set(promptfiles["zh"])
trset = set(promptfiles["tr"])
csset = set(promptfiles["cs"])

intersection = enset & esset & deset & itset & nlset & zhset & trset
intersection = enset & esset & deset & itset & nlset & zhset & trset & csset

print("Common ppkts are: ", len(intersection))


# COPY
dst_dir = "/Users/leonardo/git/malco/in_multlingual_nov24/prompts/"
for id in intersection:
for lang in langs:
shutil.copy(fp + lang + "/" + id + '_' + lang +'-prompt.txt', dst_dir + lang)
41 changes: 41 additions & 0 deletions src/malco/analysis/xlsx2babelon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
Create babelon complying tables from .xlsx that was sent to us
"""

import pandas as pd
#tr_lang_code = "es"
#tr_lang_code = "nl"
#tr_lang_code = "de"
tr_lang_code = "tr"
#tr_lang = "spanish-1"
#tr_lang = "dutch"
#tr_lang = "german"
tr_lang = "turkish-1"


data_path = "/Users/leonardo/data/translate_missing/"
data_file = data_path + "missing_hp_" + tr_lang + ".xlsx"

langs = ["it", "de", "es", "cs", "tr", ] # "zh" has a different structure, it's already been done
# German, manually edit based on excel as well

babelon_names = ["source_language", "source_value", "subject_id", "predicate_id",
"translation_language", "translation_value", "translation_status",
"translator", "translator_expertise", "translation_date",
]


df = pd.DataFrame(columns=babelon_names)
df[["source_value", "translation_value"]] = pd.read_excel(data_file, header=None, usecols="A:B")
df[["source_value", "subject_id"]] = df["source_value"].str.split('(', n=1, expand=True)
df['subject_id'] = df['subject_id'].str.replace(')','')
df['translation_value'] = df['translation_value'].str.replace(r'\(.*\)', '', regex=True)
df["source_language"] = "en"
df["predicate_id"] = "rdfs:label"
df["translation_language"] = tr_lang_code
df["translation_status"] = "CANDIDATE"
df["translator"] = "DeepL"
df["translator_expertise"] = "ALGORITHM"
df["translation_date"] = "2024-09-11"

df.to_excel(data_path + "hp-" + tr_lang_code + "-babelon.xlsx", index=False)
13 changes: 10 additions & 3 deletions src/malco/post_process/ranking_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from oaklib import get_adapter
from oaklib.interfaces import OboGraphInterface
from shelved_cache import PersistentCache
from typing import Tuple

from malco.post_process.df_save_util import safe_save_tsv
from malco.post_process.mondo_score_utils import score_grounded_result
Expand Down Expand Up @@ -39,7 +40,12 @@ def compute_mrr_and_ranks(
out_subdir: str,
prompt_dir: str,
correct_answer_file: str,
) -> Path:
) -> Tuple[Path, Path, dict, Path]:
"""
Go from the slightly preprocessed data to a dataframe with ranks, correct results, and most importantly, score the results.
The scoring happens in score_grounded_result().
"""

# Read in results TSVs from self.output_dir that match glob results*tsv
out_caches = Path("caches")
Expand Down Expand Up @@ -108,11 +114,11 @@ def compute_mrr_and_ranks(
df["rank"] = df.groupby("label")["score"].rank(ascending=False, method="first")
label_4_non_eng = df["label"].str.replace(
"_[a-z][a-z]-prompt", "_en-prompt", regex=True
) # TODO is bug here?
)

# df['correct_term'] is an OMIM
# df['term'] is Mondo or OMIM ID, or even disease label
df["correct_term"] = label_4_non_eng.map(label_to_correct_term)
df["correct_term"] = label_4_non_eng.map(label_to_correct_term, na_action='ignore')

# Make sure caching is used in the following by unwrapping explicitly
results = []
Expand All @@ -136,6 +142,7 @@ def compute_mrr_and_ranks(
lambda row: 1 / row["rank"] if row["is_correct"] else 0, axis=1
)

df.dropna(subset=["correct_term"])
# Save full data frame
full_df_path = output_dir / results_files[i].split("/")[0]
full_df_filename = "full_df_results.tsv"
Expand Down

0 comments on commit 6586222

Please sign in to comment.