Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
ofilangi committed Oct 23, 2024
1 parent 229cea5 commit 33d8857
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 103 deletions.
14 changes: 3 additions & 11 deletions exec-sbatch-gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,7 @@ source ./check_slurm_memory.sh
. env/bin/activate

conffile=config/igepp.json
#export TOKENIZERS_PARALLELISM=false
#rm -rf igepp_w*/

check_slurm_memory
python -m llm_semantic_annotator $conffile populate_owl_tag_embeddings
check_slurm_memory
python -m llm_semantic_annotator $conffile populate_gbif_taxon_tag_embeddings
check_slurm_memory
python -m llm_semantic_annotator $conffile populate_abstract_embeddings
check_slurm_memory
python -m llm_semantic_annotator $conffile compute_tag_chunk_similarities

python -m llm_semantic_annotator $conffile 1


102 changes: 70 additions & 32 deletions exec.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,61 @@
#!/bin/bash

if [ "$#" -ne 1 ]; then
echo "Usage: $0 <config_file>"
help() {
cat << EOF
Usage: $0 <config_file> <int_commande>
Commands:
1. Pseudo workflow [2,4,5,6,7]
2. Populate OWL tag embeddings
3. Populate NCBI Taxon tag embeddings
4. Populate abstract embeddings
5. Compute similarities between tags and abstract chunks
6. Display similarities information
7. Build turtle knowledge graph
8. Build dataset abstracts annotations CSV file
9. Evaluate encoder with MeSH descriptors (experimental)
Details:
2: Compute TAG embeddings for all ontologies defined in the populate_owl_tag_embeddings section
3: Compute TAG embeddings for NCBI Taxon
4: Compute ABSTRACT embeddings (title + sentences) for all abstracts in the dataset
5: Compute similarities between TAGS and ABSTRACTS
6: Display similarities information on the console
7: Generate turtle file with information {score, tag} for each DOI
8: Generate CSV file with [doi, tag, pmid, reference_id]
EOF
}

# Check for help option
if [[ "$1" == "-h" ]]; then
help
exit 0
fi

# Check for correct number of arguments
if [ "$#" -lt 2 ]; then
echo "Error: Not enough arguments."
echo "Usage: $0 <config_file> <int_commande> [options]"
echo "Use '$0 -h' for more information."
exit 1
fi

config_file=$1
command=$2

# Validate config file
if [ ! -f "$config_file" ]; then
echo "Error: Config file '$config_file' does not exist."
exit 1
fi

# Validate command is an integer
if ! [[ "$command" =~ ^[0-9]+$ ]]; then
echo "Error: Command must be an integer."
exit 1
fi

conffile="$1"
venv_name="llm_semantic_annotator_env"

# Fonction pour créer l'environnement virtuel s'il n'existe pas
Expand All @@ -31,44 +81,32 @@ run_command() {

execute_command() {
case $1 in
1) run_command python3 -m llm_semantic_annotator "$conffile" populate_owl_tag_embeddings ;;
2) run_command python3 -m llm_semantic_annotator "$conffile" populate_ncbi_taxon_tag_embeddings ;;
3) run_command python3 -m llm_semantic_annotator "$conffile" populate_abstract_embeddings ;;
4) run_command python3 -m llm_semantic_annotator "$conffile" compute_tag_chunk_similarities ;;
5) run_command python3 -m llm_semantic_annotator "$conffile" display_summary ;;
6) run_command python3 -m llm_semantic_annotator "$conffile" build_rdf_graph ;;
7) run_command python3 -m llm_semantic_annotator "$conffile" build_dataset_abstracts_annotations ;;
8) run_command python3 -m llm_semantic_annotator "$conffile" evaluate_encoder ;;
*) echo "Invalid option" ;;
2) run_command python3 -m llm_semantic_annotator "$config_file" populate_owl_tag_embeddings ;;
3) run_command python3 -m llm_semantic_annotator "$config_file" populate_ncbi_taxon_tag_embeddings ;;
4) run_command python3 -m llm_semantic_annotator "$config_file" populate_abstract_embeddings ;;
5) run_command python3 -m llm_semantic_annotator "$config_file" compute_tag_chunk_similarities ;;
6) run_command python3 -m llm_semantic_annotator "$config_file" display_summary ;;
7) run_command python3 -m llm_semantic_annotator "$config_file" build_rdf_graph ;;
8) run_command python3 -m llm_semantic_annotator "$config_file" build_dataset_abstracts_annotations ;;
9) run_command python3 -m llm_semantic_annotator "$config_file" evaluate_encoder ;;
*) echo "Invalid option" ;;
esac
}

# Créer l'environnement virtuel s'il n'existe pas
create_venv_if_not_exists

echo "What would you like to execute?"
echo "1. Pseudo workflow [2,4,5,6,7]"
echo "2. populate_owl_tag_embeddings"
echo "3. populate_ncbi_taxon_tag_embeddings"
echo "4. populate_abstract_embeddings"
echo "5. compute similarities between tags and chunks abstracts"
echo "6. display similarities information"
echo "7. build turtle knowledge graph"
echo "8. build dataset abstracts annotations"
echo "9. evaluate encoder with mesh descriptors (experimental)"
read -p "Enter your choice (1-9): " choice

case $choice in
case $command in
1)
run_command python3 -m llm_semantic_annotator "$conffile" populate_owl_tag_embeddings
#run_command python3 -m llm_semantic_annotator "$conffile" populate_ncbi_taxon_tag_embeddings
run_command python3 -m llm_semantic_annotator "$conffile" populate_abstract_embeddings
run_command python3 -m llm_semantic_annotator "$conffile" compute_tag_chunk_similarities
run_command python3 -m llm_semantic_annotator "$conffile" build_rdf_graph
run_command python3 -m llm_semantic_annotator "$conffile" display_summary
run_command python3 -m llm_semantic_annotator "$config_file" populate_owl_tag_embeddings
#run_command python3 -m llm_semantic_annotator "$config_file" populate_ncbi_taxon_tag_embeddings
run_command python3 -m llm_semantic_annotator "$config_file" populate_abstract_embeddings
run_command python3 -m llm_semantic_annotator "$config_file" compute_tag_chunk_similarities
run_command python3 -m llm_semantic_annotator "$config_file" build_rdf_graph
run_command python3 -m llm_semantic_annotator "$config_file" display_summary
;;
2|3|4|5|6|7|8|9)
execute_command $((choice - 1))
execute_command $command
;;
*)
echo "Invalid choice"
Expand Down
80 changes: 69 additions & 11 deletions llm_semantic_annotator/abstract/abstract_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd

import rdflib
from collections import defaultdict

class AbstractManager:
def __init__(self, config, model_embedding_manager):
def __init__(self, config,model_embedding_manager,tags_manager):
self.config = config

self.abstracts_per_file=config.get('abstracts_per_file', 100)
self.mem = model_embedding_manager
self.tags_manager = tags_manager

if 'from_ncbi_api' in config:
self.retmax = self.config.get('from_ncbi_api').get('retmax',10000)
self.debug_nb_req = self.config.get('from_ncbi_api').get('debug_nb_ncbi_request',-1)
Expand Down Expand Up @@ -205,9 +209,52 @@ def get_files_abstracts_embeddings(self):

return matching_files


def build_ascendants_terms(self,ascendants_dict,graphs):

for graph in graphs:
g = graph['g']
prefix = graph['prefix']
query = """ SELECT ?term ?ascendant WHERE {
?term rdfs:subClassOf* ?ascendant .
FILTER(STRSTARTS(STR(?term), '"""+ prefix + """'))
FILTER(STRSTARTS(STR(?ascendant), '"""+ prefix + """'))
} """ # Exécuter la requête

results = g.query(query) # Remplir le dictionnaire avec les résultats
for row in results:
term = str(row.term)
ascendant = str(row.ascendant)
if term != ascendant: # Éviter d'ajouter le terme lui-même comme ascendant
ascendants_dict[term].append(ascendant) # Afficher le dictionnaire

# we add ascendants of ascendants to avoid future requests
ascendants_dict_to_add = {}
for term in ascendants_dict:
listAscendants = ascendants_dict[term]
liste_asc = listAscendants.copy()

while liste_asc:
ascendant = liste_asc.pop(0)
if ascendant not in ascendants_dict:
ascendants_dict_to_add[ascendant] = liste_asc.copy()

ascendants_dict.update(ascendants_dict_to_add)

print("update dictionnary size :",len(ascendants_dict))
return ascendants_dict


def build_dataset_abstracts_annotations(self):
import re,os

import time
graphs = self.tags_manager.get_graphs_ontologies()
ascendants_dict = defaultdict(list)
debut = time.time()
self.build_ascendants_terms(ascendants_dict,graphs)
duree = time.time() - debut
print(f"loading terms with ancestors : {duree:.4f} secondes")

pattern = re.compile("abstracts_\\d+.json")
for root, dirs, files in os.walk(self.config['retention_dir']):
for filename in files:
Expand All @@ -228,18 +275,29 @@ def build_dataset_abstracts_annotations(self):
doi = abstract['doi']
if doi in abstracts_annot:
for tag in abstracts_annot[doi]:
topicalDescriptor_list.append(tag)
doi_list.append(doi)

if 'reference_id' in abstract:
reference_id_list.append(abstract['reference_id'])
reference_id=abstract['reference_id']
else:
reference_id_list.append(None)
reference_id=None

if 'pmid' in abstract:
pmid_list.append(abstract['pmid'])
pmid=abstract['pmid']
else:
pmid_list.append(None)

pmid=None

# the tag is the term
topicalDescriptor_list.append(tag)
doi_list.append(doi)
reference_id_list.append(reference_id)
pmid_list.append(pmid)

# ancestors
for ancestor in ascendants_dict[tag]:
topicalDescriptor_list.append(ancestor)
doi_list.append(doi)
reference_id_list.append(reference_id)
pmid_list.append(pmid)

df = pd.DataFrame({
'doi': doi_list,
'topicalDescriptor': topicalDescriptor_list,
Expand Down
77 changes: 32 additions & 45 deletions llm_semantic_annotator/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,63 +24,57 @@ def setup_general_config(config_all,methode):

return config

def main_populate_owl_tag_embeddings(config_all):
"""Fonction principale pour générer et stocker les embeddings de tags dans une base."""
def get_owl_tag_manager(config_all):
config = setup_general_config(config_all,'populate_owl_tag_embeddings')

# Utilisez les paramètres de config ici
print(f"Ontologies : {config['ontologies']}")
print(f"Nb terms to compute : {config['debug_nb_terms_by_ontology']}")

mem = ModelEmbeddingManager(config_all)

OwlTagManager(config,mem).manage_tags()
return OwlTagManager(config,mem)

def main_populate_gbif_taxon_tag_embeddings(config_all):
def get_gbif_taxon_tag_manager(config_all):
config = setup_general_config(config_all,'populate_gbif_taxon_tag_embeddings')
mem = ModelEmbeddingManager(config_all)
return TaxonTagManager(config,mem)

TaxonTagManager(config,mem).manage_gbif_taxon_tags()

def main_populate_ncbi_taxon_tag_embeddings(config_all):
def get_ncbi_taxon_tag_manager(config_all):
config = setup_general_config(config_all,'populate_ncbi_taxon_tag_embeddings')
mem = ModelEmbeddingManager(config_all)
return TaxonTagManager(config,mem)

TaxonTagManager(config,mem).manage_ncbi_taxon_tags()

def main_populate_abstract_embeddings(config_all):

def get_abstract_manager(config_all):
config = setup_general_config(config_all,'populate_abstract_embeddings')
mem = ModelEmbeddingManager(config_all)
AbstractManager(config,mem).manage_abstracts()
return AbstractManager(config,mem,get_owl_tag_manager(config_all))

def main_populate_owl_tag_embeddings(config_all):
"""Fonction principale pour générer et stocker les embeddings de tags dans une base."""
get_owl_tag_manager(config_all).manage_tags()

def main_populate_gbif_taxon_tag_embeddings(config_all):
get_gbif_taxon_tag_manager(config_all).manage_gbif_taxon_tags()

def main_populate_ncbi_taxon_tag_embeddings(config_all):
get_ncbi_taxon_tag_manager(config_all).manage_ncbi_taxon_tags()

def main_populate_abstract_embeddings(config_all):
get_abstract_manager(config_all).manage_abstracts()

def main_compute_tag_chunk_similarities(config_all):
"""Fonction principale pour calculer la similarité entre tous les tags et chunks."""
config_owl = setup_general_config(config_all,'populate_owl_tag_embeddings')
config_abstract = setup_general_config(config_all,'populate_abstract_embeddings')

mem = ModelEmbeddingManager(config_all)


tags_pth_files = OwlTagManager(config_owl,mem).get_files_tags_embeddings()
tags_pth_files = get_owl_tag_manager(config_all).get_files_tags_embeddings()

if len(tags_pth_files) == 0:
raise FileNotFoundError("No tags embeddings found")

tags_taxon_pth_files = TaxonTagManager(config_owl,mem).get_files_tags_ncbi_taxon_embeddings()

if len(tags_taxon_pth_files) == 0:
warnings.warn("No tags taxon embeddings found")

tags_taxon_pth_files = get_ncbi_taxon_tag_manager(config_all).get_files_tags_ncbi_taxon_embeddings()
tags_pth_files.extend(tags_taxon_pth_files)

abstracts_pth_files = AbstractManager(config_abstract,mem).get_files_abstracts_embeddings()
abstracts_pth_files = get_abstract_manager(config_all).get_files_abstracts_embeddings()

if len(abstracts_pth_files) == 0:
raise FileNotFoundError("No abstracts embeddings found")

### Loading tags embeddings
### -----------------------
mem = ModelEmbeddingManager(config_all)
tag_embeddings_all = {}
tag_embeddings = {}

Expand Down Expand Up @@ -132,26 +126,21 @@ def get_scores_files(retention_dir):
return scores_files

def get_results_complete_similarities_and_tags_embedding(config_all):
mem = ModelEmbeddingManager(config_all)

scores_files = []
retention_dir = config_all['retention_dir']
mem = ModelEmbeddingManager(config_all)
config_owl = setup_general_config(config_all,'populate_owl_tag_embeddings')
config_abstract = setup_general_config(config_all,'populate_abstract_embeddings')

scores_files = get_scores_files(retention_dir)

tags_pth_files = OwlTagManager(config_owl,mem).get_files_tags_embeddings()
tags_pth_files = get_owl_tag_manager(config_all).get_files_tags_embeddings()

if len(tags_pth_files) == 0:
raise FileNotFoundError("No tags embeddings found")

tags_taxon_pth_files = TaxonTagManager(config_owl,mem).get_files_tags_ncbi_taxon_embeddings()

if len(tags_taxon_pth_files) == 0:
warnings.warn("No tags taxon embeddings found")

tags_taxon_pth_files = get_ncbi_taxon_tag_manager(config_all).get_files_tags_ncbi_taxon_embeddings()
tags_pth_files.extend(tags_taxon_pth_files)
abstracts_pth_files = AbstractManager(config_abstract,mem).get_files_abstracts_embeddings()

abstracts_pth_files = get_abstract_manager(config_all).get_files_abstracts_embeddings()

if len(abstracts_pth_files) == 0:
raise FileNotFoundError("No abstracts embeddings found")
Expand Down Expand Up @@ -209,6 +198,4 @@ def main_build_graph(config_all):
print("Erreur de décodage JSON")

def main_build_dataset_abstracts_annotation(config_all):
config = setup_general_config(config_all,'build_dataset_abstracts_annotation')
mem = ModelEmbeddingManager(config_all)
AbstractManager(config,mem).build_dataset_abstracts_annotations()
get_abstract_manager(config_all).build_dataset_abstracts_annotations()
Loading

0 comments on commit 33d8857

Please sign in to comment.