fix

p2m2 · Oct 23, 2024 · 33d8857 · 33d8857
1 parent 229cea5
commit 33d8857
Show file tree

Hide file tree

Showing 7 changed files with 215 additions and 103 deletions.
diff --git a/exec-sbatch-gpu.sh b/exec-sbatch-gpu.sh
@@ -15,15 +15,7 @@ source ./check_slurm_memory.sh
 . env/bin/activate
 
 conffile=config/igepp.json
-#export TOKENIZERS_PARALLELISM=false
-#rm -rf igepp_w*/
-
-check_slurm_memory
-python -m llm_semantic_annotator $conffile populate_owl_tag_embeddings
-check_slurm_memory
-python -m llm_semantic_annotator $conffile populate_gbif_taxon_tag_embeddings
-check_slurm_memory
-python -m llm_semantic_annotator $conffile populate_abstract_embeddings
-check_slurm_memory
-python -m llm_semantic_annotator $conffile compute_tag_chunk_similarities
+
+python -m llm_semantic_annotator $conffile 1
+
 
diff --git a/exec.sh b/exec.sh
@@ -1,11 +1,61 @@
 #!/bin/bash
 
-if [ "$#" -ne 1 ]; then
-    echo "Usage: $0 <config_file>"
+help() {
+    cat << EOF
+Usage: $0 <config_file> <int_commande>
+
+Commands:
+  1. Pseudo workflow [2,4,5,6,7]
+  2. Populate OWL tag embeddings
+  3. Populate NCBI Taxon tag embeddings
+  4. Populate abstract embeddings
+  5. Compute similarities between tags and abstract chunks
+  6. Display similarities information
+  7. Build turtle knowledge graph
+  8. Build dataset abstracts annotations CSV file
+  9. Evaluate encoder with MeSH descriptors (experimental)
+
+Details:
+  2: Compute TAG embeddings for all ontologies defined in the populate_owl_tag_embeddings section
+  3: Compute TAG embeddings for NCBI Taxon
+  4: Compute ABSTRACT embeddings (title + sentences) for all abstracts in the dataset
+  5: Compute similarities between TAGS and ABSTRACTS
+  6: Display similarities information on the console
+  7: Generate turtle file with information {score, tag} for each DOI
+  8: Generate CSV file with [doi, tag, pmid, reference_id]
+
+EOF
+}
+
+# Check for help option
+if [[ "$1" == "-h" ]]; then
+    help
+    exit 0
+fi
+
+# Check for correct number of arguments
+if [ "$#" -lt 2 ]; then
+    echo "Error: Not enough arguments."
+    echo "Usage: $0 <config_file> <int_commande> [options]"
+    echo "Use '$0 -h' for more information."
+    exit 1
+fi
+
+config_file=$1
+command=$2
+
+# Validate config file
+if [ ! -f "$config_file" ]; then
+    echo "Error: Config file '$config_file' does not exist."
+    exit 1
+fi
+
+# Validate command is an integer
+if ! [[ "$command" =~ ^[0-9]+$ ]]; then
+    echo "Error: Command must be an integer."
     exit 1
 fi
 
-conffile="$1"
 venv_name="llm_semantic_annotator_env"
 
 # Fonction pour créer l'environnement virtuel s'il n'existe pas
@@ -31,44 +81,32 @@ run_command() {
 
 execute_command() {
     case $1 in
-        1) run_command python3 -m llm_semantic_annotator "$conffile" populate_owl_tag_embeddings ;;
-        2) run_command python3 -m llm_semantic_annotator "$conffile" populate_ncbi_taxon_tag_embeddings ;;
-        3) run_command python3 -m llm_semantic_annotator "$conffile" populate_abstract_embeddings ;;
-        4) run_command python3 -m llm_semantic_annotator "$conffile" compute_tag_chunk_similarities ;;
-        5) run_command python3 -m llm_semantic_annotator "$conffile" display_summary ;; 
-        6) run_command python3 -m llm_semantic_annotator "$conffile" build_rdf_graph ;;
-        7) run_command python3 -m llm_semantic_annotator "$conffile" build_dataset_abstracts_annotations ;; 
-        8) run_command python3 -m llm_semantic_annotator "$conffile" evaluate_encoder ;;   
-	*) echo "Invalid option" ;;
+        2) run_command python3 -m llm_semantic_annotator "$config_file" populate_owl_tag_embeddings ;;
+        3) run_command python3 -m llm_semantic_annotator "$config_file" populate_ncbi_taxon_tag_embeddings ;;
+        4) run_command python3 -m llm_semantic_annotator "$config_file" populate_abstract_embeddings ;;
+        5) run_command python3 -m llm_semantic_annotator "$config_file" compute_tag_chunk_similarities ;;
+        6) run_command python3 -m llm_semantic_annotator "$config_file" display_summary ;; 
+        7) run_command python3 -m llm_semantic_annotator "$config_file" build_rdf_graph ;;
+        8) run_command python3 -m llm_semantic_annotator "$config_file" build_dataset_abstracts_annotations ;; 
+        9) run_command python3 -m llm_semantic_annotator "$config_file" evaluate_encoder ;;   
+        *) echo "Invalid option" ;;
     esac
 }
 
 # Créer l'environnement virtuel s'il n'existe pas
 create_venv_if_not_exists
 
-echo "What would you like to execute?"
-echo "1. Pseudo workflow [2,4,5,6,7]"
-echo "2. populate_owl_tag_embeddings"
-echo "3. populate_ncbi_taxon_tag_embeddings"
-echo "4. populate_abstract_embeddings"
-echo "5. compute similarities between tags and chunks abstracts"
-echo "6. display similarities information"
-echo "7. build turtle knowledge graph"
-echo "8. build dataset abstracts annotations"
-echo "9. evaluate encoder with mesh descriptors (experimental)"
-read -p "Enter your choice (1-9): " choice
-
-case $choice in
+case $command in
     1)
-        run_command python3 -m llm_semantic_annotator "$conffile" populate_owl_tag_embeddings
-        #run_command python3 -m llm_semantic_annotator "$conffile" populate_ncbi_taxon_tag_embeddings
-        run_command python3 -m llm_semantic_annotator "$conffile" populate_abstract_embeddings
-        run_command python3 -m llm_semantic_annotator "$conffile" compute_tag_chunk_similarities
-        run_command python3 -m llm_semantic_annotator "$conffile" build_rdf_graph
-        run_command python3 -m llm_semantic_annotator "$conffile" display_summary
+        run_command python3 -m llm_semantic_annotator "$config_file" populate_owl_tag_embeddings
+        #run_command python3 -m llm_semantic_annotator "$config_file" populate_ncbi_taxon_tag_embeddings
+        run_command python3 -m llm_semantic_annotator "$config_file" populate_abstract_embeddings
+        run_command python3 -m llm_semantic_annotator "$config_file" compute_tag_chunk_similarities
+        run_command python3 -m llm_semantic_annotator "$config_file" build_rdf_graph
+        run_command python3 -m llm_semantic_annotator "$config_file" display_summary
         ;;
     2|3|4|5|6|7|8|9)
-        execute_command $((choice - 1))
+        execute_command $command
         ;;
     *)
         echo "Invalid choice"

diff --git a/llm_semantic_annotator/abstract/abstract_manager.py b/llm_semantic_annotator/abstract/abstract_manager.py
@@ -5,13 +5,17 @@
 import xml.etree.ElementTree as ET
 from pathlib import Path
 import pandas as pd
-
+import rdflib 
+from collections import defaultdict 
+
 class AbstractManager:
-    def __init__(self, config, model_embedding_manager):
+    def __init__(self, config,model_embedding_manager,tags_manager):
         self.config = config
 
         self.abstracts_per_file=config.get('abstracts_per_file', 100)
         self.mem = model_embedding_manager
+        self.tags_manager = tags_manager
+
         if 'from_ncbi_api' in config:
             self.retmax = self.config.get('from_ncbi_api').get('retmax',10000)
             self.debug_nb_req = self.config.get('from_ncbi_api').get('debug_nb_ncbi_request',-1)
@@ -205,9 +209,52 @@ def get_files_abstracts_embeddings(self):
 
         return matching_files
 
+
+    def build_ascendants_terms(self,ascendants_dict,graphs):
+
+        for graph in graphs:
+            g = graph['g']
+            prefix = graph['prefix']
+            query = """ SELECT ?term ?ascendant WHERE { 
+                ?term rdfs:subClassOf* ?ascendant . 
+                FILTER(STRSTARTS(STR(?term), '"""+ prefix + """'))
+                FILTER(STRSTARTS(STR(?ascendant), '"""+ prefix + """'))
+            } """ # Exécuter la requête 
+
+            results = g.query(query) # Remplir le dictionnaire avec les résultats 
+            for row in results: 
+                term = str(row.term) 
+                ascendant = str(row.ascendant) 
+                if term != ascendant: # Éviter d'ajouter le terme lui-même comme ascendant 
+                    ascendants_dict[term].append(ascendant) # Afficher le dictionnaire 
+
+            # we add ascendants of ascendants to avoid future requests
+            ascendants_dict_to_add = {}
+            for term in ascendants_dict:
+                listAscendants = ascendants_dict[term]
+                liste_asc = listAscendants.copy()
+
+                while liste_asc:
+                    ascendant = liste_asc.pop(0)
+                    if ascendant not in ascendants_dict:
+                        ascendants_dict_to_add[ascendant] = liste_asc.copy()
+
+            ascendants_dict.update(ascendants_dict_to_add)
+
+        print("update dictionnary size :",len(ascendants_dict))    
+        return ascendants_dict
+
+
     def build_dataset_abstracts_annotations(self):
         import re,os
-
+        import time
+        graphs = self.tags_manager.get_graphs_ontologies()
+        ascendants_dict = defaultdict(list)
+        debut = time.time()
+        self.build_ascendants_terms(ascendants_dict,graphs)
+        duree = time.time() - debut
+        print(f"loading terms with ancestors : {duree:.4f} secondes")
+
         pattern = re.compile("abstracts_\\d+.json")
         for root, dirs, files in os.walk(self.config['retention_dir']):
             for filename in files:
@@ -228,18 +275,29 @@ def build_dataset_abstracts_annotations(self):
                         doi = abstract['doi']
                         if doi in abstracts_annot:
                             for tag in abstracts_annot[doi]:
-                                topicalDescriptor_list.append(tag)
-                                doi_list.append(doi)
-
                                 if 'reference_id' in abstract:
-                                    reference_id_list.append(abstract['reference_id'])
+                                    reference_id=abstract['reference_id']
                                 else:
-                                    reference_id_list.append(None)
+                                    reference_id=None
+
                                 if 'pmid' in abstract:
-                                    pmid_list.append(abstract['pmid'])
+                                    pmid=abstract['pmid']
                                 else:
-                                    pmid_list.append(None)
-
+                                    pmid=None
+
+                                # the tag is the term            
+                                topicalDescriptor_list.append(tag)
+                                doi_list.append(doi)
+                                reference_id_list.append(reference_id)
+                                pmid_list.append(pmid)
+
+                                # ancestors
+                                for ancestor in ascendants_dict[tag]:
+                                    topicalDescriptor_list.append(ancestor)
+                                    doi_list.append(doi)
+                                    reference_id_list.append(reference_id)
+                                    pmid_list.append(pmid)
+
                     df = pd.DataFrame({
                         'doi': doi_list,
                         'topicalDescriptor': topicalDescriptor_list,

diff --git a/llm_semantic_annotator/core.py b/llm_semantic_annotator/core.py
@@ -24,63 +24,57 @@ def setup_general_config(config_all,methode):
 
     return config
 
-def main_populate_owl_tag_embeddings(config_all):
-    """Fonction principale pour générer et stocker les embeddings de tags dans une base."""
+def get_owl_tag_manager(config_all):
     config = setup_general_config(config_all,'populate_owl_tag_embeddings')
-
-    # Utilisez les paramètres de config ici
-    print(f"Ontologies : {config['ontologies']}")
-    print(f"Nb terms to compute : {config['debug_nb_terms_by_ontology']}")
-
     mem = ModelEmbeddingManager(config_all)
-
-    OwlTagManager(config,mem).manage_tags()
+    return OwlTagManager(config,mem)
 
-def main_populate_gbif_taxon_tag_embeddings(config_all):
+def get_gbif_taxon_tag_manager(config_all):
     config = setup_general_config(config_all,'populate_gbif_taxon_tag_embeddings')
     mem = ModelEmbeddingManager(config_all)
+    return TaxonTagManager(config,mem)
 
-    TaxonTagManager(config,mem).manage_gbif_taxon_tags()
-
-def main_populate_ncbi_taxon_tag_embeddings(config_all):
+def get_ncbi_taxon_tag_manager(config_all):
     config = setup_general_config(config_all,'populate_ncbi_taxon_tag_embeddings')
     mem = ModelEmbeddingManager(config_all)
+    return TaxonTagManager(config,mem)
 
-    TaxonTagManager(config,mem).manage_ncbi_taxon_tags()
-
-def main_populate_abstract_embeddings(config_all):
-
+def get_abstract_manager(config_all):
     config = setup_general_config(config_all,'populate_abstract_embeddings')
     mem = ModelEmbeddingManager(config_all)
-    AbstractManager(config,mem).manage_abstracts()
+    return AbstractManager(config,mem,get_owl_tag_manager(config_all))
+
+def main_populate_owl_tag_embeddings(config_all):
+    """Fonction principale pour générer et stocker les embeddings de tags dans une base."""
+    get_owl_tag_manager(config_all).manage_tags()
+
+def main_populate_gbif_taxon_tag_embeddings(config_all):
+    get_gbif_taxon_tag_manager(config_all).manage_gbif_taxon_tags()
+
+def main_populate_ncbi_taxon_tag_embeddings(config_all):
+    get_ncbi_taxon_tag_manager(config_all).manage_ncbi_taxon_tags()
+
+def main_populate_abstract_embeddings(config_all):
+    get_abstract_manager(config_all).manage_abstracts()
 
 def main_compute_tag_chunk_similarities(config_all):
     """Fonction principale pour calculer la similarité entre tous les tags et chunks."""
-    config_owl = setup_general_config(config_all,'populate_owl_tag_embeddings')
-    config_abstract = setup_general_config(config_all,'populate_abstract_embeddings')
-
-    mem = ModelEmbeddingManager(config_all)
-
-
-    tags_pth_files = OwlTagManager(config_owl,mem).get_files_tags_embeddings()
+    tags_pth_files = get_owl_tag_manager(config_all).get_files_tags_embeddings()
 
     if len(tags_pth_files) == 0:
         raise FileNotFoundError("No tags embeddings found")
 
-    tags_taxon_pth_files = TaxonTagManager(config_owl,mem).get_files_tags_ncbi_taxon_embeddings()
-
-    if len(tags_taxon_pth_files) == 0:
-        warnings.warn("No tags taxon embeddings found")
-
+    tags_taxon_pth_files = get_ncbi_taxon_tag_manager(config_all).get_files_tags_ncbi_taxon_embeddings()
     tags_pth_files.extend(tags_taxon_pth_files)
 
-    abstracts_pth_files = AbstractManager(config_abstract,mem).get_files_abstracts_embeddings()
+    abstracts_pth_files = get_abstract_manager(config_all).get_files_abstracts_embeddings()
 
     if len(abstracts_pth_files) == 0:
         raise FileNotFoundError("No abstracts embeddings found")
 
     ### Loading tags embeddings
     ### -----------------------
+    mem = ModelEmbeddingManager(config_all)
     tag_embeddings_all = {}
     tag_embeddings = {}
 
@@ -132,26 +126,21 @@ def get_scores_files(retention_dir):
     return scores_files
 
 def get_results_complete_similarities_and_tags_embedding(config_all):
+    mem = ModelEmbeddingManager(config_all)
+
     scores_files = []
     retention_dir = config_all['retention_dir']
-    mem = ModelEmbeddingManager(config_all)
-    config_owl = setup_general_config(config_all,'populate_owl_tag_embeddings')
-    config_abstract = setup_general_config(config_all,'populate_abstract_embeddings')
 
     scores_files = get_scores_files(retention_dir)
-
-    tags_pth_files = OwlTagManager(config_owl,mem).get_files_tags_embeddings()
+    tags_pth_files = get_owl_tag_manager(config_all).get_files_tags_embeddings()
 
     if len(tags_pth_files) == 0:
         raise FileNotFoundError("No tags embeddings found")
 
-    tags_taxon_pth_files = TaxonTagManager(config_owl,mem).get_files_tags_ncbi_taxon_embeddings()
-
-    if len(tags_taxon_pth_files) == 0:
-        warnings.warn("No tags taxon embeddings found")
-
+    tags_taxon_pth_files = get_ncbi_taxon_tag_manager(config_all).get_files_tags_ncbi_taxon_embeddings()
     tags_pth_files.extend(tags_taxon_pth_files)
-    abstracts_pth_files = AbstractManager(config_abstract,mem).get_files_abstracts_embeddings()
+
+    abstracts_pth_files = get_abstract_manager(config_all).get_files_abstracts_embeddings()
 
     if len(abstracts_pth_files) == 0:
         raise FileNotFoundError("No abstracts embeddings found")
@@ -209,6 +198,4 @@ def main_build_graph(config_all):
                 print("Erreur de décodage JSON")
 
 def main_build_dataset_abstracts_annotation(config_all):
-    config = setup_general_config(config_all,'build_dataset_abstracts_annotation')
-    mem = ModelEmbeddingManager(config_all)
-    AbstractManager(config,mem).build_dataset_abstracts_annotations()
+    get_abstract_manager(config_all).build_dataset_abstracts_annotations()