From 768a06ffb18466c34d09106bf22c7c5e48be108a Mon Sep 17 00:00:00 2001 From: Olivier Filangi Date: Mon, 21 Oct 2024 16:41:29 +0200 Subject: [PATCH] minor change . readme --- README.md | 20 +------ config/1-article.json | 4 -- config/planteom-example.json | 58 +++++++++++++++++++ exec-1article.sh | 10 ---- exec.sh | 5 +- .../abstract/abstract_manager.py | 2 +- llm_semantic_annotator/core.py | 2 - 7 files changed, 64 insertions(+), 37 deletions(-) create mode 100644 config/planteom-example.json delete mode 100755 exec-1article.sh diff --git a/README.md b/README.md index f7d47e9..6a6dbf1 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,7 @@ This approach aims to significantly enrich the metadata of scientific articles, ## Installation ```bash -python -m venv env -source env/bin/activate -pip install git+https://github.com/p2m2/encoder-ontology-match-abstract +exec.sh ``` ## Configuration @@ -103,22 +101,8 @@ check exemple on [config](./config) directory -### Execution +### Tests Execution -```bash -python -m llm_semantic_annotator config/test.json populate_owl_tag_embeddings -``` - -```bash -python -m llm_semantic_annotator config/test.json populate_ncbi_abstract_embeddings -``` -```bash -python -m llm_semantic_annotator config/igepp.json populate_ncbi_abstract_embeddings -``` - -```bash -python -m llm_semantic_annotator config/test.json compute_tag_chunk_similarities -``` ```bash python -m unittest discover diff --git a/config/1-article.json b/config/1-article.json index 0e21bb6..3f40956 100644 --- a/config/1-article.json +++ b/config/1-article.json @@ -77,10 +77,6 @@ "json_files" : [ "data/abstracts/abstracts_1.json", "data/abstracts/abstracts_2.json" - ], - "text_files" : [ - "data/abstracts/abstracts_3.txt", - "data/abstracts/abstracts_4.txt" ] } diff --git a/config/planteom-example.json b/config/planteom-example.json new file mode 100644 index 0000000..bd53417 --- /dev/null +++ b/config/planteom-example.json @@ -0,0 +1,58 @@ +{ + "encodeur" : "sentence-transformers/all-MiniLM-L6-v2", + "threshold_similarity_tag_chunk" : 0.60, + "threshold_similarity_tag" : 0.80, + "batch_size" : 32, + + "populate_owl_tag_embeddings" : { + "ontologies": { + "planteome_link" : { + "peco": { + "url": "http://purl.obolibrary.org/obo/peco.owl", + "prefix": "http://purl.obolibrary.org/obo/PECO_", + "format": "xml", + "label" : "", + "properties": [""] + }, + "po": { + "url": "http://purl.obolibrary.org/obo/po.owl", + "prefix": "http://purl.obolibrary.org/obo/PO_", + "format": "xml", + "label" : "", + "properties": [""] + }, + "pso": { + "url": "http://purl.obolibrary.org/obo/pso.owl", + "prefix": "http://purl.obolibrary.org/obo/PSO_", + "format": "xml", + "label" : "", + "properties": [""] + }, + "to": { + "url": "http://purl.obolibrary.org/obo/to.owl", + "prefix": "http://purl.obolibrary.org/obo/TO_", + "format": "xml", + "label" : "", + "properties": [""] + } + } + }, + "debug_nb_terms_by_ontology" : -1 + }, + "populate_ncbi_taxon_tag_embeddings" : { + "regex" : "(assic.*)|(arab.*)" , + "tags_per_file" : 2000 + }, + "populate_abstract_embeddings" : { + "abstracts_per_file" : 500, + "from_ncbi_api" : { + "ncbi_api_chunk_size" : 200, + "debug_nb_ncbi_request" : -1, + "retmax" : 2000, + "selected_term" : [ + "Crops%2C+Agricultural%2Fmetabolism%5BMeSH%5D" + ] + } + + } +} diff --git a/exec-1article.sh b/exec-1article.sh deleted file mode 100755 index de91d38..0000000 --- a/exec-1article.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -conffile=config/1-article.json -#rm -rf 1-article_w*/ -python -m llm_semantic_annotator $conffile populate_owl_tag_embeddings -#python -m llm_semantic_annotator $conffile populate_gbif_taxon_tag_embeddings -python -m llm_semantic_annotator $conffile populate_ncbi_taxon_tag_embeddings -python -m llm_semantic_annotator $conffile populate_abstract_embeddings -python -m llm_semantic_annotator $conffile compute_tag_chunk_similarities - diff --git a/exec.sh b/exec.sh index 70b8ea9..a7fdee5 100755 --- a/exec.sh +++ b/exec.sh @@ -46,14 +46,14 @@ execute_command() { create_venv_if_not_exists echo "What would you like to execute?" -echo "1. Full workflow" +echo "1. Pseudo workflow [2,4,5,6,7]" echo "2. populate_owl_tag_embeddings" echo "3. populate_ncbi_taxon_tag_embeddings" echo "4. populate_abstract_embeddings" echo "5. compute similarities between tags and chunks abstracts" echo "6. display similarities information" echo "7. build turtle knowledge graph" -echo "8. evaluate encoder with mesh descriptors" +echo "8. evaluate encoder with mesh descriptors (experimental)" read -p "Enter your choice (1-8): " choice case $choice in @@ -62,6 +62,7 @@ case $choice in #run_command python3 -m llm_semantic_annotator "$conffile" populate_ncbi_taxon_tag_embeddings run_command python3 -m llm_semantic_annotator "$conffile" populate_abstract_embeddings run_command python3 -m llm_semantic_annotator "$conffile" compute_tag_chunk_similarities + run_command python3 -m llm_semantic_annotator "$conffile" build_graph run_command python3 -m llm_semantic_annotator "$conffile" display_summary ;; 2|3|4|5|6|7|8) diff --git a/llm_semantic_annotator/abstract/abstract_manager.py b/llm_semantic_annotator/abstract/abstract_manager.py index 5675ba1..a46431e 100644 --- a/llm_semantic_annotator/abstract/abstract_manager.py +++ b/llm_semantic_annotator/abstract/abstract_manager.py @@ -73,7 +73,7 @@ def get_ncbi_abstracts_from_api(self): chunk = id_list[i:i+self.ncbi_api_chunk_size] ids = ",".join(chunk) fetch_url = f"{base_url}efetch.fcgi?db=pubmed&id={ids}&rettype=abstract&retmode=xml" - print(fetch_url) + fetch_response = requests.post(fetch_url) root = ET.fromstring(fetch_response.content) diff --git a/llm_semantic_annotator/core.py b/llm_semantic_annotator/core.py index 6f514a7..902dd49 100644 --- a/llm_semantic_annotator/core.py +++ b/llm_semantic_annotator/core.py @@ -80,8 +80,6 @@ def main_compute_tag_chunk_similarities(config_all): if len(abstracts_pth_files) == 0: raise FileNotFoundError("No abstracts embeddings found") - - ### Loading tags embeddings ### ----------------------- tag_embeddings_all = {}