Skip to content

Commit

Permalink
Merge pull request #1 from p2m2/tag_identifier
Browse files Browse the repository at this point in the history
Tag identifier
  • Loading branch information
ofilangi authored Oct 21, 2024
2 parents 523e3f3 + c1111ad commit 953f779
Show file tree
Hide file tree
Showing 23 changed files with 574 additions and 162,662 deletions.
20 changes: 5 additions & 15 deletions config/1-article.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"encodeur" : "sentence-transformers/all-MiniLM-L6-v2",
"threshold_similarity_tag_chunk" : 0.65,
"threshold_similarity_tag_chunk" : 0.60,
"threshold_similarity_tag" : 0.80,
"batch_size" : 32,

Expand Down Expand Up @@ -46,12 +46,12 @@
}
},
"mesh_link" : {
"mesh": {
"filepath" : "data/mesh/2024.ttl",
"mesh": {
"filepath" : "data/mesh/mesh_concept.nt",
"prefix": "http://id.nlm.nih.gov/mesh/",
"format": "turtle",
"format": "nt",
"label" : "<http://www.w3.org/2000/01/rdf-schema#label>",
"properties": ["<http://www.w3.org/2000/01/rdf-schema#comment>"]
"properties": ["<http://id.nlm.nih.gov/mesh/vocab#scopeNote>"]
}
},
"chemical_link" : {
Expand All @@ -73,16 +73,6 @@
},
"populate_abstract_embeddings" : {
"abstracts_per_file" : 50,

"from_ncbi_api" : {
"ncbi_api_chunk_size" : 200,
"debug_nb_ncbi_request" : -1,
"retmax" : 200,
"selected_term" : [
"chemical+AND+glucosinolate+AND+biotic" ,
"chemical+AND+glucosinolate+AND+abiotic"
]
},
"from_file" : {
"json_files" : [
"data/abstracts/abstracts_1.json",
Expand Down
33 changes: 33 additions & 0 deletions config/mesh_evaluation.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"encodeur" : "sentence-transformers/all-MiniLM-L6-v2",
"threshold_similarity_tag_chunk" : 0.40,
"threshold_similarity_tag" : 0.80,
"batch_size" : 32,

"populate_owl_tag_embeddings" : {
"ontologies": {
"mesh_link" : {
"mesh_descriptor": {
"filepath" : "data/mesh/mesh_descriptor.nt",
"prefix": "http://id.nlm.nih.gov/mesh/",
"format": "nt",
"label" : "<http://www.w3.org/2000/01/rdf-schema#label>",
"properties": ["<http://id.nlm.nih.gov/mesh/vocab#annotation>"]
}
}
},
"debug_nb_terms_by_ontology" : -1
},
"populate_abstract_embeddings" : {
"abstracts_per_file" : 50,

"from_ncbi_api" : {
"ncbi_api_chunk_size" : 20,
"debug_nb_ncbi_request" : -1,
"retmax" : 20,
"selected_term" : [
"Crops%2C+Agricultural%2Fmetabolism%5BMeSH%5D"
]
}
}
}
17 changes: 12 additions & 5 deletions config/mesh_example.json
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
{
"encodeur" : "sentence-transformers/all-MiniLM-L6-v2",
"threshold_similarity_tag_chunk" : 0.40,
"threshold_similarity_tag_chunk" : 0.60,
"threshold_similarity_tag" : 0.80,
"batch_size" : 32,

"populate_owl_tag_embeddings" : {
"ontologies": {
"mesh_link" : {
"mesh": {
"filepath" : "data/mesh/2024.ttl",
"filepath" : "data/mesh/mesh_concept.nt",
"prefix": "http://id.nlm.nih.gov/mesh/",
"format": "turtle",
"format": "nt",
"label" : "<http://www.w3.org/2000/01/rdf-schema#label>",
"properties": ["<http://www.w3.org/2000/01/rdf-schema#comment>"]
"properties": ["<http://id.nlm.nih.gov/mesh/vocab#scopeNote>"]
},
"mesh_descriptor": {
"filepath" : "data/mesh/mesh_descriptor.nt",
"prefix": "http://id.nlm.nih.gov/mesh/",
"format": "nt",
"label" : "<http://www.w3.org/2000/01/rdf-schema#label>",
"properties": ["<http://id.nlm.nih.gov/mesh/vocab#annotation>"]
}
}
},
Expand All @@ -26,7 +33,7 @@
"abstracts_per_file" : 50,

"from_ncbi_api" : {
"ncbi_api_chunk_size" : 200,
"ncbi_api_chunk_size" : 20,
"debug_nb_ncbi_request" : -1,
"retmax" : 20,
"selected_term" : [
Expand Down
46 changes: 46 additions & 0 deletions config/simple.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"encodeur" : "sentence-transformers/all-MiniLM-L6-v2",
"threshold_similarity_tag_chunk" : 0.65,
"threshold_similarity_tag" : 0.80,
"batch_size" : 32,

"populate_owl_tag_embeddings" : {
"ontologies": {
"planteome_link" : {
"po": {
"url": "http://purl.obolibrary.org/obo/po.owl",
"prefix": "http://purl.obolibrary.org/obo/PO_",
"format": "xml",
"label" : "<http://www.w3.org/2000/01/rdf-schema#label>",
"properties": ["<http://purl.obolibrary.org/obo/IAO_0000115>"]
},
"to": {
"url": "http://purl.obolibrary.org/obo/to.owl",
"prefix": "http://purl.obolibrary.org/obo/TO_",
"format": "xml",
"label" : "<http://www.w3.org/2000/01/rdf-schema#label>",
"properties": ["<http://purl.obolibrary.org/obo/IAO_0000115>"]
}
}
},
"debug_nb_terms_by_ontology" : -1
},
"populate_ncbi_taxon_tag_embeddings" : {
"regex" : "rassica.*" ,
"tags_per_file" : 2000
},
"populate_abstract_embeddings" : {
"abstracts_per_file" : 50,
"from_file" : {
"json_files" : [
"data/abstracts/abstracts_1.json",
"data/abstracts/abstracts_2.json"
],
"text_files" : [
"data/abstracts/abstracts_3.txt",
"data/abstracts/abstracts_4.txt"
]
}

}
}
6 changes: 3 additions & 3 deletions config/sylvain.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@
},
"mesh_link" : {
"mesh": {
"filepath" : "data/mesh/2024.ttl",
"filepath" : "data/mesh/mesh_concept.nt",
"prefix": "http://id.nlm.nih.gov/mesh/",
"format": "turtle",
"format": "nt",
"label" : "<http://www.w3.org/2000/01/rdf-schema#label>",
"properties": ["<http://www.w3.org/2000/01/rdf-schema#comment>"]
"properties": ["<http://id.nlm.nih.gov/mesh/vocab#scopeNote>"]
}
}
},
Expand Down
6 changes: 3 additions & 3 deletions config/test_lotus.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@
},
"mesh_link" : {
"mesh": {
"filepath" : "data/mesh/2024.ttl",
"filepath" : "data/mesh/mesh_concept.nt",
"prefix": "http://id.nlm.nih.gov/mesh/",
"format": "turtle",
"format": "nt",
"label" : "<http://www.w3.org/2000/01/rdf-schema#label>",
"properties": ["<http://www.w3.org/2000/01/rdf-schema#comment>"]
"properties": ["<http://id.nlm.nih.gov/mesh/vocab#scopeNote>"]
}
},
"chemical_link" : {
Expand Down
Loading

0 comments on commit 953f779

Please sign in to comment.