Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add tqdm to track update progress #188

Merged
merged 2 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ dynamic = ["version"]

[project.optional-dependencies]
pg = ["psycopg[binary]", "requests"]
etl = ["owlready2==0.40", "rdflib", "wags-tails>=0.1.2", "fastobo"]
etl = ["owlready2==0.40", "rdflib", "wags-tails>=0.1.2", "fastobo", "tqdm"]
test = ["pytest>=6.0", "pytest-cov", "httpx"]
dev = ["pre-commit", "ruff==0.2.0", "lxml", "xmlformatter"]
docs = [
Expand Down
3 changes: 2 additions & 1 deletion src/disease/etl/do.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Get Human Disease Ontology data."""
import owlready2 as owl
from tqdm import tqdm

from disease import PREFIX_LOOKUP
from disease.etl.base import OWLBase
Expand Down Expand Up @@ -47,7 +48,7 @@ def _transform_data(self) -> None:
diseases = self._get_subclasses(
disease_uri, owl.default_world.as_rdflib_graph()
)
for uri in diseases:
for uri in tqdm(diseases, ncols=80, disable=self._silent):
disease_class = do.search(iri=uri)[0]
if disease_class.deprecated:
continue
Expand Down
10 changes: 7 additions & 3 deletions src/disease/etl/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from timeit import default_timer as timer
from typing import Collection, Dict, List, Set, Tuple

from tqdm import tqdm

from disease import logger
from disease.database.database import AbstractDatabase
from disease.schemas import SourcePriority
Expand All @@ -10,12 +12,14 @@
class Merge:
"""Manage construction of record mergers for normalization."""

def __init__(self, database: AbstractDatabase) -> None:
def __init__(self, database: AbstractDatabase, silent: bool = True) -> None:
"""Initialize Merge instance.

:param Database database: db instance to use for record retrieval and creation.
:param silent: if ``True``, suppress console output
"""
self._database = database
self._silent = silent
self._groups = [] # list of tuples: (mondo_concept_id, set_of_ids)

def create_merged_concepts(self, record_ids: Collection[str]) -> None:
Expand All @@ -29,7 +33,7 @@ def create_merged_concepts(self, record_ids: Collection[str]) -> None:
# build groups
logger.info(f"Generating record ID sets from {len(record_ids)} records")
start = timer()
for concept_id in record_ids:
for concept_id in tqdm(record_ids, ncols=80, disable=self._silent):
try:
record = self._database.get_record_by_id(concept_id)
except AttributeError:
Expand All @@ -51,7 +55,7 @@ def create_merged_concepts(self, record_ids: Collection[str]) -> None:
# build merged concepts
logger.info("Creating merged records and updating database...")
start = timer()
for record_id, group in self._groups:
for record_id, group in tqdm(self._groups, ncols=80, disable=self._silent):
try:
merged_record, merged_ids = self._generate_merged_record(group)
except AttributeError:
Expand Down
3 changes: 2 additions & 1 deletion src/disease/etl/mondo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import ClassVar, DefaultDict, Dict, Optional, Set, Tuple

import fastobo
from tqdm import tqdm

from disease.etl.base import Base
from disease.schemas import DataLicenseAttributes, NamespacePrefix, SourceMeta
Expand Down Expand Up @@ -186,7 +187,7 @@ def _transform_data(self) -> None:
pediatric_diseases = self._construct_dependency_set(dag, peds_neoplasm_root)

reader = fastobo.iter(str(self._data_file.absolute()))
for item in reader:
for item in tqdm(reader, ncols=80, disable=self._silent):
concept_id = str(item.id).lower()
if concept_id.upper() not in diseases:
continue
Expand Down
3 changes: 2 additions & 1 deletion src/disease/etl/ncit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Set

import owlready2 as owl
from tqdm import tqdm

from disease import logger
from disease.etl.base import OWLBase
Expand Down Expand Up @@ -49,7 +50,7 @@ def _transform_data(self) -> None:
"""Get data from file and construct object for loading."""
ncit = owl.get_ontology(self._data_file.absolute().as_uri()).load()
disease_uris = self._get_disease_classes()
for uri in disease_uris:
for uri in tqdm(disease_uris, ncols=80, disable=self._silent):
disease_class = ncit.search(iri=uri)[0]
concept_id = f"{NamespacePrefix.NCIT.value}:{disease_class.name}"
if disease_class.P108:
Expand Down
3 changes: 2 additions & 1 deletion src/disease/etl/omim.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path
from typing import Optional

from tqdm import tqdm
from wags_tails import CustomData, DataSource

from disease.etl.base import Base
Expand Down Expand Up @@ -62,7 +63,7 @@ def _transform_data(self) -> None:
rows = [r.rstrip() for r in rows if not r.startswith("#")]
rows = [r.split("\t") for r in rows]
rows = [r for r in rows if r[0] not in ("Asterisk", "Caret", "Plus")]
for row in rows:
for row in tqdm(rows, ncols=80, disable=self._silent):
disease = {
"concept_id": f"{NamespacePrefix.OMIM.value}:{row[1]}",
}
Expand Down
69 changes: 43 additions & 26 deletions src/disease/etl/oncotree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import json
from typing import Dict

from tqdm import tqdm

from disease import logger
from disease.etl.base import Base
from disease.schemas import NamespacePrefix, SourceMeta
Expand All @@ -27,40 +29,55 @@ def _load_meta(self) -> None:
self._database.add_source_metadata(self._src_name, metadata)

def _traverse_tree(self, disease_node: Dict) -> None:
"""Traverse JSON tree and load diseases where possible.
"""Traverse JSON tree and queue diseases for loading where possible.

:param Dict disease_node: node in tree containing info for individual
disease.
:param disease_node: node in tree containing info for individual disease.
"""
if disease_node.get("level") >= 2:
disease = {
"concept_id": f"{NamespacePrefix.ONCOTREE.value}:{disease_node['code']}",
"label": disease_node["name"],
"xrefs": [],
"associated_with": [],
}
refs = disease_node.get("externalReferences", [])
for prefix, codes in refs.items():
if prefix == "UMLS":
normed_prefix = NamespacePrefix.UMLS.value
for code in codes:
normed_id = f"{normed_prefix}:{code}"
disease["associated_with"].append(normed_id)
elif prefix == "NCI":
normed_prefix = NamespacePrefix.NCIT.value
for code in codes:
normed_id = f"{normed_prefix}:{code}"
disease["xrefs"].append(normed_id)
else:
logger.warning(f"Unrecognized prefix: {prefix}")
continue
self._load_disease(disease)
if disease_node["level"] >= 2:
self._nodes.append(
{
"code": disease_node["code"],
"name": disease_node["name"],
"externalReferences": disease_node.get("externalReferences", []),
}
)
if disease_node.get("children"):
for child in disease_node["children"].values():
self._traverse_tree(child)

def _add_disease(self, disease_node: Dict) -> None:
"""Grab data from disease node and load into DB.

:param disease_node: individual node taken from OncoTree tree
"""
disease = {
"concept_id": f"{NamespacePrefix.ONCOTREE.value}:{disease_node['code']}",
"label": disease_node["name"],
"xrefs": [],
"associated_with": [],
}
refs = disease_node.get("externalReferences", [])
for prefix, codes in refs.items():
if prefix == "UMLS":
normed_prefix = NamespacePrefix.UMLS.value
for code in codes:
normed_id = f"{normed_prefix}:{code}"
disease["associated_with"].append(normed_id)
elif prefix == "NCI":
normed_prefix = NamespacePrefix.NCIT.value
for code in codes:
normed_id = f"{normed_prefix}:{code}"
disease["xrefs"].append(normed_id)
else:
logger.warning(f"Unrecognized prefix: {prefix}")
continue
self._load_disease(disease)

def _transform_data(self) -> None:
"""Initiate OncoTree data transformation."""
with self._data_file.open() as f:
oncotree = json.load(f)
self._nodes = []
self._traverse_tree(oncotree["TISSUE"])
for node in tqdm(self._nodes, ncols=80, disable=self._silent):
self._add_disease(node)
Loading