diff --git a/dynamodb_revisions/add_item_type.py b/dynamodb_revisions/add_item_type.py new file mode 100644 index 00000000..1d0e91a6 --- /dev/null +++ b/dynamodb_revisions/add_item_type.py @@ -0,0 +1,70 @@ +"""Add item_type attribute to all items.""" +import sys +from pathlib import Path +from timeit import default_timer as timer +import click +from botocore.exceptions import ClientError +import logging + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +sys.path.append(f"{PROJECT_ROOT}") + +from therapy.database import Database # noqa: E402 + + +logger = logging.getLogger('therapy') +logger.setLevel(logging.DEBUG) + +db = Database() + + +def add_item_type(label_and_type: str, concept_id: str, item_type: str): + """Add item_type to individual db item.""" + key = { + 'label_and_type': label_and_type, + 'concept_id': concept_id + } + update_expression = "set item_type=:r" + update_values = {':r': item_type} + try: + db.therapies.update_item(Key=key, + UpdateExpression=update_expression, + ExpressionAttributeValues=update_values) + except ClientError as e: + logger.error(f"boto3 client error in `database.update_record()`: " + f"{e.response['Error']['Message']}") + + +VALID_TYPES = {'identity', 'label', 'trade_name', 'rx_brand', 'alias', + 'other_id', 'xref', 'merger'} + + +def add_item_types(): + """Add item_type attribute to all items.""" + last_evaluated_key = None + while True: + if last_evaluated_key: + response = db.therapies.scan(ExclusiveStartKey=last_evaluated_key) + else: + response = db.therapies.scan() + + records = response['Items'] + for record in records: + label_and_type = record['label_and_type'] + item_type = label_and_type.split('##')[-1] + if item_type in VALID_TYPES: + add_item_type(label_and_type, record['concept_id'], item_type) + else: + logger.error(f"Couldn't parse item type for record: {record}") + + last_evaluated_key = response.get('LastEvaluatedKey') + if not last_evaluated_key: + break + + +if __name__ == '__main__': + click.echo("Adding item_types...") + start = timer() + add_item_types() + end = timer() + click.echo(f"finished adding item_types in {end - start:.5f}s.") diff --git a/setup.py b/setup.py index 4d57c821..babb1374 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ """Defines how metakb is packaged and distributed.""" from setuptools import setup -setup(version="0.2.14") +setup(version="0.2.15") diff --git a/tests/unit/data/therapies.json b/tests/unit/data/therapies.json index 6bf7feb9..efca8bab 100644 --- a/tests/unit/data/therapies.json +++ b/tests/unit/data/therapies.json @@ -1,6 +1,8 @@ [ { "label_and_type": "chembl:chembl11359##identity", + "src_name": "ChEMBL", + "item_type": "identity", "concept_id": "chembl:CHEMBL11359", "label": "CISPLATIN", "aliases": [ @@ -16,8 +18,7 @@ "Platinol-Aq" ], "approval_status": "approved", - "trade_names": ["PLATINOL", "PLATINOL-AQ", "CISPLATIN"], - "src_name": "ChEMBL" + "trade_names": ["PLATINOL", "PLATINOL-AQ", "CISPLATIN"] }, { "label_and_type": "cisplatin##label", @@ -172,7 +173,8 @@ { "label_and_type": "pms-cisplatin##trade_name", "concept_id": "drugbank:db00515", - "src_name": "DrugBank" + "src_name": "DrugBank", + "item_type": "trade_name" }, { "label_and_type": "drugbank:db00522##identity", @@ -350,7 +352,8 @@ { "label_and_type": "interferon alfacon-1##label", "concept_id": "wikidata:q15353101", - "src_name": "Wikidata" + "src_name": "Wikidata", + "item_type": "label" }, { "label_and_type": "wikidata:q191924##identity", @@ -487,7 +490,8 @@ { "label_and_type": "fda:5x5hb3vz3z##xref", "concept_id": "ncit:c74021", - "src_name": "NCIt" + "src_name": "NCIt", + "item_type": "xref" }, { "label_and_type": "chemidplus:197904-84-0##other_id", @@ -608,6 +612,7 @@ }, { "label_and_type": "chemidplus:15663-27-1##identity", + "item_type": "identity", "concept_id": "chemidplus:15663-27-1", "label": "Cisplatin", "aliases": [ @@ -623,7 +628,8 @@ { "label_and_type": "drugbank:db00515##other_id", "concept_id": "chemidplus:15663-27-1", - "src_name": "ChemIDplus" + "src_name": "ChemIDplus", + "item_type": "other_id" }, { "label_and_type": "cisplatin##label", @@ -632,6 +638,7 @@ }, { "label_and_type": "cis-diaminedichloroplatinum##alias", + "item_type": "alias", "concept_id": "chemidplus:15663-27-1", "src_name": "ChemIDplus" }, @@ -735,7 +742,8 @@ { "label_and_type": "dichlorodiammineplatinum##alias", "concept_id": "rxcui:2555", - "src_name": "RxNorm" + "src_name": "RxNorm", + "item_type": "alias" }, { "label_and_type": "cis platinum##alias", @@ -1416,6 +1424,7 @@ }, { "label_and_type": "ncit:c839##merger", + "item_type": "merger", "concept_id": "ncit:C839", "other_ids": ["chemidplus:8025-81-8"], "label": "Spiramycin", @@ -1467,7 +1476,8 @@ { "label_and_type": "rxcui:1041527##rx_brand", "concept_id": "rxcui:161", - "src_name": "RxNorm" + "src_name": "RxNorm", + "item_type": "rx_brand" }, { "label_and_type": "rxcui:218330##rx_brand", diff --git a/tests/unit/test_chemidplus_source.py b/tests/unit/test_chemidplus_source.py index 8080c063..5d37fb67 100644 --- a/tests/unit/test_chemidplus_source.py +++ b/tests/unit/test_chemidplus_source.py @@ -3,6 +3,7 @@ from therapy.schemas import Drug, MatchType import pytest from tests.conftest import compare_records +import datetime @pytest.fixture(scope='module') @@ -191,7 +192,9 @@ def test_meta(chemidplus): response = chemidplus.search('incoherent-string-of-text') assert response['source_meta_'].data_license == 'custom' assert response['source_meta_'].data_license_url == 'https://www.nlm.nih.gov/databases/download/terms_and_conditions.html' # noqa: E501 - assert response['source_meta_'].version == '20210204' + version = response['source_meta_'].version + assert len(version) == 8 + assert datetime.datetime.strptime(version, '%Y%m%d') assert response['source_meta_'].data_url == 'ftp://ftp.nlm.nih.gov/nlmdata/.chemidlease/' # noqa: E501 assert response['source_meta_'].rdp_url is None assert response['source_meta_'].data_license_attributes == { diff --git a/tests/unit/test_database.py b/tests/unit/test_database.py index 6305de09..0e01b426 100644 --- a/tests/unit/test_database.py +++ b/tests/unit/test_database.py @@ -4,6 +4,7 @@ from tests.conftest import TEST_ROOT import json import os +from boto3.dynamodb.conditions import Key @pytest.fixture(scope='module') @@ -41,3 +42,46 @@ def test_tables_created(db): existing_tables = db.dynamodb_client.list_tables()['TableNames'] assert 'therapy_concepts' in existing_tables assert 'therapy_metadata' in existing_tables + + +def test_item_type(db): + """Check that objects are tagged with item_type attribute.""" + filter_exp = Key('label_and_type').eq('chembl:chembl11359##identity') + item = db.therapies.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'identity' + + filter_exp = Key('label_and_type').eq('interferon alfacon-1##label') + item = db.therapies.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'label' + + filter_exp = Key('label_and_type').eq('fda:5x5hb3vz3z##xref') + item = db.therapies.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'xref' + + filter_exp = Key('label_and_type').eq('drugbank:db00515##other_id') + item = db.therapies.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'other_id' + + filter_exp = Key('label_and_type').eq('dichlorodiammineplatinum##alias') + item = db.therapies.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'alias' + + filter_exp = Key('label_and_type').eq('rxcui:1041527##rx_brand') + item = db.therapies.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'rx_brand' + + filter_exp = Key('label_and_type').eq('pms-cisplatin##trade_name') + item = db.therapies.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'trade_name' + + filter_exp = Key('label_and_type').eq('ncit:c839##merger') + item = db.therapies.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'merger' diff --git a/therapy/__init__.py b/therapy/__init__.py index 2c1d475c..b67e8f64 100644 --- a/therapy/__init__.py +++ b/therapy/__init__.py @@ -10,7 +10,7 @@ logger.setLevel(logging.DEBUG) # TODO: Fix so that we don't have to change in setup.cfg -__version__ = "0.2.14" +__version__ = "0.2.15" class DownloadException(Exception): diff --git a/therapy/database.py b/therapy/database.py index d3df7035..1a5be138 100644 --- a/therapy/database.py +++ b/therapy/database.py @@ -96,8 +96,11 @@ def create_therapies_table(self, existing_tables: List): { 'AttributeName': 'src_name', 'AttributeType': 'S' + }, + { + 'AttributeName': 'item_type', + 'AttributeType': 'S' } - ], GlobalSecondaryIndexes=[ { @@ -115,6 +118,22 @@ def create_therapies_table(self, existing_tables: List): 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 } + }, + { + 'IndexName': 'item_type_index', + 'KeySchema': [ + { + 'AttributeName': 'item_type', + 'KeyType': 'HASH' + } + ], + 'Projection': { + 'ProjectionType': 'KEYS_ONLY' + }, + 'ProvisionedThroughput': { + 'ReadCapacityUnits': 10, + 'WriteCapacityUnits': 10 + } } ], ProvisionedThroughput={ @@ -227,7 +246,7 @@ def add_record(self, record: Dict, record_type="identity"): f"{e.response['Error']['Message']}") def add_ref_record(self, term: str, concept_id: str, ref_type: str): - """Add auxilliary/reference record to database. + """Add auxiliary/reference record to database. :param str term: referent term :param str concept_id: concept ID to refer to @@ -253,7 +272,7 @@ def update_record(self, concept_id: str, field: str, new_value: Any): :param str concept_id: record to update :param str field: name of field to update - :parm str new_value: new value + :param Any new_value: new value """ key = { 'label_and_type': f'{concept_id.lower()}##identity', diff --git a/therapy/etl/chembl.py b/therapy/etl/chembl.py index 442619a9..1b9f00c2 100644 --- a/therapy/etl/chembl.py +++ b/therapy/etl/chembl.py @@ -237,6 +237,7 @@ def _load_therapy(self, record, batch): """Load therapy record into DynamoDB.""" record['label_and_type'] = \ f"{record['concept_id'].lower()}##identity" + record['item_type'] = 'identity' batch.put_item(Item=record) self._added_ids.append(record['concept_id']) @@ -246,7 +247,8 @@ def _load_label(self, record, batch): 'label_and_type': f"{record['label'].lower()}##label", 'concept_id': f"{record['concept_id'].lower()}", - 'src_name': SourceName.CHEMBL.value + 'src_name': SourceName.CHEMBL.value, + 'item_type': 'label', } batch.put_item(Item=label) @@ -264,7 +266,8 @@ def _load_alias(self, record, batch): alias = { 'label_and_type': f"{alias}##alias", 'concept_id': f"{record['concept_id'].lower()}", - 'src_name': SourceName.CHEMBL.value + 'src_name': SourceName.CHEMBL.value, + 'item_type': 'alias', } batch.put_item(Item=alias) @@ -287,7 +290,8 @@ def _load_trade_name(self, record, batch): 'label_and_type': f"{trade_name}##trade_name", 'concept_id': f"{record['concept_id'].lower()}", - 'src_name': SourceName.CHEMBL.value + 'src_name': SourceName.CHEMBL.value, + 'item_type': 'trade_name' } batch.put_item(Item=trade_name) diff --git a/therapy/etl/chemidplus.py b/therapy/etl/chemidplus.py index e26e0119..6d0b5f1b 100644 --- a/therapy/etl/chemidplus.py +++ b/therapy/etl/chemidplus.py @@ -186,6 +186,7 @@ def _load_record(self, batch: BatchWriter, record: Dict): 'label_and_type': f'{record["label"].lower()}##label', 'concept_id': concept_id_ref, 'src_name': SourceName.CHEMIDPLUS.value, + 'item_type': 'label', }) else: del record['label'] @@ -204,13 +205,15 @@ def _load_record(self, batch: BatchWriter, record: Dict): batch.put_item(Item={ 'label_and_type': pk, 'concept_id': concept_id_ref, - 'src_name': SourceName.CHEMIDPLUS.value + 'src_name': SourceName.CHEMIDPLUS.value, + 'item_type': field_type, }) else: del record[field] record['src_name'] = SourceName.CHEMIDPLUS.value record['label_and_type'] = f'{concept_id_ref}##identity' + record['item_type'] = 'identity' batch.put_item(Item=record) self._added_ids.append(record['concept_id']) diff --git a/therapy/etl/drugbank.py b/therapy/etl/drugbank.py index 54774033..dab5622f 100644 --- a/therapy/etl/drugbank.py +++ b/therapy/etl/drugbank.py @@ -67,10 +67,7 @@ def perform_etl(self) -> List[str]: return self._added_ids def _download_data(self): - """Download DrugBank database XML file. - - :param PosixPath db_dir: The path to the DrugBank data directory - """ + """Download DrugBank database XML file.""" logger.info("Downloading DrugBank file...") if 'DRUGBANK_USER' in environ.keys() and \ 'DRUGBANK_PWD' in environ.keys(): @@ -204,6 +201,7 @@ def _load_therapy(self, batch, params): if not params[label_type] or len( {a.casefold() for a in params[label_type]}) > 20: del params[label_type] + params['item_type'] = 'identity' batch.put_item(Item=params) self._added_ids.append(params['concept_id']) @@ -292,7 +290,8 @@ def _load_label(self, label, concept_id, batch): 'label_and_type': f"{label.lower()}##label", 'concept_id': f"{concept_id.lower()}", - 'src_name': SourceName.DRUGBANK.value + 'src_name': SourceName.DRUGBANK.value, + 'item_type': 'label' } batch.put_item(Item=label) @@ -303,7 +302,8 @@ def _load_aliases(self, aliases, concept_id, batch): alias = { 'label_and_type': f"{alias.lower()}##alias", 'concept_id': f"{concept_id.lower()}", - 'src_name': SourceName.DRUGBANK.value + 'src_name': SourceName.DRUGBANK.value, + 'item_type': 'alias' } batch.put_item(Item=alias) @@ -315,7 +315,8 @@ def _load_trade_names(self, trade_names, concept_id, batch): trade_name = { 'label_and_type': f"{trade_name.lower()}##trade_name", 'concept_id': f"{concept_id.lower()}", - 'src_name': SourceName.DRUGBANK.value + 'src_name': SourceName.DRUGBANK.value, + 'item_type': 'trade_name', } batch.put_item(Item=trade_name) @@ -327,6 +328,7 @@ def _load_other_ids(self, other_ids, concept_id, batch): 'label_and_type': f'{other_id}##other_id', 'concept_id': concept_id, 'src_name': SourceName.DRUGBANK.value, + 'item_type': 'other_id', } batch.put_item(Item=item) @@ -338,6 +340,7 @@ def _load_xrefs(self, xrefs, concept_id, batch): 'label_and_type': f'{xref}##xref', 'concept_id': concept_id, 'src_name': SourceName.DRUGBANK.value, + 'item_type': 'xref', } batch.put_item(Item=item) diff --git a/therapy/etl/merge.py b/therapy/etl/merge.py index 535809cd..6b487616 100644 --- a/therapy/etl/merge.py +++ b/therapy/etl/merge.py @@ -184,4 +184,5 @@ def record_order(record): merged_attrs['label_and_type'] = \ f'{merged_attrs["concept_id"].lower()}##merger' + merged_attrs['item_type'] = 'merger' return merged_attrs diff --git a/therapy/etl/ncit.py b/therapy/etl/ncit.py index 9fdce3b1..84d03855 100644 --- a/therapy/etl/ncit.py +++ b/therapy/etl/ncit.py @@ -210,6 +210,7 @@ def _load_therapy(self, therapy: Therapy, batch): concept_id_lower = item['concept_id'].lower() item['label_and_type'] = f"{concept_id_lower}##identity" item['src_name'] = SourceName.NCIT.value + item['item_type'] = 'identity' for field_type, field in (('alias', 'aliases'), ('other_id', 'other_identifiers'), @@ -225,7 +226,8 @@ def _load_therapy(self, therapy: Therapy, batch): batch.put_item(Item={ 'label_and_type': pk, 'concept_id': concept_id_lower, - 'src_name': SourceName.NCIT.value + 'src_name': SourceName.NCIT.value, + 'item_type': field_type }) else: del item[field] @@ -235,7 +237,8 @@ def _load_therapy(self, therapy: Therapy, batch): batch.put_item(Item={ 'label_and_type': pk, 'concept_id': concept_id_lower, - 'src_name': SourceName.NCIT.value + 'src_name': SourceName.NCIT.value, + 'item_type': 'label' }) else: del therapy.label diff --git a/therapy/etl/rxnorm.py b/therapy/etl/rxnorm.py index 8060b2dc..b195b490 100644 --- a/therapy/etl/rxnorm.py +++ b/therapy/etl/rxnorm.py @@ -257,6 +257,7 @@ def _load_therapy(self, params, batch): self._load_label_types(params, batch) params['src_name'] = SourceName.RXNORM.value params['label_and_type'] = f"{params['concept_id'].lower()}##identity" + params['item_type'] = 'identity' try: batch.put_item(Item=params) self._added_ids.append(params['concept_id']) @@ -304,7 +305,8 @@ def _load_label_type(self, params, batch, label_type_sing, label_type_pl): t = { 'label_and_type': f"{t.lower()}##{label_type_sing}", 'concept_id': f"{params['concept_id'].lower()}", - 'src_name': SourceName.RXNORM.value + 'src_name': SourceName.RXNORM.value, + 'item_type': label_type_sing } try: batch.put_item(Item=t) @@ -385,7 +387,8 @@ def _load_brand_concepts(self, value, brands, batch): 'label_and_type': f"{brands.get(tn)}##rx_brand", 'concept_id': value['concept_id'], - 'src_name': SourceName.RXNORM.value + 'src_name': SourceName.RXNORM.value, + 'item_type': 'rx_brand' }) def _add_str_field(self, params, row, precise_ingredient, drug_forms, diff --git a/therapy/etl/wikidata.py b/therapy/etl/wikidata.py index 77a50296..1e3c373f 100644 --- a/therapy/etl/wikidata.py +++ b/therapy/etl/wikidata.py @@ -138,6 +138,7 @@ def _transform_data(self): if concept_id not in items.keys(): item = dict() item['label_and_type'] = f"{concept_id.lower()}##identity" + item['item_type'] = 'identity' item['concept_id'] = concept_id item['src_name'] = SourceName.WIKIDATA.value @@ -199,7 +200,8 @@ def _load_therapy(self, item: Dict, batch): batch.put_item(Item={ 'label_and_type': pk, 'concept_id': concept_id_lower, - 'src_name': SourceName.WIKIDATA.value + 'src_name': SourceName.WIKIDATA.value, + 'item_type': 'label' }) for field_type, field in (('alias', 'aliases'), @@ -216,7 +218,8 @@ def _load_therapy(self, item: Dict, batch): batch.put_item(Item={ 'label_and_type': pk, 'concept_id': concept_id_lower, - 'src_name': SourceName.NCIT.value + 'src_name': SourceName.NCIT.value, + 'item_type': field_type }) else: del item[field] diff --git a/therapy/query.py b/therapy/query.py index 2aebea72..4a246391 100644 --- a/therapy/query.py +++ b/therapy/query.py @@ -107,7 +107,7 @@ def _add_record(self, elif matches[src_name]['match_type'] == MatchType[match_type.upper()]: matches[src_name]['records'].append(drug) - return (response, src_name) + return response, src_name def _fetch_records(self, response: Dict[str, Dict], @@ -135,7 +135,7 @@ def _fetch_records(self, except ClientError as e: logger.error(e.response['Error']['Message']) - return (response, matched_sources) + return response, matched_sources def _fill_no_matches(self, resp: Dict[str, Dict]) -> Dict: """Fill all empty source_matches slots with NO_MATCH results. @@ -180,7 +180,7 @@ def _check_concept_id(self, (resp, src_name) = self._add_record(resp, item, MatchType.CONCEPT_ID.name) sources = sources - {src_name} - return (resp, sources) + return resp, sources def _check_match_type(self, query: str, @@ -202,7 +202,7 @@ def _check_match_type(self, (resp, matched_srcs) = self._fetch_records(resp, concept_ids, match_type) sources = sources - matched_srcs - return (resp, sources) + return resp, sources def _response_keyed(self, query: str, sources: Set[str]) -> Dict: """Return response as dict where key is source name and value