Skip to content

Commit

Permalink
Modify XPath to retrieve tables and references. (close #119)
Browse files Browse the repository at this point in the history
  • Loading branch information
nils-herrmann authored May 23, 2024
1 parent 0eb7114 commit f0bc9d2
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
22 changes: 13 additions & 9 deletions pubmed_parser/pubmed_oa_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,15 @@ def parse_pubmed_xml(path, include_path=False, nxml=False):
return dict_out


def get_reference(reference):
"""Get reference from one of the three possible positions."""
for tag in ["mixed-citation", "element-citation", "citation"]:
ref = reference.find(tag)
if ref is not None:
return ref
return None


def parse_pubmed_references(path):
"""
Given path to xml file, parse references articles
Expand All @@ -260,15 +269,10 @@ def parse_pubmed_references(path):
for reference in references:
ref_id = reference.attrib["id"]

if reference.find("mixed-citation") is not None:
ref = reference.find("mixed-citation")
elif reference.find("element-citation") is not None:
ref = reference.find("element-citation")
else:
ref = None

ref = get_reference(reference)
if ref is not None:
if "publication-type" in ref.attrib.keys() and ref is not None:
ref_types = ["citation-type", "publication-type"]
if any(ref_type in ref_types for ref_type in ref.attrib.keys()):
if ref.attrib.values() is not None:
journal_type = ref.attrib.values()[0]
else:
Expand Down Expand Up @@ -529,7 +533,7 @@ def parse_pubmed_table(path, return_xml=True):
pmc = dict_article_meta["pmc"]

# parse table
tables = tree.xpath(".//body.//sec.//table-wrap")
tables = tree.xpath(".//body//table-wrap")
table_dicts = list()
for table in tables:
if table.find("label") is not None:
Expand Down
12 changes: 12 additions & 0 deletions tests/test_pubmed_oa_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,18 @@ def test_parse_pubmed_references():
assert isinstance(references[0], dict)
assert len(references) == 58, "Expected references to have length of 29"

references_9539395 = pp.parse_pubmed_references(pubmed_xml_9539395)
assert references_9539395[0].get('pmid') == '36094679'


def test_parse_pubmed_table():
"""
Test parsing table from PubMed XML file
"""
table_9539395 = pp.parse_pubmed_table(pubmed_xml_9539395)
expected_cols = ['Gene', 'Uninfected and untreated', 'Day 7 postinoculation', 'PBS', 'sACE22.v2.4-IgG1']
assert table_9539395[0].get('table_columns') == expected_cols


def test_parse_pubmed_caption():
"""
Expand Down

0 comments on commit f0bc9d2

Please sign in to comment.