-
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathadd_mesh_xrefs_to_doid_owl.py
139 lines (124 loc) · 4.94 KB
/
add_mesh_xrefs_to_doid_owl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""This script adds newly inferred cross-references for DOID.
These are directly added to the version controlled DOID OWL file.
"""
import csv
import obonet
from biomappings import load_mappings
EDITABLE_OWL_PATH = "/Users/ben/src/HumanDiseaseOntology/src/ontology/doid-edit.owl"
OBO_PATH = "/Users/ben/src/HumanDiseaseOntology/src/ontology/HumanDO.obo"
REVIEW_PATH = "/Users/ben/src/HumanDiseaseOntology/doid_mesh_review.tsv"
# Get the DOID ontology
g = obonet.read_obo(
"https://raw.githubusercontent.com/DiseaseOntology/"
"HumanDiseaseOntology/main/src/ontology/HumanDO.obo"
)
def add_xref(lines, node, xref):
"""Add xrefs to an appropriate place in the OWL file."""
node_owl = node.replace(":", "_")
look_for_xref = False
start_xref_idx = None
def_idx = None
xref_entries = []
for idx, line in enumerate(lines):
# First, find the class with the given ID and start looking for xrefs
if line.startswith(f"# Class: obo:{node_owl}"):
blank_counter = 0
look_for_xref = True
def_idx = idx + 2
continue
# If we are already looking for xrefs after the header
if look_for_xref:
# Note that there is always a blank line right after this header, and
# also after the block corresponding to the entry ends. So we need
# to be able to tell whether we are at the first or second blank line
# after the header.
if not line.strip():
if not blank_counter:
blank_counter += 1
continue
else:
break
# If we find some xrefs, we keep track of those
if line.startswith("AnnotationAssertion(oboInOwl:hasDbXref obo"):
if not start_xref_idx:
start_xref_idx = idx
xref_entries.append(line)
# If we found any xrefs but now there is a different line, we finish
elif start_xref_idx and not line.startswith("AnnotationAssertion(oboInOwl:hasDbXref"):
break
# If we never found any existing xrefs then we will put the new xref
# after the definition
if start_xref_idx is None:
start_xref_idx = def_idx + 1
# We now have to render the xref string and sort xrefs alphabetically
# to make sure we put the new one in the right place
xref_str = f'AnnotationAssertion(oboInOwl:hasDbXref obo:{node_owl} "{xref}"^^xsd:string)\n'
xref_entries.append(xref_str)
xref_entries = sorted(xref_entries)
xr_idx = xref_entries.index(xref_str)
lines.insert(start_xref_idx + xr_idx, xref_str)
return lines
if __name__ == "__main__":
# There are some curations that are redundant since DOID already mapped
# these nodes to MESH. We figure out what these are so we can avoid
# adding them.
doid_already_mapped = set()
g = obonet.read_obo(OBO_PATH)
for node, data in g.nodes(data=True):
# Skip external entries
if not node.startswith("DOID"):
continue
# Make sure we have a name
if "name" not in data:
continue
# Get existing xrefs as a standardized dict
xrefs = dict([xref.split(":", maxsplit=1) for xref in data.get("xref", [])])
# If there are already MESH mappings, we keep track of that
if "MESH" in xrefs:
doid_already_mapped.add(node)
# We now load mappings curated in Biomappings
mappings = load_mappings()
doid_mappings = [
(m["source identifier"], m["target identifier"], m)
for m in mappings
if (
m["source prefix"] == "doid"
and m["target prefix"] == "mesh"
and m["source identifier"] not in doid_already_mapped
)
]
# Make sure we get and standardize the order of mappings in both directions
doid_mappings += [
(m["target identifier"], m["source identifier"], m)
for m in mappings
if (
m["source prefix"] == "mesh"
and m["target prefix"] == "doid"
and m["target identifier"] not in doid_already_mapped
)
]
# Read the OWL file
with open(EDITABLE_OWL_PATH) as fh:
lines = fh.readlines()
review_cols = [
"source prefix",
"source identifier",
"source name",
"relation",
"target prefix",
"target identifier",
"target name",
"type",
"source",
]
review_rows = [review_cols]
# Add all the xrefs to the OWL, simultaneously add xrefs to a review TSV
for do_id, mesh_id, mapping in doid_mappings:
lines = add_xref(lines, do_id, "MESH:" + mesh_id)
review_rows.append([mapping[c] for c in review_cols])
# Dump the new review TSV and OWL file
with open(REVIEW_PATH, "w") as fh:
writer = csv.writer(fh, delimiter="\t")
writer.writerows(review_rows)
with open(EDITABLE_OWL_PATH, "w") as fh:
fh.writelines(lines)