forked from sorgerlab/famplex
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_references.py
320 lines (285 loc) · 11.4 KB
/
check_references.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
from __future__ import print_function, unicode_literals
import csv
import sys
from collections import Counter
def read_csv(fh, delimiter, quotechar):
if sys.version_info.major < 3:
csvreader = csv.reader(fh, delimiter=bytes(delimiter),
quotechar=bytes(quotechar))
rows = [[cell.decode('utf-8') for cell in row] for row in csvreader]
else:
csvreader = csv.reader(fh, delimiter=delimiter, quotechar=quotechar)
rows = [row for row in csvreader]
return rows
def load_csv(filename):
with open(filename) as f:
rows = read_csv(f, ',', '"')
return rows
def load_grounding_map(filename):
gm_rows = load_csv(filename)
gm_tuples = []
check_rows(gm_rows, 7, filename)
g_map = {}
for row in gm_rows:
gm_tuples.append(tuple(row))
key = row[0]
db_refs = {'TEXT': key}
keys = [entry for entry in row[1::2] if entry != '']
values = [entry for entry in row[2::2] if entry != '']
if len(keys) != len(values):
print('ERROR: Mismatched keys and values in row %s' % str(row))
continue
else:
db_refs.update(dict(zip(keys, values)))
if len(db_refs.keys()) > 1:
g_map[key] = db_refs
else:
g_map[key] = None
return g_map, tuple(gm_tuples)
def check_file_rows(filename, row_length):
with open(filename) as f:
rows = read_csv(f, ',', '"')
check_rows(rows, row_length, filename)
def check_rows(rows, row_length, filename):
for ix, row in enumerate(rows):
if len(row) != row_length:
print("ERROR: Line %d in file %s has %d columns, should be %d" %
((ix + 1), filename, len(row), row_length))
def load_entity_list(filename):
with open(filename) as f:
rows = read_csv(f, ',', '"')
check_rows(rows, 1, filename)
entities = [row[0] for row in rows]
return entities
def load_relationships(filename):
relationships = []
with open(filename) as f:
rows = read_csv(f, ',', '"')
check_rows(rows, 5, filename)
for row in rows:
relationships.append(((row[0], row[1]), row[2], (row[3], row[4])))
return relationships
def load_equivalences(filename):
equivalences = []
with open(filename) as f:
rows = read_csv(f, ',', '"')
check_rows(rows, 3, filename)
for row in rows:
equivalences.append((row[0], row[1], row[2]))
return equivalences
def update_id_prefixes(filename):
gm_rows = load_csv(filename)
updated_rows = []
for row in gm_rows:
key = row[0]
keys = [entry for entry in row[1::2]]
values = [entry for entry in row[2::2]]
if 'GO' in keys:
go_ix = keys.index('GO')
values[go_ix] = 'GO:%s' % values[go_ix]
if 'CHEBI' in keys:
chebi_ix = keys.index('CHEBI')
values[chebi_ix] = 'CHEBI:%s' % values[chebi_ix]
if 'CHEMBL' in keys:
chembl_ix = keys.index('CHEMBL')
values[chembl_ix] = 'CHEMBL%s' % values[chembl_ix]
updated_row = [key]
for pair in zip(keys, values):
updated_row += pair
updated_rows.append(updated_row)
return updated_rows
def pubchem_and_chebi(db_refs):
pubchem_id = db_refs.get('PUBCHEM')
chebi_id = db_refs.get('CHEBI')
if pubchem_id and not chebi_id:
return 'chebi_missing'
if chebi_id and not pubchem_id:
return 'pubchem_missing'
return None
def check_duplicates(entries, entry_label):
ent_counter = Counter(entries)
print("-- Checking for duplicate %s --" % entry_label)
found_duplicates = False
for ent, freq in ent_counter.items():
if freq > 1:
print("ERROR: Duplicate %s in %s." % (str(ent), entry_label))
found_duplicates = True
print()
return found_duplicates
if __name__ == '__main__':
signal_error = False
entities = load_entity_list('entities.csv')
relationships = load_relationships('relations.csv')
equivalences = load_equivalences('equivalences.csv')
gm, gm_tuples = load_grounding_map('grounding_map.csv')
check_file_rows('gene_prefixes.csv', 3)
for entries, entry_label in ((entities, 'entities'),
(relationships, 'relationships'),
(equivalences, 'equivalences'),
(gm_tuples, 'groundings')):
if check_duplicates(entries, entry_label):
signal_error = True
print("-- Checking for undeclared FamPlex IDs in grounding map --")
# Look through grounding map and find all instances with an FPLX db key
entities_missing_gm = []
for text, db_refs in gm.items():
if db_refs is not None:
for db_key, db_id in db_refs.items():
if db_key == 'FPLX' and db_id not in entities:
entities_missing_gm.append(db_id)
print("ERROR: ID %s referenced in grounding map "
"is not in entities list." % db_id)
signal_error = True
print()
print("-- Checking for CHEBI/PUBCHEM IDs--")
chebi_id_missing = []
pubchem_id_missing = []
for text, db_refs in gm.items():
if db_refs is not None:
p_and_c = pubchem_and_chebi(db_refs)
if p_and_c == 'chebi_missing':
chebi_id_missing.append(db_refs['PUBCHEM'])
print("WARNING: %s has PUBCHEM ID but no CHEBI ID." % text)
if p_and_c == 'pubchem_missing':
pubchem_id_missing.append(db_refs['CHEBI'])
print("WARNING: %s has CHEBI ID but no PUBCHEM ID." % text)
print()
print("-- Checking for undeclared FamPlex IDs in relationships file --")
# Load the relationships
# Check the relationships for consistency with entities
entities_missing_rel = []
for subj, rel, obj in relationships:
for term in (subj, obj):
term_ns = term[0]
term_id = term[1]
if term_ns == 'FPLX' and term_id not in entities:
entities_missing_rel.append(term_id)
print("ERROR: ID %s referenced in relations "
"is not in entities list." % term_id)
signal_error = True
print()
print("-- Checking for valid namespaces in relations --")
for ix, (subj, rel, obj) in enumerate(relationships):
for term in (subj, obj):
term_ns = term[0]
if term_ns not in ('FPLX', 'HGNC', 'UP'):
print("ERROR: row %d: Invalid namespace in relations.csv: %s" %
(ix+1, term_ns))
signal_error = True
# This check requires the indra package
try:
from indra.databases import hgnc_client
print()
print("-- Checking for invalid HGNC IDs in relationships file --")
for subj, rel, obj in relationships:
for term in (subj, obj):
term_ns = term[0]
term_id = term[1]
if term_ns == 'HGNC':
hgnc_id = hgnc_client.get_hgnc_id(term_id)
if not hgnc_id:
print("ERROR: ID %s referenced in relations is "
"not a valid HGNC ID." % term_id)
signal_error = True
except ImportError as e:
print('HGNC check could not be performed because of import error')
print(e)
signal_error = True
pass
# This check requires the indra package
try:
from indra.databases import hgnc_client
print()
print("-- Checking for invalid HGNC IDs in grounding map --")
for text, db_refs in gm.items():
if db_refs is not None:
for db_key, db_id in db_refs.items():
if db_key == 'HGNC':
hgnc_id = hgnc_client.get_hgnc_id(db_id)
if not hgnc_id:
print("ERROR: ID %s in grounding map is "
"not a valid HGNC ID." % db_id)
signal_error = True
except ImportError:
print('HGNC check could not be performed because of import error')
print(e)
signal_error = True
pass
# This check requires a ChEBI resource file to be available. You
# can obtain it from here: ftp://ftp.ebi.ac.uk/pub/databases/
# chebi/Flat_file_tab_delimited/compounds.tsv.gz
try:
with open('chebi_compounds.tsv', 'rt') as fh:
chebi_ids = [lin.split('\t')[2] for lin in fh.readlines()]
print()
print("-- Checking for invalid ChEBI IDs in grounding map --")
for text, db_refs in gm.items():
if db_refs is not None:
for db_key, db_id in db_refs.items():
if db_key == 'CHEBI':
if db_id not in chebi_ids:
print("ERROR: ID %s in grounding map is "
"not a valid CHEBI ID." % db_id)
except IOError:
pass
print()
print("-- Checking for FamPlexes whose relationships are undefined --")
# Check the relationships for consistency with entities
rel_missing_entities = []
for ent in entities:
found = False
for subj, rel, obj in relationships:
subj_ns = subj[0]
subj_id = subj[1]
obj_ns = obj[0]
obj_id = obj[1]
if subj_ns == 'FPLX' and subj_id == ent:
found = True
break
if obj_ns == 'FPLX' and obj_id == ent:
found = True
break
if not found:
rel_missing_entities.append(ent)
print("WARNING: ID %s has no known relations." % ent)
print()
print("-- Checking for non-existent FamPlexes in equivalences --")
entities_missing_eq = []
for eq_ns, eq_id, be_id in equivalences:
if be_id not in entities:
signal_error = True
entities_missing_eq.append(be_id)
print("ERROR: ID %s referenced in equivalences "
"is not in entities list." % be_id)
print()
print("-- Checking for duplicate equivalences --")
equiv_counter = Counter(equivalences)
duplicate_eq = [item for item, count in equiv_counter.items()
if count > 1]
if duplicate_eq:
print("ERROR: Duplicate equivalences found:")
for dup in duplicate_eq:
print(dup)
# This check requires the requests package to be installed
try:
import requests
import logging
logging.getLogger('requests').setLevel(logging.CRITICAL)
logging.getLogger('urllib3').setLevel(logging.CRITICAL)
print()
print("-- Checking for invalid PUBCHEM CIDs in grounding map --")
pubchem_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/' + \
'cid/%s/description/XML'
for text, db_refs in gm.items():
if db_refs is not None:
for db_key, db_id in db_refs.items():
if db_key == 'PUBCHEM':
res = requests.get(pubchem_url % db_id)
if res.status_code != 200:
print("ERROR: ID %s in grounding map is "
"not a valid PUBCHEM ID." % db_id)
except ImportError:
pass
if signal_error:
sys.exit(1)
sys.exit(0)