Skip to content

Commit

Permalink
Add preprocess data
Browse files Browse the repository at this point in the history
  • Loading branch information
LuChang-CS committed May 5, 2021
1 parent 9259bcd commit 2d6b12d
Show file tree
Hide file tree
Showing 7 changed files with 847 additions and 0 deletions.
203 changes: 203 additions & 0 deletions data/icd9.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
001-139
001-009
010-018
020-027
030-041
042-042
045-049
050-059
060-066
070-079
080-088
090-099
100-104
110-118
120-129
130-136
137-139
140-239
140-149
150-159
160-165
170-176
179-189
190-199
200-209
210-229
230-234
235-238
239-239
240-279
240-246
249-259
260-269
270-279
280-289
280
281
282
283
284
285
286
287
288
289
290-319
290-294
295-299
300-316
317-319
320-389
320-327
330-337
338-338
339-339
340-349
350-359
360-379
380-389
390-459
390-392
393-398
401-405
410-414
415-417
420-429
430-438
440-449
451-459
460-519
460-466
470-478
480-488
490-496
500-508
510-519
520-579
520-529
530-539
540-543
550-553
555-558
560-569
570-579
580-629
580-589
590-599
600-608
610-612
614-616
617-629
630-679
630-639
640-649
650-659
660-669
670-677
678-679
680-709
680-686
690-698
700-709
710-739
710-719
720-724
725-729
730-739
740-759
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760-779
760-763
764-779
780-799
780-789
790-796
797-799
800-999
800-804
805-809
810-819
820-829
830-839
840-848
850-854
860-869
870-879
880-887
890-897
900-904
905-909
910-919
920-924
925-929
930-939
940-949
950-957
958-959
960-979
980-989
990-995
996-999
V01-V9
V01-V09
V10-V19
V20-V29
V30-V39
V40-V49
V50-V59
V60-V69
V70-V82
V83-V84
V85-V85
V86-V86
V87-V87
V88-V88
V89-V89
V90-V90
V91-V91
E000-E999
E000-E000
E001-E030
E800-E807
E810-E819
E820-E825
E826-E829
E830-E838
E840-E845
E846-E849
E850-E858
E860-E869
E870-E876
E878-E879
E880-E888
E890-E899
E900-E909
E910-E915
E916-E928
E929-E929
E930-E949
E950-E959
E960-E969
E970-E979
E980-E989
E990-E999
Empty file added preprocess/__init__.py
Empty file.
120 changes: 120 additions & 0 deletions preprocess/auxiliary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import os

import numpy as np


def parse_icd9_range(range_: str) -> (str, str, int, int):
ranges = range_.lstrip().split('-')
if ranges[0][0] == 'V':
prefix = 'V'
format_ = '%02d'
start, end = int(ranges[0][1:]), int(ranges[1][1:])
elif ranges[0][0] == 'E':
prefix = 'E'
format_ = '%03d'
start, end = int(ranges[0][1:]), int(ranges[1][1:])
else:
prefix = ''
format_ = '%03d'
if len(ranges) == 1:
start = int(ranges[0])
end = start + 1
else:
start, end = int(ranges[0]), int(ranges[1])
return prefix, format_, start, end


def generate_code_levels(path, code_map: dict) -> np.ndarray:
print('generating code levels ...')
three_level_code_set = set(code.split('.')[0] for code in code_map)
icd9_path = os.path.join(path, 'icd9.txt')
icd9_range = list(open(icd9_path, 'r', encoding='utf-8').readlines())
three_level_dict = dict()
level1, level2, level3 = (1, 1, 1)
level1_can_add = False
for range_ in icd9_range:
range_ = range_.rstrip()
if range_[0] == ' ':
prefix, format_, start, end = parse_icd9_range(range_)
level2_cannot_add = True
for i in range(start, end + 1):
code = prefix + format_ % i
if code in three_level_code_set:
three_level_dict[code] = [level1, level2, level3]
level3 += 1
level1_can_add = True
level2_cannot_add = False
if not level2_cannot_add:
level2 += 1
else:
if level1_can_add:
level1 += 1
level1_can_add = False

level4 = 1
code_level = dict()
for code in code_map:
three_level_code = code.split('.')[0]
if three_level_code in three_level_dict:
three_level = three_level_dict[three_level_code]
code_level[code] = three_level + [level4]
level4 += 1
else:
print(three_level_code)
code_level[code] = [0, 0, 0, 0]

code_level_matrix = np.zeros((len(code_map) + 1, 4), dtype=int)
for code, cid in code_map.items():
code_level_matrix[cid] = code_level[code]

return code_level_matrix


def generate_patient_code_adjacent(code_x: np.ndarray, code_num: int) -> np.ndarray:
print('generating patient code adjacent matrix ...')
result = np.zeros((len(code_x), code_num + 1), dtype=int)
for i, codes in enumerate(code_x):
adj_codes = codes[codes > 0]
result[i][adj_codes] = 1
return result


def generate_code_code_adjacent(code_num: int, code_level_matrix: np.ndarray) -> np.ndarray:
print('generating code code adjacent matrix ...')
n = code_num + 1
result = np.zeros((n, n), dtype=int)
for i in range(1, n):
print('\r\t%d / %d' % (i, n), end='')
for j in range(1, n):
if i != j:
level_i = code_level_matrix[i]
level_j = code_level_matrix[j]
same_level = 4
while same_level > 0:
level = same_level - 1
if level_i[level] == level_j[level]:
break
same_level -= 1
result[i, j] = same_level + 1
print('\r\t%d / %d' % (n, n))
return result


def co_occur(pids: np.ndarray,
patient_admission: dict,
admission_codes_encoded: dict,
code_num: int) -> (np.ndarray, np.ndarray, np.ndarray):
print('calculating co-occurrence ...')
x = np.zeros((code_num + 1, code_num + 1), dtype=float)
for i, pid in enumerate(pids):
print('\r\t%d / %d' % (i + 1, len(pids)), end='')
admissions = patient_admission[pid]
for k, admission in enumerate(admissions[:-1]):
codes = admission_codes_encoded[admission['admission_id']]
for m in range(len(codes) - 1):
for n in range(m + 1, len(codes)):
c_i, c_j = codes[m], codes[n]
x[c_i, c_j] = 1
x[c_j, c_i] = 1
print('\r\t%d / %d' % (len(pids), len(pids)))
return x
Loading

0 comments on commit 2d6b12d

Please sign in to comment.