-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathDocRefParser.py
135 lines (112 loc) · 5.42 KB
/
DocRefParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
import sys
import json
import csv
import pandas as pd
import boto3
from awsglue.utils import getResolvedOptions
from io import StringIO
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
files = []
args = getResolvedOptions(sys.argv,
['bucket', 'prefix'])
def get_docref_files():
response = s3_client.list_objects_v2(
Bucket=args['bucket'],
Prefix=args['prefix']
)
print(response)
for file in response['Contents']:
print(file['Key'])
download_docref_files(file['Key'])
def download_docref_files(file):
prefix = file
file = prefix.split("/")[-1]
files.append(file)
with open("/tmp/" + file, 'wb') as data:
s3_client.download_fileobj(args['bucket'], prefix, data)
get_docref_files()
for file in files:
print("/tmp/" + file)
with open("/tmp/" + file, 'rb') as fin:
fin = fin.readlines()
# print(fin)
row_id = 0
output = []
for line in fin:
data = json.loads(line)
record_id = data['id']
patient_id = data['subject']['reference']
encounter_id = data['context']['encounter'][0]['reference']
## get the top-level FHIR extension element
try:
data = data['extension']
except KeyError:
continue
for item in data:
if item['url'] == 'http://healthlake.amazonaws.com/aws-cm/':
data = item['extension']
break
## get the Colossus inferred ICD-10 extension element
for item in data:
if item['url'] == 'http://healthlake.amazonaws.com/aws-cm/infer-icd10/':
data = item['extension']
break
## process each of the inferred Colossus ICD-10 entities
for item in data:
if item['url'] == 'http://healthlake.amazonaws.com/aws-cm/infer-icd10/aws-cm-icd10-entity':
entity_score = None
entity_id = None
for entity in item['extension']:
## store the entity id
if entity['url'] == 'http://healthlake.amazonaws.com/aws-cm/infer-icd10/aws-cm-icd10-entity-id':
entity_id = entity['valueInteger']
continue
## store the entity score
if entity[
'url'] == 'http://healthlake.amazonaws.com/aws-cm/infer-icd10/aws-cm-icd10-entity-score':
entity_score = entity['valueDecimal']
continue
if entity[
'url'] == 'http://healthlake.amazonaws.com/aws-cm/infer-icd10/aws-cm-icd10-entity-ConceptList':
code_id = 0
## capture each of detected codes and scores
for concept in entity['extension']:
code_value = None
code_score = None
code_description = None
for code in concept['extension']:
if code[
'url'] == 'http://healthlake.amazonaws.com/aws-cm/infer-icd10/aws-cm-icd10-entity-Concept-Code':
code_value = code['valueString']
continue
if code[
'url'] == 'http://healthlake.amazonaws.com/aws-cm/infer-icd10/aws-cm-icd10-entity-Concept-Score':
code_score = code['valueDecimal']
continue
if code[
'url'] == 'http://healthlake.amazonaws.com/aws-cm/infer-icd10/aws-cm-icd10-entity-Concept-Description':
code_description = code['valueString']
continue
code_value = code_value.replace('.', '')
## entity_score needs to be assigned before the code scores/values are processed
output.append({
'row_id': row_id,
'record_id': record_id,
'encounter_id': encounter_id,
'patient_id': patient_id,
'entity_id': entity_id,
'entity_score': entity_score,
'code_id': code_id,
'code_score': code_score,
'code_value': code_value,
'code_description': code_description
})
code_id += 1
row_id += 1
df = pd.DataFrame(output)
file = file.split(".")[0]
df.to_csv("/tmp/" + file + ".csv", index=False, sep='\t', quoting=csv.QUOTE_NONE, escapechar=' ')
s3.Bucket(args['bucket']).upload_file('/tmp/' + file + '.csv', 'ParsedDocRef/' + file + ".csv")