-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmodule-2-setup.yaml
180 lines (167 loc) · 5.88 KB
/
module-2-setup.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
AWSTemplateFormatVersion: "2010-09-09"
Description: >
This template sets up Glue resources and Athena tables to run queries over HealthLake exported data.
Parameters:
CrawlerName:
Type: String
Default: "healthlake-export-crawler"
Description: "Name of Glue crawler to crawl over HealthLake exported data"
HealthLakeExportS3Bucket:
Type: String
Default: "bucket-name"
Description: "S3 bucket where HealthLake data has been exported"
HealthLakeExportS3Prefix:
Type: String
Default: "prefix"
Description: "S3 prefix where HealthLake data has been exported without ending slash"
DBName:
Type: String
Default: "healthlakedb"
Description: "Database name"
GlueDocRefParserJobName:
Type: String
Default: "DocRefParserJob"
Description: "Glue job name to parse DocumentReference resource"
HealthLakeGlueRoleName:
Type: String
Default: "healthlake-workshop-glue-role"
Description: "Glue Role"
CrawlerTriggerName:
Type: String
Default: "healthlake-trigger"
Description: "Crawler Trigger"
DocRefCrawlerName:
Type: String
Default: "DocRefCrawler"
Description: "DocRefCrawler name"
HealthLakeGlueWorkflowName:
Type: String
Default: "healthlake-post-export"
Description: "HealthLakeGlueWorkflow name"
HealthLakeDocRefJobTriggerName:
Type: String
Default: "HealthLakeDocRefJobTrigger"
Description: "HealthLakeDocRefJobTrigger name"
Resources:
HealthLakeDatabase:
Type: AWS::Glue::Database
Properties:
CatalogId: !Ref AWS::AccountId
DatabaseInput:
Name: !Ref DBName
Description: "AWS Glue container to hold metadata tables for the HealthLake exported data crawler"
HealthLakeCrawler:
Type: AWS::Glue::Crawler
Properties:
Name: !Ref CrawlerName
Role: !GetAtt HealthLakeGlueRole.Arn
Description: AWS Glue crawler to crawl HealthLake data
DatabaseName: !Ref DBName
Targets:
S3Targets:
- Path: !Sub "s3://${HealthLakeExportS3Bucket}/${HealthLakeExportS3Prefix}/"
SchemaChangePolicy:
UpdateBehavior: "UPDATE_IN_DATABASE"
DeleteBehavior: "LOG"
Configuration: '{"Version":1.0,"CrawlerOutput":{"Partitions":{"AddOrUpdateBehavior":"InheritFromTable"},"Tables":{"AddOrUpdateBehavior":"MergeNewColumns"}}}'
DocRefCrawler:
Type: AWS::Glue::Crawler
Properties:
Name: !Ref DocRefCrawlerName
Role: !GetAtt HealthLakeGlueRole.Arn
Description: AWS Glue crawler to crawl parsed Document Reference data
DatabaseName: !Ref DBName
Targets:
S3Targets:
- Path: !Sub "s3://${HealthLakeExportS3Bucket}/ParsedDocRef/"
SchemaChangePolicy:
UpdateBehavior: "UPDATE_IN_DATABASE"
DeleteBehavior: "LOG"
Configuration: '{"Version":1.0,"CrawlerOutput":{"Partitions":{"AddOrUpdateBehavior":"InheritFromTable"},"Tables":{"AddOrUpdateBehavior":"MergeNewColumns"}}}'
HealthLakeGlueRole:
Type: "AWS::IAM::Role"
Properties:
RoleName: !Ref HealthLakeGlueRoleName
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: "Allow"
Principal:
Service:
- "glue.amazonaws.com"
Action:
- "sts:AssumeRole"
Path: "/"
ManagedPolicyArns:
- arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole
Policies:
- PolicyName: "healthlake-workshop-glue-policy"
PolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: "Allow"
Action:
- "cloudwatch:PutMetricData"
- "logs:CreateLogStream"
- "logs:PutLogEvents"
- "logs:CreateLogGroup"
- "logs:DescribeLogStreams"
Resource: "*"
- Effect: "Allow"
Action:
- "s3:GetObject"
- "s3:PutObject"
- "s3:ListBucket"
Resource:
- !Sub "arn:aws:s3:::${HealthLakeExportS3Bucket}"
- !Sub "arn:aws:s3:::${HealthLakeExportS3Bucket}/*"
- Effect: "Allow"
Action:
- "kms:Decrypt"
Resource:
- !Sub "arn:aws:kms:*:${AWS::AccountId}:key/*"
HealthLakeDocRefJob:
Type: AWS::Glue::Job
Properties:
Role: !Ref HealthLakeGlueRole
Description: Glue job to parse DocumentReference resource
Command:
Name: pythonshell
PythonVersion: "3"
ScriptLocation: !Sub "s3://${HealthLakeExportS3Bucket}/DocRefParser.py"
DefaultArguments:
"--bucket": !Ref HealthLakeExportS3Bucket
"--prefix": !Sub "${HealthLakeExportS3Prefix}/DocumentReference/"
MaxCapacity: 1
Name: !Ref GlueDocRefParserJobName
HealthLakeDocRefJobTrigger:
Type: AWS::Glue::Trigger
Properties:
Type: CONDITIONAL
Description: Glue job trigger
WorkflowName: !Ref HealthLakeGlueWorkflow
StartOnCreation: True
Actions:
- JobName: !Ref HealthLakeDocRefJob
Predicate:
Conditions:
- LogicalOperator: EQUALS
CrawlerName: !Ref HealthLakeCrawler
CrawlState: SUCCEEDED
Name: !Ref HealthLakeDocRefJobTriggerName
HealthLakeGlueWorkflow:
Type: AWS::Glue::Workflow
Properties:
Name: !Ref HealthLakeGlueWorkflowName
Description: Workflow for orchestrating crawling HealthLake data and parsing DocumentReference resource.
CrawlerTrigger:
Type: AWS::Glue::Trigger
Properties:
WorkflowName: !Ref HealthLakeGlueWorkflow
Name: !Ref CrawlerTriggerName
Description: Start crawler for exported HealthLake data
Type: ON_DEMAND
Actions:
- CrawlerName: !Ref HealthLakeCrawler