-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdocker-compose.yml
176 lines (175 loc) · 5.3 KB
/
docker-compose.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# By default, we disable mountain the current directory under /modyn_host. However, this might be helpful for local development.
# For the trainer server, you additionally might want to enable the runtime and deployment option to enable the GPU in the container.
# For the storage, you probably want to mount some volume containing the datasets.
# For the selector, you might want to mount a volume on a fast local storage device to store the trigger training sets.
# For storage-db and metadata-db, you might want to increase the amount of available /dev/shm and store the postgres files on a fast local disk.
services:
metadata-db:
image: postgres:16.4-alpine
restart: always
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
POSTGRES_HOST_AUTH_METHOD: md5
command: postgres -c config_file=/etc/postgresql/postgresql.conf
# shm_size: 24gb
volumes:
- ./metadata-postgres-data:/var/lib/postgresql/data
- ./conf/metadata_postgresql.conf:/etc/postgresql/postgresql.conf
- ./conf/pg_hba.conf:/tmp/pg_hba.conf
- ./conf/init_pg_hba.sh:/docker-entrypoint-initdb.d/init_pg_hba.sh
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 5s
timeout: 5s
retries: 20
storage-db:
image: postgres:16.4-alpine
restart: always
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
POSTGRES_HOST_AUTH_METHOD: md5
command: postgres -c config_file=/etc/postgresql/postgresql.conf
# shm_size: 8gb
volumes:
- ./storage-postgres-data:/var/lib/postgresql/data
- ./conf/storage_postgresql.conf:/etc/postgresql/postgresql.conf
- ./conf/pg_hba.conf:/tmp/pg_hba.conf
- ./conf/init_pg_hba.sh:/docker-entrypoint-initdb.d/init_pg_hba.sh
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 5s
timeout: 5s
retries: 20
storage:
restart: on-failure
depends_on:
storage-db:
condition: service_healthy
build:
context: .
dockerfile: docker/Storage/Dockerfile
volumes:
- storage-data:/app/storage
# - /mnt/datasets:/datasets
# - .:/modyn_host
metadata_processor:
restart: on-failure
depends_on:
metadata-db:
condition: service_healthy
build:
context: .
dockerfile: docker/MetadataProcessor/Dockerfile
model_storage:
restart: on-failure
depends_on:
metadata-db:
condition: service_healthy
build:
context: .
dockerfile: docker/Model_Storage/Dockerfile
volumes:
- model_storage-data:/tmp/models
evaluator:
restart: on-failure
depends_on:
- model_storage
- storage
build:
context: .
dockerfile: docker/Evaluator/Dockerfile
trainer_server:
restart: on-failure
depends_on:
- storage
- selector
- model_storage
- metadata-db
build:
context: .
dockerfile: docker/Trainer_Server/Dockerfile
volumes:
- downsampling-data:/tmp/offline_dataset # Remove when using a fast other storage directory (line below)
# - /mnt/ssd/offline_dataset:/tmp/offline_dataset
# - .:/modyn_host
# shm_size: 4gb
# CUDASTART (ignore/delete if enabling cuda manually)
# runtime: nvidia
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu, utility, compute]
# CUDAEND (ignore/delete if enabling cuda manually)
selector:
restart: on-failure
depends_on:
metadata-db:
condition: service_healthy
build:
context: .
dockerfile: docker/Selector/Dockerfile
volumes:
- selector-localstorage-data:/tmp/local_storage # Remove when using a fast other storage directory (lines below)
- selector-data:/tmp/trigger_samples # Remove when using a fast other storage directory (lines below)
# - /mnt/ssd/local_storage:/tmp/local_storage
# - /mnt/ssd/trigger_samples:/tmp/trigger_samples
# - .:/modyn_host
# shm_size: 4gb
supervisor:
restart: on-failure
depends_on:
- storage
- metadata-db
- storage-db
- selector
- metadata_processor
- trainer_server
- evaluator
build:
context: .
dockerfile: docker/Supervisor/Dockerfile
volumes:
- supervisor-evaluation-data:/tmp/evaluation_results
# - .:/modyn_host
container_name: supervisor
ports:
- "3000:50063"
tests:
depends_on:
supervisor:
condition: service_started
storage:
condition: service_started
selector:
condition: service_started
metadata_processor:
condition: service_started
model_storage:
condition: service_started
evaluator:
condition: service_started
trainer_server:
condition: service_started
metadata-db:
condition: service_healthy
storage-db:
condition: service_healthy
build:
context: .
dockerfile: docker/Tests/Dockerfile
volumes:
- storage-data:/app/storage
volumes:
storage-data:
selector-data:
selector-localstorage-data:
downsampling-data:
model_storage-data:
supervisor-evaluation-data: