-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
75 lines (64 loc) · 3.07 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
include ./local-ops.mk
venv-setup:
rm -rf .venv
python3.11 -m venv .venv
.venv/bin/python -m pip install --upgrade pip
.venv/bin/python -m pip install -r ./requirements.txt
sub-init:
echo "SUB_ID=<enter subscription name>" > sub.env
# This not only sets up infra but converts the blob stores -> ADLS Gen 2
infra:
./setup/create-resources.sh
create-cluster:
.venv/bin/python ./common/cluster.py
create-env:
.venv/bin/python ./common/env.py --name "pdf-split-env" --conda_file "./config/pdf-env.yml"
.venv/bin/python ./common/env.py --name "form-rec-env" --conda_file "./config/form-rec-env.yml"
.venv/bin/python ./common/env.py --name "parallel-env" --conda_file "./config/parallel-env.yml"
.venv/bin/python ./common/env.py --name "blob-env" --conda_file "./config/blob.yml"
pdf_blob=$(shell cat variables.env | grep "BLOB_CONTAINER_PDF" | cut -d "=" -f 2 | xargs)
image_blob=$(shell cat variables.env | grep "BLOB_CONTAINER_IMAGES" | cut -d "=" -f 2 | xargs)
text_blob=$(shell cat variables.env | grep "BLOB_CONTAINER_TXT" | cut -d "=" -f 2 | xargs)
final_blob=$(shell cat variables.env | grep "BLOB_CONTAINER_FINAL" | cut -d "=" -f 2 | xargs)
create-datastores:
.venv/bin/python ./common/datastore.py --container_name $(pdf_blob) \
--datastore_name "pdfinputfiles" \
--datastore_desc "PDF input files"
.venv/bin/python ./common/datastore.py --container_name $(image_blob) \
--datastore_name "pdfimages" \
--datastore_desc "Page level PDF files"
.venv/bin/python ./common/datastore.py --container_name $(text_blob) \
--datastore_name "textfiles" \
--datastore_desc "Page level text files"
.venv/bin/python ./common/datastore.py --container_name $(final_blob) \
--datastore_name "textfilesfinal" \
--datastore_desc "Final text files"
## Single step experiment
primary_datastore="azureml://datastores/pdfinputfiles/paths/"
intermediate_pdf_datastore="azureml://datastores/pdfimages/paths/"
intermediate_txt_datastore="azureml://datastores/textfiles/paths/"
output_datastore="azureml://datastores/textfilesfinal/paths/"
single-job:
.venv/bin/python ./single-step-job/main.py --input_datastore $(primary_datastore) \
--output_datastore $(intermediate_datastore)
# Sequential pipeline: PDF -> images, and then form recognizer on those images
seq-pipeline:
.venv/bin/python ./pipeline-sequential/main.py \
--input_datastore $(primary_datastore) \
--intermediate_datastore $(intermediate_datastore) \
--output_datastore $(output_datastore)
# Similar to sequential process, but in parallel
# Note that before running this, you should insert the Form Recognizer key into the second component
par-pipeline:
.venv/bin/python ./pipeline-parallel/main.py \
--input_datastore $(primary_datastore) \
--intermediate_pdf_datastore $(intermediate_pdf_datastore) \
--intermediate_txt_datastore $(intermediate_txt_datastore) \
--output_datastore $(output_datastore)
# Commit local branch changes
branch=$(shell git symbolic-ref --short HEAD)
now=$(shell date '+%F_%H:%M:%S' )
git-push:
git add . && git commit -m "Changes as of $(now)" && git push -u origin $(branch)
git-pull:
git pull origin $(branch)