From 5aef43910abe72c1150afb750cb956aa9684adca Mon Sep 17 00:00:00 2001
From: Derek A Dombek <50093944+derekadombek@users.noreply.github.com>
Date: Wed, 7 Aug 2024 08:40:30 -0600
Subject: [PATCH] OCR CI build artifact (#153)

* init for [IDWA-OCR-72] Install OCR into an executable

* edit readme with the build command

* add refs to form_filled to use this without args

* lint

* lint

* lint

* build/upload artifact

* pip install pyinstaller

* pip install pyinstaller

* rm dev

* put build in dependencies

* add requirements.txt and use pyinstaller cli

* rm working directory

* point to main and not dir main

* try dist/ and --onefile

* try -windowed

* dist/main

* upgrade upload action

* macos-latest

* try using assets from tests

* try using assets from tests

* revert back to dup assets

* CLI for better handling of arguments

* lint

* lint

* rm args with pyinstaller because of new cli

* use ^ woth version

* install docopt with gh action job

* rm unused assets in the ocr dir

* docs

* docs

* upload bin for each os

* matrix exp

* matrix exp

* matrix exp

* zip

* zip

* check path

* check path

* check path

* whoops

* try gzexe

* &&

* wip

* try building release

* fix the needs:

* try building release

* try building release

* try building release

* try building release

* add checkout

* change from action

* add another checkout

* add paths

* try using workflow_call

* try using workflow_call

* wip

* using download action

* using download action

* try that

* token

* try for loop

* dont use matrix

* token ref

* try with workspace

* try with workspace

* try with workspace

* whoops

* working dir

* working dir

* add --repo

* think i got it

* try encoding with jq

* try encoding with jq

* github.repository

* full url

* matrix again

* see what dir we're in

* path to artifactas

* put everything in first job with create

* put everything in first job with create

* upload all in dir

* write all

* try dif action

* try dif action

* try with content

* upgrade action

* new output

* upgrade upload and download versions

* just path for download

* ls

* ls

* add to uplaod

* try full workflow

* forgot to switch needs job

* again

* fix file names

* fix file names

* change release title name

* missed diffs

* clean-up

* try changing ref

* try changing ref

* try changing ref

* try changing ref

* try changing ref

* try changing ref

* try changing ref

* create and see if upload asset chooses it

* create and see if upload asset chooses it

* create and see if upload asset chooses it

* create and see if upload asset chooses it

* create and see if upload asset chooses it

* create and see if upload asset chooses it

* create and see if upload asset chooses it

* use ref at checkout

* ls

* use ref at checkout

* try new upload action

* fix script

* create tag again

* full workflow

* full workflow

* full workflow

* cleaned up and made as workflow_dispatch

---------

Co-authored-by: Derek Dombek <derek.a.dombek.com>
---
 .github/workflows/build-ocr.yml   | 53 +++++++++++++++++++++++++++++++
 .github/workflows/release-ocr.yml | 38 ++++++++--------------
 OCR/README.md                     |  4 +++
 OCR/ocr/pyinstaller.py            | 23 ++++++++++++++
 OCR/poetry.lock                   |  7 ++--
 OCR/pyproject.toml                |  1 +
 requirements.txt                  |  7 ++++
 7 files changed, 106 insertions(+), 27 deletions(-)
 create mode 100644 .github/workflows/build-ocr.yml
 create mode 100644 OCR/ocr/pyinstaller.py
 create mode 100644 requirements.txt

diff --git a/.github/workflows/build-ocr.yml b/.github/workflows/build-ocr.yml
new file mode 100644
index 00000000..ce30992a
--- /dev/null
+++ b/.github/workflows/build-ocr.yml
@@ -0,0 +1,53 @@
+name: Build & Upload OCR Binaries
+on:
+  workflow_call:
+    outputs:
+      output-file:
+        description: "The first output string"
+        value: ${{ jobs.build.outputs.output_artifacts }}
+  workflow_dispatch:
+      
+jobs:
+  build:
+    strategy:
+      matrix:
+        include:
+          - os: macos-latest
+            name: macos
+            cmd: >
+              pyinstaller -F -w -n main-macos ./OCR/ocr/main.py &&
+              cd dist/ &&
+              zip -r9 main-macos main-macos
+            out_file: main-macos.zip
+          - os: windows-latest
+            name: windows
+            cmd: pyinstaller -F -w -n main-windows ./OCR/ocr/main.py
+            out_file: main-windows.exe
+          - os: ubuntu-latest
+            name: ubuntu
+            cmd: >
+              pyinstaller -F -w -n main-ubuntu ./OCR/ocr/main.py &&
+              cd dist/ &&
+              zip -r9 main-ubuntu main-ubuntu
+            out_file: main-ubuntu.zip
+    runs-on: ${{ matrix.os }}
+    outputs:
+      output_artifacts: ${{ steps.artifacts.outputs.matrix.out_file }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt pyinstaller
+          pip install docopt
+      - name: Build binaries for all OS's
+        run: ${{ matrix.cmd }}
+      - name: Upload Artifacts To Workflow
+        uses: actions/upload-artifact@v4
+        id: artifacts
+        with:
+          name: main-${{ matrix.name }}
+          path: ./dist/${{ matrix.out_file}}
diff --git a/.github/workflows/release-ocr.yml b/.github/workflows/release-ocr.yml
index 420857fc..f25b4419 100644
--- a/.github/workflows/release-ocr.yml
+++ b/.github/workflows/release-ocr.yml
@@ -1,47 +1,38 @@
 name: Release MDE-OCR artifacts
+run-name: Release MDE-OCR artifacts - by @${{ github.actor }}
 on:
-    # workflow_dispatch:
-    #     inputs:
-    #         tag:
-    #             description: 'target environment'
-    #             required: true
-    push:
-        branches:
-          - idwa-ocr-ci-for-executable
-        paths:
-          - .github/workflows/release-ocr.yml
-          - .github/workflows/build-ocr.yml
-          - OCR/**
-        # tags:
-        #     - 'v*'
+    workflow_dispatch:
+        inputs:
+            tag:
+                description: 'Version tag for new release'
+                required: true
 jobs:
   create-release:
     name: Create Release
-    
     runs-on: [ubuntu-latest]
     permissions:
         contents: write
     steps:
     - uses: actions/checkout@v4
     - name: Create tag
-      uses: actions/github-script@v5
+      uses: actions/github-script@v7
       with:
         script: |
             github.rest.git.createRef({
                 owner: context.repo.owner,
                 repo: context.repo.repo,
-                ref: 'refs/tags/1.0.0',
+                ref: 'refs/tags/${{ github.event.inputs.tag }}',
                 sha: context.sha
             })
     - name: Create release
       id: create_release
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        tag: ${{ github.ref_name }}
+        tag: ${{ github.event.inputs.tag }}
       run: |
         gh release create "$tag" \
             --repo="$GITHUB_REPOSITORY" \
-            --title="MDE-OCR ${tag#v}" \
+            --title="MDE-OCR ${tag}" \
             --generate-notes
     - name: Output Release URL File
       run: echo "${{ steps.create_release.outputs.upload_url }}" > release_url.txt
@@ -62,9 +53,8 @@ jobs:
           with:
             path: artifacts
             merge-multiple: true
-        - name: Upload release binaries
-          uses: alexellis/upload-assets@0.4.1
-          env:
-            GITHUB_TOKEN: ${{ github.token }}
+        - name: Release Upload Assets
+          uses: jaywcjlove/github-action-upload-assets@main
           with:
-            asset_paths: '["./artifacts/*"]'
\ No newline at end of file
+            tag: ${{ github.event.inputs.tag }}
+            asset-path: '["./artifacts/*"]'
diff --git a/OCR/README.md b/OCR/README.md
index acbb0537..f7277d12 100644
--- a/OCR/README.md
+++ b/OCR/README.md
@@ -29,6 +29,10 @@ Run main, hoping to convert this to a cli at some point
 poetry run main
 ```
 
+To build the OCR service into an executable artifact
+```shell
+poetry run build
+```
 
 Adding new dependencies
 ```shell
diff --git a/OCR/ocr/pyinstaller.py b/OCR/ocr/pyinstaller.py
new file mode 100644
index 00000000..64bab811
--- /dev/null
+++ b/OCR/ocr/pyinstaller.py
@@ -0,0 +1,23 @@
+import PyInstaller.__main__
+from pathlib import Path
+
+HERE = Path(__file__).parent.absolute()
+path_to_main = str(HERE / "main.py")
+
+
+# This function installs/packages the main OCR function as an executable.
+# You could also use the commandline. Using `pyinstaller ./OCR/ocr/main.py -F -w` works the same is the function below.
+# If you need to add asset paths, follow the example below.
+def install():
+    PyInstaller.__main__.run(
+        [
+            path_to_main,
+            "--onefile",
+            "--windowed",
+            # SOURCE:DESTINATION
+            # "--add-data=ocr/assets/form_filled.png:assets/",
+            # "--add-data=ocr/assets/form_segmention_template.png:assets/",
+            # "--add-data=ocr/assets/labels.json:assets/",
+            # other pyinstaller options...
+        ]
+    )
diff --git a/OCR/poetry.lock b/OCR/poetry.lock
index 8c0f6a43..39059386 100644
--- a/OCR/poetry.lock
+++ b/OCR/poetry.lock
@@ -246,13 +246,13 @@ tqdm = ["tqdm"]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.23.4"
+version = "0.23.5"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.23.4-py3-none-any.whl", hash = "sha256:3a0b957aa87150addf0cc7bd71b4d954b78e749850e1e7fb29ebbd2db64ca037"},
-    {file = "huggingface_hub-0.23.4.tar.gz", hash = "sha256:35d99016433900e44ae7efe1c209164a5a81dbbcd53a52f99c281dcd7ce22431"},
+    {file = "huggingface_hub-0.23.5-py3-none-any.whl", hash = "sha256:d7a7d337615e11a45cc14a0ce5a605db6b038dc24af42866f731684825226e90"},
+    {file = "huggingface_hub-0.23.5.tar.gz", hash = "sha256:67a9caba79b71235be3752852ca27da86bd54311d2424ca8afdb8dda056edf98"},
 ]
 
 [package.dependencies]
@@ -1158,6 +1158,7 @@ python-versions = ">=3.8"
 files = [
     {file = "PyMuPDFb-1.24.6-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:21e3ed890f736def68b9a031122ae1fb854d5cb9a53aa144b6e2ca3092416a6b"},
     {file = "PyMuPDFb-1.24.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:8704d2dfadc9448ce184597d8b0f9c30143e379ac948a517f9c4db7c0c71ed51"},
+    {file = "PyMuPDFb-1.24.6-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01662584d5cfa7a91f77585f13fc23a12291cfd76a57e0a28dd5a56bf521cb2c"},
     {file = "PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1f7657353529ae3f88575c83ee49eac9adea311a034b9c97248a65cee7df0e5"},
     {file = "PyMuPDFb-1.24.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:cebc2cedb870d1e1168e2f502eb06f05938f6df69103b0853a2b329611ec19a7"},
     {file = "PyMuPDFb-1.24.6-py3-none-win32.whl", hash = "sha256:ac4b865cd1e239db04674f85e02844a0e405f8255ee7a74dfee0d86aad0d3576"},
diff --git a/OCR/pyproject.toml b/OCR/pyproject.toml
index 939325ef..7c374173 100644
--- a/OCR/pyproject.toml
+++ b/OCR/pyproject.toml
@@ -34,6 +34,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
 main = "ocr.main:main"
+build = "ocr.pyinstaller:install"
 
 [tool.ruff]
 line-length = 118
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..2fba91a4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+numpy==1.26.4
+opencv-python==4.9.0.80
+python-dotenv==1.0.1
+Pillow>=10.3.0
+torch==1.13.1
+docopt==0.6.2
+git+https://github.com/huggingface/transformers.git
\ No newline at end of file