From d4896cfdb6af4047b6b1b9626b2963675b10cf7a Mon Sep 17 00:00:00 2001
From: Nick Stogner <nstogner@users.noreply.github.com>
Date: Mon, 15 Jan 2024 21:37:17 -0500
Subject: [PATCH] General housekeeping (#54)

* Update readme
* Change test names / directories
* Update development.md guide

NOTE: No actual changes to testing code were made. Changes to workflow
files are mostly for readability of files and also of the checks as
displayed on PRs.
---
 .../{docker-push.yml => build-push.yml}       | 43 ++++++---
 .github/workflows/integration-tests.yml       | 23 -----
 .github/workflows/load-tests.yml              | 25 ++++--
 .github/workflows/system-test-kind.yml        | 34 -------
 .github/workflows/tests.yml                   | 52 +++++++++++
 Makefile                                      | 13 ++-
 README.md                                     | 90 +++++++++----------
 docs/backends.md                              |  8 +-
 docs/development.md                           | 57 +++++++++---
 ..._bench_test.go => endpoints_bench_test.go} |  0
 tests/{system-test-kind.sh => e2e/test.sh}    |  0
 tests/{ => e2e}/test_openai_embedding.py      |  0
 12 files changed, 199 insertions(+), 146 deletions(-)
 rename .github/workflows/{docker-push.yml => build-push.yml} (51%)
 delete mode 100644 .github/workflows/integration-tests.yml
 delete mode 100644 .github/workflows/system-test-kind.yml
 create mode 100644 .github/workflows/tests.yml
 rename pkg/endpoints/{endponts_bench_test.go => endpoints_bench_test.go} (100%)
 rename tests/{system-test-kind.sh => e2e/test.sh} (100%)
 rename tests/{ => e2e}/test_openai_embedding.py (100%)

diff --git a/.github/workflows/docker-push.yml b/.github/workflows/build-push.yml
similarity index 51%
rename from .github/workflows/docker-push.yml
rename to .github/workflows/build-push.yml
index 5bf09b94..83e66465 100644
--- a/.github/workflows/docker-push.yml
+++ b/.github/workflows/build-push.yml
@@ -1,4 +1,4 @@
-name: Create and publish a Docker image
+name: Build and Push
 
 # Configures this workflow to run every time a change is pushed to the branch called `release`.
 on:
@@ -8,44 +8,59 @@ on:
     tags:
       - "v*.*.*"
     paths-ignore:
-      - 'README.md'
+      - "README.md"
   pull_request:
 
-# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
+# Defines two custom environment variables for the workflow.
+# These are used for the Container registry domain, and a name
+# for the Docker image that this workflow builds.
 env:
   REGISTRY: ghcr.io
   IMAGE_NAME: substratusai/lingo
 
-# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
+# There is a single job in this workflow.
+# It's configured to run on the latest available version of Ubuntu.
 jobs:
-  build-and-push-image:
+  container:
     runs-on: ubuntu-latest
+
     # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
     permissions:
       contents: read
       packages: write
-      # 
+
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
-      # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
-      - if: github.event_name == 'push'
-        name: Log in to the Container registry
+
+      # Uses the `docker/login-action` action to log in to the Container registry registry
+      # using the account and password that will publish the packages. Once published, the
+      # packages are scoped to the account defined here.
+      - name: Log in to the Container registry
+        if: github.event_name == 'push'
         uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
         with:
           registry: ${{ env.REGISTRY }}
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
-      # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
+
+      # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about)
+      # to extract tags and labels that will be applied to the specified image. The `id` "meta"
+      # allows the output of this step to be referenced in a subsequent step. The `images` value
+      # provides the base name for the tags and labels.
       - name: Extract metadata (tags, labels) for Docker
         id: meta
         uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
         with:
           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
-      # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
-      # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
-      # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
-      - name: Build and push Docker image
+
+      # This step uses the `docker/build-push-action` action to build the image, based on your
+      # repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
+      # It uses the `context` parameter to define the build's context as the set of files located
+      # in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)"
+      # in the README of the `docker/build-push-action` repository. It uses the `tags` and `labels`
+      # parameters to tag and label the image with the output from the "meta" step.
+      - name: Build and push container image
         uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
         with:
           context: .
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
deleted file mode 100644
index 4d94880b..00000000
--- a/.github/workflows/integration-tests.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Integration Tests
-run-name: Integration Tests by @${{ github.actor }}
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-
-jobs:
-    install:
-        runs-on: ubuntu-latest
-        steps:
-        - name: Checkout code
-          uses: actions/checkout@v2
-        - name: Setup Go
-          uses: actions/setup-go@v4
-          with:
-            go-version: '>=1.21.0'
-        - name: Run race tests
-          run: make test-race
-        - name: Run integration tests
-          run: make test-integration
diff --git a/.github/workflows/load-tests.yml b/.github/workflows/load-tests.yml
index c80e7d6d..8741d8a6 100644
--- a/.github/workflows/load-tests.yml
+++ b/.github/workflows/load-tests.yml
@@ -4,34 +4,43 @@ run-name: Load tests by @${{ github.actor }}
 on: workflow_dispatch
 
 jobs:
-  perf-tests:
+  k6:
     runs-on: ubuntu-latest
+
     permissions:
       contents: "read"
       id-token: "write"
+
     steps:
       - name: Checkout code
         uses: actions/checkout@v2
-      - id: "auth"
+
+      - name: Authenticate with GCP
         uses: "google-github-actions/auth@v1"
         with:
           workload_identity_provider: "projects/819220466562/locations/global/workloadIdentityPools/github/providers/my-repo"
           service_account: "github-actions@substratus-dev.iam.gserviceaccount.com"
-      - name: "Set up Cloud SDK"
+
+      - name: Set up cloud SDK
         uses: "google-github-actions/setup-gcloud@v1"
-      - id: "get-credentials"
+
+      - name: Get credentials
         uses: "google-github-actions/get-gke-credentials@v1"
         with:
           cluster_name: "lingo-dev"
           location: "us-central1"
-      - id: "get-pods"
+
+      - name: Get pods
         run: "kubectl get pods"
-      - id: "install-skaffold"
+
+      - name: Install skaffold
         run: |
           curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64
           chmod +x skaffold
           sudo mv skaffold /usr/local/bin
-      - id: "configure-docker"
+
+      - name: Configure docker
         run: gcloud auth configure-docker -q us-central1-docker.pkg.dev
-      - id: "run-tests"
+
+      - name: Run tests
         run: "./tests/load/test.sh"
diff --git a/.github/workflows/system-test-kind.yml b/.github/workflows/system-test-kind.yml
deleted file mode 100644
index 2bba5dd4..00000000
--- a/.github/workflows/system-test-kind.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: System Tests for kind
-run-name: System Tests for kind by @${{ github.actor }}
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-
-jobs:
-    install:
-        runs-on: ubuntu-latest
-
-        steps:
-        - name: Checkout code
-          uses: actions/checkout@v2
-
-        - name: Install Kind
-          run: |
-            # For AMD64 / x86_64
-            [ $(uname -m) = x86_64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
-            # For ARM64
-            [ $(uname -m) = aarch64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-arm64
-            chmod +x ./kind
-            sudo mv ./kind /usr/local/bin/kind
-
-        - name: Install Skaffold
-          run: |
-            curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64
-            chmod +x skaffold
-            sudo mv skaffold /usr/local/bin
-
-        - name: Run system test kind
-          run: bash tests/system-test-kind.sh
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 00000000..9025d2cb
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,52 @@
+name: Tests
+run-name: Run tests by @${{ github.actor }}
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  unit-and-integration:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Setup Go
+        uses: actions/setup-go@v4
+        with:
+          go-version: ">=1.21.0"
+
+      - name: Run unit tests
+        run: make test-unit
+
+      - name: Run integration tests
+        run: make test-integration
+
+  e2e:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Install kind
+        run: |
+          # For AMD64 / x86_64
+          [ $(uname -m) = x86_64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
+          # For ARM64
+          [ $(uname -m) = aarch64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-arm64
+          chmod +x ./kind
+          sudo mv ./kind /usr/local/bin/kind
+
+      - name: Install skaffold
+        run: |
+          curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64
+          chmod +x skaffold
+          sudo mv skaffold /usr/local/bin
+
+      - name: Run e2e tests
+        run: make test-e2e
diff --git a/Makefile b/Makefile
index ffe6eace..1826ea5b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,23 +1,20 @@
 ENVTEST_K8S_VERSION = 1.27.1
 
 .PHONY: test
-test: test-unit
-
-.PHONY: test-all
-test-all: test-race test-integration
+test: test-unit test-race test-integration test-e2e
 
 .PHONY: test-unit
 test-unit:
-	go test -mod=readonly ./pkg/...
-
-.PHONY: test-race
-test-race:
 	go test -mod=readonly -race ./pkg/...
 
 .PHONY: test-integration
 test-integration: envtest
 	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./tests/integration -v
 
+.PHONY: test-e2e
+test-e2e:
+	./tests/e2e/test.sh
+
 ## Location to install dependencies to
 LOCALBIN ?= $(shell pwd)/bin
 $(LOCALBIN):
diff --git a/README.md b/README.md
index 770ca0b8..8c8f2954 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,41 @@
-# Lingo - K8s LLM Proxy + Scaler
+# Lingo
 
-Lingo is an OpenAI compatible LLM proxy and autoscaler for K8s
+Lingo is a lightweight, scale-from-zero ML model proxy and that runs on Kubernetes. Lingo allows you to run text-completion and embedding servers in your own project without changing any of your OpenAI client code.
 
-![lingo demo](lingo.gif)
-
-🚀  Serve popular OSS LLM models in minutes on CPUs or GPUs  
-🧮  Serve Embedding Model servers  
-⚖️  Automatically scale up and down, all the way to 0  
-🪄  Built-in proxy that batches requests while scaling magic happens  
-🛠️  Easy to install, No complex dependencies such as Istio or Knative  
-☁️  Provide a unified API across clouds for serving LLMs
-
-Support the project by adding a star! ❤️
+🚀  Serve OSS LLMs on CPUs or GPUs  
+✅️  Compatible with the OpenAI API  
+⚖️  Scale from zero, autoscale based on load  
+…  Queue requests to avoid overloading models  
+🛠️  Zero dependencies (no Istio, Knative, etc.)   
+⦿  Namespaced - no cluster privileges needed
 
-Join us on Discord:  
 <a href="https://discord.gg/JeXhcmjZVm">
 <img alt="discord-invite" src="https://dcbadge.vercel.app/api/server/JeXhcmjZVm?style=flat">
 </a>
 
-## Quickstart (Any K8s, Kind, GKE, EKS etc)
-Add the Helm repo:
+Support the project by adding a star! ⭐️
+
+![lingo demo](lingo.gif)
+
+## Quickstart
+
+This quickstart will walk through installing Lingo and demonstrating how it scales models from zero. This should work on any Kubernetes cluster (GKE, EKS, AKS, Kind).
+
+Start by adding and updating the Substratus Helm repo.
+
 ```bash
 helm repo add substratusai https://substratusai.github.io/helm
 helm repo update
 ```
 
-Install the Lingo controller and proxy:
+Install Lingo.
+
 ```bash
 helm install lingo substratusai/lingo
 ```
 
-Deploy an embedding model:
+Deploy an embedding model (runs on CPUs).
+
 ```bash
 helm upgrade --install stapi-minilm-l6-v2 substratusai/stapi -f - << EOF
 model: all-MiniLM-L6-v2
@@ -40,7 +45,8 @@ deploymentAnnotations:
 EOF
 ```
 
-Deploy a LLM (mistral-7b-instruct) using vLLM:
+Deploy the Mistral 7B Instruct LLM using vLLM (GPUs are required).
+
 ```bash
 helm upgrade --install mistral-7b-instruct substratusai/vllm -f - << EOF
 model: mistralai/Mistral-7B-Instruct-v0.1
@@ -54,21 +60,23 @@ deploymentAnnotations:
   lingo.substratus.ai/max-replicas: "3" # needs to be string
 EOF
 ```
-Notice how the deployment has 0 replicas. That's fine because Lingo
-will automatically scale the embedding model server from 0 to 1
-once there is an incoming HTTP request.
 
-By default, the proxy is only accessible within the Kubernetes cluster. To access it from your local machine, set up a port forward:
+All model deployments currently have 0 replicas. Lingo will scale the Deployment in response to the first HTTP request.
+
+By default, the proxy is only accessible within the Kubernetes cluster. To access it from your local machine, set up a port forward.
+
 ```bash
 kubectl port-forward svc/lingo 8080:80
 ```
 
-In a separate terminal watch the pods:
+In a separate terminal watch the Pods.
+
 ```bash
 watch kubectl get pods
 ```
 
-Get embeddings by using the OpenAI compatible HTTP API:
+Get embeddings by using the OpenAI compatible HTTP API.
+
 ```bash
 curl http://localhost:8080/v1/embeddings \
   -H "Content-Type: application/json" \
@@ -77,35 +85,27 @@ curl http://localhost:8080/v1/embeddings \
     "model": "text-embedding-ada-002"
   }'
 ```
-You should see a stapi pod being created on the fly that
-will serve the request. The beautiful thing about Lingo
-is that it holds  your request in the proxy while the
-stapi pod is being created, once it's ready to serve, Lingo
-send the request to the stapi pod. The end-user does not
-see any errors and gets the response to their request.
-
-Similarly, send a request to the mistral-7b-instruct model that
-was deployed:
+
+You should see a model Pod being created on the fly that
+will serve the request. The first request will wait for this Pod to become ready.
+
+If you deployed the Mistral 7B LLM, try sending it a request as well.
+
 ```bash
 curl http://localhost:8080/v1/completions \
   -H "Content-Type: application/json" \
   -d '{"model": "mistral-7b-instruct-v0.1", "prompt": "<s>[INST]Who was the first president of the United States?[/INST]", "max_tokens": 40}'
 ```
-The first request to an LLM takes longer because
-those models require a GPU and require additional time
-to download the model.
-
-What else would you like to see? Join our Discord and ask directly.
 
-## Roadmap
+The first request to an LLM takes longer because of the size of the model. Subsequent request should be much quicker.
 
-* HA for the proxy controller
-* Response Request Caching
-* Model caching to speed up auto scaling for LLMs
-* Authentication
-* Multi cluster serving
+Checkout [substratus.ai](https://www.substratus.ai) to learn more about the managed hybrid-SaaS offering. Substratus allows you to run Lingo in your cloud account, while benefiting from extensive cluster performance addons that can dramatically reduce startup times and boost throughput.
 
 ## Creators
-Feel free to contact any of us:
+
+Let us know about features you are interested in seeing or reach out with questions. [Visit our Discord channel](https://discord.gg/JeXhcmjZVm) to join the discussion!
+
+Or just reach out on LinkedIn if you want to connect:
+
 * [Nick Stogner](https://www.linkedin.com/in/nstogner/)
 * [Sam Stoelinga](https://www.linkedin.com/in/samstoelinga/)
diff --git a/docs/backends.md b/docs/backends.md
index 580b020e..29b68b95 100644
--- a/docs/backends.md
+++ b/docs/backends.md
@@ -2,10 +2,14 @@
 
 Lingo backends are expected to serve models via an OpenAI-compatible API.
 
+## Routing
+
 Lingo will select a backend based on the `X-Model` header or the `.model` field in the JSON request body.
 
 ## Deployments
 
+Lingo manages the replicas of Kubernetes Deployments.
+
 Annotations:
 
 | Annotation | Required | Default | Description |
@@ -16,6 +20,8 @@ Annotations:
 
 ## Services
 
-* Lingo will forward traffic to a backend Service with the same name as the relevant Deployment.
+Lingo will keep track of Pods associated with Kubernetes Services tied to backend Deployments.
+
+* Lingo will forward traffic to a backend Service with the same name as the annotated Deployment.
 * If one port exists, lingo will send traffic to it.
 * If more than one port exists, lingo will send traffic to the port named `http`.
diff --git a/docs/development.md b/docs/development.md
index 0191bc82..708491ec 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -1,8 +1,34 @@
 # Development
 
+## Testing
+
+Run all tests (takes a while).
+```sh
+make test
+```
+
+*OR*
+
+Run specific tests.
+
+```bash
+make test-unit
+make test-race
+make test-integration
+make test-e2e
+```
+
+## Local Deployment
+
+Create a local cluster.
+
 ```sh
 kind create cluster
+```
 
+Install a scaled-to-zero embeddings backend.
+
+```sh
 # Install STAPI
 helm repo add substratusai https://substratusai.github.io/helm
 helm repo update
@@ -12,11 +38,27 @@ replicaCount: 0
 deploymentAnnotations:
   lingo.substratus.ai/models: text-embedding-ada-002
 EOF
+```
 
+Deploy Lingo from source.
 
-# Deploy
+```sh
 skaffold dev
+```
+
+OR
 
+Deploy Lingo from the main branch.
+
+```bash
+helm upgrade --install lingo substratusai/lingo \
+  --set image.tag=main \
+  --set image.pullPolicy=Always
+```
+
+Send test requests.
+
+```sh
 # In another terminal...
 kubectl port-forward svc/lingo 8080:80
 # In another terminal...
@@ -31,22 +73,11 @@ curl http://localhost:8080/delay/10 \
   }'
 
 
-# Get embeddings using OpenAI compatible API endpoint
+# Get embeddings using OpenAI compatible API endpoint.
 curl http://localhost:8080/v1/embeddings \
   -H "Content-Type: application/json" \
   -d '{
     "input": "Your text string goes here",
     "model": "text-embedding-ada-002"
   }'
-
-# Install vLLM with facebook opt 125
-
-
 ```
-
-Installing the latest development release (main branch):
-```bash
-helm upgrade --install lingo substratusai/lingo \
-  --set image.tag=main \
-  --set image.pullPolicy=Always
-```
\ No newline at end of file
diff --git a/pkg/endpoints/endponts_bench_test.go b/pkg/endpoints/endpoints_bench_test.go
similarity index 100%
rename from pkg/endpoints/endponts_bench_test.go
rename to pkg/endpoints/endpoints_bench_test.go
diff --git a/tests/system-test-kind.sh b/tests/e2e/test.sh
similarity index 100%
rename from tests/system-test-kind.sh
rename to tests/e2e/test.sh
diff --git a/tests/test_openai_embedding.py b/tests/e2e/test_openai_embedding.py
similarity index 100%
rename from tests/test_openai_embedding.py
rename to tests/e2e/test_openai_embedding.py