From d4896cfdb6af4047b6b1b9626b2963675b10cf7a Mon Sep 17 00:00:00 2001 From: Nick Stogner Date: Mon, 15 Jan 2024 21:37:17 -0500 Subject: [PATCH] General housekeeping (#54) * Update readme * Change test names / directories * Update development.md guide NOTE: No actual changes to testing code were made. Changes to workflow files are mostly for readability of files and also of the checks as displayed on PRs. --- .../{docker-push.yml => build-push.yml} | 43 ++++++--- .github/workflows/integration-tests.yml | 23 ----- .github/workflows/load-tests.yml | 25 ++++-- .github/workflows/system-test-kind.yml | 34 ------- .github/workflows/tests.yml | 52 +++++++++++ Makefile | 13 ++- README.md | 90 +++++++++---------- docs/backends.md | 8 +- docs/development.md | 57 +++++++++--- ..._bench_test.go => endpoints_bench_test.go} | 0 tests/{system-test-kind.sh => e2e/test.sh} | 0 tests/{ => e2e}/test_openai_embedding.py | 0 12 files changed, 199 insertions(+), 146 deletions(-) rename .github/workflows/{docker-push.yml => build-push.yml} (51%) delete mode 100644 .github/workflows/integration-tests.yml delete mode 100644 .github/workflows/system-test-kind.yml create mode 100644 .github/workflows/tests.yml rename pkg/endpoints/{endponts_bench_test.go => endpoints_bench_test.go} (100%) rename tests/{system-test-kind.sh => e2e/test.sh} (100%) rename tests/{ => e2e}/test_openai_embedding.py (100%) diff --git a/.github/workflows/docker-push.yml b/.github/workflows/build-push.yml similarity index 51% rename from .github/workflows/docker-push.yml rename to .github/workflows/build-push.yml index 5bf09b94..83e66465 100644 --- a/.github/workflows/docker-push.yml +++ b/.github/workflows/build-push.yml @@ -1,4 +1,4 @@ -name: Create and publish a Docker image +name: Build and Push # Configures this workflow to run every time a change is pushed to the branch called `release`. on: @@ -8,44 +8,59 @@ on: tags: - "v*.*.*" paths-ignore: - - 'README.md' + - "README.md" pull_request: -# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. +# Defines two custom environment variables for the workflow. +# These are used for the Container registry domain, and a name +# for the Docker image that this workflow builds. env: REGISTRY: ghcr.io IMAGE_NAME: substratusai/lingo -# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. +# There is a single job in this workflow. +# It's configured to run on the latest available version of Ubuntu. jobs: - build-and-push-image: + container: runs-on: ubuntu-latest + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. permissions: contents: read packages: write - # + steps: - name: Checkout repository uses: actions/checkout@v4 - # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. - - if: github.event_name == 'push' - name: Log in to the Container registry + + # Uses the `docker/login-action` action to log in to the Container registry registry + # using the account and password that will publish the packages. Once published, the + # packages are scoped to the account defined here. + - name: Log in to the Container registry + if: github.event_name == 'push' uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. + + # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) + # to extract tags and labels that will be applied to the specified image. The `id` "meta" + # allows the output of this step to be referenced in a subsequent step. The `images` value + # provides the base name for the tags and labels. - name: Extract metadata (tags, labels) for Docker id: meta uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. - # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. - # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. - - name: Build and push Docker image + + # This step uses the `docker/build-push-action` action to build the image, based on your + # repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. + # It uses the `context` parameter to define the build's context as the set of files located + # in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" + # in the README of the `docker/build-push-action` repository. It uses the `tags` and `labels` + # parameters to tag and label the image with the output from the "meta" step. + - name: Build and push container image uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 with: context: . diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml deleted file mode 100644 index 4d94880b..00000000 --- a/.github/workflows/integration-tests.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Integration Tests -run-name: Integration Tests by @${{ github.actor }} - -on: - push: - branches: - - main - pull_request: - -jobs: - install: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v2 - - name: Setup Go - uses: actions/setup-go@v4 - with: - go-version: '>=1.21.0' - - name: Run race tests - run: make test-race - - name: Run integration tests - run: make test-integration diff --git a/.github/workflows/load-tests.yml b/.github/workflows/load-tests.yml index c80e7d6d..8741d8a6 100644 --- a/.github/workflows/load-tests.yml +++ b/.github/workflows/load-tests.yml @@ -4,34 +4,43 @@ run-name: Load tests by @${{ github.actor }} on: workflow_dispatch jobs: - perf-tests: + k6: runs-on: ubuntu-latest + permissions: contents: "read" id-token: "write" + steps: - name: Checkout code uses: actions/checkout@v2 - - id: "auth" + + - name: Authenticate with GCP uses: "google-github-actions/auth@v1" with: workload_identity_provider: "projects/819220466562/locations/global/workloadIdentityPools/github/providers/my-repo" service_account: "github-actions@substratus-dev.iam.gserviceaccount.com" - - name: "Set up Cloud SDK" + + - name: Set up cloud SDK uses: "google-github-actions/setup-gcloud@v1" - - id: "get-credentials" + + - name: Get credentials uses: "google-github-actions/get-gke-credentials@v1" with: cluster_name: "lingo-dev" location: "us-central1" - - id: "get-pods" + + - name: Get pods run: "kubectl get pods" - - id: "install-skaffold" + + - name: Install skaffold run: | curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64 chmod +x skaffold sudo mv skaffold /usr/local/bin - - id: "configure-docker" + + - name: Configure docker run: gcloud auth configure-docker -q us-central1-docker.pkg.dev - - id: "run-tests" + + - name: Run tests run: "./tests/load/test.sh" diff --git a/.github/workflows/system-test-kind.yml b/.github/workflows/system-test-kind.yml deleted file mode 100644 index 2bba5dd4..00000000 --- a/.github/workflows/system-test-kind.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: System Tests for kind -run-name: System Tests for kind by @${{ github.actor }} - -on: - push: - branches: - - main - pull_request: - -jobs: - install: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Install Kind - run: | - # For AMD64 / x86_64 - [ $(uname -m) = x86_64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64 - # For ARM64 - [ $(uname -m) = aarch64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-arm64 - chmod +x ./kind - sudo mv ./kind /usr/local/bin/kind - - - name: Install Skaffold - run: | - curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64 - chmod +x skaffold - sudo mv skaffold /usr/local/bin - - - name: Run system test kind - run: bash tests/system-test-kind.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 00000000..9025d2cb --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,52 @@ +name: Tests +run-name: Run tests by @${{ github.actor }} + +on: + push: + branches: + - main + pull_request: + +jobs: + unit-and-integration: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Setup Go + uses: actions/setup-go@v4 + with: + go-version: ">=1.21.0" + + - name: Run unit tests + run: make test-unit + + - name: Run integration tests + run: make test-integration + + e2e: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Install kind + run: | + # For AMD64 / x86_64 + [ $(uname -m) = x86_64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64 + # For ARM64 + [ $(uname -m) = aarch64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-arm64 + chmod +x ./kind + sudo mv ./kind /usr/local/bin/kind + + - name: Install skaffold + run: | + curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64 + chmod +x skaffold + sudo mv skaffold /usr/local/bin + + - name: Run e2e tests + run: make test-e2e diff --git a/Makefile b/Makefile index ffe6eace..1826ea5b 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,20 @@ ENVTEST_K8S_VERSION = 1.27.1 .PHONY: test -test: test-unit - -.PHONY: test-all -test-all: test-race test-integration +test: test-unit test-race test-integration test-e2e .PHONY: test-unit test-unit: - go test -mod=readonly ./pkg/... - -.PHONY: test-race -test-race: go test -mod=readonly -race ./pkg/... .PHONY: test-integration test-integration: envtest KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./tests/integration -v +.PHONY: test-e2e +test-e2e: + ./tests/e2e/test.sh + ## Location to install dependencies to LOCALBIN ?= $(shell pwd)/bin $(LOCALBIN): diff --git a/README.md b/README.md index 770ca0b8..8c8f2954 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,41 @@ -# Lingo - K8s LLM Proxy + Scaler +# Lingo -Lingo is an OpenAI compatible LLM proxy and autoscaler for K8s +Lingo is a lightweight, scale-from-zero ML model proxy and that runs on Kubernetes. Lingo allows you to run text-completion and embedding servers in your own project without changing any of your OpenAI client code. -![lingo demo](lingo.gif) - -🚀 Serve popular OSS LLM models in minutes on CPUs or GPUs -🧮 Serve Embedding Model servers -⚖️ Automatically scale up and down, all the way to 0 -🪄 Built-in proxy that batches requests while scaling magic happens -🛠️ Easy to install, No complex dependencies such as Istio or Knative -☁️ Provide a unified API across clouds for serving LLMs - -Support the project by adding a star! ❤️ +🚀 Serve OSS LLMs on CPUs or GPUs +✅️ Compatible with the OpenAI API +⚖️ Scale from zero, autoscale based on load +… Queue requests to avoid overloading models +🛠️ Zero dependencies (no Istio, Knative, etc.) +⦿ Namespaced - no cluster privileges needed -Join us on Discord: discord-invite -## Quickstart (Any K8s, Kind, GKE, EKS etc) -Add the Helm repo: +Support the project by adding a star! ⭐️ + +![lingo demo](lingo.gif) + +## Quickstart + +This quickstart will walk through installing Lingo and demonstrating how it scales models from zero. This should work on any Kubernetes cluster (GKE, EKS, AKS, Kind). + +Start by adding and updating the Substratus Helm repo. + ```bash helm repo add substratusai https://substratusai.github.io/helm helm repo update ``` -Install the Lingo controller and proxy: +Install Lingo. + ```bash helm install lingo substratusai/lingo ``` -Deploy an embedding model: +Deploy an embedding model (runs on CPUs). + ```bash helm upgrade --install stapi-minilm-l6-v2 substratusai/stapi -f - << EOF model: all-MiniLM-L6-v2 @@ -40,7 +45,8 @@ deploymentAnnotations: EOF ``` -Deploy a LLM (mistral-7b-instruct) using vLLM: +Deploy the Mistral 7B Instruct LLM using vLLM (GPUs are required). + ```bash helm upgrade --install mistral-7b-instruct substratusai/vllm -f - << EOF model: mistralai/Mistral-7B-Instruct-v0.1 @@ -54,21 +60,23 @@ deploymentAnnotations: lingo.substratus.ai/max-replicas: "3" # needs to be string EOF ``` -Notice how the deployment has 0 replicas. That's fine because Lingo -will automatically scale the embedding model server from 0 to 1 -once there is an incoming HTTP request. -By default, the proxy is only accessible within the Kubernetes cluster. To access it from your local machine, set up a port forward: +All model deployments currently have 0 replicas. Lingo will scale the Deployment in response to the first HTTP request. + +By default, the proxy is only accessible within the Kubernetes cluster. To access it from your local machine, set up a port forward. + ```bash kubectl port-forward svc/lingo 8080:80 ``` -In a separate terminal watch the pods: +In a separate terminal watch the Pods. + ```bash watch kubectl get pods ``` -Get embeddings by using the OpenAI compatible HTTP API: +Get embeddings by using the OpenAI compatible HTTP API. + ```bash curl http://localhost:8080/v1/embeddings \ -H "Content-Type: application/json" \ @@ -77,35 +85,27 @@ curl http://localhost:8080/v1/embeddings \ "model": "text-embedding-ada-002" }' ``` -You should see a stapi pod being created on the fly that -will serve the request. The beautiful thing about Lingo -is that it holds your request in the proxy while the -stapi pod is being created, once it's ready to serve, Lingo -send the request to the stapi pod. The end-user does not -see any errors and gets the response to their request. - -Similarly, send a request to the mistral-7b-instruct model that -was deployed: + +You should see a model Pod being created on the fly that +will serve the request. The first request will wait for this Pod to become ready. + +If you deployed the Mistral 7B LLM, try sending it a request as well. + ```bash curl http://localhost:8080/v1/completions \ -H "Content-Type: application/json" \ -d '{"model": "mistral-7b-instruct-v0.1", "prompt": "[INST]Who was the first president of the United States?[/INST]", "max_tokens": 40}' ``` -The first request to an LLM takes longer because -those models require a GPU and require additional time -to download the model. - -What else would you like to see? Join our Discord and ask directly. -## Roadmap +The first request to an LLM takes longer because of the size of the model. Subsequent request should be much quicker. -* HA for the proxy controller -* Response Request Caching -* Model caching to speed up auto scaling for LLMs -* Authentication -* Multi cluster serving +Checkout [substratus.ai](https://www.substratus.ai) to learn more about the managed hybrid-SaaS offering. Substratus allows you to run Lingo in your cloud account, while benefiting from extensive cluster performance addons that can dramatically reduce startup times and boost throughput. ## Creators -Feel free to contact any of us: + +Let us know about features you are interested in seeing or reach out with questions. [Visit our Discord channel](https://discord.gg/JeXhcmjZVm) to join the discussion! + +Or just reach out on LinkedIn if you want to connect: + * [Nick Stogner](https://www.linkedin.com/in/nstogner/) * [Sam Stoelinga](https://www.linkedin.com/in/samstoelinga/) diff --git a/docs/backends.md b/docs/backends.md index 580b020e..29b68b95 100644 --- a/docs/backends.md +++ b/docs/backends.md @@ -2,10 +2,14 @@ Lingo backends are expected to serve models via an OpenAI-compatible API. +## Routing + Lingo will select a backend based on the `X-Model` header or the `.model` field in the JSON request body. ## Deployments +Lingo manages the replicas of Kubernetes Deployments. + Annotations: | Annotation | Required | Default | Description | @@ -16,6 +20,8 @@ Annotations: ## Services -* Lingo will forward traffic to a backend Service with the same name as the relevant Deployment. +Lingo will keep track of Pods associated with Kubernetes Services tied to backend Deployments. + +* Lingo will forward traffic to a backend Service with the same name as the annotated Deployment. * If one port exists, lingo will send traffic to it. * If more than one port exists, lingo will send traffic to the port named `http`. diff --git a/docs/development.md b/docs/development.md index 0191bc82..708491ec 100644 --- a/docs/development.md +++ b/docs/development.md @@ -1,8 +1,34 @@ # Development +## Testing + +Run all tests (takes a while). +```sh +make test +``` + +*OR* + +Run specific tests. + +```bash +make test-unit +make test-race +make test-integration +make test-e2e +``` + +## Local Deployment + +Create a local cluster. + ```sh kind create cluster +``` +Install a scaled-to-zero embeddings backend. + +```sh # Install STAPI helm repo add substratusai https://substratusai.github.io/helm helm repo update @@ -12,11 +38,27 @@ replicaCount: 0 deploymentAnnotations: lingo.substratus.ai/models: text-embedding-ada-002 EOF +``` +Deploy Lingo from source. -# Deploy +```sh skaffold dev +``` + +OR +Deploy Lingo from the main branch. + +```bash +helm upgrade --install lingo substratusai/lingo \ + --set image.tag=main \ + --set image.pullPolicy=Always +``` + +Send test requests. + +```sh # In another terminal... kubectl port-forward svc/lingo 8080:80 # In another terminal... @@ -31,22 +73,11 @@ curl http://localhost:8080/delay/10 \ }' -# Get embeddings using OpenAI compatible API endpoint +# Get embeddings using OpenAI compatible API endpoint. curl http://localhost:8080/v1/embeddings \ -H "Content-Type: application/json" \ -d '{ "input": "Your text string goes here", "model": "text-embedding-ada-002" }' - -# Install vLLM with facebook opt 125 - - ``` - -Installing the latest development release (main branch): -```bash -helm upgrade --install lingo substratusai/lingo \ - --set image.tag=main \ - --set image.pullPolicy=Always -``` \ No newline at end of file diff --git a/pkg/endpoints/endponts_bench_test.go b/pkg/endpoints/endpoints_bench_test.go similarity index 100% rename from pkg/endpoints/endponts_bench_test.go rename to pkg/endpoints/endpoints_bench_test.go diff --git a/tests/system-test-kind.sh b/tests/e2e/test.sh similarity index 100% rename from tests/system-test-kind.sh rename to tests/e2e/test.sh diff --git a/tests/test_openai_embedding.py b/tests/e2e/test_openai_embedding.py similarity index 100% rename from tests/test_openai_embedding.py rename to tests/e2e/test_openai_embedding.py