Shared filesystem caching (#272)

* Implements the shared filesystem cache in proposal [here](https://github.com/substratusai/kubeai/blob/main/proposals/model-storage.md) * Add support for shared-filesystem caching * Manually tested with dynamically provisioned and pre-provisioned GCP Filestore instance (see [GCP doc](https://cloud.google.com/filestore/docs/csi-driver)) * Add integration test for cache logic * Add e2e test that leverages a kind hostpath volume * Update docs * Add huggingface-loader image build * Add validation to make sure cacheProfile is not used outside of `hf://` urls. * Add validation to make url and cacheProfile immutable * Add integration test for validation logic * Add tests validating that FasterWhisper engine works with cache. Scope creep: * Added additional field validation (replicas, minReplicas, maxReplicas) Followup later (Issues submitted): * Add test validating that Infinity engine works with cache. * Submit a followup Issue to update the huggingface model loader image to the one that is built
substratusai · Oct 18, 2024 · 5e8f2c5 · 5e8f2c5
1 parent 7fdbc7d
commit 5e8f2c5
Show file tree

Hide file tree

Showing 52 changed files with 2,307 additions and 680 deletions.
diff --git a/.github/workflows/build-push-huggingface-model-loader.yml b/.github/workflows/build-push-huggingface-model-loader.yml
@@ -0,0 +1,61 @@
+name: Build and Push huggingface-model-loader Docker image
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - "v*.*.*"
+    paths-ignore:
+      - '**/README.md'
+  pull_request:
+
+# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: substratusai/huggingface-model-loader
+
+jobs:
+  huggingface-model-loader:
+    runs-on: ubuntu-latest
+    # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Log in to the Container registry
+        if: github.event_name == 'push'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Login to docker.io
+        if: github.event_name == 'push'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+            ${{ env.IMAGE_NAME }}
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./components/huggingface-model-loader
+          platforms: linux/amd64,linux/arm64
+          push: ${{ github.event_name == 'push' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.github/workflows/build-push.yml → .github/workflows/build-push-kubeai.yml b/.github/workflows/build-push.yml → .github/workflows/build-push-kubeai.yml
@@ -1,4 +1,4 @@
-name: Build and Push Docker image
+name: Build and Push kubeai Docker image
 on:
   push:
     branches:

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -20,14 +20,14 @@ jobs:
       - name: Run integration tests
         run: make test-integration
 
-  e2e:
+  e2e-general:
     runs-on: ubuntu-latest
     # NOTE: Uncomment if we start getting limited on number of concurrent jobs
     # (due to rapid pushes, etc).
     #needs: unit-and-integration # No use in running e2e tests if integration tests fail.
     strategy:
       matrix:
-        testcase: ["quickstart", "openai-python-client", "faster-whisper", "autoscaler-restart"]
+        testcase: ["quickstart", "openai-python-client", "autoscaler-restart", "cache-shared-filesystem"]
     steps:
       - name: Checkout code
         uses: actions/checkout@v2
@@ -48,4 +48,36 @@ jobs:
         run: kind create cluster
 
       - name: Run the e2e testcase
-        run: make test-e2e-${{ matrix.testcase }}
+        run: make test-e2e-${{ matrix.testcase }}
+
+  e2e-engines:
+    runs-on: ubuntu-latest
+    # NOTE: Uncomment if we start getting limited on number of concurrent jobs
+    # (due to rapid pushes, etc).
+    #needs: unit-and-integration # No use in running e2e tests if integration tests fail.
+    strategy:
+      matrix:
+        engine: ["FasterWhisper"] # "VLLM", "Infinity", "OLlama"
+        # Run each test case with and without caching.
+        cacheProfile: ["", "e2e-test-kind-pv"]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Install kind
+        run: |
+          curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.24.0/kind-linux-amd64
+          chmod +x ./kind
+          sudo mv ./kind /usr/local/bin/kind
+
+      - name: Install helm
+        run: |
+          curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+          chmod 700 get_helm.sh
+          ./get_helm.sh
+
+      - name: Start kind cluster
+        run: kind create cluster
+
+      - name: Run the e2e testcase
+        run: make test-e2e-engine ENGINE=${{ matrix.engine }} CACHE_PROFILE=${{ matrix.cacheProfile }}
diff --git a/Makefile b/Makefile
@@ -79,17 +79,21 @@ test-integration: fmt vet envtest
 test-e2e-quickstart: skaffold
 	./test/e2e/run.sh quickstart
 
-.PHONY: test-e2e-faster-whisper
-test-e2e-faster-whisper: skaffold
-	./test/e2e/run.sh faster-whisper --profile kubeai-only
-
 .PHONY: test-e2e-openai-python-client
 test-e2e-openai-python-client: skaffold
-	./test/e2e/run.sh openai-python-client --profile kubeai-only
+	./test/e2e/run.sh openai-python-client --profile e2e-test-default
 
 .PHONY: test-e2e-autoscaler-restart
 test-e2e-autoscaler-restart: skaffold
-	./test/e2e/run.sh autoscaler-restart --profile kubeai-only-rapid-scaling
+	./test/e2e/run.sh autoscaler-restart --profile e2e-test-autoscaler-restart
+
+.PHONY: test-e2e-cache-shared-filesystem
+test-e2e-cache-shared-filesystem: skaffold
+	./test/e2e/run.sh cache-shared-filesystem --profile e2e-test-default
+
+.PHONY: test-e2e-engine
+test-e2e-engine: skaffold
+	CACHE_PROFILE=$(CACHE_PROFILE) ./test/e2e/run.sh engine-$(ENGINE) --profile e2e-test-default
 
 .PHONY: lint
 lint: golangci-lint ## Run golangci-lint linter

diff --git a/api/v1/constants.go → api/v1/metadata.go b/api/v1/constants.go → api/v1/metadata.go
@@ -14,4 +14,10 @@ const (
 	// Use in conjunction with --allow-pod-address-override for development purposes.
 	ModelPodIPAnnotation   = "model-pod-ip"
 	ModelPodPortAnnotation = "model-pod-port"
+
+	ModelCacheEvictionFinalizer = "kubeai.org/cache-eviction"
 )
+
+func PVCModelAnnotation(modelName string) string {
+	return "models.kubeai.org/" + modelName
+}
diff --git a/api/v1/model_types.go b/api/v1/model_types.go
@@ -21,11 +21,16 @@ import (
 )
 
 // ModelSpec defines the desired state of Model.
+// +kubebuilder:validation:XValidation:rule="!has(self.cacheProfile) || self.url.startsWith(\"hf://\")", message="cacheProfile is only supported with a huggingface url (\"hf://...\") at the moment."
+// +kubebuilder:validation:XValidation:rule="!has(self.maxReplicas) || self.minReplicas <= self.maxReplicas", message="minReplicas should be less than or equal to maxReplicas."
 type ModelSpec struct {
 	// URL of the model to be served.
 	// Currently only the following formats are supported:
 	// For VLLM & FasterWhisper engines: "hf://<model-repo>/<model-name>"
 	// For OLlama engine: "ollama://<model>
+	// +kubebuilder:validation:Required
+	// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="url is immutable."
+	// +kubebuilder:validation:XValidation:rule="self.startsWith(\"hf://\") || self.startsWith(\"ollama://\")", message="url must start with \"hf://\" or \"ollama://\" and not be empty."
 	URL string `json:"url"`
 
 	// Features that the model supports.
@@ -34,6 +39,7 @@ type ModelSpec struct {
 
 	// Engine to be used for the server process.
 	// +kubebuilder:validation:Enum=OLlama;VLLM;FasterWhisper;Infinity
+	// +kubebuilder:validation:Required
 	Engine string `json:"engine"`
 
 	// ResourceProfile required to serve the model.
@@ -42,6 +48,11 @@ type ModelSpec struct {
 	// Must be a valid ResourceProfile defined in the system config.
 	ResourceProfile string `json:"resourceProfile,omitempty"`
 
+	// CacheProfile to be used for caching model artifacts.
+	// Must be a valid CacheProfile defined in the system config.
+	// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="cacheProfile is immutable."
+	CacheProfile string `json:"cacheProfile,omitempty"`
+
 	// Image to be used for the server process.
 	// Will be set from ResourceProfile + Engine if not specified.
 	Image string `json:"image,omitempty"`
@@ -110,13 +121,18 @@ const (
 // ModelStatus defines the observed state of Model.
 type ModelStatus struct {
 	Replicas ModelStatusReplicas `json:"replicas,omitempty"`
+	Cache    *ModelStatusCache   `json:"cache,omitempty"`
 }
 
 type ModelStatusReplicas struct {
 	All   int32 `json:"all"`
 	Ready int32 `json:"ready"`
 }
 
+type ModelStatusCache struct {
+	Loaded bool `json:"loaded"`
+}
+
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas.all

diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
diff --git a/charts/kubeai/templates/configmap.yaml b/charts/kubeai/templates/configmap.yaml
@@ -10,8 +10,12 @@ data:
       huggingface: {{ include "kubeai.huggingfaceSecretName" . }}
     resourceProfiles:
       {{- .Values.resourceProfiles | toYaml | nindent 6 }}
+    cacheProfiles:
+      {{- .Values.cacheProfiles | toYaml | nindent 6 }}
     modelServers:
       {{- .Values.modelServers | toYaml | nindent 6 }}
+    modelLoaders:
+      {{- .Values.modelLoaders | toYaml | nindent 6 }}
     modelRollouts:
       {{- .Values.modelRollouts | toYaml | nindent 6 }}
     modelServerPods:

diff --git a/charts/kubeai/templates/crds/kubeai.org_models.yaml b/charts/kubeai/templates/crds/kubeai.org_models.yaml
@@ -49,6 +49,14 @@ spec:
                   AutoscalingDisabled will stop the controller from managing the replicas
                   for the Model. When disabled, metrics will not be collected on server Pods.
                 type: boolean
+              cacheProfile:
+                description: |-
+                  CacheProfile to be used for caching model artifacts.
+                  Must be a valid CacheProfile defined in the system config.
+                type: string
+                x-kubernetes-validations:
+                - message: cacheProfile is immutable.
+                  rule: self == oldSelf
               engine:
                 description: Engine to be used for the server process.
                 enum:
@@ -134,16 +142,34 @@ spec:
                   For VLLM & FasterWhisper engines: "hf://<model-repo>/<model-name>"
                   For OLlama engine: "ollama://<model>
                 type: string
+                x-kubernetes-validations:
+                - message: url is immutable.
+                  rule: self == oldSelf
+                - message: url must start with "hf://" or "ollama://" and not be empty.
+                  rule: self.startsWith("hf://") || self.startsWith("ollama://")
             required:
             - engine
             - features
             - scaleDownDelaySeconds
             - targetRequests
             - url
             type: object
+            x-kubernetes-validations:
+            - message: cacheProfile is only supported with a huggingface url ("hf://...")
+                at the moment.
+              rule: '!has(self.cacheProfile) || self.url.startsWith("hf://")'
+            - message: minReplicas should be less than or equal to maxReplicas.
+              rule: '!has(self.maxReplicas) || self.minReplicas <= self.maxReplicas'
           status:
             description: ModelStatus defines the observed state of Model.
             properties:
+              cache:
+                properties:
+                  loaded:
+                    type: boolean
+                required:
+                - loaded
+                type: object
               replicas:
                 properties:
                   all:

diff --git a/charts/kubeai/templates/role.yaml b/charts/kubeai/templates/role.yaml
@@ -12,6 +12,32 @@ rules:
   verbs:
   - create
   - delete
+  - deletecollection
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - "batch"
+  resources:
+  - jobs
+  verbs:
+  - create
+  - delete
+  - deletecollection
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - persistentvolumeclaims
+  verbs:
+  - create
+  - delete
   - get
   - list
   - patch

diff --git a/charts/kubeai/values-gke.yaml b/charts/kubeai/values-gke.yaml
@@ -36,3 +36,11 @@ resourceProfiles:
     nodeSelector:
       cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
       cloud.google.com/gke-tpu-topology: "2x4"
+
+cacheProfiles:
+  standard-filestore:
+    sharedFilesystem:
+      storageClassName: "standard-rwx"
+  premium-filestore:
+    sharedFilesystem:
+      storageClassName: "premium-rwx"
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -31,6 +31,11 @@ modelServers:
     images:
       default: "michaelf34/infinity:latest"
 
+modelLoaders:
+  huggingface:
+    # TODO: Update image to the one built with GH Actions.
+    image: "us-central1-docker.pkg.dev/substratus-dev/default/huggingface-model-downloader:v0.0.1"
+
 modelServerPods:
   # Security Context for the model pods
   # Needed for OpenShift
@@ -99,6 +104,8 @@ resourceProfiles:
         value: "present"
         effect: "NoSchedule"
 
+cacheProfiles: {}
+
 modelAutoscaling:
   # Interval that the autoscaler will scrape model server metrics.
   # and calculate the desired number of replicas.