Skip to content

Commit

Permalink
Shared filesystem caching (#272)
Browse files Browse the repository at this point in the history
* Implements the shared filesystem cache in proposal
[here](https://github.com/substratusai/kubeai/blob/main/proposals/model-storage.md)
* Add support for shared-filesystem caching
* Manually tested with dynamically provisioned and pre-provisioned GCP
Filestore instance (see [GCP
doc](https://cloud.google.com/filestore/docs/csi-driver))
* Add integration test for cache logic
* Add e2e test that leverages a kind hostpath volume
* Update docs
* Add huggingface-loader image build
* Add validation to make sure cacheProfile is not used outside of
`hf://` urls.
* Add validation to make url and cacheProfile immutable
* Add integration test for validation logic
* Add tests validating that FasterWhisper engine works with cache.

Scope creep:
* Added additional field validation (replicas, minReplicas, maxReplicas)

Followup later (Issues submitted):
* Add test validating that Infinity engine works with cache.
* Submit a followup Issue to update the huggingface model loader image
to the one that is built
  • Loading branch information
nstogner authored Oct 18, 2024
1 parent 7fdbc7d commit 5e8f2c5
Show file tree
Hide file tree
Showing 52 changed files with 2,307 additions and 680 deletions.
61 changes: 61 additions & 0 deletions .github/workflows/build-push-huggingface-model-loader.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: Build and Push huggingface-model-loader Docker image
on:
push:
branches:
- main
tags:
- "v*.*.*"
paths-ignore:
- '**/README.md'
pull_request:

# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
env:
REGISTRY: ghcr.io
IMAGE_NAME: substratusai/huggingface-model-loader

jobs:
huggingface-model-loader:
runs-on: ubuntu-latest
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to the Container registry
if: github.event_name == 'push'
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Login to docker.io
if: github.event_name == 'push'
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: |
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
${{ env.IMAGE_NAME }}
- name: Build and push Docker image
uses: docker/build-push-action@v6
with:
context: ./components/huggingface-model-loader
platforms: linux/amd64,linux/arm64
push: ${{ github.event_name == 'push' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Build and Push Docker image
name: Build and Push kubeai Docker image
on:
push:
branches:
Expand Down
38 changes: 35 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ jobs:
- name: Run integration tests
run: make test-integration

e2e:
e2e-general:
runs-on: ubuntu-latest
# NOTE: Uncomment if we start getting limited on number of concurrent jobs
# (due to rapid pushes, etc).
#needs: unit-and-integration # No use in running e2e tests if integration tests fail.
strategy:
matrix:
testcase: ["quickstart", "openai-python-client", "faster-whisper", "autoscaler-restart"]
testcase: ["quickstart", "openai-python-client", "autoscaler-restart", "cache-shared-filesystem"]
steps:
- name: Checkout code
uses: actions/checkout@v2
Expand All @@ -48,4 +48,36 @@ jobs:
run: kind create cluster

- name: Run the e2e testcase
run: make test-e2e-${{ matrix.testcase }}
run: make test-e2e-${{ matrix.testcase }}

e2e-engines:
runs-on: ubuntu-latest
# NOTE: Uncomment if we start getting limited on number of concurrent jobs
# (due to rapid pushes, etc).
#needs: unit-and-integration # No use in running e2e tests if integration tests fail.
strategy:
matrix:
engine: ["FasterWhisper"] # "VLLM", "Infinity", "OLlama"
# Run each test case with and without caching.
cacheProfile: ["", "e2e-test-kind-pv"]
steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Install kind
run: |
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.24.0/kind-linux-amd64
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind
- name: Install helm
run: |
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
chmod 700 get_helm.sh
./get_helm.sh
- name: Start kind cluster
run: kind create cluster

- name: Run the e2e testcase
run: make test-e2e-engine ENGINE=${{ matrix.engine }} CACHE_PROFILE=${{ matrix.cacheProfile }}
16 changes: 10 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,21 @@ test-integration: fmt vet envtest
test-e2e-quickstart: skaffold
./test/e2e/run.sh quickstart

.PHONY: test-e2e-faster-whisper
test-e2e-faster-whisper: skaffold
./test/e2e/run.sh faster-whisper --profile kubeai-only

.PHONY: test-e2e-openai-python-client
test-e2e-openai-python-client: skaffold
./test/e2e/run.sh openai-python-client --profile kubeai-only
./test/e2e/run.sh openai-python-client --profile e2e-test-default

.PHONY: test-e2e-autoscaler-restart
test-e2e-autoscaler-restart: skaffold
./test/e2e/run.sh autoscaler-restart --profile kubeai-only-rapid-scaling
./test/e2e/run.sh autoscaler-restart --profile e2e-test-autoscaler-restart

.PHONY: test-e2e-cache-shared-filesystem
test-e2e-cache-shared-filesystem: skaffold
./test/e2e/run.sh cache-shared-filesystem --profile e2e-test-default

.PHONY: test-e2e-engine
test-e2e-engine: skaffold
CACHE_PROFILE=$(CACHE_PROFILE) ./test/e2e/run.sh engine-$(ENGINE) --profile e2e-test-default

.PHONY: lint
lint: golangci-lint ## Run golangci-lint linter
Expand Down
6 changes: 6 additions & 0 deletions api/v1/constants.go → api/v1/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,10 @@ const (
// Use in conjunction with --allow-pod-address-override for development purposes.
ModelPodIPAnnotation = "model-pod-ip"
ModelPodPortAnnotation = "model-pod-port"

ModelCacheEvictionFinalizer = "kubeai.org/cache-eviction"
)

func PVCModelAnnotation(modelName string) string {
return "models.kubeai.org/" + modelName
}
16 changes: 16 additions & 0 deletions api/v1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,16 @@ import (
)

// ModelSpec defines the desired state of Model.
// +kubebuilder:validation:XValidation:rule="!has(self.cacheProfile) || self.url.startsWith(\"hf://\")", message="cacheProfile is only supported with a huggingface url (\"hf://...\") at the moment."
// +kubebuilder:validation:XValidation:rule="!has(self.maxReplicas) || self.minReplicas <= self.maxReplicas", message="minReplicas should be less than or equal to maxReplicas."
type ModelSpec struct {
// URL of the model to be served.
// Currently only the following formats are supported:
// For VLLM & FasterWhisper engines: "hf://<model-repo>/<model-name>"
// For OLlama engine: "ollama://<model>
// +kubebuilder:validation:Required
// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="url is immutable."
// +kubebuilder:validation:XValidation:rule="self.startsWith(\"hf://\") || self.startsWith(\"ollama://\")", message="url must start with \"hf://\" or \"ollama://\" and not be empty."
URL string `json:"url"`

// Features that the model supports.
Expand All @@ -34,6 +39,7 @@ type ModelSpec struct {

// Engine to be used for the server process.
// +kubebuilder:validation:Enum=OLlama;VLLM;FasterWhisper;Infinity
// +kubebuilder:validation:Required
Engine string `json:"engine"`

// ResourceProfile required to serve the model.
Expand All @@ -42,6 +48,11 @@ type ModelSpec struct {
// Must be a valid ResourceProfile defined in the system config.
ResourceProfile string `json:"resourceProfile,omitempty"`

// CacheProfile to be used for caching model artifacts.
// Must be a valid CacheProfile defined in the system config.
// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="cacheProfile is immutable."
CacheProfile string `json:"cacheProfile,omitempty"`

// Image to be used for the server process.
// Will be set from ResourceProfile + Engine if not specified.
Image string `json:"image,omitempty"`
Expand Down Expand Up @@ -110,13 +121,18 @@ const (
// ModelStatus defines the observed state of Model.
type ModelStatus struct {
Replicas ModelStatusReplicas `json:"replicas,omitempty"`
Cache *ModelStatusCache `json:"cache,omitempty"`
}

type ModelStatusReplicas struct {
All int32 `json:"all"`
Ready int32 `json:"ready"`
}

type ModelStatusCache struct {
Loaded bool `json:"loaded"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas.all
Expand Down
22 changes: 21 additions & 1 deletion api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions charts/kubeai/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@ data:
huggingface: {{ include "kubeai.huggingfaceSecretName" . }}
resourceProfiles:
{{- .Values.resourceProfiles | toYaml | nindent 6 }}
cacheProfiles:
{{- .Values.cacheProfiles | toYaml | nindent 6 }}
modelServers:
{{- .Values.modelServers | toYaml | nindent 6 }}
modelLoaders:
{{- .Values.modelLoaders | toYaml | nindent 6 }}
modelRollouts:
{{- .Values.modelRollouts | toYaml | nindent 6 }}
modelServerPods:
Expand Down
26 changes: 26 additions & 0 deletions charts/kubeai/templates/crds/kubeai.org_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ spec:
AutoscalingDisabled will stop the controller from managing the replicas
for the Model. When disabled, metrics will not be collected on server Pods.
type: boolean
cacheProfile:
description: |-
CacheProfile to be used for caching model artifacts.
Must be a valid CacheProfile defined in the system config.
type: string
x-kubernetes-validations:
- message: cacheProfile is immutable.
rule: self == oldSelf
engine:
description: Engine to be used for the server process.
enum:
Expand Down Expand Up @@ -134,16 +142,34 @@ spec:
For VLLM & FasterWhisper engines: "hf://<model-repo>/<model-name>"
For OLlama engine: "ollama://<model>
type: string
x-kubernetes-validations:
- message: url is immutable.
rule: self == oldSelf
- message: url must start with "hf://" or "ollama://" and not be empty.
rule: self.startsWith("hf://") || self.startsWith("ollama://")
required:
- engine
- features
- scaleDownDelaySeconds
- targetRequests
- url
type: object
x-kubernetes-validations:
- message: cacheProfile is only supported with a huggingface url ("hf://...")
at the moment.
rule: '!has(self.cacheProfile) || self.url.startsWith("hf://")'
- message: minReplicas should be less than or equal to maxReplicas.
rule: '!has(self.maxReplicas) || self.minReplicas <= self.maxReplicas'
status:
description: ModelStatus defines the observed state of Model.
properties:
cache:
properties:
loaded:
type: boolean
required:
- loaded
type: object
replicas:
properties:
all:
Expand Down
26 changes: 26 additions & 0 deletions charts/kubeai/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,32 @@ rules:
verbs:
- create
- delete
- deletecollection
- get
- list
- patch
- update
- watch
- apiGroups:
- "batch"
resources:
- jobs
verbs:
- create
- delete
- deletecollection
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
resources:
- persistentvolumeclaims
verbs:
- create
- delete
- get
- list
- patch
Expand Down
8 changes: 8 additions & 0 deletions charts/kubeai/values-gke.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,11 @@ resourceProfiles:
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
cloud.google.com/gke-tpu-topology: "2x4"

cacheProfiles:
standard-filestore:
sharedFilesystem:
storageClassName: "standard-rwx"
premium-filestore:
sharedFilesystem:
storageClassName: "premium-rwx"
7 changes: 7 additions & 0 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ modelServers:
images:
default: "michaelf34/infinity:latest"

modelLoaders:
huggingface:
# TODO: Update image to the one built with GH Actions.
image: "us-central1-docker.pkg.dev/substratus-dev/default/huggingface-model-downloader:v0.0.1"

modelServerPods:
# Security Context for the model pods
# Needed for OpenShift
Expand Down Expand Up @@ -99,6 +104,8 @@ resourceProfiles:
value: "present"
effect: "NoSchedule"

cacheProfiles: {}

modelAutoscaling:
# Interval that the autoscaler will scrape model server metrics.
# and calculate the desired number of replicas.
Expand Down
Loading

0 comments on commit 5e8f2c5

Please sign in to comment.