LoRA Adapters for vLLM & support for s3, gs, oss for pulling adapters…

… and models (to cache) from buckets (#304) * Add `.spec.adapters` * Support requesting adapters using the pattern: `{"model": "<model>_<adapter>", ... }` * Load LoRA adapters into running vLLM containers * Support updating LoRA adapters without needing to restart vLLM * Rewrite `.model` to use adapter in chat request body when proxying to vLLM * Add adapters to model list * Add support for `s3://`, `gs://`, `oss://` urls (for adapters and cache loading) * Add new cloud credentials to support new urls * Update docs * Update Model validation NOTE: * Was unable to test `oss://` urls... Had issues opening acct. FOLLOWUP: * Need to add adapter e2e tests (have not found a small enough model with adapters for use in kind cluster) * Need to update chart values.yaml to include GH-actions-built image for model loader after merge!!! Fixes #132, #303
substratusai · Nov 24, 2024 · b12f811 · b12f811
1 parent 499a6ed
commit b12f811
Show file tree

Hide file tree

Showing 81 changed files with 2,168 additions and 391 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,5 +1,11 @@
 # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
 # Ignore build and test binaries.
 bin/
+charts/
+components/
 docs/
-manifests/
+examples/
+manifests/
+proposals/
+test/
+tmp/
diff --git a/...s/build-push-huggingface-model-loader.yml → ...hub/workflows/build-push-model-loader.yml b/...s/build-push-huggingface-model-loader.yml → ...hub/workflows/build-push-model-loader.yml
@@ -1,4 +1,4 @@
-name: Build and Push huggingface-model-loader Docker image
+name: Build and Push kubeai-model-loader Docker image
 on:
   push:
     branches:
@@ -12,10 +12,10 @@ on:
 # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
 env:
   REGISTRY: ghcr.io
-  IMAGE_NAME: substratusai/huggingface-model-loader
+  IMAGE_NAME: substratusai/kubeai-model-loader
 
 jobs:
-  huggingface-model-loader:
+  kubeai-model-loader:
     runs-on: ubuntu-latest
     # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
     permissions:
@@ -52,7 +52,7 @@ jobs:
       - name: Build and push Docker image
         uses: docker/build-push-action@v6
         with:
-          context: ./components/huggingface-model-loader
+          context: ./components/model-loader
           platforms: linux/amd64,linux/arm64
           push: ${{ github.event_name == 'push' }}
           tags: ${{ steps.meta.outputs.tags }}

diff --git a/api/v1/metadata.go b/api/v1/metadata.go
@@ -21,3 +21,11 @@ const (
 func PVCModelAnnotation(modelName string) string {
 	return "models.kubeai.org/" + modelName
 }
+
+const (
+	PodAdapterLabelPrefix = "adapter.kubeai.org/"
+)
+
+func PodAdapterLabel(adapterID string) string {
+	return PodAdapterLabelPrefix + adapterID
+}
diff --git a/api/v1/model_types.go b/api/v1/model_types.go
@@ -21,18 +21,34 @@ import (
 )
 
 // ModelSpec defines the desired state of Model.
-// +kubebuilder:validation:XValidation:rule="!has(self.cacheProfile) || self.url.startsWith(\"hf://\")", message="cacheProfile is only supported with a huggingface url (\"hf://...\") at the moment."
+// +kubebuilder:validation:XValidation:rule="!has(self.cacheProfile) || self.url.startsWith(\"hf://\") || self.url.startsWith(\"s3://\") || self.url.startsWith(\"gs://\") || self.url.startsWith(\"oss://\")", message="cacheProfile is only supported with urls of format \"hf://...\", \"s3://...\", \"gs://...\", or \"oss://...\" at the moment."
+// +kubebuilder:validation:XValidation:rule="!self.url.startsWith(\"s3://\") || has(self.cacheProfile)", message="urls of format \"s3://...\" only supported when using a cacheProfile"
+// +kubebuilder:validation:XValidation:rule="!self.url.startsWith(\"gs://\") || has(self.cacheProfile)", message="urls of format \"gs://...\" only supported when using a cacheProfile"
+// +kubebuilder:validation:XValidation:rule="!self.url.startsWith(\"oss://\") || has(self.cacheProfile)", message="urls of format \"oss://...\" only supported when using a cacheProfile"
 // +kubebuilder:validation:XValidation:rule="!has(self.maxReplicas) || self.minReplicas <= self.maxReplicas", message="minReplicas should be less than or equal to maxReplicas."
+// +kubebuilder:validation:XValidation:rule="!has(self.adapters) || self.engine == \"VLLM\"", message="adapters only supported with VLLM engine."
 type ModelSpec struct {
 	// URL of the model to be served.
-	// Currently only the following formats are supported:
-	// For VLLM & FasterWhisper engines: "hf://<model-repo>/<model-name>"
-	// For OLlama engine: "ollama://<model>
+	// Currently the following formats are supported:
+	//
+	// For VLLM, FasterWhisper, Infinity engines:
+	//
+	// "hf://<repo>/<model>"
+	// "gs://<bucket>/<path>" (only with cacheProfile)
+	// "oss://<bucket>/<path>" (only with cacheProfile)
+	// "s3://<bucket>/<path>" (only with cacheProfile)
+	//
+	// For OLlama engine:
+	//
+	// "ollama://<model>"
+	//
 	// +kubebuilder:validation:Required
 	// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="url is immutable."
-	// +kubebuilder:validation:XValidation:rule="self.startsWith(\"hf://\") || self.startsWith(\"ollama://\")", message="url must start with \"hf://\" or \"ollama://\" and not be empty."
+	// +kubebuilder:validation:XValidation:rule="self.startsWith(\"hf://\") || self.startsWith(\"ollama://\") || self.startsWith(\"s3://\") || self.startsWith(\"gs://\") || self.startsWith(\"oss://\")", message="url must start with \"hf://\", \"ollama://\", \"s3://\", \"gs://\", or \"oss://\" and not be empty."
 	URL string `json:"url"`
 
+	Adapters []Adapter `json:"adapters,omitempty"`
+
 	// Features that the model supports.
 	// Dictates the APIs that are available for the model.
 	Features []ModelFeature `json:"features"`
@@ -118,6 +134,16 @@ const (
 	InfinityEngine      = "Infinity"
 )
 
+type Adapter struct {
+	// Name must be a lowercase string with no spaces.
+	// +kubebuilder:validation:Required
+	// +kubebuilder:validation:Pattern=^[a-z0-9-]+$
+	// +kubebuilder:validation:MaxLength=63
+	Name string `json:"name"`
+	// +kubebuilder:validation:XValidation:rule="self.startsWith(\"hf://\") || self.startsWith(\"s3://\") || self.startsWith(\"gs://\") || self.startsWith(\"oss://\")", message="adapter url must start with \"hf://\", \"s3://\", \"gs://\", or \"oss://\"."
+	URL string `json:"url"`
+}
+
 // ModelStatus defines the observed state of Model.
 type ModelStatus struct {
 	Replicas ModelStatusReplicas `json:"replicas,omitempty"`
@@ -133,11 +159,14 @@ type ModelStatusCache struct {
 	Loaded bool `json:"loaded"`
 }
 
+// NOTE: Model name length should be limited to allow for the model name to be used in
+// the names of the resources created by the controller.
+
+// Model resources define the ML models that will be served by KubeAI.
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas.all
-
-// Model resources define the ML models that will be served by KubeAI.
+// +kubebuilder:validation:XValidation:rule="size(self.metadata.name) <= 40", message="name must not exceed 40 characters."
 type Model struct {
 	metav1.TypeMeta   `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`

diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
diff --git a/charts/kubeai/templates/_helpers.tpl b/charts/kubeai/templates/_helpers.tpl
@@ -72,6 +72,60 @@ Create the name of the service account to use for model pods
 {{- end }}
 {{- end }}
 
+{{/*
+Create the name of the alibaba secret to use
+*/}}
+{{- define "kubeai.alibabaSecretName" -}}
+{{- if .Values.secrets.alibaba.create -}}
+{{- if .Values.secrets.alibaba.name -}}
+{{- .Values.secrets.alibaba.name -}}
+{{- else }}
+{{- (include "kubeai.fullname" .)}}-alibaba
+{{- end}}
+{{- else }}
+{{- if not .Values.secrets.alibaba.name -}}
+{{ fail "if secrets.alibaba.create is false, secrets.alibaba.name is required" }}
+{{- end }}
+{{- .Values.secrets.alibaba.name }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create the name of the aws secret to use
+*/}}
+{{- define "kubeai.awsSecretName" -}}
+{{- if .Values.secrets.aws.create -}}
+{{- if .Values.secrets.aws.name -}}
+{{- .Values.secrets.aws.name -}}
+{{- else }}
+{{- (include "kubeai.fullname" .)}}-aws
+{{- end}}
+{{- else }}
+{{- if not .Values.secrets.aws.name -}}
+{{ fail "if secrets.aws.create is false, secrets.aws.name is required" }}
+{{- end }}
+{{- .Values.secrets.aws.name }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create the name of the gcp secret to use
+*/}}
+{{- define "kubeai.gcpSecretName" -}}
+{{- if .Values.secrets.gcp.create -}}
+{{- if .Values.secrets.gcp.name -}}
+{{- .Values.secrets.gcp.name -}}
+{{- else }}
+{{- (include "kubeai.fullname" .)}}-gcp
+{{- end}}
+{{- else }}
+{{- if not .Values.secrets.gcp.name -}}
+{{ fail "if secrets.gcp.create is false, secrets.gcp.name is required" }}
+{{- end }}
+{{- .Values.secrets.gcp.name }}
+{{- end }}
+{{- end }}
+
 {{/*
 Create the name of the huggingface secret to use
 */}}

diff --git a/charts/kubeai/templates/aws-secret.yaml b/charts/kubeai/templates/aws-secret.yaml
@@ -0,0 +1,11 @@
+{{- if and .Values.secrets.aws.create (and (not (empty .Values.secrets.aws.accessKeyId)) (not (empty .Values.secrets.aws.secretAccessKey))) }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "kubeai.awsSecretName" . }}
+  labels:
+    {{- include "kubeai.labels" . | nindent 4 }}
+data:
+  accessKeyId: {{ .Values.secrets.aws.accessKeyId | b64enc }}
+  secretAccessKey: {{ .Values.secrets.aws.secretAccessKey | b64enc }}
+{{- end }}
diff --git a/charts/kubeai/templates/configmap.yaml b/charts/kubeai/templates/configmap.yaml
@@ -7,15 +7,18 @@ metadata:
 data:
   system.yaml: |
     secretNames:
+      alibaba: {{ include "kubeai.alibabaSecretName" . }}
+      aws: {{ include "kubeai.awsSecretName" . }}
+      gcp: {{ include "kubeai.gcpSecretName" . }}
       huggingface: {{ include "kubeai.huggingfaceSecretName" . }}
     resourceProfiles:
       {{- .Values.resourceProfiles | toYaml | nindent 6 }}
     cacheProfiles:
       {{- .Values.cacheProfiles | toYaml | nindent 6 }}
     modelServers:
       {{- .Values.modelServers | toYaml | nindent 6 }}
-    modelLoaders:
-      {{- .Values.modelLoaders | toYaml | nindent 6 }}
+    modelLoading:
+      {{- .Values.modelLoading | toYaml | nindent 6 }}
     modelRollouts:
       {{- .Values.modelRollouts | toYaml | nindent 6 }}
     modelServerPods:

diff --git a/charts/kubeai/templates/crds/kubeai.org_models.yaml b/charts/kubeai/templates/crds/kubeai.org_models.yaml
@@ -39,6 +39,26 @@ spec:
           spec:
             description: ModelSpec defines the desired state of Model.
             properties:
+              adapters:
+                items:
+                  properties:
+                    name:
+                      description: Name must be a lowercase string with no spaces.
+                      maxLength: 63
+                      pattern: ^[a-z0-9-]+$
+                      type: string
+                    url:
+                      type: string
+                      x-kubernetes-validations:
+                      - message: adapter url must start with "hf://", "s3://", "gs://",
+                          or "oss://".
+                        rule: self.startsWith("hf://") || self.startsWith("s3://")
+                          || self.startsWith("gs://") || self.startsWith("oss://")
+                  required:
+                  - name
+                  - url
+                  type: object
+                type: array
               args:
                 description: Args to be added to the server process.
                 items:
@@ -138,15 +158,30 @@ spec:
               url:
                 description: |-
                   URL of the model to be served.
-                  Currently only the following formats are supported:
-                  For VLLM & FasterWhisper engines: "hf://<model-repo>/<model-name>"
-                  For OLlama engine: "ollama://<model>
+                  Currently the following formats are supported:
+
+
+                  For VLLM, FasterWhisper, Infinity engines:
+
+
+                  "hf://<repo>/<model>"
+                  "gs://<bucket>/<path>" (only with cacheProfile)
+                  "oss://<bucket>/<path>" (only with cacheProfile)
+                  "s3://<bucket>/<path>" (only with cacheProfile)
+
+
+                  For OLlama engine:
+
+
+                  "ollama://<model>"
                 type: string
                 x-kubernetes-validations:
                 - message: url is immutable.
                   rule: self == oldSelf
-                - message: url must start with "hf://" or "ollama://" and not be empty.
-                  rule: self.startsWith("hf://") || self.startsWith("ollama://")
+                - message: url must start with "hf://", "ollama://", "s3://", "gs://",
+                    or "oss://" and not be empty.
+                  rule: self.startsWith("hf://") || self.startsWith("ollama://") ||
+                    self.startsWith("s3://") || self.startsWith("gs://") || self.startsWith("oss://")
             required:
             - engine
             - features
@@ -155,11 +190,20 @@ spec:
             - url
             type: object
             x-kubernetes-validations:
-            - message: cacheProfile is only supported with a huggingface url ("hf://...")
-                at the moment.
-              rule: '!has(self.cacheProfile) || self.url.startsWith("hf://")'
+            - message: cacheProfile is only supported with urls of format "hf://...",
+                "s3://...", "gs://...", or "oss://..." at the moment.
+              rule: '!has(self.cacheProfile) || self.url.startsWith("hf://") || self.url.startsWith("s3://")
+                || self.url.startsWith("gs://") || self.url.startsWith("oss://")'
+            - message: urls of format "s3://..." only supported when using a cacheProfile
+              rule: '!self.url.startsWith("s3://") || has(self.cacheProfile)'
+            - message: urls of format "gs://..." only supported when using a cacheProfile
+              rule: '!self.url.startsWith("gs://") || has(self.cacheProfile)'
+            - message: urls of format "oss://..." only supported when using a cacheProfile
+              rule: '!self.url.startsWith("oss://") || has(self.cacheProfile)'
             - message: minReplicas should be less than or equal to maxReplicas.
               rule: '!has(self.maxReplicas) || self.minReplicas <= self.maxReplicas'
+            - message: adapters only supported with VLLM engine.
+              rule: '!has(self.adapters) || self.engine == "VLLM"'
           status:
             description: ModelStatus defines the observed state of Model.
             properties:
@@ -184,6 +228,9 @@ spec:
                 type: object
             type: object
         type: object
+        x-kubernetes-validations:
+        - message: name must not exceed 40 characters.
+          rule: size(self.metadata.name) <= 40
     served: true
     storage: true
     subresources:

diff --git a/charts/kubeai/templates/secret.yaml → .../kubeai/templates/huggingface-secret.yaml b/charts/kubeai/templates/secret.yaml → .../kubeai/templates/huggingface-secret.yaml
diff --git a/charts/kubeai/templates/role.yaml b/charts/kubeai/templates/role.yaml
@@ -57,6 +57,12 @@ rules:
   - get
   - patch
   - update
+- apiGroups:
+  - ""
+  resources:
+  - pods/exec
+  verbs:
+  - "*"
 - apiGroups:
   - kubeai.org
   resources: