add local models for coding

substratusai · Dec 3, 2024 · 71e0962 · 71e0962
1 parent 679d9e2
commit 71e0962
Showing 1 changed file with 29 additions and 1 deletion.
diff --git a/charts/models/values.yaml b/charts/models/values.yaml
@@ -187,7 +187,7 @@ catalog:
     resourceProfile: nvidia-gpu-h100:8
     targetRequests: 500
   llama-3.1-70b-instruct-fp8-gh200:
-    enabled: true
+    enabled: false
     features: [TextGeneration]
     url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
     engine: VLLM
@@ -270,6 +270,34 @@ catalog:
     engine: VLLM
     resourceProfile: nvidia-gpu-l4:1
   # Qwen #
+  qwen2.5-coder-1.5b-cpu:
+    enabled: true
+    features: ["TextGeneration"]
+    url: "ollama://qwen2.5-coder:1.5b"
+    engine: OLlama
+    minReplicas: 1
+    resourceProfile: cpu:1
+  qwen2.5-coder-1.5b-rtx4070-8gb:
+    enabled: false
+    features: ["TextGeneration"]
+    url: "hf://Qwen/Qwen2.5-Coder-1.5B-Instruct"
+    engine: VLLM
+    env:
+      VLLM_ATTENTION_BACKEND: FLASHINFER
+    args:
+    - --max-model-len=2048
+    - --max-num-seqs=16
+    - --quantization=fp8
+    - --kv-cache-dtype=fp8
+    minReplicas: 1
+    resourceProfile: nvidia-gpu-rtx4070-8gb:1
+  qwen2.5-7b-cpu:
+    enabled: true
+    features: ["TextGeneration"]
+    url: "ollama://qwen2.5:7b"
+    engine: OLlama
+    minReplicas: 1
+    resourceProfile: cpu:2
   qwen2-500m-cpu:
     enabled: false
     features: ["TextGeneration"]