Skip to content

Commit

Permalink
add local models for coding
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 committed Dec 3, 2024
1 parent 679d9e2 commit 71e0962
Showing 1 changed file with 29 additions and 1 deletion.
30 changes: 29 additions & 1 deletion charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ catalog:
resourceProfile: nvidia-gpu-h100:8
targetRequests: 500
llama-3.1-70b-instruct-fp8-gh200:
enabled: true
enabled: false
features: [TextGeneration]
url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
engine: VLLM
Expand Down Expand Up @@ -270,6 +270,34 @@ catalog:
engine: VLLM
resourceProfile: nvidia-gpu-l4:1
# Qwen #
qwen2.5-coder-1.5b-cpu:
enabled: true
features: ["TextGeneration"]
url: "ollama://qwen2.5-coder:1.5b"
engine: OLlama
minReplicas: 1
resourceProfile: cpu:1
qwen2.5-coder-1.5b-rtx4070-8gb:
enabled: false
features: ["TextGeneration"]
url: "hf://Qwen/Qwen2.5-Coder-1.5B-Instruct"
engine: VLLM
env:
VLLM_ATTENTION_BACKEND: FLASHINFER
args:
- --max-model-len=2048
- --max-num-seqs=16
- --quantization=fp8
- --kv-cache-dtype=fp8
minReplicas: 1
resourceProfile: nvidia-gpu-rtx4070-8gb:1
qwen2.5-7b-cpu:
enabled: true
features: ["TextGeneration"]
url: "ollama://qwen2.5:7b"
engine: OLlama
minReplicas: 1
resourceProfile: cpu:2
qwen2-500m-cpu:
enabled: false
features: ["TextGeneration"]
Expand Down

0 comments on commit 71e0962

Please sign in to comment.