add model for 70b on single h100

substratusai · Jan 1, 2025 · d46bf77 · d46bf77
1 parent 92ac45f
commit d46bf77
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 0 deletions.
diff --git a/charts/models/values.yaml b/charts/models/values.yaml
@@ -139,6 +139,18 @@ catalog:
       - --disable-log-requests
     resourceProfile: nvidia-gpu-h100:2
     targetRequests: 500
+  llama-3.1-70b-instruct-fp8-1xh100:
+    features: [TextGeneration]
+    url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
+    engine: VLLM
+    args:
+      - --enable-prefix-caching
+      - --max-model-len=16384
+      - --max-num-batched-token=16384
+      - --gpu-memory-utilization=0.95
+      - --disable-log-requests
+      - --kv-cache-dtype=fp8
+    resourceProfile: nvidia-gpu-h100:1
   llama-3.1-70b-instruct-fp8-l4:
     enabled: false
     features: [TextGeneration]

diff --git a/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml b/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml
@@ -0,0 +1,17 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-70b-instruct-fp8-1xh100
+spec:
+  features: [TextGeneration]
+  url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
+  engine: VLLM
+  args:
+    - --enable-prefix-caching
+    - --max-model-len=16384
+    - --max-num-batched-token=16384
+    - --gpu-memory-utilization=0.95
+    - --disable-log-requests
+    - --kv-cache-dtype=fp8
+  resourceProfile: nvidia-gpu-h100:1