Skip to content

Commit

Permalink
add model for 70b on single h100
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 committed Jan 1, 2025
1 parent 92ac45f commit d46bf77
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 0 deletions.
12 changes: 12 additions & 0 deletions charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,18 @@ catalog:
- --disable-log-requests
resourceProfile: nvidia-gpu-h100:2
targetRequests: 500
llama-3.1-70b-instruct-fp8-1xh100:
features: [TextGeneration]
url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
engine: VLLM
args:
- --enable-prefix-caching
- --max-model-len=16384
- --max-num-batched-token=16384
- --gpu-memory-utilization=0.95
- --disable-log-requests
- --kv-cache-dtype=fp8
resourceProfile: nvidia-gpu-h100:1
llama-3.1-70b-instruct-fp8-l4:
enabled: false
features: [TextGeneration]
Expand Down
17 changes: 17 additions & 0 deletions manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: llama-3.1-70b-instruct-fp8-1xh100
spec:
features: [TextGeneration]
url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
engine: VLLM
args:
- --enable-prefix-caching
- --max-model-len=16384
- --max-num-batched-token=16384
- --gpu-memory-utilization=0.95
- --disable-log-requests
- --kv-cache-dtype=fp8
resourceProfile: nvidia-gpu-h100:1

0 comments on commit d46bf77

Please sign in to comment.