update vllm image for GPU and TPU to v0.6.4.post1 (#310)

Note the new GH200 image is not available yet and will be updated later.
substratusai · Nov 21, 2024 · 8e0a494 · 8e0a494
1 parent 57d8d06
commit 8e0a494
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 3 deletions.
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -17,10 +17,10 @@ modelServers:
       # The key is the image name (referenced from resourceProfiles) and the value is the image.
       # The "default" image should always be specified.
       # "default" is used when no imageName is specified or if a specific image is not found.
-      default: "vllm/vllm-openai:v0.6.3.post1"
+      default: "vllm/vllm-openai:v0.6.4.post1"
       cpu: "substratusai/vllm:v0.6.3.post1-cpu"
-      google-tpu: "substratusai/vllm:v0.6.3.post1-tpu"
-      nvidia-gpu: "vllm/vllm-openai:v0.6.3.post1"
+      google-tpu: "substratusai/vllm:v0.6.4.post1-tpu"
+      nvidia-gpu: "vllm/vllm-openai:v0.6.4.post1"
       # TODO (samos123) switch to the official image when it is available.
       # Note this is simply a clone of drikster80/vllm-gh200-openai:v0.6.3.post1.
       # Source: https://github.com/drikster80/vllm/tree/gh200-docker

diff --git a/test/e2e/run.sh b/test/e2e/run.sh
@@ -44,6 +44,10 @@ error_handler() {
     kubectl get events
     echo "--- Models ---"
     kubectl get crds models.kubeai.org && kubectl get models -oyaml
+    echo "--- Model Logs ---"
+    kubectl logs -l app=model --tail -1
+    echo "--- Describe Model Pods ---"
+    kubectl describe pod -l app=model
     echo "!!! FAIL !!!"
     exit 1
 }