From c4117ba9a6c64d53a71e7fe024dfcf9ca2980bc7 Mon Sep 17 00:00:00 2001
From: Sam Stoelinga <sammiestoel@gmail.com>
Date: Sat, 28 Dec 2024 15:30:06 -0800
Subject: [PATCH 1/5] wip scenario 70b 8 replicas prefix hash

---
 .../README.md                                 | 128 ++++++++++++++++++
 .../base-request.json                         |   6 +
 .../least-load-vs-prefix-hash-70b-8r/k6.json  |  15 ++
 .../model.yaml                                |  18 +++
 .../least-load-vs-prefix-hash-70b-8r/pod.yaml |  19 +++
 5 files changed, 186 insertions(+)
 create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md
 create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json
 create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json
 create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml
 create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml

diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md
new file mode 100644
index 00000000..aaa309d2
--- /dev/null
+++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md
@@ -0,0 +1,128 @@
+# Results
+
+Under specific conditions:
+
+* Restricted GPU memory
+* Low `max_tokens` to be generated
+* Chat threads with decently long user messages
+
+Prefix hashing was shown to have `34%` decrease in average time per token.
+
+`712.11ms (LeastLoad) --> 469.34ms (PrefixHash)`
+
+## Steps taken
+
+```bash
+
+cd ./benchmarks/chat
+make data
+export IMG=us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2
+docker build -t $IMG . && docker push $IMG
+
+kubectl apply -f ./scenarios/least-load-vs-prefix-hash/model.yaml
+kubectl apply -f ./scenarios/least-load-vs-prefix-hash/pod.yaml
+
+# Run 2x (to ensure both cases start with a preloaded cache)
+kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash make run
+
+kubectl patch model llama-3.1-8b-instruct-fp8-l4 --type='merge' -p '{"spec": {"loadBalancing": {"strategy": "PrefixHash"}}}'
+kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash-70b-8r make run
+```
+
+## Next Steps
+
+* Rerun with increased replicas (i.e. 10 instead of 2)
+
+## Benchmark Output
+
+### LeastLoad
+
+```
+         /\      Grafana   /‾‾/  
+    /\  /  \     |\  __   /  /   
+   /  \/    \    | |/ /  /   ‾‾\ 
+  /          \   |   (  |  (‾)  |
+ / __________ \  |_|\_\  \_____/ 
+
+     execution: local
+        script: ./k6.js
+        output: -
+
+     scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop):
+              * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s)
+
+
+     ✓ Post status is 200
+
+     checks.........................: 100.00% 7341 out of 7341
+     data_received..................: 4.7 MB  7.9 kB/s
+     data_sent......................: 25 MB   42 kB/s
+     http_req_blocked...............: avg=161.4µs  min=2.83µs   med=5.8µs    max=16.67ms  p(90)=8.06µs   p(95)=10.19µs 
+     http_req_connecting............: avg=55.73µs  min=0s       med=0s       max=8.41ms   p(90)=0s       p(95)=0s      
+     http_req_duration..............: avg=6.31s    min=165.25ms med=6.66s    max=11.65s   p(90)=8.55s    p(95)=9.07s   
+       { expected_response:true }...: avg=6.31s    min=165.25ms med=6.66s    max=11.65s   p(90)=8.55s    p(95)=9.07s   
+   ✓ http_req_failed................: 0.00%   0 out of 7341
+     http_req_receiving.............: avg=84.64µs  min=29.4µs   med=74.05µs  max=732.69µs p(90)=129.94µs p(95)=154.19µs
+     http_req_sending...............: avg=68µs     min=12.1µs   med=32.3µs   max=1.38ms   p(90)=144.04µs p(95)=173.19µs
+     http_req_tls_handshaking.......: avg=0s       min=0s       med=0s       max=0s       p(90)=0s       p(95)=0s      
+     http_req_waiting...............: avg=6.31s    min=165.04ms med=6.66s    max=11.65s   p(90)=8.55s    p(95)=9.07s   
+     http_reqs......................: 7341    12.422953/s
+     input_tokens...................: 4990223 8444.803735/s
+     iteration_duration.............: avg=46.39s   min=6.73s    med=41.26s   max=4m13s    p(90)=1m8s     p(95)=1m28s   
+     iterations.....................: 1000    1.69227/s
+     new_tokens.....................: 68062   115.179268/s
+     time_per_token.................: avg=712.11ms min=39.56ms  med=703.28ms max=2.69s    p(90)=928.58ms p(95)=1.09s   
+     tokens.........................: 5058285 8559.983003/s
+     vus............................: 1       min=0            max=80
+     vus_max........................: 80      min=21           max=80
+
+
+running (09m50.9s), 00/80 VUs, 1000 complete and 0 interrupted iterations
+chat ✓ [======================================] 80 VUs  09m50.9s/10m0s  1000/1000 shared iters
+```
+
+### PrefixHash
+
+```
+         /\      Grafana   /‾‾/  
+    /\  /  \     |\  __   /  /   
+   /  \/    \    | |/ /  /   ‾‾\ 
+  /          \   |   (  |  (‾)  |
+ / __________ \  |_|\_\  \_____/ 
+
+     execution: local
+        script: ./k6.js
+        output: -
+
+     scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop):
+              * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s)
+
+
+     ✓ Post status is 200
+
+     checks.........................: 100.00% 7341 out of 7341
+     data_received..................: 4.7 MB  12 kB/s
+     data_sent......................: 25 MB   65 kB/s
+     http_req_blocked...............: avg=268.24µs min=2.94µs   med=5.76µs   max=28.19ms  p(90)=8.17µs   p(95)=10.41µs 
+     http_req_connecting............: avg=136.33µs min=0s       med=0s       max=17.7ms   p(90)=0s       p(95)=0s      
+     http_req_duration..............: avg=4.08s    min=151.9ms  med=2.45s    max=12.32s   p(90)=9.63s    p(95)=10.26s  
+       { expected_response:true }...: avg=4.08s    min=151.9ms  med=2.45s    max=12.32s   p(90)=9.63s    p(95)=10.26s  
+   ✓ http_req_failed................: 0.00%   0 out of 7341
+     http_req_receiving.............: avg=81.81µs  min=28.68µs  med=72.08µs  max=786.09µs p(90)=125.04µs p(95)=148.6µs 
+     http_req_sending...............: avg=63.61µs  min=11.85µs  med=31.65µs  max=1.59ms   p(90)=136.85µs p(95)=161.88µs
+     http_req_tls_handshaking.......: avg=0s       min=0s       med=0s       max=0s       p(90)=0s       p(95)=0s      
+     http_req_waiting...............: avg=4.08s    min=151.81ms med=2.45s    max=12.32s   p(90)=9.63s    p(95)=10.26s  
+     http_reqs......................: 7341    19.230625/s
+     input_tokens...................: 4990576 13073.409349/s
+     iteration_duration.............: avg=29.98s   min=2.37s    med=20.29s   max=2m53s    p(90)=1m1s     p(95)=1m18s   
+     iterations.....................: 1000    2.619619/s
+     new_tokens.....................: 68218   178.705191/s
+     time_per_token.................: avg=469.34ms min=44.2ms   med=257.72ms max=3.86s    p(90)=1s       p(95)=1.1s    
+     tokens.........................: 5058794 13252.11454/s
+     vus............................: 3       min=0            max=80
+     vus_max........................: 80      min=19           max=80
+
+
+running (06m21.7s), 00/80 VUs, 1000 complete and 0 interrupted iterations
+chat ✓ [======================================] 80 VUs  06m21.7s/10m0s  1000/1000 shared iters
+```
diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json
new file mode 100644
index 00000000..9550d606
--- /dev/null
+++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json
@@ -0,0 +1,6 @@
+{
+    "model": "llama-3.1-70b-instruct-fp8-h100",
+    "max_tokens": 10,
+    "temperature": 0,
+    "messages": []
+}
\ No newline at end of file
diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json
new file mode 100644
index 00000000..4b687779
--- /dev/null
+++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json
@@ -0,0 +1,15 @@
+{
+    "thresholds": {
+        "http_req_failed": [
+            "rate==0"
+        ]
+    },
+    "scenarios": {
+        "chat": {
+            "executor": "shared-iterations",
+            "vus": 320,
+            "iterations": 1000,
+            "maxDuration": "600s"
+        }
+    }
+}
\ No newline at end of file
diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml
new file mode 100644
index 00000000..9ff87269
--- /dev/null
+++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml
@@ -0,0 +1,18 @@
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-70b-instruct-fp8-h100
+spec:
+  features: [TextGeneration]
+  url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
+  engine: VLLM
+  args:
+    - --enable-prefix-caching
+    - --max-model-len=16384
+    - --max-num-batched-token=16384
+    - --gpu-memory-utilization=0.95
+    - --disable-log-requests
+    - --kv-cache-dtype=fp8
+  resourceProfile: nvidia-gpu-h100:1
+  minReplicas: 8
+  maxReplicas: 8
diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml
new file mode 100644
index 00000000..872c9706
--- /dev/null
+++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: chat-benchmark
+spec:
+  restartPolicy: Never
+  containers:
+    - name: bench
+      image: us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2
+      command: ["sleep", "infinity"]
+      resources:
+        requests:
+          cpu: 6
+          ephemeral-storage: 10Gi
+          memory: 24Gi
+        limits:
+          cpu: 6
+          ephemeral-storage: 10Gi
+          memory: 24Gi
\ No newline at end of file

From 771cad39ce099974e8eaf78d7cb855c97b41a0a1 Mon Sep 17 00:00:00 2001
From: Sam Stoelinga <sammiestoel@gmail.com>
Date: Tue, 31 Dec 2024 19:06:50 -0800
Subject: [PATCH 2/5] update with results

---
 .../README.md                                 | 215 +++++++++++-------
 .../least-load-vs-prefix-hash-70b-8r/pod.yaml |   2 +-
 2 files changed, 138 insertions(+), 79 deletions(-)

diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md
index aaa309d2..bc9fede1 100644
--- a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md
+++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md
@@ -13,116 +13,175 @@ Prefix hashing was shown to have `34%` decrease in average time per token.
 ## Steps taken
 
 ```bash
+export SCENARIO=least-load-vs-prefix-hash-70b-8r
+export PROJECT_ID=$(gcloud config get-value project)
+export IMG=us-central1-docker.pkg.dev/$PROJECT_ID/default/kubeai-benchmark-chat:v0.0.2
 
 cd ./benchmarks/chat
 make data
-export IMG=us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2
-docker build -t $IMG . && docker push $IMG
+gcloud builds submit . -t $IMG
+# docker build -t $IMG . && docker push $IMG
 
-kubectl apply -f ./scenarios/least-load-vs-prefix-hash/model.yaml
-kubectl apply -f ./scenarios/least-load-vs-prefix-hash/pod.yaml
+kubectl apply -f ./scenarios/$SCENARIO/model.yaml
+envsubst < ./scenarios/$SCENARIO/pod.yaml | kubectl apply -f -
+
+# Had to manually copy the file for some reason
+# TODO fix Dockerfile to ensure it gets added
+kubectl cp data/message-threads.json chat-benchmark:/work/data/
 
 # Run 2x (to ensure both cases start with a preloaded cache)
-kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash make run
+# kubectl exec -it chat-benchmark -- SCENARIO=$SCENARIO make run
+kubectl exec -it chat-benchmark -- bash -c "SCENARIO=$SCENARIO make run"
 
-kubectl patch model llama-3.1-8b-instruct-fp8-l4 --type='merge' -p '{"spec": {"loadBalancing": {"strategy": "PrefixHash"}}}'
-kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash-70b-8r make run
+kubectl patch model llama-3.1-70b-instruct-fp8-h100 --type='merge' -p '{"spec": {"loadBalancing": {"strategy": "PrefixHash"}}}'
+kubectl exec -it chat-benchmark -- SCENARIO=$SCENARIO make run
 ```
 
-## Next Steps
-
-* Rerun with increased replicas (i.e. 10 instead of 2)
 
 ## Benchmark Output
 
-### LeastLoad
+### LeastLoad - single replica
 
 ```
-         /\      Grafana   /‾‾/  
-    /\  /  \     |\  __   /  /   
-   /  \/    \    | |/ /  /   ‾‾\ 
-  /          \   |   (  |  (‾)  |
- / __________ \  |_|\_\  \_____/ 
+     scenarios: (100.00%) 1 scenario, 320 max VUs, 10m30s max duration (incl. graceful stop):
+              * chat: 1000 iterations shared among 320 VUs (maxDuration: 10m0s, gracefulStop: 30s)
 
-     execution: local
-        script: ./k6.js
-        output: -
 
-     scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop):
-              * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s)
+     ✓ Post status is 200
+
+     checks.........................: 100.00% 6094 out of 6094
+     data_received..................: 3.9 MB  6.2 kB/s
+     data_sent......................: 20 MB   32 kB/s
+     dropped_iterations.............: 23      0.036508/s
+     http_req_blocked...............: avg=1.52ms   min=1.72µs   med=4.52µs  max=47.12ms p(90)=7.64µs   p(95)=14.47ms
+     http_req_connecting............: avg=79.02µs  min=0s       med=0s      max=13.96ms p(90)=0s       p(95)=119.84µs
+     http_req_duration..............: avg=32.48s   min=6.25s    med=37.74s  max=50.64s  p(90)=43.38s   p(95)=45.81s
+       { expected_response:true }...: avg=32.48s   min=6.25s    med=37.74s  max=50.64s  p(90)=43.38s   p(95)=45.81s
+   ✓ http_req_failed................: 0.00%   0 out of 6094
+     http_req_receiving.............: avg=75.82µs  min=19.9µs   med=68.09µs max=2.04ms  p(90)=115.16µs p(95)=134.82µs
+     http_req_sending...............: avg=103.99µs min=8.22µs   med=27.04µs max=33.92ms p(90)=126.5µs  p(95)=186.9µs
+     http_req_tls_handshaking.......: avg=0s       min=0s       med=0s      max=0s      p(90)=0s       p(95)=0s
+     http_req_waiting...............: avg=32.48s   min=6.25s    med=37.73s  max=50.64s  p(90)=43.38s   p(95)=45.81s
+     http_reqs......................: 6094    9.672953/s
+     input_tokens...................: 3859568 6126.258596/s
+     iteration_duration.............: avg=3m49s    min=1m30s    med=3m23s   max=10m17s  p(90)=5m41s    p(95)=6m36s
+     iterations.....................: 728     1.155548/s
+     new_tokens.....................: 56340   89.42799/s
+     time_per_token.................: avg=4.03s    min=625.66ms med=3.87s   max=22.72s  p(90)=5s       p(95)=11.69s
+     tokens.........................: 3915908 6215.686586/s
+     vus............................: 252     min=0            max=320
+     vus_max........................: 320     min=25           max=320
+
+
+running (10m30.0s), 000/320 VUs, 728 complete and 249 interrupted iterations
+chat ✗ [==========================>-----------] 320 VUs  10m30.0s/10m0s  0728/1000 shared iters
+```
+
+## LeastLoad - 8 replicas 1st run
+
+```
+     scenarios: (100.00%) 1 scenario, 320 max VUs, 10m30s max duration (incl. graceful stop):
+              * chat: 1000 iterations shared among 320 VUs (maxDuration: 10m0s, gracefulStop: 30s)
 
 
      ✓ Post status is 200
 
      checks.........................: 100.00% 7341 out of 7341
-     data_received..................: 4.7 MB  7.9 kB/s
-     data_sent......................: 25 MB   42 kB/s
-     http_req_blocked...............: avg=161.4µs  min=2.83µs   med=5.8µs    max=16.67ms  p(90)=8.06µs   p(95)=10.19µs 
-     http_req_connecting............: avg=55.73µs  min=0s       med=0s       max=8.41ms   p(90)=0s       p(95)=0s      
-     http_req_duration..............: avg=6.31s    min=165.25ms med=6.66s    max=11.65s   p(90)=8.55s    p(95)=9.07s   
-       { expected_response:true }...: avg=6.31s    min=165.25ms med=6.66s    max=11.65s   p(90)=8.55s    p(95)=9.07s   
+     data_received..................: 4.7 MB  47 kB/s
+     data_sent......................: 25 MB   250 kB/s
+     http_req_blocked...............: avg=280.95µs min=1.57µs   med=4.13µs   max=28.71ms p(90)=6.86µs   p(95)=32.09µs
+     http_req_connecting............: avg=55.16µs  min=0s       med=0s       max=19.59ms p(90)=0s       p(95)=0s
+     http_req_duration..............: avg=3.67s    min=112.34ms med=3.65s    max=8.58s   p(90)=6.09s    p(95)=6.56s
+       { expected_response:true }...: avg=3.67s    min=112.34ms med=3.65s    max=8.58s   p(90)=6.09s    p(95)=6.56s
    ✓ http_req_failed................: 0.00%   0 out of 7341
-     http_req_receiving.............: avg=84.64µs  min=29.4µs   med=74.05µs  max=732.69µs p(90)=129.94µs p(95)=154.19µs
-     http_req_sending...............: avg=68µs     min=12.1µs   med=32.3µs   max=1.38ms   p(90)=144.04µs p(95)=173.19µs
-     http_req_tls_handshaking.......: avg=0s       min=0s       med=0s       max=0s       p(90)=0s       p(95)=0s      
-     http_req_waiting...............: avg=6.31s    min=165.04ms med=6.66s    max=11.65s   p(90)=8.55s    p(95)=9.07s   
-     http_reqs......................: 7341    12.422953/s
-     input_tokens...................: 4990223 8444.803735/s
-     iteration_duration.............: avg=46.39s   min=6.73s    med=41.26s   max=4m13s    p(90)=1m8s     p(95)=1m28s   
-     iterations.....................: 1000    1.69227/s
-     new_tokens.....................: 68062   115.179268/s
-     time_per_token.................: avg=712.11ms min=39.56ms  med=703.28ms max=2.69s    p(90)=928.58ms p(95)=1.09s   
-     tokens.........................: 5058285 8559.983003/s
-     vus............................: 1       min=0            max=80
-     vus_max........................: 80      min=21           max=80
-
-
-running (09m50.9s), 00/80 VUs, 1000 complete and 0 interrupted iterations
-chat ✓ [======================================] 80 VUs  09m50.9s/10m0s  1000/1000 shared iters
+     http_req_receiving.............: avg=75.3µs   min=18.48µs  med=62.57µs  max=2.87ms  p(90)=118.19µs p(95)=139.71µs
+     http_req_sending...............: avg=100.92µs min=8.74µs   med=29.1µs   max=24.35ms p(90)=129.08µs p(95)=164.54µs
+     http_req_tls_handshaking.......: avg=0s       min=0s       med=0s       max=0s      p(90)=0s       p(95)=0s
+     http_req_waiting...............: avg=3.67s    min=112.2ms  med=3.65s    max=8.58s   p(90)=6.09s    p(95)=6.56s
+     http_reqs......................: 7341    73.808399/s
+     input_tokens...................: 4990165 50172.468256/s
+     iteration_duration.............: avg=26.96s   min=6.17s    med=24.73s   max=1m30s   p(90)=41.36s   p(95)=48.91s
+     iterations.....................: 1000    10.05427/s
+     new_tokens.....................: 67808   681.759967/s
+     time_per_token.................: avg=419.15ms min=34.84ms  med=397.78ms max=2.37s   p(90)=662.6ms  p(95)=781.79ms
+     tokens.........................: 5057973 50854.228224/s
+     vus............................: 1       min=0            max=320
+     vus_max........................: 320     min=22           max=320
+
+
+running (01m39.5s), 000/320 VUs, 1000 complete and 0 interrupted iterations
+chat ✓ [======================================] 320 VUs  01m39.5s/10m0s  1000/1000 shared iters
 ```
 
-### PrefixHash
+## LeastLoad - 8 replicas 2nd run
 
 ```
-         /\      Grafana   /‾‾/  
-    /\  /  \     |\  __   /  /   
-   /  \/    \    | |/ /  /   ‾‾\ 
-  /          \   |   (  |  (‾)  |
- / __________ \  |_|\_\  \_____/ 
+     scenarios: (100.00%) 1 scenario, 320 max VUs, 10m30s max duration (incl. graceful stop):
+              * chat: 1000 iterations shared among 320 VUs (maxDuration: 10m0s, gracefulStop: 30s)
 
-     execution: local
-        script: ./k6.js
-        output: -
 
-     scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop):
-              * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s)
+     ✓ Post status is 200
+
+     checks.........................: 100.00% 7341 out of 7341
+     data_received..................: 4.7 MB  49 kB/s
+     data_sent......................: 25 MB   259 kB/s
+     http_req_blocked...............: avg=856.57µs min=1.6µs    med=4.23µs   max=33.05ms p(90)=7.16µs   p(95)=32.24µs
+     http_req_connecting............: avg=107.71µs min=0s       med=0s       max=28.11ms p(90)=0s       p(95)=0s
+     http_req_duration..............: avg=3.54s    min=131.17ms med=3.53s    max=9.66s   p(90)=5.95s    p(95)=6.53s
+       { expected_response:true }...: avg=3.54s    min=131.17ms med=3.53s    max=9.66s   p(90)=5.95s    p(95)=6.53s
+   ✓ http_req_failed................: 0.00%   0 out of 7341
+     http_req_receiving.............: avg=76.78µs  min=20.42µs  med=63.93µs  max=3.16ms  p(90)=119.07µs p(95)=138.94µs
+     http_req_sending...............: avg=153.18µs min=8.93µs   med=29.5µs   max=14.71ms p(90)=129.95µs p(95)=173.11µs
+     http_req_tls_handshaking.......: avg=0s       min=0s       med=0s       max=0s      p(90)=0s       p(95)=0s
+     http_req_waiting...............: avg=3.54s    min=130.82ms med=3.53s    max=9.66s   p(90)=5.95s    p(95)=6.53s
+     http_reqs......................: 7341    76.270469/s
+     input_tokens...................: 4990249 51846.973437/s
+     iteration_duration.............: avg=26.06s   min=3.61s    med=24.15s   max=1m25s   p(90)=39.9s    p(95)=48.14s
+     iterations.....................: 1000    10.389657/s
+     new_tokens.....................: 67790   704.314821/s
+     time_per_token.................: avg=405.39ms min=34.22ms  med=384.49ms max=2.2s    p(90)=650.92ms p(95)=749.72ms
+     tokens.........................: 5058039 52551.288258/s
+     vus............................: 1       min=0            max=320
+     vus_max........................: 320     min=19           max=320
+
+
+running (01m36.2s), 000/320 VUs, 1000 complete and 0 interrupted iterations
+chat ✓ [======================================] 320 VUs  01m36.2s/10m0s  1000/1000 shared iters
+```
+
+### PrefixHash - 3rd run
+
+```
+     scenarios: (100.00%) 1 scenario, 320 max VUs, 10m30s max duration (incl. graceful stop):
+              * chat: 1000 iterations shared among 320 VUs (maxDuration: 10m0s, gracefulStop: 30s)
 
 
      ✓ Post status is 200
 
      checks.........................: 100.00% 7341 out of 7341
-     data_received..................: 4.7 MB  12 kB/s
-     data_sent......................: 25 MB   65 kB/s
-     http_req_blocked...............: avg=268.24µs min=2.94µs   med=5.76µs   max=28.19ms  p(90)=8.17µs   p(95)=10.41µs 
-     http_req_connecting............: avg=136.33µs min=0s       med=0s       max=17.7ms   p(90)=0s       p(95)=0s      
-     http_req_duration..............: avg=4.08s    min=151.9ms  med=2.45s    max=12.32s   p(90)=9.63s    p(95)=10.26s  
-       { expected_response:true }...: avg=4.08s    min=151.9ms  med=2.45s    max=12.32s   p(90)=9.63s    p(95)=10.26s  
+     data_received..................: 4.7 MB  55 kB/s
+     data_sent......................: 25 MB   288 kB/s
+     http_req_blocked...............: avg=833.58µs min=1.61µs  med=4.34µs   max=41.24ms p(90)=10.84µs  p(95)=35.22µs
+     http_req_connecting............: avg=243.25µs min=0s      med=0s       max=23.94ms p(90)=0s       p(95)=0s
+     http_req_duration..............: avg=3.13s    min=83.91ms med=2.22s    max=10.71s  p(90)=6.67s    p(95)=7.33s
+       { expected_response:true }...: avg=3.13s    min=83.91ms med=2.22s    max=10.71s  p(90)=6.67s    p(95)=7.33s
    ✓ http_req_failed................: 0.00%   0 out of 7341
-     http_req_receiving.............: avg=81.81µs  min=28.68µs  med=72.08µs  max=786.09µs p(90)=125.04µs p(95)=148.6µs 
-     http_req_sending...............: avg=63.61µs  min=11.85µs  med=31.65µs  max=1.59ms   p(90)=136.85µs p(95)=161.88µs
-     http_req_tls_handshaking.......: avg=0s       min=0s       med=0s       max=0s       p(90)=0s       p(95)=0s      
-     http_req_waiting...............: avg=4.08s    min=151.81ms med=2.45s    max=12.32s   p(90)=9.63s    p(95)=10.26s  
-     http_reqs......................: 7341    19.230625/s
-     input_tokens...................: 4990576 13073.409349/s
-     iteration_duration.............: avg=29.98s   min=2.37s    med=20.29s   max=2m53s    p(90)=1m1s     p(95)=1m18s   
-     iterations.....................: 1000    2.619619/s
-     new_tokens.....................: 68218   178.705191/s
-     time_per_token.................: avg=469.34ms min=44.2ms   med=257.72ms max=3.86s    p(90)=1s       p(95)=1.1s    
-     tokens.........................: 5058794 13252.11454/s
-     vus............................: 3       min=0            max=80
-     vus_max........................: 80      min=19           max=80
-
-
-running (06m21.7s), 00/80 VUs, 1000 complete and 0 interrupted iterations
-chat ✓ [======================================] 80 VUs  06m21.7s/10m0s  1000/1000 shared iters
+     http_req_receiving.............: avg=75.62µs  min=19.77µs med=71.23µs  max=1.99ms  p(90)=118.68µs p(95)=138.44µs
+     http_req_sending...............: avg=135.04µs min=7.79µs  med=30.48µs  max=15.02ms p(90)=137.44µs p(95)=181.62µs
+     http_req_tls_handshaking.......: avg=0s       min=0s      med=0s       max=0s      p(90)=0s       p(95)=0s
+     http_req_waiting...............: avg=3.13s    min=83.79ms med=2.22s    max=10.71s  p(90)=6.67s    p(95)=7.33s
+     http_reqs......................: 7341    85.023164/s
+     input_tokens...................: 4989621 57789.588176/s
+     iteration_duration.............: avg=23.03s   min=1.71s   med=22.05s   max=1m20s   p(90)=41.36s   p(95)=49.67s
+     iterations.....................: 1000    11.581959/s
+     new_tokens.....................: 67718   784.307131/s
+     time_per_token.................: avg=361.07ms min=35.86ms med=235.35ms max=2.78s   p(90)=723.57ms p(95)=827ms
+     tokens.........................: 5057339 58573.895307/s
+     vus............................: 1       min=0            max=320
+     vus_max........................: 320     min=21           max=320
+
+
+running (01m26.3s), 000/320 VUs, 1000 complete and 0 interrupted iterations
+chat ✓ [======================================] 320 VUs  01m26.3s/10m0s  1000/1000 shared iters
 ```
+
diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml
index 872c9706..6660c1ec 100644
--- a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml
+++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
     - name: bench
-      image: us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2
+      image: $IMG
       command: ["sleep", "infinity"]
       resources:
         requests:

From 92ac45fd0c619dec8f96143a28be075d35471e1d Mon Sep 17 00:00:00 2001
From: Sam Stoelinga <sammiestoel@gmail.com>
Date: Tue, 31 Dec 2024 19:22:36 -0800
Subject: [PATCH 3/5] update results

---
 .../README.md                                 | 30 +++++++++++++++----
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md
index bc9fede1..814ea466 100644
--- a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md
+++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md
@@ -1,14 +1,32 @@
-# Results
+# Prefix Hash Benchmark - Llama 3.1 70B with 8 replicas
 
 Under specific conditions:
 
-* Restricted GPU memory
-* Low `max_tokens` to be generated
+* Set `max_tokens` to 10 to see understand performance impact when significant time is spent on input.
 * Chat threads with decently long user messages
 
-Prefix hashing was shown to have `34%` decrease in average time per token.
+Summary of how Prefix Hashing affects performance:
+* `12%` decrease in average time per token: `405.39 ms (LeastLoad) --> 361.07ms (PrefixHash)`
+* input_tokens: `51846.973437/s (LeastLoad) --> 57789.588176/s (PrefixHash)`. An increase of `11%` in throughput of input tokens.
 
-`712.11ms (LeastLoad) --> 469.34ms (PrefixHash)`
+Least Load results:
+```
+     input_tokens...................: 4990249 51846.973437/s
+     iteration_duration.............: avg=26.06s   min=3.61s    med=24.15s   max=1m25s   p(90)=39.9s    p(95)=48.14s
+     iterations.....................: 1000    10.389657/s
+     new_tokens.....................: 67790   704.314821/s
+     time_per_token.................: avg=405.39ms min=34.22ms  med=384.49ms max=2.2s    p(90)=650.92ms p(95)=749.72ms
+```
+
+
+Prefix Hashing results:
+```
+     input_tokens...................: 4989621 57789.588176/s
+     iteration_duration.............: avg=23.03s   min=1.71s   med=22.05s   max=1m20s   p(90)=41.36s   p(95)=49.67s
+     iterations.....................: 1000    11.581959/s
+     new_tokens.....................: 67718   784.307131/s
+     time_per_token.................: avg=361.07ms min=35.86ms med=235.35ms max=2.78s   p(90)=723.57ms p(95)=827ms
+```
 
 ## Steps taken
 
@@ -38,7 +56,7 @@ kubectl exec -it chat-benchmark -- SCENARIO=$SCENARIO make run
 ```
 
 
-## Benchmark Output
+## Benchmark Outputs
 
 ### LeastLoad - single replica
 

From d46bf77b6498254d875b9e1de350a4c1f4871988 Mon Sep 17 00:00:00 2001
From: Sam Stoelinga <sammiestoel@gmail.com>
Date: Tue, 31 Dec 2024 19:51:29 -0800
Subject: [PATCH 4/5] add model for 70b on single h100

---
 charts/models/values.yaml                       | 12 ++++++++++++
 .../llama-3.1-70b-instruct-fp8-1xh100.yaml      | 17 +++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml

diff --git a/charts/models/values.yaml b/charts/models/values.yaml
index 7d7f2714..45e9d73e 100644
--- a/charts/models/values.yaml
+++ b/charts/models/values.yaml
@@ -139,6 +139,18 @@ catalog:
       - --disable-log-requests
     resourceProfile: nvidia-gpu-h100:2
     targetRequests: 500
+  llama-3.1-70b-instruct-fp8-1xh100:
+    features: [TextGeneration]
+    url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
+    engine: VLLM
+    args:
+      - --enable-prefix-caching
+      - --max-model-len=16384
+      - --max-num-batched-token=16384
+      - --gpu-memory-utilization=0.95
+      - --disable-log-requests
+      - --kv-cache-dtype=fp8
+    resourceProfile: nvidia-gpu-h100:1
   llama-3.1-70b-instruct-fp8-l4:
     enabled: false
     features: [TextGeneration]
diff --git a/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml b/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml
new file mode 100644
index 00000000..55006519
--- /dev/null
+++ b/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml
@@ -0,0 +1,17 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-70b-instruct-fp8-1xh100
+spec:
+  features: [TextGeneration]
+  url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
+  engine: VLLM
+  args:
+    - --enable-prefix-caching
+    - --max-model-len=16384
+    - --max-num-batched-token=16384
+    - --gpu-memory-utilization=0.95
+    - --disable-log-requests
+    - --kv-cache-dtype=fp8
+  resourceProfile: nvidia-gpu-h100:1

From 55ebdef7062f4d1554d563f6023d2cf06a2f95a5 Mon Sep 17 00:00:00 2001
From: Sam Stoelinga <sammiestoel@gmail.com>
Date: Fri, 3 Jan 2025 14:18:38 -0800
Subject: [PATCH 5/5] rename

---
 charts/models/values.yaml                                       | 2 +-
 ...t-fp8-1xh100.yaml => llama-3.1-70b-instruct-fp8-1-h100.yaml} | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename manifests/models/{llama-3.1-70b-instruct-fp8-1xh100.yaml => llama-3.1-70b-instruct-fp8-1-h100.yaml} (90%)

diff --git a/charts/models/values.yaml b/charts/models/values.yaml
index 45e9d73e..f98255ca 100644
--- a/charts/models/values.yaml
+++ b/charts/models/values.yaml
@@ -139,7 +139,7 @@ catalog:
       - --disable-log-requests
     resourceProfile: nvidia-gpu-h100:2
     targetRequests: 500
-  llama-3.1-70b-instruct-fp8-1xh100:
+  llama-3.1-70b-instruct-fp8-1-h100:
     features: [TextGeneration]
     url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
     engine: VLLM
diff --git a/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml b/manifests/models/llama-3.1-70b-instruct-fp8-1-h100.yaml
similarity index 90%
rename from manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml
rename to manifests/models/llama-3.1-70b-instruct-fp8-1-h100.yaml
index 55006519..f21d2ea8 100644
--- a/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml
+++ b/manifests/models/llama-3.1-70b-instruct-fp8-1-h100.yaml
@@ -2,7 +2,7 @@
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
-  name: llama-3.1-70b-instruct-fp8-1xh100
+  name: llama-3.1-70b-instruct-fp8-1-h100
 spec:
   features: [TextGeneration]
   url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8