From c4117ba9a6c64d53a71e7fe024dfcf9ca2980bc7 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Sat, 28 Dec 2024 15:30:06 -0800 Subject: [PATCH 1/5] wip scenario 70b 8 replicas prefix hash --- .../README.md | 128 ++++++++++++++++++ .../base-request.json | 6 + .../least-load-vs-prefix-hash-70b-8r/k6.json | 15 ++ .../model.yaml | 18 +++ .../least-load-vs-prefix-hash-70b-8r/pod.yaml | 19 +++ 5 files changed, 186 insertions(+) create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml create mode 100644 benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md new file mode 100644 index 00000000..aaa309d2 --- /dev/null +++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md @@ -0,0 +1,128 @@ +# Results + +Under specific conditions: + +* Restricted GPU memory +* Low `max_tokens` to be generated +* Chat threads with decently long user messages + +Prefix hashing was shown to have `34%` decrease in average time per token. + +`712.11ms (LeastLoad) --> 469.34ms (PrefixHash)` + +## Steps taken + +```bash + +cd ./benchmarks/chat +make data +export IMG=us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2 +docker build -t $IMG . && docker push $IMG + +kubectl apply -f ./scenarios/least-load-vs-prefix-hash/model.yaml +kubectl apply -f ./scenarios/least-load-vs-prefix-hash/pod.yaml + +# Run 2x (to ensure both cases start with a preloaded cache) +kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash make run + +kubectl patch model llama-3.1-8b-instruct-fp8-l4 --type='merge' -p '{"spec": {"loadBalancing": {"strategy": "PrefixHash"}}}' +kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash-70b-8r make run +``` + +## Next Steps + +* Rerun with increased replicas (i.e. 10 instead of 2) + +## Benchmark Output + +### LeastLoad + +``` + /\ Grafana /‾‾/ + /\ / \ |\ __ / / + / \/ \ | |/ / / ‾‾\ + / \ | ( | (‾) | + / __________ \ |_|\_\ \_____/ + + execution: local + script: ./k6.js + output: - + + scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop): + * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s) + + + ✓ Post status is 200 + + checks.........................: 100.00% 7341 out of 7341 + data_received..................: 4.7 MB 7.9 kB/s + data_sent......................: 25 MB 42 kB/s + http_req_blocked...............: avg=161.4µs min=2.83µs med=5.8µs max=16.67ms p(90)=8.06µs p(95)=10.19µs + http_req_connecting............: avg=55.73µs min=0s med=0s max=8.41ms p(90)=0s p(95)=0s + http_req_duration..............: avg=6.31s min=165.25ms med=6.66s max=11.65s p(90)=8.55s p(95)=9.07s + { expected_response:true }...: avg=6.31s min=165.25ms med=6.66s max=11.65s p(90)=8.55s p(95)=9.07s + ✓ http_req_failed................: 0.00% 0 out of 7341 + http_req_receiving.............: avg=84.64µs min=29.4µs med=74.05µs max=732.69µs p(90)=129.94µs p(95)=154.19µs + http_req_sending...............: avg=68µs min=12.1µs med=32.3µs max=1.38ms p(90)=144.04µs p(95)=173.19µs + http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s + http_req_waiting...............: avg=6.31s min=165.04ms med=6.66s max=11.65s p(90)=8.55s p(95)=9.07s + http_reqs......................: 7341 12.422953/s + input_tokens...................: 4990223 8444.803735/s + iteration_duration.............: avg=46.39s min=6.73s med=41.26s max=4m13s p(90)=1m8s p(95)=1m28s + iterations.....................: 1000 1.69227/s + new_tokens.....................: 68062 115.179268/s + time_per_token.................: avg=712.11ms min=39.56ms med=703.28ms max=2.69s p(90)=928.58ms p(95)=1.09s + tokens.........................: 5058285 8559.983003/s + vus............................: 1 min=0 max=80 + vus_max........................: 80 min=21 max=80 + + +running (09m50.9s), 00/80 VUs, 1000 complete and 0 interrupted iterations +chat ✓ [======================================] 80 VUs 09m50.9s/10m0s 1000/1000 shared iters +``` + +### PrefixHash + +``` + /\ Grafana /‾‾/ + /\ / \ |\ __ / / + / \/ \ | |/ / / ‾‾\ + / \ | ( | (‾) | + / __________ \ |_|\_\ \_____/ + + execution: local + script: ./k6.js + output: - + + scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop): + * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s) + + + ✓ Post status is 200 + + checks.........................: 100.00% 7341 out of 7341 + data_received..................: 4.7 MB 12 kB/s + data_sent......................: 25 MB 65 kB/s + http_req_blocked...............: avg=268.24µs min=2.94µs med=5.76µs max=28.19ms p(90)=8.17µs p(95)=10.41µs + http_req_connecting............: avg=136.33µs min=0s med=0s max=17.7ms p(90)=0s p(95)=0s + http_req_duration..............: avg=4.08s min=151.9ms med=2.45s max=12.32s p(90)=9.63s p(95)=10.26s + { expected_response:true }...: avg=4.08s min=151.9ms med=2.45s max=12.32s p(90)=9.63s p(95)=10.26s + ✓ http_req_failed................: 0.00% 0 out of 7341 + http_req_receiving.............: avg=81.81µs min=28.68µs med=72.08µs max=786.09µs p(90)=125.04µs p(95)=148.6µs + http_req_sending...............: avg=63.61µs min=11.85µs med=31.65µs max=1.59ms p(90)=136.85µs p(95)=161.88µs + http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s + http_req_waiting...............: avg=4.08s min=151.81ms med=2.45s max=12.32s p(90)=9.63s p(95)=10.26s + http_reqs......................: 7341 19.230625/s + input_tokens...................: 4990576 13073.409349/s + iteration_duration.............: avg=29.98s min=2.37s med=20.29s max=2m53s p(90)=1m1s p(95)=1m18s + iterations.....................: 1000 2.619619/s + new_tokens.....................: 68218 178.705191/s + time_per_token.................: avg=469.34ms min=44.2ms med=257.72ms max=3.86s p(90)=1s p(95)=1.1s + tokens.........................: 5058794 13252.11454/s + vus............................: 3 min=0 max=80 + vus_max........................: 80 min=19 max=80 + + +running (06m21.7s), 00/80 VUs, 1000 complete and 0 interrupted iterations +chat ✓ [======================================] 80 VUs 06m21.7s/10m0s 1000/1000 shared iters +``` diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json new file mode 100644 index 00000000..9550d606 --- /dev/null +++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json @@ -0,0 +1,6 @@ +{ + "model": "llama-3.1-70b-instruct-fp8-h100", + "max_tokens": 10, + "temperature": 0, + "messages": [] +} \ No newline at end of file diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json new file mode 100644 index 00000000..4b687779 --- /dev/null +++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json @@ -0,0 +1,15 @@ +{ + "thresholds": { + "http_req_failed": [ + "rate==0" + ] + }, + "scenarios": { + "chat": { + "executor": "shared-iterations", + "vus": 320, + "iterations": 1000, + "maxDuration": "600s" + } + } +} \ No newline at end of file diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml new file mode 100644 index 00000000..9ff87269 --- /dev/null +++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml @@ -0,0 +1,18 @@ +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: llama-3.1-70b-instruct-fp8-h100 +spec: + features: [TextGeneration] + url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 + engine: VLLM + args: + - --enable-prefix-caching + - --max-model-len=16384 + - --max-num-batched-token=16384 + - --gpu-memory-utilization=0.95 + - --disable-log-requests + - --kv-cache-dtype=fp8 + resourceProfile: nvidia-gpu-h100:1 + minReplicas: 8 + maxReplicas: 8 diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml new file mode 100644 index 00000000..872c9706 --- /dev/null +++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Pod +metadata: + name: chat-benchmark +spec: + restartPolicy: Never + containers: + - name: bench + image: us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2 + command: ["sleep", "infinity"] + resources: + requests: + cpu: 6 + ephemeral-storage: 10Gi + memory: 24Gi + limits: + cpu: 6 + ephemeral-storage: 10Gi + memory: 24Gi \ No newline at end of file From 771cad39ce099974e8eaf78d7cb855c97b41a0a1 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Tue, 31 Dec 2024 19:06:50 -0800 Subject: [PATCH 2/5] update with results --- .../README.md | 215 +++++++++++------- .../least-load-vs-prefix-hash-70b-8r/pod.yaml | 2 +- 2 files changed, 138 insertions(+), 79 deletions(-) diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md index aaa309d2..bc9fede1 100644 --- a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md +++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md @@ -13,116 +13,175 @@ Prefix hashing was shown to have `34%` decrease in average time per token. ## Steps taken ```bash +export SCENARIO=least-load-vs-prefix-hash-70b-8r +export PROJECT_ID=$(gcloud config get-value project) +export IMG=us-central1-docker.pkg.dev/$PROJECT_ID/default/kubeai-benchmark-chat:v0.0.2 cd ./benchmarks/chat make data -export IMG=us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2 -docker build -t $IMG . && docker push $IMG +gcloud builds submit . -t $IMG +# docker build -t $IMG . && docker push $IMG -kubectl apply -f ./scenarios/least-load-vs-prefix-hash/model.yaml -kubectl apply -f ./scenarios/least-load-vs-prefix-hash/pod.yaml +kubectl apply -f ./scenarios/$SCENARIO/model.yaml +envsubst < ./scenarios/$SCENARIO/pod.yaml | kubectl apply -f - + +# Had to manually copy the file for some reason +# TODO fix Dockerfile to ensure it gets added +kubectl cp data/message-threads.json chat-benchmark:/work/data/ # Run 2x (to ensure both cases start with a preloaded cache) -kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash make run +# kubectl exec -it chat-benchmark -- SCENARIO=$SCENARIO make run +kubectl exec -it chat-benchmark -- bash -c "SCENARIO=$SCENARIO make run" -kubectl patch model llama-3.1-8b-instruct-fp8-l4 --type='merge' -p '{"spec": {"loadBalancing": {"strategy": "PrefixHash"}}}' -kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash-70b-8r make run +kubectl patch model llama-3.1-70b-instruct-fp8-h100 --type='merge' -p '{"spec": {"loadBalancing": {"strategy": "PrefixHash"}}}' +kubectl exec -it chat-benchmark -- SCENARIO=$SCENARIO make run ``` -## Next Steps - -* Rerun with increased replicas (i.e. 10 instead of 2) ## Benchmark Output -### LeastLoad +### LeastLoad - single replica ``` - /\ Grafana /‾‾/ - /\ / \ |\ __ / / - / \/ \ | |/ / / ‾‾\ - / \ | ( | (‾) | - / __________ \ |_|\_\ \_____/ + scenarios: (100.00%) 1 scenario, 320 max VUs, 10m30s max duration (incl. graceful stop): + * chat: 1000 iterations shared among 320 VUs (maxDuration: 10m0s, gracefulStop: 30s) - execution: local - script: ./k6.js - output: - - scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop): - * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s) + ✓ Post status is 200 + + checks.........................: 100.00% 6094 out of 6094 + data_received..................: 3.9 MB 6.2 kB/s + data_sent......................: 20 MB 32 kB/s + dropped_iterations.............: 23 0.036508/s + http_req_blocked...............: avg=1.52ms min=1.72µs med=4.52µs max=47.12ms p(90)=7.64µs p(95)=14.47ms + http_req_connecting............: avg=79.02µs min=0s med=0s max=13.96ms p(90)=0s p(95)=119.84µs + http_req_duration..............: avg=32.48s min=6.25s med=37.74s max=50.64s p(90)=43.38s p(95)=45.81s + { expected_response:true }...: avg=32.48s min=6.25s med=37.74s max=50.64s p(90)=43.38s p(95)=45.81s + ✓ http_req_failed................: 0.00% 0 out of 6094 + http_req_receiving.............: avg=75.82µs min=19.9µs med=68.09µs max=2.04ms p(90)=115.16µs p(95)=134.82µs + http_req_sending...............: avg=103.99µs min=8.22µs med=27.04µs max=33.92ms p(90)=126.5µs p(95)=186.9µs + http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s + http_req_waiting...............: avg=32.48s min=6.25s med=37.73s max=50.64s p(90)=43.38s p(95)=45.81s + http_reqs......................: 6094 9.672953/s + input_tokens...................: 3859568 6126.258596/s + iteration_duration.............: avg=3m49s min=1m30s med=3m23s max=10m17s p(90)=5m41s p(95)=6m36s + iterations.....................: 728 1.155548/s + new_tokens.....................: 56340 89.42799/s + time_per_token.................: avg=4.03s min=625.66ms med=3.87s max=22.72s p(90)=5s p(95)=11.69s + tokens.........................: 3915908 6215.686586/s + vus............................: 252 min=0 max=320 + vus_max........................: 320 min=25 max=320 + + +running (10m30.0s), 000/320 VUs, 728 complete and 249 interrupted iterations +chat ✗ [==========================>-----------] 320 VUs 10m30.0s/10m0s 0728/1000 shared iters +``` + +## LeastLoad - 8 replicas 1st run + +``` + scenarios: (100.00%) 1 scenario, 320 max VUs, 10m30s max duration (incl. graceful stop): + * chat: 1000 iterations shared among 320 VUs (maxDuration: 10m0s, gracefulStop: 30s) ✓ Post status is 200 checks.........................: 100.00% 7341 out of 7341 - data_received..................: 4.7 MB 7.9 kB/s - data_sent......................: 25 MB 42 kB/s - http_req_blocked...............: avg=161.4µs min=2.83µs med=5.8µs max=16.67ms p(90)=8.06µs p(95)=10.19µs - http_req_connecting............: avg=55.73µs min=0s med=0s max=8.41ms p(90)=0s p(95)=0s - http_req_duration..............: avg=6.31s min=165.25ms med=6.66s max=11.65s p(90)=8.55s p(95)=9.07s - { expected_response:true }...: avg=6.31s min=165.25ms med=6.66s max=11.65s p(90)=8.55s p(95)=9.07s + data_received..................: 4.7 MB 47 kB/s + data_sent......................: 25 MB 250 kB/s + http_req_blocked...............: avg=280.95µs min=1.57µs med=4.13µs max=28.71ms p(90)=6.86µs p(95)=32.09µs + http_req_connecting............: avg=55.16µs min=0s med=0s max=19.59ms p(90)=0s p(95)=0s + http_req_duration..............: avg=3.67s min=112.34ms med=3.65s max=8.58s p(90)=6.09s p(95)=6.56s + { expected_response:true }...: avg=3.67s min=112.34ms med=3.65s max=8.58s p(90)=6.09s p(95)=6.56s ✓ http_req_failed................: 0.00% 0 out of 7341 - http_req_receiving.............: avg=84.64µs min=29.4µs med=74.05µs max=732.69µs p(90)=129.94µs p(95)=154.19µs - http_req_sending...............: avg=68µs min=12.1µs med=32.3µs max=1.38ms p(90)=144.04µs p(95)=173.19µs - http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s - http_req_waiting...............: avg=6.31s min=165.04ms med=6.66s max=11.65s p(90)=8.55s p(95)=9.07s - http_reqs......................: 7341 12.422953/s - input_tokens...................: 4990223 8444.803735/s - iteration_duration.............: avg=46.39s min=6.73s med=41.26s max=4m13s p(90)=1m8s p(95)=1m28s - iterations.....................: 1000 1.69227/s - new_tokens.....................: 68062 115.179268/s - time_per_token.................: avg=712.11ms min=39.56ms med=703.28ms max=2.69s p(90)=928.58ms p(95)=1.09s - tokens.........................: 5058285 8559.983003/s - vus............................: 1 min=0 max=80 - vus_max........................: 80 min=21 max=80 - - -running (09m50.9s), 00/80 VUs, 1000 complete and 0 interrupted iterations -chat ✓ [======================================] 80 VUs 09m50.9s/10m0s 1000/1000 shared iters + http_req_receiving.............: avg=75.3µs min=18.48µs med=62.57µs max=2.87ms p(90)=118.19µs p(95)=139.71µs + http_req_sending...............: avg=100.92µs min=8.74µs med=29.1µs max=24.35ms p(90)=129.08µs p(95)=164.54µs + http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s + http_req_waiting...............: avg=3.67s min=112.2ms med=3.65s max=8.58s p(90)=6.09s p(95)=6.56s + http_reqs......................: 7341 73.808399/s + input_tokens...................: 4990165 50172.468256/s + iteration_duration.............: avg=26.96s min=6.17s med=24.73s max=1m30s p(90)=41.36s p(95)=48.91s + iterations.....................: 1000 10.05427/s + new_tokens.....................: 67808 681.759967/s + time_per_token.................: avg=419.15ms min=34.84ms med=397.78ms max=2.37s p(90)=662.6ms p(95)=781.79ms + tokens.........................: 5057973 50854.228224/s + vus............................: 1 min=0 max=320 + vus_max........................: 320 min=22 max=320 + + +running (01m39.5s), 000/320 VUs, 1000 complete and 0 interrupted iterations +chat ✓ [======================================] 320 VUs 01m39.5s/10m0s 1000/1000 shared iters ``` -### PrefixHash +## LeastLoad - 8 replicas 2nd run ``` - /\ Grafana /‾‾/ - /\ / \ |\ __ / / - / \/ \ | |/ / / ‾‾\ - / \ | ( | (‾) | - / __________ \ |_|\_\ \_____/ + scenarios: (100.00%) 1 scenario, 320 max VUs, 10m30s max duration (incl. graceful stop): + * chat: 1000 iterations shared among 320 VUs (maxDuration: 10m0s, gracefulStop: 30s) - execution: local - script: ./k6.js - output: - - scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop): - * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s) + ✓ Post status is 200 + + checks.........................: 100.00% 7341 out of 7341 + data_received..................: 4.7 MB 49 kB/s + data_sent......................: 25 MB 259 kB/s + http_req_blocked...............: avg=856.57µs min=1.6µs med=4.23µs max=33.05ms p(90)=7.16µs p(95)=32.24µs + http_req_connecting............: avg=107.71µs min=0s med=0s max=28.11ms p(90)=0s p(95)=0s + http_req_duration..............: avg=3.54s min=131.17ms med=3.53s max=9.66s p(90)=5.95s p(95)=6.53s + { expected_response:true }...: avg=3.54s min=131.17ms med=3.53s max=9.66s p(90)=5.95s p(95)=6.53s + ✓ http_req_failed................: 0.00% 0 out of 7341 + http_req_receiving.............: avg=76.78µs min=20.42µs med=63.93µs max=3.16ms p(90)=119.07µs p(95)=138.94µs + http_req_sending...............: avg=153.18µs min=8.93µs med=29.5µs max=14.71ms p(90)=129.95µs p(95)=173.11µs + http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s + http_req_waiting...............: avg=3.54s min=130.82ms med=3.53s max=9.66s p(90)=5.95s p(95)=6.53s + http_reqs......................: 7341 76.270469/s + input_tokens...................: 4990249 51846.973437/s + iteration_duration.............: avg=26.06s min=3.61s med=24.15s max=1m25s p(90)=39.9s p(95)=48.14s + iterations.....................: 1000 10.389657/s + new_tokens.....................: 67790 704.314821/s + time_per_token.................: avg=405.39ms min=34.22ms med=384.49ms max=2.2s p(90)=650.92ms p(95)=749.72ms + tokens.........................: 5058039 52551.288258/s + vus............................: 1 min=0 max=320 + vus_max........................: 320 min=19 max=320 + + +running (01m36.2s), 000/320 VUs, 1000 complete and 0 interrupted iterations +chat ✓ [======================================] 320 VUs 01m36.2s/10m0s 1000/1000 shared iters +``` + +### PrefixHash - 3rd run + +``` + scenarios: (100.00%) 1 scenario, 320 max VUs, 10m30s max duration (incl. graceful stop): + * chat: 1000 iterations shared among 320 VUs (maxDuration: 10m0s, gracefulStop: 30s) ✓ Post status is 200 checks.........................: 100.00% 7341 out of 7341 - data_received..................: 4.7 MB 12 kB/s - data_sent......................: 25 MB 65 kB/s - http_req_blocked...............: avg=268.24µs min=2.94µs med=5.76µs max=28.19ms p(90)=8.17µs p(95)=10.41µs - http_req_connecting............: avg=136.33µs min=0s med=0s max=17.7ms p(90)=0s p(95)=0s - http_req_duration..............: avg=4.08s min=151.9ms med=2.45s max=12.32s p(90)=9.63s p(95)=10.26s - { expected_response:true }...: avg=4.08s min=151.9ms med=2.45s max=12.32s p(90)=9.63s p(95)=10.26s + data_received..................: 4.7 MB 55 kB/s + data_sent......................: 25 MB 288 kB/s + http_req_blocked...............: avg=833.58µs min=1.61µs med=4.34µs max=41.24ms p(90)=10.84µs p(95)=35.22µs + http_req_connecting............: avg=243.25µs min=0s med=0s max=23.94ms p(90)=0s p(95)=0s + http_req_duration..............: avg=3.13s min=83.91ms med=2.22s max=10.71s p(90)=6.67s p(95)=7.33s + { expected_response:true }...: avg=3.13s min=83.91ms med=2.22s max=10.71s p(90)=6.67s p(95)=7.33s ✓ http_req_failed................: 0.00% 0 out of 7341 - http_req_receiving.............: avg=81.81µs min=28.68µs med=72.08µs max=786.09µs p(90)=125.04µs p(95)=148.6µs - http_req_sending...............: avg=63.61µs min=11.85µs med=31.65µs max=1.59ms p(90)=136.85µs p(95)=161.88µs - http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s - http_req_waiting...............: avg=4.08s min=151.81ms med=2.45s max=12.32s p(90)=9.63s p(95)=10.26s - http_reqs......................: 7341 19.230625/s - input_tokens...................: 4990576 13073.409349/s - iteration_duration.............: avg=29.98s min=2.37s med=20.29s max=2m53s p(90)=1m1s p(95)=1m18s - iterations.....................: 1000 2.619619/s - new_tokens.....................: 68218 178.705191/s - time_per_token.................: avg=469.34ms min=44.2ms med=257.72ms max=3.86s p(90)=1s p(95)=1.1s - tokens.........................: 5058794 13252.11454/s - vus............................: 3 min=0 max=80 - vus_max........................: 80 min=19 max=80 - - -running (06m21.7s), 00/80 VUs, 1000 complete and 0 interrupted iterations -chat ✓ [======================================] 80 VUs 06m21.7s/10m0s 1000/1000 shared iters + http_req_receiving.............: avg=75.62µs min=19.77µs med=71.23µs max=1.99ms p(90)=118.68µs p(95)=138.44µs + http_req_sending...............: avg=135.04µs min=7.79µs med=30.48µs max=15.02ms p(90)=137.44µs p(95)=181.62µs + http_req_tls_handshaking.......: avg=0s min=0s med=0s max=0s p(90)=0s p(95)=0s + http_req_waiting...............: avg=3.13s min=83.79ms med=2.22s max=10.71s p(90)=6.67s p(95)=7.33s + http_reqs......................: 7341 85.023164/s + input_tokens...................: 4989621 57789.588176/s + iteration_duration.............: avg=23.03s min=1.71s med=22.05s max=1m20s p(90)=41.36s p(95)=49.67s + iterations.....................: 1000 11.581959/s + new_tokens.....................: 67718 784.307131/s + time_per_token.................: avg=361.07ms min=35.86ms med=235.35ms max=2.78s p(90)=723.57ms p(95)=827ms + tokens.........................: 5057339 58573.895307/s + vus............................: 1 min=0 max=320 + vus_max........................: 320 min=21 max=320 + + +running (01m26.3s), 000/320 VUs, 1000 complete and 0 interrupted iterations +chat ✓ [======================================] 320 VUs 01m26.3s/10m0s 1000/1000 shared iters ``` + diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml index 872c9706..6660c1ec 100644 --- a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml +++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml @@ -6,7 +6,7 @@ spec: restartPolicy: Never containers: - name: bench - image: us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2 + image: $IMG command: ["sleep", "infinity"] resources: requests: From 92ac45fd0c619dec8f96143a28be075d35471e1d Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Tue, 31 Dec 2024 19:22:36 -0800 Subject: [PATCH 3/5] update results --- .../README.md | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md index bc9fede1..814ea466 100644 --- a/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md +++ b/benchmarks/chat/scenarios/least-load-vs-prefix-hash-70b-8r/README.md @@ -1,14 +1,32 @@ -# Results +# Prefix Hash Benchmark - Llama 3.1 70B with 8 replicas Under specific conditions: -* Restricted GPU memory -* Low `max_tokens` to be generated +* Set `max_tokens` to 10 to see understand performance impact when significant time is spent on input. * Chat threads with decently long user messages -Prefix hashing was shown to have `34%` decrease in average time per token. +Summary of how Prefix Hashing affects performance: +* `12%` decrease in average time per token: `405.39 ms (LeastLoad) --> 361.07ms (PrefixHash)` +* input_tokens: `51846.973437/s (LeastLoad) --> 57789.588176/s (PrefixHash)`. An increase of `11%` in throughput of input tokens. -`712.11ms (LeastLoad) --> 469.34ms (PrefixHash)` +Least Load results: +``` + input_tokens...................: 4990249 51846.973437/s + iteration_duration.............: avg=26.06s min=3.61s med=24.15s max=1m25s p(90)=39.9s p(95)=48.14s + iterations.....................: 1000 10.389657/s + new_tokens.....................: 67790 704.314821/s + time_per_token.................: avg=405.39ms min=34.22ms med=384.49ms max=2.2s p(90)=650.92ms p(95)=749.72ms +``` + + +Prefix Hashing results: +``` + input_tokens...................: 4989621 57789.588176/s + iteration_duration.............: avg=23.03s min=1.71s med=22.05s max=1m20s p(90)=41.36s p(95)=49.67s + iterations.....................: 1000 11.581959/s + new_tokens.....................: 67718 784.307131/s + time_per_token.................: avg=361.07ms min=35.86ms med=235.35ms max=2.78s p(90)=723.57ms p(95)=827ms +``` ## Steps taken @@ -38,7 +56,7 @@ kubectl exec -it chat-benchmark -- SCENARIO=$SCENARIO make run ``` -## Benchmark Output +## Benchmark Outputs ### LeastLoad - single replica From d46bf77b6498254d875b9e1de350a4c1f4871988 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Tue, 31 Dec 2024 19:51:29 -0800 Subject: [PATCH 4/5] add model for 70b on single h100 --- charts/models/values.yaml | 12 ++++++++++++ .../llama-3.1-70b-instruct-fp8-1xh100.yaml | 17 +++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml diff --git a/charts/models/values.yaml b/charts/models/values.yaml index 7d7f2714..45e9d73e 100644 --- a/charts/models/values.yaml +++ b/charts/models/values.yaml @@ -139,6 +139,18 @@ catalog: - --disable-log-requests resourceProfile: nvidia-gpu-h100:2 targetRequests: 500 + llama-3.1-70b-instruct-fp8-1xh100: + features: [TextGeneration] + url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 + engine: VLLM + args: + - --enable-prefix-caching + - --max-model-len=16384 + - --max-num-batched-token=16384 + - --gpu-memory-utilization=0.95 + - --disable-log-requests + - --kv-cache-dtype=fp8 + resourceProfile: nvidia-gpu-h100:1 llama-3.1-70b-instruct-fp8-l4: enabled: false features: [TextGeneration] diff --git a/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml b/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml new file mode 100644 index 00000000..55006519 --- /dev/null +++ b/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml @@ -0,0 +1,17 @@ +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: llama-3.1-70b-instruct-fp8-1xh100 +spec: + features: [TextGeneration] + url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 + engine: VLLM + args: + - --enable-prefix-caching + - --max-model-len=16384 + - --max-num-batched-token=16384 + - --gpu-memory-utilization=0.95 + - --disable-log-requests + - --kv-cache-dtype=fp8 + resourceProfile: nvidia-gpu-h100:1 From 55ebdef7062f4d1554d563f6023d2cf06a2f95a5 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Fri, 3 Jan 2025 14:18:38 -0800 Subject: [PATCH 5/5] rename --- charts/models/values.yaml | 2 +- ...t-fp8-1xh100.yaml => llama-3.1-70b-instruct-fp8-1-h100.yaml} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename manifests/models/{llama-3.1-70b-instruct-fp8-1xh100.yaml => llama-3.1-70b-instruct-fp8-1-h100.yaml} (90%) diff --git a/charts/models/values.yaml b/charts/models/values.yaml index 45e9d73e..f98255ca 100644 --- a/charts/models/values.yaml +++ b/charts/models/values.yaml @@ -139,7 +139,7 @@ catalog: - --disable-log-requests resourceProfile: nvidia-gpu-h100:2 targetRequests: 500 - llama-3.1-70b-instruct-fp8-1xh100: + llama-3.1-70b-instruct-fp8-1-h100: features: [TextGeneration] url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 engine: VLLM diff --git a/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml b/manifests/models/llama-3.1-70b-instruct-fp8-1-h100.yaml similarity index 90% rename from manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml rename to manifests/models/llama-3.1-70b-instruct-fp8-1-h100.yaml index 55006519..f21d2ea8 100644 --- a/manifests/models/llama-3.1-70b-instruct-fp8-1xh100.yaml +++ b/manifests/models/llama-3.1-70b-instruct-fp8-1-h100.yaml @@ -2,7 +2,7 @@ apiVersion: kubeai.org/v1 kind: Model metadata: - name: llama-3.1-70b-instruct-fp8-1xh100 + name: llama-3.1-70b-instruct-fp8-1-h100 spec: features: [TextGeneration] url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8