vLLM benchmark
v1
1. Exec to container
kubectl exec -n ns-vllm pods/$(kubectl get pods -n ns-vllm -l app.kubernetes.io/instance=vllm -o jsonpath='{.items[].metadata.name}') -it -- /bin/bash
2. Run suite
- Completion presets
- Embedding presets
Setup
#!/bin/bash
# curl -L 'https://github.com/mixa3607/mixa3607.github.com/raw/refs/heads/master/docs/wiki/AMD_GFX906/vllm/benchmark/bench.compreltion-presets.sh' | bash
echo '
baseArgs: >
--ignore-eos --num-warmups 1
benchmarks:
########### TG
## prefix-len 16
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16 --random-output-len 256
--num-prompts 10 --max-concurrency=1
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16 --random-output-len 256
--num-prompts=30 --max-concurrency=4
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16 --random-output-len 256
--num-prompts 60 --max-concurrency 8
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16 --random-output-len 256
--num-prompts 90 --max-concurrency 16
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16 --random-output-len 256
--num-prompts 120 --max-concurrency 32
## prefix-len 4096
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 4096 --random-output-len 256
--num-prompts 10 --max-concurrency=1
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 4096 --random-output-len 256
--num-prompts=30 --max-concurrency=4
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 4096 --random-output-len 256
--num-prompts 60 --max-concurrency 8
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 4096 --random-output-len 256
--num-prompts 90 --max-concurrency 16
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 4096 --random-output-len 256
--num-prompts 120 --max-concurrency 32
## prefix-len 16384
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16384 --random-output-len 256
--num-prompts 10 --max-concurrency=1
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16384 --random-output-len 256
--num-prompts=30 --max-concurrency=4
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16384 --random-output-len 256
--num-prompts 60 --max-concurrency 8
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16384 --random-output-len 256
--num-prompts 90 --max-concurrency 16
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 16384 --random-output-len 256
--num-prompts 120 --max-concurrency 32
## prefix-len 32768
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 32768 --random-output-len 256
--num-prompts 10 --max-concurrency=1
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 32768 --random-output-len 256
--num-prompts=30 --max-concurrency=4
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 32768 --random-output-len 256
--num-prompts 60 --max-concurrency 8
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 32768 --random-output-len 256
--num-prompts 90 --max-concurrency 16
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 32768 --random-output-len 256
--num-prompts 120 --max-concurrency 32
## prefix-len 65536
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 65536 --random-output-len 256
--num-prompts 10 --max-concurrency=1
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 65536 --random-output-len 256
--num-prompts=30 --max-concurrency=4
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 65536 --random-output-len 256
--num-prompts 60 --max-concurrency 8
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 65536 --random-output-len 256
--num-prompts 90 --max-concurrency 16
- workload: tg
args: >
--dataset-name random
--random-input-len 0 --random-prefix-len 65536 --random-output-len 256
--num-prompts 120 --max-concurrency 32
########### PP
- workload: pp
args: >
--dataset-name random
--random-input-len 1024 --random-prefix-len 0 --random-output-len 1
--num-prompts 30 --max-concurrency 1
- workload: pp
args: >
--dataset-name random
--random-input-len 4096 --random-prefix-len 0 --random-output-len 1
--num-prompts 20 --max-concurrency 1
- workload: pp
args: >
--dataset-name random
--random-input-len 8192 --random-prefix-len 0 --random-output-len 1
--num-prompts 10 --max-concurrency 1
- workload: pp
args: >
--dataset-name random
--random-input-len 16384 --random-prefix-len 0 --random-output-len 1
--num-prompts 8 --max-concurrency 1
- workload: pp
args: >
--dataset-name random
--random-input-len 32768 --random-prefix-len 0 --random-output-len 1
--num-prompts=6 --max-concurrency 1
- workload: pp
args: >
--dataset-name random
--random-input-len 65536 --random-prefix-len 0 --random-output-len 1
--num-prompts 4 --max-concurrency 1
' > bench-configs.yaml
Setup
#!/bin/bash
# curl -L 'https://github.com/mixa3607/mixa3607.github.com/raw/refs/heads/master/docs/wiki/AMD_GFX906/vllm/benchmark/bench.embedding-presets.sh' | bash
echo '
baseArgs: >
--num-warmups 4
benchmarks:
########### EMBED 2048
- workload: embed
args: >
--dataset-name random
--random-input 2048
--num-prompts 32 --max-concurrency 1
- workload: embed
args: >
--dataset-name random
--random-input 2048
--num-prompts 64 --max-concurrency 4
- workload: embed
args: >
--dataset-name random
--random-input 2048
--num-prompts 128 --max-concurrency 8
- workload: embed
args: >
--dataset-name random
--random-input 2048
--num-prompts 256 --max-concurrency 16
- workload: embed
args: >
--dataset-name random
--random-input 2048
--num-prompts 384 --max-concurrency 32
########### EMBED 256
- workload: embed
args: >
--dataset-name random
--random-input 256
--num-prompts 64 --max-concurrency 1
- workload: embed
args: >
--dataset-name random
--random-input 256
--num-prompts 128 --max-concurrency 4
- workload: embed
args: >
--dataset-name random
--random-input 256
--num-prompts 256 --max-concurrency 8
- workload: embed
args: >
--dataset-name random
--random-input 256
--num-prompts 384 --max-concurrency 16
- workload: embed
args: >
--dataset-name random
--random-input 256
--num-prompts 512 --max-concurrency 32
' > bench-configs.yaml
Run
#!/bin/bash
# curl -L 'https://github.com/mixa3607/mixa3607.github.com/raw/refs/heads/master/docs/wiki/AMD_GFX906/vllm/benchmark/bench.run.sh' | bash
export CONFIGS="$(cat bench-configs.yaml)"
export VLLM_INST_HOST="127.0.0.1"
export VLLM_INST_PORT="8000"
export MODELS="$(curl http://$VLLM_INST_HOST:$VLLM_INST_PORT/v1/models)"
export MODEL_ROOT="$(echo "$MODELS" | yq '.data[0].root' -r)"
export MODEL_NAME="$(echo "$MODELS" | yq '.data[0].id' -r)"
for (( i=0; i<$(echo "$CONFIGS" | yq '.benchmarks | length'); i++ )); do
BENCHMARK_NAME="$(echo "$CONFIGS" | yq ".benchmarks[$i].workload" -r)"
BENCHMARK_ARGS="$(echo "$CONFIGS" | yq ".benchmarks[$i].args + \" \" + .baseArgs" -r | tr '\n' ' ')"
echo "Run $BENCHMARK_NAME ($BENCHMARK_ARGS)"
vllm bench serve $BENCHMARK_ARGS \
--host "$VLLM_INST_HOST" --model "$MODEL_ROOT" --served-model-name "$MODEL_NAME" --save-detailed --save-result \
--metadata metadata.workload="$BENCHMARK_NAME" metadata.image="$IMAGE_NAME"
echo ""
done
3. Copy results to local fs (optional)
kubectl exec -n ns-vllm pods/$(kubectl get pods -n ns-vllm -l app.kubernetes.io/instance=vllm -o jsonpath='{.items[].metadata.name}') -- bash -c 'cd /app/vllm/ && tar -zcvf - *.json' | tar -zxvf - -C results/
4. Convert results to table (optional)
# clone and open dir with https://github.com/mixa3607/ML-gfx906/tree/master/vllm/benchmark
dotnet run --project ./ResultsConverter/ResultsConverter/ResultsConverter.csproj -- gen-table -i ./results/