Skip to main content

Run vLLM

Docker

See https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md + https://github.com/ROCm/vllm/blob/main/docs/deployment/docker.md

Kubernetes with Helm

values.yaml
image:
registry: docker.io
repository: mixa3607/llama.cpp-gfx906
tag: full-b8356-rocm-7.2.0

# limit grace period (optional)
terminationGracePeriodSeconds: 15

# uncomment when you need more time for initial model downloading
#startupProbe:
# enabled: false
#livenessProbe:
# enabled: false
#readinessProbe:
# enabled: false

# enable ingress
ingress:
enabled: true
hostname: llamacpp.arkprojects.space
extraTls:
- hosts:
- "llamacpp.arkprojects.space"
secretName: arkprojects.space-cert-prod

# scrape metrics
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: /metrics
prometheus.io/port: "8080"

# node selector and tolerations
nodeSelector:
feature.node.kubernetes.io/amd-gpu: "true"
tolerations:
- key: "feature.node.kubernetes.io/amd-gpu"
operator: "Equal"
value: "true"
effect: "NoSchedule"

# resources ¯\_(ツ)_/¯
resources:
limits:
memory: 16G
requests:
cpu: 2
memory: 8G

# persistence
persistence:
enabled: true
storageClass: truenas-trash4-iscsi
size: 192G

# use direct mounts OR k8s amd gpu operator
extraVolumeMounts:
- name: dev-kfd
mountPath: /dev/kfd
- name: dev-dri
mountPath: /dev/dri
- name: shm
mountPath: /dev/shm
extraVolumes:
- name: dev-kfd
hostPath:
path: /dev/kfd
- name: dev-dri
hostPath:
path: /dev/dri
- name: shm
emptyDir:
medium: Memory
sizeLimit: 32G

# required when use direct mounts insted gpu operator
containerSecurityContext:
capabilities:
add:
- SYS_PTRACE
seccompProfile:
type: Unconfined

extraEnvVars:
- name: LLAMA_ARG_N_PARALLEL
value: "1"
- name: LLAMA_ARG_THREADS_HTTP
value: "-1"
... and other args

helmfile.yaml
repositories:
- url: ghcr.io/mixa3607/charts
name: mixa3607-charts
oci: true

helmDefaults:
createNamespace: true
cleanupOnFail: true
atomic: false
historyMax: 2

releases:
- name: llama-cpp
chart: mixa3607-charts/llamacpp
version: 0.1.2
namespace: ns-vllm
values:
- ./llama.cpp/values.yml
Install chart
helmfile apply