Run vLLM

Docker

See https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md + https://github.com/ROCm/vllm/blob/main/docs/deployment/docker.md

Kubernetes with Helm

values.yaml
image:
  registry: docker.io
  repository: mixa3607/llama.cpp-gfx906
  tag: full-b8356-rocm-7.2.0

# limit grace period (optional)
terminationGracePeriodSeconds: 15

# uncomment when you need more time for initial model downloading
#startupProbe:
#  enabled: false
#livenessProbe:
#  enabled: false
#readinessProbe:
#  enabled: false

# enable ingress
ingress:
  enabled: true
  hostname: llamacpp.arkprojects.space
  extraTls:
    - hosts:
        - "llamacpp.arkprojects.space"
      secretName: arkprojects.space-cert-prod

# scrape metrics
podAnnotations:
  prometheus.io/scrape: "true"
  prometheus.io/path: /metrics
  prometheus.io/port: "8080"

# node selector and tolerations
nodeSelector:
  feature.node.kubernetes.io/amd-gpu: "true"
tolerations:
  - key: "feature.node.kubernetes.io/amd-gpu"
    operator: "Equal"
    value: "true"
    effect: "NoSchedule"

# resources ¯\_(ツ)_/¯
resources:
  limits:
    memory: 16G
  requests:
    cpu: 2
    memory: 8G

# persistence
persistence:
  enabled: true
  storageClass: truenas-trash4-iscsi
  size: 192G

# use direct mounts OR k8s amd gpu operator
extraVolumeMounts:
  - name: dev-kfd
    mountPath: /dev/kfd
  - name: dev-dri
    mountPath: /dev/dri
  - name: shm
    mountPath: /dev/shm
extraVolumes:
  - name: dev-kfd
    hostPath:
      path: /dev/kfd
  - name: dev-dri
    hostPath:
      path: /dev/dri
  - name: shm
    emptyDir:
      medium: Memory
      sizeLimit: 32G

# required when use direct mounts insted gpu operator
containerSecurityContext:
  capabilities:
    add:
    - SYS_PTRACE
  seccompProfile:
    type: Unconfined

extraEnvVars:
- name: LLAMA_ARG_N_PARALLEL
  value: "1"
- name: LLAMA_ARG_THREADS_HTTP
  value: "-1"
... and other args

helmfile.yaml
repositories:
  - url: ghcr.io/mixa3607/charts
    name: mixa3607-charts
    oci: true

helmDefaults:
  createNamespace: true
  cleanupOnFail: true
  atomic: false
  historyMax: 2

releases:
  - name: llama-cpp
    chart: mixa3607-charts/llamacpp
    version: 0.1.2
    namespace: ns-vllm
    values:
      - ./llama.cpp/values.yml

Install chart
helmfile apply

Docker​

Kubernetes with Helm​

Docker

Kubernetes with Helm