Skip to main content

Perf tuning

Force PCIe speed

curl -L https://github.com/corundum/corundum/raw/refs/heads/master/fpga/lib/pcie/scripts/pcie_set_speed.sh > pcie_set_speed.sh
chmod +x pcie_set_speed.sh

# PCI bridge: Advanced Micro Devices, Inc. [AMD/ATI] Device 14a0 (rev 01)
AMDGPU_DEVICES=(
'17:00.0'
'1a:00.0'
'31:00.0'
'4b:00.0'
)
for (( i=0; i<${#AMDGPU_DEVICES[@]}; i++ )); do
sudo ./pcie_set_speed.sh "${AMDGPU_DEVICES[$i]}" 4
done

Overclock / PowerLimit

upp - https://github.com/sibradzic/upp

# PP parameters
echo '
SmallPowerLimit1: $TDP_MAX
SmallPowerLimit2: $TDP_MAX
BoostPowerLimit: $TDP_MAX
PowerSavingClockTable:
PowerSavingClockMax:
PowerSavingClockMax 0: $GPU_MAX
smcPPTable:
SocketPowerLimitAc0: $TDP_MAX
SocketPowerLimitDc: $TDP_MAX
TdcLimitGfx: $TDC_MAX
FreqTableGfx:
FreqTableGfx 8: $GPU_MAX
FreqTableUclk:
FreqTableUclk 2: $MEM_MAX
FreqTableUclk 3: $MEM_MAX
DcModeMaxFreq:
DcModeMaxFreq 0: $GPU_MAX
' > mi50-oc.yaml.tpl

# Stock values for 016.004.000.064.016969
echo '
export MEM_MAX=1000
export GPU_MAX=1725
export TDP_MAX=225
export TDC_MAX=330
' > preset-stock.sh
chmod +x preset-stock.sh

# OC worked on all my 4 cards
echo '
export MEM_MAX=1200
export GPU_MAX=1900
export TDP_MAX=300
export TDC_MAX=330
' > preset-oc.sh
chmod +x preset-oc.sh

# Apply preset script
echo '
set -e
PRESET="./preset-$1.sh"
AMDGPU_DEVICE="$2"
. "$PRESET"
if [ ! -e "/sys/bus/pci/devices/$AMDGPU_DEVICE" ]; then
AMDGPU_DEVICE="0000:$AMDGPU_DEVICE"
fi
FILE=mi50-oc.yaml
envsubst < $FILE.tpl > $FILE
upp -p "/sys/bus/pci/devices/$AMDGPU_DEVICE/pp_table" undump -d $FILE -w
' > apply-preset.sh
chmod +x apply-preset.sh

# Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Vega 20 [Radeon Pro VII/Radeon Instinct MI50 32GB] (rev 01)
AMDGPU_DEVICES=(
'19:00.0'
'1c:00.0'
'33:00.0'
'4d:00.0'
)
for (( i=0; i<${#AMDGPU_DEVICES[@]}; i++ )); do
sudo ./apply-preset.sh oc "${AMDGPU_DEVICES[$i]}"
done

How to check errs?

$ sudo rocm-smi --showrasinfo
========================= ROCm System Management Interface =========================
===================================== RAS Info =====================================

GPU[0]: RAS INFO
Block Status Correctable Error Uncorrectable Error
UMC ENABLED 0 0
SDMA ENABLED 0 0
GFX ENABLED 0 0
MMHUB ENABLED 0 0
ATHUB ENABLED
PCIE_BIF ENABLED 0 0
HDP ENABLED 0 0
XGMI_WAFL DISABLED
DF ENABLED
SMN ENABLED
SEM ENABLED
MP0 ENABLED
MP1 ENABLED
FUSE ENABLED
____________________________________________________________________________________

GPU[1]: RAS INFO
...

Changing smcPPTable/TdcLimitGfx 350 => 150 reduced the hotspot by 10+- degrees with almost no drop in performance in vllm

temperatures

Results

MEM_MAX=1000; GPU_MAX=1725; TDP_MAX=225; TDC_MAX=330

modelsizeparamsbackendngln_ubatchsmfatestt/s
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1pp2048430.10 ± 0.09
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1tg25632.43 ± 0.02
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1pp2048 @ d16384358.33 ± 13.54
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1tg256 @ d1638429.74 ± 1.54

MEM_MAX=1150; GPU_MAX=1850; TDP_MAX=300; TDC_MAX=330

modelsizeparamsbackendngln_ubatchsmfatestt/s
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1pp2048457.45 ± 0.12
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1tg25634.05 ± 2.00
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1pp2048 @ d16384389.94 ± 1.68
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1tg256 @ d1638430.82 ± 1.05

MEM_MAX=1150; GPU_MAX=1850; TDP_MAX=180; TDC_MAX=330

modelsizeparamsbackendngln_ubatchsmfatestt/s
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1pp2048441.30 ± 1.18
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1tg25633.85 ± 0.14
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1pp2048 @ d16384372.79 ± 2.24
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1tg256 @ d1638431.97 ± 0.21

MEM_MAX=1150; GPU_MAX=1850; TDP_MAX=140; TDC_MAX=330

modelsizeparamsbackendngln_ubatchsmfatestt/s
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1pp2048415.37 ± 0.51
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1tg25632.40 ± 0.13
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1pp2048 @ d16384350.42 ± 1.33
gemma4 31B Q8_030.38 GiB30.70 BROCm992048tensor1tg256 @ d1638430.71 ± 0.18