Skip to content

Commit f9591b3

Browse files
[HUD] [SGLang benchmarking] Adding bigger models for NVIDIA and implementing AMD Support for SGLang using docker (#80)
* try with docker setup * added rocm tests * add runner * fixes * fix * fixes and more tests * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * try diff approach * try diff approach * clean and add llama70b * added gpt-oss and deepseek * fix * fix * fix * fix * fix * fix
1 parent d67c812 commit f9591b3

File tree

5 files changed

+441
-56
lines changed

5 files changed

+441
-56
lines changed

.github/scripts/generate_vllm_benchmark_matrix.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
#!/usr/bin/env python
22

3-
import os
4-
import json
53
import glob
4+
import json
65
import logging
7-
from logging import warning
6+
import os
87
from argparse import Action, ArgumentParser, Namespace
9-
from typing import Any, Dict, Optional, List
8+
from logging import warning
9+
from typing import Any, Dict, List, Optional
1010

1111

1212
logging.basicConfig(level=logging.INFO)
@@ -45,6 +45,7 @@
4545
"linux.aws.h100.8": "cuda",
4646
"linux.dgx.b200": "cuda",
4747
"linux.dgx.b200.8": "cuda",
48+
"linux.rocm.gpu.gfx942.1": "rocm",
4849
"linux.rocm.gpu.gfx942.2": "rocm",
4950
"linux.rocm.gpu.gfx942.4": "rocm",
5051
"linux.rocm.gpu.gfx942.8": "rocm",
@@ -79,6 +80,7 @@
7980
],
8081
"google/gemma-3-4b-it": [
8182
"linux.dgx.b200",
83+
"linux.rocm.gpu.gfx942", # TODO: Fail on ROCm
8284
],
8385
# Run some bigger models on B200 to share the load
8486
"Qwen/Qwen3-30B-A3B": [

.github/scripts/run-sglang-performance-benchmarks.sh

Lines changed: 128 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,106 @@ ensure_sharegpt_downloaded() {
4040
fi
4141
}
4242

43+
build_vllm_from_source_for_rocm() {
44+
echo "Starting vLLM build for ROCm..."
45+
46+
# Validate ROCm installation
47+
if ! command -v rocminfo &> /dev/null; then
48+
echo "Error: rocminfo not found. Please ensure ROCm is properly installed."
49+
exit 1
50+
fi
51+
52+
if [ ! -d "/opt/rocm" ]; then
53+
echo "Error: ROCm installation directory /opt/rocm not found."
54+
exit 1
55+
fi
56+
57+
extra_index="${PYTORCH_ROCM_INDEX_URL:-https://download.pytorch.org/whl/rocm6.3}"
58+
59+
# Tooling & base deps for building
60+
uv pip install --upgrade pip
61+
uv pip install cmake ninja packaging typing_extensions pybind11 wheel
62+
63+
# Install ROCm PyTorch that matches the container ROCm
64+
uv pip uninstall torch || true
65+
uv pip uninstall torchvision || true
66+
uv pip uninstall torchaudio || true
67+
uv pip install --no-cache-dir --pre torch torchvision torchaudio --index-url "${extra_index}"
68+
69+
# Install Triton flash attention for ROCm
70+
echo "Installing Triton flash attention for ROCm..."
71+
uv pip uninstall triton || true
72+
if ! git clone https://github.com/OpenAI/triton.git; then
73+
echo "Error: Failed to clone Triton repository"
74+
exit 1
75+
fi
76+
cd triton
77+
if ! git checkout e5be006; then
78+
echo "Error: Failed to checkout Triton commit e5be006"
79+
exit 1
80+
fi
81+
cd python
82+
if ! uv pip install .; then
83+
echo "Error: Failed to install Triton"
84+
exit 1
85+
fi
86+
cd ../..
87+
rm -rf triton
88+
89+
# Clone vLLM source
90+
rm -rf vllm
91+
git clone https://github.com/vllm-project/vllm.git
92+
cd vllm
93+
94+
# Build & install AMD SMI
95+
uv pip install /opt/rocm/share/amd_smi
96+
97+
# Install additional dependencies
98+
uv pip install --upgrade numba \
99+
scipy \
100+
huggingface-hub[cli,hf_transfer] \
101+
setuptools_scm
102+
uv pip install "numpy<2"
103+
104+
# Install ROCm-specific Python requirements from the repo
105+
if [ -f requirements/rocm.txt ]; then
106+
uv pip install -r requirements/rocm.txt
107+
fi
108+
109+
# Detect GPU architecture dynamically
110+
gpu_arch=$(rocminfo | grep gfx | head -1 | awk '{print $2}' || echo "gfx90a")
111+
echo "Detected GPU architecture: $gpu_arch"
112+
113+
# Set ROCm environment variables
114+
export VLLM_TARGET_DEVICE=rocm
115+
export PYTORCH_ROCM_ARCH="$gpu_arch"
116+
export ROCM_HOME="/opt/rocm"
117+
export HIP_PLATFORM="amd"
118+
export PATH="$ROCM_HOME/bin:$PATH"
119+
export LD_LIBRARY_PATH="$ROCM_HOME/lib:$LD_LIBRARY_PATH"
120+
121+
# Additional ROCm stability settings
122+
export PYTORCH_HIP_ALLOC_CONF="expandable_segments:True"
123+
export HIP_VISIBLE_DEVICES="0"
124+
export AMD_LOG_LEVEL=1 # Reduce AMD driver logging
125+
126+
# Build & install vLLM into this venv
127+
echo "Building vLLM for ROCm with architecture: $gpu_arch"
128+
if ! python3 setup.py develop; then
129+
echo "Error: Failed to build vLLM from source"
130+
exit 1
131+
fi
132+
133+
# Verify vLLM installation
134+
echo "Verifying vLLM installation..."
135+
if ! python3 -c "import vllm; print(f'vLLM version: {vllm.__version__}')"; then
136+
echo "Error: vLLM installation verification failed"
137+
exit 1
138+
fi
139+
140+
echo "vLLM build completed successfully!"
141+
cd ..
142+
}
43143

44144
run_serving_tests() {
45145
# run serving tests using `sglang.bench_serving` command
@@ -74,12 +174,11 @@ run_serving_tests() {
74174
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
75175
echo "Running over qps list $qps_list"
76176

77-
# Extract only specific SGLang server parameters
177+
# Extract special parameters that need mapping or special handling
78178
model_path=$(echo "$server_params" | jq -r '.model_path // .model')
79-
context_length=$(echo "$server_params" | jq -r '.context_length // 4096')
179+
tp=$(echo "$server_params" | jq -r '.tp // .tensor_parallel_size // 1')
80180

81181
# check if there is enough resources to run the test
82-
tp=$(echo "$server_params" | jq -r '.tp // 1')
83182
if [ "$ON_CPU" == "1" ]; then
84183
if [[ $numa_count -lt $tp ]]; then
85184
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -95,13 +194,28 @@ run_serving_tests() {
95194
# check if server model and client model is aligned
96195
server_model="$model_path"
97196
client_model=$(echo "$client_params" | jq -r '.model // .model_path')
98-
if [[ $server_model != "$client_model" ]]; then
197+
if [[ $server_model != "$client_model" ]] && [[ $server_model != *"gpt-oss"* ]]; then
99198
echo "Server model and client model must be the same. Skip testcase $test_name."
100199
continue
101200
fi
102201

103-
server_command="python -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
104-
202+
# Remove the special parameters that we'll handle manually
203+
server_params_filtered=$(echo "$server_params" | jq 'del(.model, .model_path, .tensor_parallel_size, .tp)')
204+
205+
# Use the json2args utility to convert the filtered params to command line arguments
206+
server_args=$(json2args "$server_params_filtered")
207+
208+
# Build the server command with manually mapped parameters and auto-parsed ones
209+
server_command="python3 -m sglang.launch_server --model-path $model_path --tp $tp $server_args"
210+
211+
# Model-specific environment variables (command-line flags can be added to JSON directly)
212+
if [[ "${DEVICE_NAME:-}" == "rocm" ]]; then
213+
# GPT-OSS models on ROCm - set environment variables
214+
if [[ "$model_path" == *"gpt-oss"* ]]; then
215+
echo "Detected GPT-OSS model on ROCm, setting compatibility environment variables"
216+
export SGLANG_USE_AITER=0
217+
fi
218+
fi
105219
# run the server
106220
echo "Running test case $test_name"
107221
echo "Server command: $server_command"
@@ -119,14 +233,17 @@ run_serving_tests() {
119233
continue
120234
fi
121235

122-
# Create a new uv environment for vllm client (once per test case)
123236
echo "Creating new uv environment for vllm client..."
124237
uv venv vllm_client_env
125238

126-
# Activate the environment and install vllm
127239
echo "Installing vllm in the new environment..."
128240
source vllm_client_env/bin/activate
129-
pip install vllm
241+
242+
if [[ "${DEVICE_NAME:-}" == "rocm" ]]; then
243+
build_vllm_from_source_for_rocm
244+
else
245+
uv pip install vllm
246+
fi
130247

131248
# iterate over different QPS
132249
for qps in $qps_list; do
@@ -192,6 +309,8 @@ main() {
192309
check_hf_token
193310
install_dependencies
194311

312+
pip install uv
313+
195314
# get the current IP address, required by SGLang bench commands
196315
export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}')
197316
# turn off the reporting of the status of each request, to clean up the terminal output

.github/workflows/sglang-benchmark.yml

Lines changed: 74 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ on:
2121
A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
2222
required: true
2323
type: string
24-
default: h100
24+
default: h100,b200,rocm
2525
pull_request:
2626
paths:
2727
- .github/workflows/sglang-benchmark.yml
@@ -52,7 +52,7 @@ jobs:
5252
shell: bash
5353
env:
5454
MODELS: ${{ inputs.models || '' }}
55-
RUNNERS: ${{ inputs.runners || 'h100' }}
55+
RUNNERS: ${{ inputs.runners || '' }}
5656
run: |
5757
set -eux
5858
@@ -98,8 +98,6 @@ jobs:
9898
python-version: '3.12'
9999
cache: 'pip'
100100

101-
- name: Install uv
102-
uses: astral-sh/setup-uv@v6
103101

104102
- name: Check if the device is supported
105103
shell: bash
@@ -173,33 +171,53 @@ jobs:
173171
with:
174172
registry-type: public
175173

176-
- name: Install SGLang
177-
working-directory: sglang-benchmarks
174+
- name: Select SGLang Docker image
175+
working-directory: sglang-benchmarks/sglang
178176
shell: bash
179177
run: |
180178
set -eux
181-
uv venv sgl_server_env
182179
183-
# Install SGLang from source
184-
uv pip install -p sgl_server_env -e "$(pwd)/sglang/python[all]" boto3 psutil gitpython sentencepiece
180+
# Determine image suffix based on device
181+
if [[ "${DEVICE_NAME}" == "cuda" ]]; then
182+
if [[ "${DEVICE_TYPE}" == *"B200"* ]]; then
183+
IMAGE_SUFFIX="-cu128-b200"
184+
else
185+
IMAGE_SUFFIX=""
186+
fi
187+
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
188+
IMAGE_SUFFIX="-rocm630-mi30x"
189+
else
190+
echo "SGLang benchmarks require either CUDA or ROCm devices."
191+
exit 1
192+
fi
185193
186-
# Verify installations
187-
echo "$(pwd)/sgl_server_env/bin" >> $GITHUB_PATH
194+
# Find the newest tag with available Docker image
195+
SELECTED_TAG=""
196+
for tag in $(git for-each-ref --sort=-creatordate --format '%(refname:short)' refs/tags); do
197+
candidate_image="lmsysorg/sglang:${tag}${IMAGE_SUFFIX}"
198+
echo "Checking: $candidate_image"
199+
200+
if docker manifest inspect "$candidate_image" >/dev/null 2>&1; then
201+
SELECTED_TAG="$tag"
202+
DOCKER_IMAGE="$candidate_image"
203+
HEAD_SHA=$(git rev-list -n 1 "$tag")
204+
echo "Found available image: $candidate_image"
205+
break
206+
fi
207+
done
208+
209+
# Fallback to latest if no tagged version found
210+
if [[ -z "$SELECTED_TAG" ]]; then
211+
echo "No tagged images found, using latest"
212+
DOCKER_IMAGE="lmsysorg/sglang:latest${IMAGE_SUFFIX}"
213+
HEAD_SHA=$(git rev-parse HEAD)
214+
SELECTED_TAG="latest"
215+
fi
188216
189-
- name: Install NVCC #TODO: Use docker image (nvidia/cuda:12.8.1-devel-ubuntu22.04) instead of locally specifying the variables
190-
if: env.DEVICE_NAME == 'cuda'
191-
shell: bash
192-
run: |
193-
set -eux
194-
sudo apt-get update
195-
sudo apt-get install -y wget gnupg
196-
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
197-
sudo dpkg -i cuda-keyring_1.1-1_all.deb
198-
sudo apt-get update
199-
sudo apt-get install -y --no-install-recommends cuda-toolkit-12-8
200-
sudo ln -s /usr/local/cuda-12.8 /usr/local/cuda || true
201-
echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
202-
echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
217+
echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> "$GITHUB_ENV"
218+
echo "HEAD_SHA=$HEAD_SHA" >> "$GITHUB_ENV"
219+
echo "LATEST_TAG=$SELECTED_TAG" >> "$GITHUB_ENV"
220+
echo "Using: $DOCKER_IMAGE (tag: $SELECTED_TAG)"
203221
204222
- name: Setup benchmark tests
205223
env:
@@ -222,13 +240,39 @@ jobs:
222240
find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display"
223241
224242
- name: Run SGLang benchmark
225-
working-directory: sglang-benchmarks/benchmarks
226243
env:
227244
HF_TOKEN: ${{ secrets.HF_TOKEN }}
228245
SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
229246
run: |
230247
set -eux
231-
bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
248+
249+
container_name=$(docker run \
250+
${GPU_FLAG:-} \
251+
-e HF_TOKEN \
252+
-e DEVICE_NAME \
253+
-e DEVICE_TYPE \
254+
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
255+
--ipc=host \
256+
--tty \
257+
--detach \
258+
--security-opt seccomp=unconfined \
259+
--shm-size=32g \
260+
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
261+
-w /tmp/workspace \
262+
"${DOCKER_IMAGE}"
263+
)
264+
265+
docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
266+
267+
- name: Authenticate with AWS
268+
# AWS CUDA runners already have access to the bucket via its runner IAM role
269+
if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200')
270+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
271+
with:
272+
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
273+
# The max duration enforced by the server side
274+
role-duration-seconds: 18000
275+
aws-region: us-east-1
232276

233277
- name: Upload the benchmark results
234278
if: always()
@@ -258,9 +302,11 @@ jobs:
258302
fi
259303
260304
python3 .github/scripts/upload_benchmark_results.py \
261-
--repo sglang-benchmarks/sglang \
305+
--repo-name sgl-project/sglang \
262306
--benchmark-name "SGLang benchmark" \
263307
--benchmark-results "${BENCHMARK_RESULTS}" \
308+
--head-sha "${HEAD_SHA}" \
309+
--head-branch main \
264310
--device-name "${DEVICE_NAME}" \
265311
--device-type "${SANITIZED_DEVICE_TYPE}" \
266312
--model "${SANITIZED_MODELS}"

0 commit comments

Comments
 (0)