pytorch
diff --git a/‎.github/scripts/generate_vllm_benchmark_matrix.py‎
Lines changed: 6 additions & 4 deletions b/‎.github/scripts/generate_vllm_benchmark_matrix.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎.github/scripts/run-sglang-performance-benchmarks.sh‎
Lines changed: 128 additions & 9 deletions b/‎.github/scripts/run-sglang-performance-benchmarks.sh‎
Lines changed: 128 additions & 9 deletions
diff --git a/‎.github/workflows/sglang-benchmark.yml‎
Lines changed: 74 additions & 28 deletions b/‎.github/workflows/sglang-benchmark.yml‎
Lines changed: 74 additions & 28 deletions
@@ -1,12 +1,12 @@
 #!/usr/bin/env python
 
-import os
-import json
 import glob
+import json
 import logging
-from logging import warning
+import os
 from argparse import Action, ArgumentParser, Namespace
-from typing import Any, Dict, Optional, List
+from logging import warning
+from typing import Any, Dict, List, Optional
 
 
 logging.basicConfig(level=logging.INFO)
@@ -45,6 +45,7 @@
     "linux.aws.h100.8": "cuda",
     "linux.dgx.b200": "cuda",
     "linux.dgx.b200.8": "cuda",
+    "linux.rocm.gpu.gfx942.1": "rocm",
     "linux.rocm.gpu.gfx942.2": "rocm",
     "linux.rocm.gpu.gfx942.4": "rocm",
     "linux.rocm.gpu.gfx942.8": "rocm",
@@ -79,6 +80,7 @@
     ],
     "google/gemma-3-4b-it": [
         "linux.dgx.b200",
+        "linux.rocm.gpu.gfx942",  # TODO: Fail on ROCm
     ],
     # Run some bigger models on B200 to share the load
     "Qwen/Qwen3-30B-A3B": [
 
@@ -40,6 +40,106 @@ ensure_sharegpt_downloaded() {
   fi
 }
 
+build_vllm_from_source_for_rocm() {
+  echo "Starting vLLM build for ROCm..."
+  
+  # Validate ROCm installation
+  if ! command -v rocminfo &> /dev/null; then
+    echo "Error: rocminfo not found. Please ensure ROCm is properly installed."
+    exit 1
+  fi
+  
+  if [ ! -d "/opt/rocm" ]; then
+    echo "Error: ROCm installation directory /opt/rocm not found."
+    exit 1
+  fi
+  
+  extra_index="${PYTORCH_ROCM_INDEX_URL:-https://download.pytorch.org/whl/rocm6.3}"
+
+  # Tooling & base deps for building
+  uv pip install --upgrade pip
+  uv pip install cmake ninja packaging typing_extensions pybind11 wheel
+
+  # Install ROCm PyTorch that matches the container ROCm
+  uv pip uninstall torch || true
+  uv pip uninstall torchvision || true
+  uv pip uninstall torchaudio || true
+  uv pip install --no-cache-dir --pre torch torchvision torchaudio --index-url "${extra_index}"
+
+  # Install Triton flash attention for ROCm
+  echo "Installing Triton flash attention for ROCm..."
+  uv pip uninstall triton || true
+  if ! git clone https://github.com/OpenAI/triton.git; then
+    echo "Error: Failed to clone Triton repository"
+    exit 1
+  fi
+  cd triton
+  if ! git checkout e5be006; then
+    echo "Error: Failed to checkout Triton commit e5be006"
+    exit 1
+  fi
+  cd python
+  if ! uv pip install .; then
+    echo "Error: Failed to install Triton"
+    exit 1
+  fi
+  cd ../..
+  rm -rf triton
+
+  # Clone vLLM source
+  rm -rf vllm
+  git clone https://github.com/vllm-project/vllm.git
+  cd vllm
+
+  # Build & install AMD SMI
+  uv pip install /opt/rocm/share/amd_smi
+
+  # Install additional dependencies
+  uv pip install --upgrade numba \
+    scipy \
+    huggingface-hub[cli,hf_transfer] \
+    setuptools_scm
+  uv pip install "numpy<2"
+
+  # Install ROCm-specific Python requirements from the repo
+  if [ -f requirements/rocm.txt ]; then
+    uv pip install -r requirements/rocm.txt
+  fi
+
+  # Detect GPU architecture dynamically
+  gpu_arch=$(rocminfo | grep gfx | head -1 | awk '{print $2}' || echo "gfx90a")
+  echo "Detected GPU architecture: $gpu_arch"
+  
+  # Set ROCm environment variables
+  export VLLM_TARGET_DEVICE=rocm
+  export PYTORCH_ROCM_ARCH="$gpu_arch"
+  export ROCM_HOME="/opt/rocm"
+  export HIP_PLATFORM="amd"
+  export PATH="$ROCM_HOME/bin:$PATH"
+  export LD_LIBRARY_PATH="$ROCM_HOME/lib:$LD_LIBRARY_PATH"
+  
+  # Additional ROCm stability settings
+  export PYTORCH_HIP_ALLOC_CONF="expandable_segments:True"
+  export HIP_VISIBLE_DEVICES="0"
+  export AMD_LOG_LEVEL=1  # Reduce AMD driver logging
+
+  # Build & install vLLM into this venv
+  echo "Building vLLM for ROCm with architecture: $gpu_arch"
+  if ! python3 setup.py develop; then
+    echo "Error: Failed to build vLLM from source"
+    exit 1
+  fi
+  
+  # Verify vLLM installation
+  echo "Verifying vLLM installation..."
+  if ! python3 -c "import vllm; print(f'vLLM version: {vllm.__version__}')"; then
+    echo "Error: vLLM installation verification failed"
+    exit 1
+  fi
+  
+  echo "vLLM build completed successfully!"
+  cd ..
+}
 
 run_serving_tests() {
   # run serving tests using `sglang.bench_serving` command
@@ -74,12 +174,11 @@ run_serving_tests() {
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
     echo "Running over qps list $qps_list"
 
-    # Extract only specific SGLang server parameters
+    # Extract special parameters that need mapping or special handling
     model_path=$(echo "$server_params" | jq -r '.model_path // .model')
-    context_length=$(echo "$server_params" | jq -r '.context_length // 4096')
+    tp=$(echo "$server_params" | jq -r '.tp // .tensor_parallel_size // 1')
 
     # check if there is enough resources to run the test
-    tp=$(echo "$server_params" | jq -r '.tp // 1')
     if [ "$ON_CPU" == "1" ]; then
       if [[ $numa_count -lt $tp ]]; then
         echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -95,13 +194,28 @@ run_serving_tests() {
     # check if server model and client model is aligned
     server_model="$model_path"
     client_model=$(echo "$client_params" | jq -r '.model // .model_path')
-    if [[ $server_model != "$client_model" ]]; then
+    if [[ $server_model != "$client_model" ]] && [[ $server_model != *"gpt-oss"* ]]; then
       echo "Server model and client model must be the same. Skip testcase $test_name."
       continue
     fi
 
-    server_command="python -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
-
+    # Remove the special parameters that we'll handle manually
+    server_params_filtered=$(echo "$server_params" | jq 'del(.model, .model_path, .tensor_parallel_size, .tp)')
+    
+    # Use the json2args utility to convert the filtered params to command line arguments
+    server_args=$(json2args "$server_params_filtered")
+    
+    # Build the server command with manually mapped parameters and auto-parsed ones
+    server_command="python3 -m sglang.launch_server --model-path $model_path --tp $tp $server_args"
+    
+    # Model-specific environment variables (command-line flags can be added to JSON directly)
+    if [[ "${DEVICE_NAME:-}" == "rocm" ]]; then
+      # GPT-OSS models on ROCm - set environment variables
+      if [[ "$model_path" == *"gpt-oss"* ]]; then
+        echo "Detected GPT-OSS model on ROCm, setting compatibility environment variables"
+        export SGLANG_USE_AITER=0
+      fi
+    fi
     # run the server
     echo "Running test case $test_name"
     echo "Server command: $server_command"
@@ -119,14 +233,17 @@ run_serving_tests() {
       continue
     fi
 
-    # Create a new uv environment for vllm client (once per test case)
     echo "Creating new uv environment for vllm client..."
     uv venv vllm_client_env
 
-    # Activate the environment and install vllm
     echo "Installing vllm in the new environment..."
     source vllm_client_env/bin/activate
-    pip install vllm
+
+    if [[ "${DEVICE_NAME:-}" == "rocm" ]]; then
+      build_vllm_from_source_for_rocm
+    else
+      uv pip install vllm
+    fi
 
     # iterate over different QPS
     for qps in $qps_list; do
@@ -192,6 +309,8 @@ main() {
     check_hf_token
     install_dependencies
 
+    pip install uv
+
     # get the current IP address, required by SGLang bench commands
     export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}')
     # turn off the reporting of the status of each request, to clean up the terminal output
 
@@ -21,7 +21,7 @@ on:
           A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
         required: true
         type: string
-        default: h100
+        default: h100,b200,rocm
   pull_request:
     paths:
       - .github/workflows/sglang-benchmark.yml
@@ -52,7 +52,7 @@ jobs:
         shell: bash
         env:
           MODELS: ${{ inputs.models || '' }}
-          RUNNERS: ${{ inputs.runners || 'h100' }}
+          RUNNERS: ${{ inputs.runners || '' }}
         run: |
           set -eux
 
@@ -98,8 +98,6 @@ jobs:
           python-version: '3.12'
           cache: 'pip'
 
-      - name: Install uv
-        uses: astral-sh/setup-uv@v6
 
       - name: Check if the device is supported
         shell: bash
@@ -173,33 +171,53 @@ jobs:
         with:
           registry-type: public
 
-      - name: Install SGLang
-        working-directory: sglang-benchmarks
+      - name: Select SGLang Docker image
+        working-directory: sglang-benchmarks/sglang
         shell: bash
         run: |
           set -eux
-          uv venv sgl_server_env
 
-          # Install SGLang from source
-          uv pip install -p sgl_server_env -e "$(pwd)/sglang/python[all]" boto3 psutil gitpython sentencepiece
+          # Determine image suffix based on device
+          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
+            if [[ "${DEVICE_TYPE}" == *"B200"* ]]; then
+              IMAGE_SUFFIX="-cu128-b200"
+            else
+              IMAGE_SUFFIX=""
+            fi
+          elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            IMAGE_SUFFIX="-rocm630-mi30x"
+          else
+            echo "SGLang benchmarks require either CUDA or ROCm devices."
+            exit 1
+          fi
 
-          # Verify installations
-          echo "$(pwd)/sgl_server_env/bin" >> $GITHUB_PATH
+          # Find the newest tag with available Docker image
+          SELECTED_TAG=""
+          for tag in $(git for-each-ref --sort=-creatordate --format '%(refname:short)' refs/tags); do
+            candidate_image="lmsysorg/sglang:${tag}${IMAGE_SUFFIX}"
+            echo "Checking: $candidate_image"
+            
+            if docker manifest inspect "$candidate_image" >/dev/null 2>&1; then
+              SELECTED_TAG="$tag"
+              DOCKER_IMAGE="$candidate_image"
+              HEAD_SHA=$(git rev-list -n 1 "$tag")
+              echo "Found available image: $candidate_image"
+              break
+            fi
+          done
+          
+          # Fallback to latest if no tagged version found
+          if [[ -z "$SELECTED_TAG" ]]; then
+            echo "No tagged images found, using latest"
+            DOCKER_IMAGE="lmsysorg/sglang:latest${IMAGE_SUFFIX}"
+            HEAD_SHA=$(git rev-parse HEAD)
+            SELECTED_TAG="latest"
+          fi
 
-      - name: Install NVCC #TODO: Use docker image (nvidia/cuda:12.8.1-devel-ubuntu22.04) instead of locally specifying the variables
-        if: env.DEVICE_NAME == 'cuda'
-        shell: bash
-        run: |
-          set -eux
-          sudo apt-get update
-          sudo apt-get install -y wget gnupg
-          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-          sudo dpkg -i cuda-keyring_1.1-1_all.deb
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends cuda-toolkit-12-8
-          sudo ln -s /usr/local/cuda-12.8 /usr/local/cuda || true
-          echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
-          echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
+          echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> "$GITHUB_ENV"
+          echo "HEAD_SHA=$HEAD_SHA" >> "$GITHUB_ENV"
+          echo "LATEST_TAG=$SELECTED_TAG" >> "$GITHUB_ENV"
+          echo "Using: $DOCKER_IMAGE (tag: $SELECTED_TAG)"
 
       - name: Setup benchmark tests
         env:
@@ -222,13 +240,39 @@ jobs:
           find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display"
 
       - name: Run SGLang benchmark
-        working-directory: sglang-benchmarks/benchmarks
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
         run: |
           set -eux
-          bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e HF_TOKEN \
+            -e DEVICE_NAME \
+            -e DEVICE_TYPE \
+            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
+            --ipc=host \
+            --tty \
+            --detach \
+            --security-opt seccomp=unconfined \
+            --shm-size=32g \
+            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+            -w /tmp/workspace \
+            "${DOCKER_IMAGE}"
+          )
+
+          docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
+
+      - name: Authenticate with AWS
+        # AWS CUDA runners already have access to the bucket via its runner IAM role
+        if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200')
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
 
       - name: Upload the benchmark results
         if: always()
@@ -258,9 +302,11 @@ jobs:
           fi
 
           python3 .github/scripts/upload_benchmark_results.py \
-              --repo sglang-benchmarks/sglang \
+              --repo-name sgl-project/sglang \
               --benchmark-name "SGLang benchmark" \
               --benchmark-results "${BENCHMARK_RESULTS}" \
+              --head-sha "${HEAD_SHA}" \
+              --head-branch main \
               --device-name "${DEVICE_NAME}" \
               --device-type "${SANITIZED_DEVICE_TYPE}" \
               --model "${SANITIZED_MODELS}"