Add DeepSeek-V3 and DeepSeek-R1 on B200 (#78)

huydhn · web-flow · commit 963053cacccb · 2025-09-11T02:24:02.000-07:00
* Add DeepSeek-V3 and DeepSeek-R1 on B200

Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;

* Comment

Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;

* Format

Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;

* Minor bug

Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;

* Fix another bug

Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;

* Skip gemma3 on ROCm

Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;

* A bit more tweak

Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;

* [no ci] Benchmark DeepSeek is a bit slow

Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;

---------

Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;
diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -16,7 +16,7 @@
     1: [
         "linux.aws.a100",
         "linux.aws.h100",
-        "linux.rocm.gpu.gfx942.2",  # No single ROCm GPU?
+        "linux.rocm.gpu.gfx942.1",
         "linux.24xl.spr-metal",
         "linux.dgx.b200",
     ],
@@ -29,8 +29,6 @@
     4: [
         "linux.aws.h100.4",
         "linux.rocm.gpu.gfx942.4",
-        # TODO (huydhn): Enable this when Intel's runners are ready
-        # "intel-cpu-emr",
     ],
     8: [
         "linux.aws.h100.8",
@@ -62,6 +60,65 @@
     ]
 )
 
+# Model and runner skip logic, for example, just need to run DeepSeek on b200
+# and not h100. This also serves as a knob to tune CI behavior. TODO (huydhn):
+# Figure out how to set this in the JSON benchmark configuration instead
+PLATFORM_SKIPS = {
+    # Already been covered in both A100 and H100
+    "meta-llama/Meta-Llama-3.1-8B-Instruct": [
+        "linux.dgx.b200",
+    ],
+    "meta-llama/Meta-Llama-3.1-70B-Instruct": [
+        "linux.dgx.b200",
+    ],
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": [
+        "linux.dgx.b200",
+    ],
+    "Qwen/Qwen3-8B": [
+        "linux.dgx.b200",
+    ],
+    "google/gemma-3-4b-it": [
+        "linux.dgx.b200",
+    ],
+    # Run some bigger models on B200 to share the load
+    "Qwen/Qwen3-30B-A3B": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ],
+    "google/gemma-3-27b-it": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+        "linux.rocm.gpu.gfx942",  # TODO (huydhn): Fail on ROCm
+    ],
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ],
+    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+        "linux.rocm.gpu.gfx942",  # TODO (huydhn): Hang on ROCm
+    ],
+    # Run gpt-oss on both H100 and B200
+    "openai/gpt-oss-20b": [
+        "linux.aws.a100",
+    ],
+    "openai/gpt-oss-120b": [
+        "linux.aws.a100",
+    ],
+    # Deepseek can only run on B200
+    "deepseek-ai/DeepSeek-V3.1": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ],
+    "deepseek-ai/DeepSeek-R1": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ],
+}
+# Lower case all the model names for consistency
+PLATFORM_SKIPS = {k.lower(): v for k, v in PLATFORM_SKIPS.items()}
+
 
 class ValidateDir(Action):
     def __call__(
@@ -198,6 +255,12 @@ def generate_benchmark_matrix(
                     if not found_runner and not use_all_runners:
                         continue
 
+                    # Check the skip logic
+                    if model in PLATFORM_SKIPS and any(
+                        [r in runner for r in PLATFORM_SKIPS[model]]
+                    ):
+                        continue
+
                     benchmark_matrix["include"].append(
                         {
                             "runner": runner,
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -2,8 +2,8 @@ name: vLLM Benchmark
 
 on:
   schedule:
-    # Run every 4 hours
-    - cron: '0 */4 * * *'
+    # Run every 6 hours
+    - cron: '0 */6 * * *'
   workflow_dispatch:
     inputs:
       vllm_branch:
@@ -53,7 +53,7 @@ jobs:
         shell: bash
         env:
           MODELS: ${{ inputs.models || '' }}
-          RUNNERS: ${{ inputs.runners || 'h100' }}
+          RUNNERS: ${{ inputs.runners || '' }}
         run: |
           set -eux
 
diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json
@@ -72,5 +72,27 @@
             "num_iters": 15,
             "max_model_len": 8192
         }
+    },
+    {
+        "test_name": "latency_deepseek_v3_tp8",
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-V3.1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15,
+            "max_model_len": 8192
+        }
+    },
+    {
+        "test_name": "latency_deepseek_r1_tp8",
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15,
+            "max_model_len": 8192
+        }
     }
 ]
diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json
@@ -451,5 +451,45 @@
             "random_input_len": 5250,
             "random_output_len": 8250
         }
+    },
+    {
+        "test_name": "serving_deepseek_v3_tp8_random_in5k_out8k",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "deepseek-ai/DeepSeek-V3.1",
+            "tensor_parallel_size": 8,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "deepseek-ai/DeepSeek-V3.1",
+            "backend": "vllm",
+            "dataset_name": "random",
+            "num_prompts": 200,
+            "random_input_len": 5250,
+            "random_output_len": 8250
+        }
+    },
+    {
+        "test_name": "serving_deepseek_r1_tp8_random_in5k_out8k",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "backend": "vllm",
+            "dataset_name": "random",
+            "num_prompts": 200,
+            "random_input_len": 5250,
+            "random_output_len": 8250
+        }
     }
 ]
diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
@@ -79,5 +79,29 @@
             "backend": "vllm",
             "max_model_len": 8192
         }
+    },
+    {
+        "test_name": "throughput_deepseek_v3_tp8",
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-V3.1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm",
+            "max_model_len": 8192
+        }
+    },
+    {
+        "test_name": "throughput_deepseek_r1_tp8",
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm",
+            "max_model_len": 8192
+        }
     }
 ]