Skip to content

Commit 963053c

Browse files
authored
Add DeepSeek-V3 and DeepSeek-R1 on B200 (#78)
* Add DeepSeek-V3 and DeepSeek-R1 on B200 Signed-off-by: Huy Do <huydhn@gmail.com> * Comment Signed-off-by: Huy Do <huydhn@gmail.com> * Format Signed-off-by: Huy Do <huydhn@gmail.com> * Minor bug Signed-off-by: Huy Do <huydhn@gmail.com> * Fix another bug Signed-off-by: Huy Do <huydhn@gmail.com> * Skip gemma3 on ROCm Signed-off-by: Huy Do <huydhn@gmail.com> * A bit more tweak Signed-off-by: Huy Do <huydhn@gmail.com> * [no ci] Benchmark DeepSeek is a bit slow Signed-off-by: Huy Do <huydhn@gmail.com> --------- Signed-off-by: Huy Do <huydhn@gmail.com>
1 parent 644dd73 commit 963053c

File tree

5 files changed

+155
-6
lines changed

5 files changed

+155
-6
lines changed

.github/scripts/generate_vllm_benchmark_matrix.py

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
1: [
1717
"linux.aws.a100",
1818
"linux.aws.h100",
19-
"linux.rocm.gpu.gfx942.2", # No single ROCm GPU?
19+
"linux.rocm.gpu.gfx942.1",
2020
"linux.24xl.spr-metal",
2121
"linux.dgx.b200",
2222
],
@@ -29,8 +29,6 @@
2929
4: [
3030
"linux.aws.h100.4",
3131
"linux.rocm.gpu.gfx942.4",
32-
# TODO (huydhn): Enable this when Intel's runners are ready
33-
# "intel-cpu-emr",
3432
],
3533
8: [
3634
"linux.aws.h100.8",
@@ -62,6 +60,65 @@
6260
]
6361
)
6462

63+
# Model and runner skip logic, for example, just need to run DeepSeek on b200
64+
# and not h100. This also serves as a knob to tune CI behavior. TODO (huydhn):
65+
# Figure out how to set this in the JSON benchmark configuration instead
66+
PLATFORM_SKIPS = {
67+
# Already been covered in both A100 and H100
68+
"meta-llama/Meta-Llama-3.1-8B-Instruct": [
69+
"linux.dgx.b200",
70+
],
71+
"meta-llama/Meta-Llama-3.1-70B-Instruct": [
72+
"linux.dgx.b200",
73+
],
74+
"mistralai/Mixtral-8x7B-Instruct-v0.1": [
75+
"linux.dgx.b200",
76+
],
77+
"Qwen/Qwen3-8B": [
78+
"linux.dgx.b200",
79+
],
80+
"google/gemma-3-4b-it": [
81+
"linux.dgx.b200",
82+
],
83+
# Run some bigger models on B200 to share the load
84+
"Qwen/Qwen3-30B-A3B": [
85+
"linux.aws.a100",
86+
"linux.aws.h100",
87+
],
88+
"google/gemma-3-27b-it": [
89+
"linux.aws.a100",
90+
"linux.aws.h100",
91+
"linux.rocm.gpu.gfx942", # TODO (huydhn): Fail on ROCm
92+
],
93+
"meta-llama/Llama-4-Scout-17B-16E-Instruct": [
94+
"linux.aws.a100",
95+
"linux.aws.h100",
96+
],
97+
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [
98+
"linux.aws.a100",
99+
"linux.aws.h100",
100+
"linux.rocm.gpu.gfx942", # TODO (huydhn): Hang on ROCm
101+
],
102+
# Run gpt-oss on both H100 and B200
103+
"openai/gpt-oss-20b": [
104+
"linux.aws.a100",
105+
],
106+
"openai/gpt-oss-120b": [
107+
"linux.aws.a100",
108+
],
109+
# Deepseek can only run on B200
110+
"deepseek-ai/DeepSeek-V3.1": [
111+
"linux.aws.a100",
112+
"linux.aws.h100",
113+
],
114+
"deepseek-ai/DeepSeek-R1": [
115+
"linux.aws.a100",
116+
"linux.aws.h100",
117+
],
118+
}
119+
# Lower case all the model names for consistency
120+
PLATFORM_SKIPS = {k.lower(): v for k, v in PLATFORM_SKIPS.items()}
121+
65122

66123
class ValidateDir(Action):
67124
def __call__(
@@ -198,6 +255,12 @@ def generate_benchmark_matrix(
198255
if not found_runner and not use_all_runners:
199256
continue
200257

258+
# Check the skip logic
259+
if model in PLATFORM_SKIPS and any(
260+
[r in runner for r in PLATFORM_SKIPS[model]]
261+
):
262+
continue
263+
201264
benchmark_matrix["include"].append(
202265
{
203266
"runner": runner,

.github/workflows/vllm-benchmark.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ name: vLLM Benchmark
22

33
on:
44
schedule:
5-
# Run every 4 hours
6-
- cron: '0 */4 * * *'
5+
# Run every 6 hours
6+
- cron: '0 */6 * * *'
77
workflow_dispatch:
88
inputs:
99
vllm_branch:
@@ -53,7 +53,7 @@ jobs:
5353
shell: bash
5454
env:
5555
MODELS: ${{ inputs.models || '' }}
56-
RUNNERS: ${{ inputs.runners || 'h100' }}
56+
RUNNERS: ${{ inputs.runners || '' }}
5757
run: |
5858
set -eux
5959

vllm-benchmarks/benchmarks/cuda/latency-tests.json

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,5 +72,27 @@
7272
"num_iters": 15,
7373
"max_model_len": 8192
7474
}
75+
},
76+
{
77+
"test_name": "latency_deepseek_v3_tp8",
78+
"parameters": {
79+
"model": "deepseek-ai/DeepSeek-V3.1",
80+
"tensor_parallel_size": 8,
81+
"load_format": "dummy",
82+
"num_iters_warmup": 5,
83+
"num_iters": 15,
84+
"max_model_len": 8192
85+
}
86+
},
87+
{
88+
"test_name": "latency_deepseek_r1_tp8",
89+
"parameters": {
90+
"model": "deepseek-ai/DeepSeek-R1",
91+
"tensor_parallel_size": 8,
92+
"load_format": "dummy",
93+
"num_iters_warmup": 5,
94+
"num_iters": 15,
95+
"max_model_len": 8192
96+
}
7597
}
7698
]

vllm-benchmarks/benchmarks/cuda/serving-tests.json

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,5 +451,45 @@
451451
"random_input_len": 5250,
452452
"random_output_len": 8250
453453
}
454+
},
455+
{
456+
"test_name": "serving_deepseek_v3_tp8_random_in5k_out8k",
457+
"qps_list": [1, 4, 16, "inf"],
458+
"server_parameters": {
459+
"model": "deepseek-ai/DeepSeek-V3.1",
460+
"tensor_parallel_size": 8,
461+
"swap_space": 16,
462+
"disable_log_stats": "",
463+
"disable_log_requests": "",
464+
"load_format": "dummy"
465+
},
466+
"client_parameters": {
467+
"model": "deepseek-ai/DeepSeek-V3.1",
468+
"backend": "vllm",
469+
"dataset_name": "random",
470+
"num_prompts": 200,
471+
"random_input_len": 5250,
472+
"random_output_len": 8250
473+
}
474+
},
475+
{
476+
"test_name": "serving_deepseek_r1_tp8_random_in5k_out8k",
477+
"qps_list": [1, 4, 16, "inf"],
478+
"server_parameters": {
479+
"model": "deepseek-ai/DeepSeek-R1",
480+
"tensor_parallel_size": 8,
481+
"swap_space": 16,
482+
"disable_log_stats": "",
483+
"disable_log_requests": "",
484+
"load_format": "dummy"
485+
},
486+
"client_parameters": {
487+
"model": "deepseek-ai/DeepSeek-R1",
488+
"backend": "vllm",
489+
"dataset_name": "random",
490+
"num_prompts": 200,
491+
"random_input_len": 5250,
492+
"random_output_len": 8250
493+
}
454494
}
455495
]

vllm-benchmarks/benchmarks/cuda/throughput-tests.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,5 +79,29 @@
7979
"backend": "vllm",
8080
"max_model_len": 8192
8181
}
82+
},
83+
{
84+
"test_name": "throughput_deepseek_v3_tp8",
85+
"parameters": {
86+
"model": "deepseek-ai/DeepSeek-V3.1",
87+
"tensor_parallel_size": 8,
88+
"load_format": "dummy",
89+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
90+
"num_prompts": 200,
91+
"backend": "vllm",
92+
"max_model_len": 8192
93+
}
94+
},
95+
{
96+
"test_name": "throughput_deepseek_r1_tp8",
97+
"parameters": {
98+
"model": "deepseek-ai/DeepSeek-R1",
99+
"tensor_parallel_size": 8,
100+
"load_format": "dummy",
101+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
102+
"num_prompts": 200,
103+
"backend": "vllm",
104+
"max_model_len": 8192
105+
}
82106
}
83107
]

0 commit comments

Comments
 (0)