Skip to content

Commit 774075f

Browse files
[Benchmark] Adding "facebook/opt-125m" model in the benchmarking tests for both vLLM and SGLang (#76)
* Added facebook model in the benchmark tests * fixing model length issue * update the name of the parameter for sglang * remove not needed tests for sglang * updated serving tests for sglang * install nvcc * fix bug * fix bug * fix bug * fix bug * fix * fix * fix
1 parent 963053c commit 774075f

File tree

8 files changed

+167
-25
lines changed

8 files changed

+167
-25
lines changed

.github/workflows/sglang-benchmark.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,21 @@ jobs:
186186
# Verify installations
187187
echo "$(pwd)/sgl_server_env/bin" >> $GITHUB_PATH
188188
189+
- name: Install NVCC
190+
if: env.DEVICE_NAME == 'cuda'
191+
shell: bash
192+
run: |
193+
set -eux
194+
sudo apt-get update
195+
sudo apt-get install -y wget gnupg
196+
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
197+
sudo dpkg -i cuda-keyring_1.1-1_all.deb
198+
sudo apt-get update
199+
sudo apt-get install -y --no-install-recommends cuda-toolkit-12-8
200+
sudo ln -s /usr/local/cuda-12.8 /usr/local/cuda || true
201+
echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
202+
echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
203+
189204
- name: Setup benchmark tests
190205
env:
191206
MODELS: ${{ matrix.models }}

sglang-benchmarks/benchmarks/cuda/serving-tests.json

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,19 @@
1919
}
2020
},
2121
{
22-
"test_name": "serving_qwen3_30b_a3b_tp8_random_in1k_out2k",
22+
"test_name": "serving_gemma_3_27b_it_tp8_random_in1k_out2k",
2323
"qps_list": [10],
2424
"server_parameters": {
25-
"model": "Qwen/Qwen3-30B-A3B",
25+
"model": "google/gemma-3-27b-it",
2626
"tensor_parallel_size": 8,
2727
"swap_space": 16,
2828
"disable_log_stats": "",
2929
"disable_log_requests": "",
3030
"load_format": "dummy",
31-
"max_model_len": 8192
31+
"context_length": 8192
3232
},
3333
"client_parameters": {
34-
"model": "Qwen/Qwen3-30B-A3B",
34+
"model": "google/gemma-3-27b-it",
3535
"backend": "vllm",
3636
"dataset_name": "random",
3737
"num_prompts": 200,
@@ -40,19 +40,19 @@
4040
}
4141
},
4242
{
43-
"test_name": "serving_gemma_3_27b_it_tp8_random_in1k_out2k",
43+
"test_name": "serving_gemma_3_4b_it_tp1_random_in1k_out2k",
4444
"qps_list": [10],
4545
"server_parameters": {
46-
"model": "google/gemma-3-27b-it",
47-
"tensor_parallel_size": 8,
46+
"model": "google/gemma-3-4b-it",
47+
"tensor_parallel_size": 1,
4848
"swap_space": 16,
4949
"disable_log_stats": "",
5050
"disable_log_requests": "",
5151
"load_format": "dummy",
52-
"max_model_len": 8192
52+
"context_length": 8192
5353
},
5454
"client_parameters": {
55-
"model": "google/gemma-3-27b-it",
55+
"model": "google/gemma-3-4b-it",
5656
"backend": "vllm",
5757
"dataset_name": "random",
5858
"num_prompts": 200,
@@ -61,45 +61,44 @@
6161
}
6262
},
6363
{
64-
"test_name": "serving_gemma_3_4b_it_tp1_random_in1k_out2k",
65-
"qps_list": [10],
64+
"test_name": "serving_opt125m_tp1_sharegpt",
65+
"qps_list": [1, 4, 16, "inf"],
6666
"server_parameters": {
67-
"model": "google/gemma-3-4b-it",
67+
"model": "facebook/opt-125m",
6868
"tensor_parallel_size": 1,
6969
"swap_space": 16,
7070
"disable_log_stats": "",
7171
"disable_log_requests": "",
7272
"load_format": "dummy",
73-
"max_model_len": 8192
73+
"context_length": 2048
7474
},
7575
"client_parameters": {
76-
"model": "google/gemma-3-4b-it",
76+
"model": "facebook/opt-125m",
7777
"backend": "vllm",
78-
"dataset_name": "random",
79-
"num_prompts": 200,
80-
"random_input_len": 1024,
81-
"random_output_len": 2048
78+
"dataset_name": "sharegpt",
79+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
80+
"num_prompts": 200
8281
}
8382
},
8483
{
85-
"test_name": "serving_qwen3_8b_tp1_random_in1k_out2k",
86-
"qps_list": [10],
84+
"test_name": "serving_opt125m_tp1_random_in750_out75",
85+
"qps_list": [1, 4, 16, "inf"],
8786
"server_parameters": {
88-
"model": "Qwen/Qwen3-8B",
87+
"model": "facebook/opt-125m",
8988
"tensor_parallel_size": 1,
9089
"swap_space": 16,
9190
"disable_log_stats": "",
9291
"disable_log_requests": "",
9392
"load_format": "dummy",
94-
"max_model_len": 8192
93+
"context_length": 2048
9594
},
9695
"client_parameters": {
97-
"model": "Qwen/Qwen3-8B",
96+
"model": "facebook/opt-125m",
9897
"backend": "vllm",
9998
"dataset_name": "random",
10099
"num_prompts": 200,
101-
"random_input_len": 1024,
102-
"random_output_len": 2048
100+
"random_input_len": 750,
101+
"random_output_len": 75
103102
}
104103
}
105104
]

vllm-benchmarks/benchmarks/cuda/latency-tests.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,17 @@
7373
"max_model_len": 8192
7474
}
7575
},
76+
{
77+
"test_name": "latency_opt125m_tp1",
78+
"parameters": {
79+
"model": "facebook/opt-125m",
80+
"tensor_parallel_size": 1,
81+
"load_format": "dummy",
82+
"num_iters_warmup": 5,
83+
"num_iters": 15,
84+
"max_model_len": 2048
85+
}
86+
},
7687
{
7788
"test_name": "latency_deepseek_v3_tp8",
7889
"parameters": {

vllm-benchmarks/benchmarks/cuda/serving-tests.json

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,47 @@
453453
}
454454
},
455455
{
456+
"test_name": "serving_opt125m_tp1_sharegpt",
457+
"qps_list": [1, 4, 16, "inf"],
458+
"server_parameters": {
459+
"model": "facebook/opt-125m",
460+
"tensor_parallel_size": 1,
461+
"swap_space": 16,
462+
"disable_log_stats": "",
463+
"disable_log_requests": "",
464+
"load_format": "dummy",
465+
"max_model_len": 2048
466+
},
467+
"client_parameters": {
468+
"model": "facebook/opt-125m",
469+
"backend": "vllm",
470+
"dataset_name": "sharegpt",
471+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
472+
"num_prompts": 200
473+
}
474+
},
475+
{
476+
"test_name": "serving_opt125m_tp1_random_in750_out75",
477+
"qps_list": [1, 4, 16, "inf"],
478+
"server_parameters": {
479+
"model": "facebook/opt-125m",
480+
"tensor_parallel_size": 1,
481+
"swap_space": 16,
482+
"disable_log_stats": "",
483+
"disable_log_requests": "",
484+
"load_format": "dummy",
485+
"max_model_len": 2048
486+
},
487+
"client_parameters": {
488+
"model": "facebook/opt-125m",
489+
"backend": "vllm",
490+
"dataset_name": "random",
491+
"num_prompts": 200,
492+
"random_input_len": 750,
493+
"random_output_len": 75
494+
}
495+
},
496+
{
456497
"test_name": "serving_deepseek_v3_tp8_random_in5k_out8k",
457498
"qps_list": [1, 4, 16, "inf"],
458499
"server_parameters": {

vllm-benchmarks/benchmarks/cuda/throughput-tests.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,18 @@
8080
"max_model_len": 8192
8181
}
8282
},
83+
{
84+
"test_name": "throughput_opt125m_tp1",
85+
"parameters": {
86+
"model": "facebook/opt-125m",
87+
"tensor_parallel_size": 1,
88+
"load_format": "dummy",
89+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
90+
"num_prompts": 200,
91+
"backend": "vllm",
92+
"max_model_len": 2048
93+
}
94+
},
8395
{
8496
"test_name": "throughput_deepseek_v3_tp8",
8597
"parameters": {

vllm-benchmarks/benchmarks/rocm/latency-tests.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,16 @@
5050
"num_iters": 15,
5151
"max_model_len": 8192
5252
}
53+
},
54+
{
55+
"test_name": "latency_opt125m_tp1",
56+
"parameters": {
57+
"model": "facebook/opt-125m",
58+
"tensor_parallel_size": 1,
59+
"load_format": "dummy",
60+
"num_iters_warmup": 5,
61+
"num_iters": 15,
62+
"max_model_len": 2048
63+
}
5364
}
5465
]

vllm-benchmarks/benchmarks/rocm/serving-tests.json

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,5 +410,46 @@
410410
"random_input_len": 1024,
411411
"random_output_len": 2048
412412
}
413+
},
414+
{
415+
"test_name": "serving_opt125m_tp1_sharegpt",
416+
"qps_list": [1, 4, 16, "inf"],
417+
"server_parameters": {
418+
"model": "facebook/opt-125m",
419+
"tensor_parallel_size": 1,
420+
"swap_space": 16,
421+
"disable_log_stats": "",
422+
"disable_log_requests": "",
423+
"load_format": "dummy",
424+
"max_model_len": 2048
425+
},
426+
"client_parameters": {
427+
"model": "facebook/opt-125m",
428+
"backend": "vllm",
429+
"dataset_name": "sharegpt",
430+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
431+
"num_prompts": 200
432+
}
433+
},
434+
{
435+
"test_name": "serving_opt125m_tp1_random_in750_out75",
436+
"qps_list": [1, 4, 16, "inf"],
437+
"server_parameters": {
438+
"model": "facebook/opt-125m",
439+
"tensor_parallel_size": 1,
440+
"swap_space": 16,
441+
"disable_log_stats": "",
442+
"disable_log_requests": "",
443+
"load_format": "dummy",
444+
"max_model_len": 2048
445+
},
446+
"client_parameters": {
447+
"model": "facebook/opt-125m",
448+
"backend": "vllm",
449+
"dataset_name": "random",
450+
"num_prompts": 200,
451+
"random_input_len": 750,
452+
"random_output_len": 75
453+
}
413454
}
414455
]

vllm-benchmarks/benchmarks/rocm/throughput-tests.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,5 +55,17 @@
5555
"backend": "vllm",
5656
"max_model_len": 8192
5757
}
58+
},
59+
{
60+
"test_name": "throughput_opt125m_tp1",
61+
"parameters": {
62+
"model": "facebook/opt-125m",
63+
"tensor_parallel_size": 1,
64+
"load_format": "dummy",
65+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
66+
"num_prompts": 200,
67+
"backend": "vllm",
68+
"max_model_len": 2048
69+
}
5870
}
5971
]

0 commit comments

Comments
 (0)