Skip to content

Commit 4f5ace1

Browse files
authored
change CPU test jsons for AWS GNRr8i.24xlarge runner (#95)
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
1 parent 4bde5d3 commit 4f5ace1

File tree

3 files changed

+210
-64
lines changed

3 files changed

+210
-64
lines changed

vllm-benchmarks/benchmarks/cpu/latency-tests-cpu.json

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,24 @@
11
[
22
{
3-
"test_name": "latency_llama8B_tp1",
3+
"test_name": "latency_llama8B_tp2",
44
"environment_variables": {
5+
"VLLM_RPC_TIMEOUT": 100000,
56
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
7+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
8+
"VLLM_CPU_SGL_KERNEL": 1,
69
"VLLM_CPU_KVCACHE_SPACE": 40
710
},
811
"parameters": {
9-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10-
"tensor_parallel_size": 1,
11-
"load_format": "dummy",
12-
"num_iters_warmup": 5,
13-
"num_iters": 15
14-
}
15-
},
16-
{
17-
"test_name": "latency_llama8B_tp4",
18-
"environment_variables": {
19-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
20-
"VLLM_CPU_KVCACHE_SPACE": 40
21-
},
22-
"parameters": {
23-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
24-
"tensor_parallel_size": 4,
25-
"load_format": "dummy",
12+
"model": "meta-llama/Llama-3.1-8B-Instruct",
13+
"tensor_parallel_size": 2,
14+
"dtype": "bfloat16",
15+
"distributed_executor_backend": "mp",
16+
"block_size": 128,
17+
"trust_remote_code": "",
18+
"disable_log_stats": "",
19+
"enforce_eager": "",
20+
"max_num_batched_tokens": 2048,
21+
"max_num_seqs": 256,
2622
"num_iters_warmup": 5,
2723
"num_iters": 15
2824
}

vllm-benchmarks/benchmarks/cpu/serving-tests-cpu.json

Lines changed: 182 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,120 +2,275 @@
22
{
33
"test_name": "serving_llama8B_tp1_sharegpt",
44
"qps_list": [1, 4, 16, "inf"],
5+
"max_concurrency_list": [32],
56
"server_environment_variables": {
67
"VLLM_RPC_TIMEOUT": 100000,
78
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
89
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
10+
"VLLM_CPU_SGL_KERNEL": 1,
911
"VLLM_CPU_KVCACHE_SPACE": 40
1012
},
1113
"server_parameters": {
12-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
14+
"model": "meta-llama/Llama-3.1-8B-Instruct",
1315
"tensor_parallel_size": 1,
14-
"device": "cpu",
1516
"dtype": "bfloat16",
1617
"distributed_executor_backend": "mp",
1718
"block_size": 128,
1819
"trust_remote_code": "",
1920
"disable_log_stats": "",
20-
"disable_log_requests": "",
21+
"enforce_eager": "",
22+
"max_num_batched_tokens": 2048,
23+
"max_num_seqs": 256,
2124
"load_format": "dummy"
2225
},
2326
"client_parameters": {
24-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
27+
"model": "meta-llama/Llama-3.1-8B-Instruct",
2528
"backend": "vllm",
2629
"dataset_name": "sharegpt",
2730
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28-
"num_prompts": 200
31+
"num_prompts": 32
2932
}
3033
},
3134
{
3235
"test_name": "serving_llama8B_tp2_sharegpt",
3336
"qps_list": [1, 4, 16, "inf"],
37+
"max_concurrency_list": [32],
3438
"server_environment_variables": {
3539
"VLLM_RPC_TIMEOUT": 100000,
3640
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
3741
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
42+
"VLLM_CPU_SGL_KERNEL": 1,
3843
"VLLM_CPU_KVCACHE_SPACE": 40
3944
},
4045
"server_parameters": {
41-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
46+
"model": "meta-llama/Llama-3.1-8B-Instruct",
4247
"tensor_parallel_size": 2,
43-
"device": "cpu",
4448
"dtype": "bfloat16",
4549
"distributed_executor_backend": "mp",
4650
"block_size": 128,
4751
"trust_remote_code": "",
4852
"disable_log_stats": "",
49-
"disable_log_requests": "",
53+
"enforce_eager": "",
54+
"max_num_batched_tokens": 2048,
55+
"max_num_seqs": 256,
5056
"load_format": "dummy"
5157
},
5258
"client_parameters": {
53-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
59+
"model": "meta-llama/Llama-3.1-8B-Instruct",
5460
"backend": "vllm",
5561
"dataset_name": "sharegpt",
5662
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
57-
"num_prompts": 200
63+
"num_prompts": 32
5864
}
5965
},
6066
{
61-
"test_name": "serving_llama8B_tp4_sharegpt",
67+
"test_name": "serving_llama8B_tp1_random_128_128",
6268
"qps_list": [1, 4, 16, "inf"],
69+
"max_concurrency_list": [32],
6370
"server_environment_variables": {
6471
"VLLM_RPC_TIMEOUT": 100000,
6572
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6673
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
74+
"VLLM_CPU_SGL_KERNEL": 1,
6775
"VLLM_CPU_KVCACHE_SPACE": 40
6876
},
6977
"server_parameters": {
70-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
71-
"tensor_parallel_size": 4,
72-
"device": "cpu",
78+
"model": "meta-llama/Llama-3.1-8B-Instruct",
79+
"tensor_parallel_size": 1,
7380
"dtype": "bfloat16",
7481
"distributed_executor_backend": "mp",
7582
"block_size": 128,
7683
"trust_remote_code": "",
84+
"enable_chunked_prefill": "",
7785
"disable_log_stats": "",
78-
"disable_log_requests": "",
86+
"enforce_eager": "",
87+
"max_num_batched_tokens": 2048,
88+
"max_num_seqs": 256,
7989
"load_format": "dummy"
8090
},
8191
"client_parameters": {
82-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
92+
"model": "meta-llama/Llama-3.1-8B-Instruct",
8393
"backend": "vllm",
84-
"dataset_name": "sharegpt",
85-
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
86-
"num_prompts": 200
94+
"dataset_name": "random",
95+
"random-input-len": 128,
96+
"random-output-len": 128,
97+
"ignore-eos": "",
98+
"num_prompts": 32
99+
}
100+
},
101+
{
102+
"test_name": "serving_llama8B_tp2_random_128_128",
103+
"qps_list": [1, 4, 16, "inf"],
104+
"max_concurrency_list": [32],
105+
"server_environment_variables": {
106+
"VLLM_RPC_TIMEOUT": 100000,
107+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
108+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
109+
"VLLM_CPU_SGL_KERNEL": 1,
110+
"VLLM_CPU_KVCACHE_SPACE": 40
111+
},
112+
"server_parameters": {
113+
"model": "meta-llama/Llama-3.1-8B-Instruct",
114+
"tensor_parallel_size": 2,
115+
"dtype": "bfloat16",
116+
"distributed_executor_backend": "mp",
117+
"block_size": 128,
118+
"trust_remote_code": "",
119+
"enable_chunked_prefill": "",
120+
"disable_log_stats": "",
121+
"enforce_eager": "",
122+
"max_num_batched_tokens": 2048,
123+
"max_num_seqs": 256,
124+
"load_format": "dummy"
125+
},
126+
"client_parameters": {
127+
"model": "meta-llama/Llama-3.1-8B-Instruct",
128+
"backend": "vllm",
129+
"dataset_name": "random",
130+
"random-input-len": 128,
131+
"random-output-len": 128,
132+
"ignore-eos": "",
133+
"num_prompts": 32
134+
}
135+
},
136+
{
137+
"test_name": "serving_llama8B_tp1_random_128_2048",
138+
"qps_list": [1, 4, 16, "inf"],
139+
"max_concurrency_list": [32],
140+
"server_environment_variables": {
141+
"VLLM_RPC_TIMEOUT": 100000,
142+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
143+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
144+
"VLLM_CPU_SGL_KERNEL": 1,
145+
"VLLM_CPU_KVCACHE_SPACE": 40
146+
},
147+
"server_parameters": {
148+
"model": "meta-llama/Llama-3.1-8B-Instruct",
149+
"tensor_parallel_size": 1,
150+
"dtype": "bfloat16",
151+
"distributed_executor_backend": "mp",
152+
"block_size": 128,
153+
"trust_remote_code": "",
154+
"enable_chunked_prefill": "",
155+
"disable_log_stats": "",
156+
"enforce_eager": "",
157+
"max_num_batched_tokens": 2048,
158+
"max_num_seqs": 256,
159+
"load_format": "dummy"
160+
},
161+
"client_parameters": {
162+
"model": "meta-llama/Llama-3.1-8B-Instruct",
163+
"backend": "vllm",
164+
"dataset_name": "random",
165+
"random-input-len": 128,
166+
"random-output-len": 2048,
167+
"ignore-eos": "",
168+
"num_prompts": 32
169+
}
170+
},
171+
{
172+
"test_name": "serving_llama8B_tp2_random_128_2048",
173+
"qps_list": [1, 4, 16, "inf"],
174+
"max_concurrency_list": [32],
175+
"server_environment_variables": {
176+
"VLLM_RPC_TIMEOUT": 100000,
177+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
178+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
179+
"VLLM_CPU_SGL_KERNEL": 1,
180+
"VLLM_CPU_KVCACHE_SPACE": 40
181+
},
182+
"server_parameters": {
183+
"model": "meta-llama/Llama-3.1-8B-Instruct",
184+
"tensor_parallel_size": 2,
185+
"dtype": "bfloat16",
186+
"distributed_executor_backend": "mp",
187+
"block_size": 128,
188+
"trust_remote_code": "",
189+
"enable_chunked_prefill": "",
190+
"disable_log_stats": "",
191+
"enforce_eager": "",
192+
"max_num_batched_tokens": 2048,
193+
"max_num_seqs": 256,
194+
"load_format": "dummy"
195+
},
196+
"client_parameters": {
197+
"model": "meta-llama/Llama-3.1-8B-Instruct",
198+
"backend": "vllm",
199+
"dataset_name": "random",
200+
"random-input-len": 128,
201+
"random-output-len": 2048,
202+
"ignore-eos": "",
203+
"num_prompts": 32
87204
}
88205
},
89206
{
90-
"test_name": "serving_llama8B_tp4_random_1024_128",
207+
"test_name": "serving_llama8B_tp1_random_2048_128",
91208
"qps_list": [1, 4, 16, "inf"],
209+
"max_concurrency_list": [32],
92210
"server_environment_variables": {
93211
"VLLM_RPC_TIMEOUT": 100000,
94212
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
95213
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
214+
"VLLM_CPU_SGL_KERNEL": 1,
96215
"VLLM_CPU_KVCACHE_SPACE": 40
97216
},
98217
"server_parameters": {
99-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
100-
"tensor_parallel_size": 4,
101-
"device": "cpu",
218+
"model": "meta-llama/Llama-3.1-8B-Instruct",
219+
"tensor_parallel_size": 1,
220+
"dtype": "bfloat16",
221+
"distributed_executor_backend": "mp",
222+
"block_size": 128,
223+
"trust_remote_code": "",
224+
"enable_chunked_prefill": "",
225+
"disable_log_stats": "",
226+
"enforce_eager": "",
227+
"max_num_batched_tokens": 2048,
228+
"max_num_seqs": 256,
229+
"load_format": "dummy"
230+
},
231+
"client_parameters": {
232+
"model": "meta-llama/Llama-3.1-8B-Instruct",
233+
"backend": "vllm",
234+
"dataset_name": "random",
235+
"random-input-len": 2048,
236+
"random-output-len": 128,
237+
"ignore-eos": "",
238+
"num_prompts": 32
239+
}
240+
},
241+
{
242+
"test_name": "serving_llama8B_tp2_random_2048_128",
243+
"qps_list": [1, 4, 16, "inf"],
244+
"max_concurrency_list": [32],
245+
"server_environment_variables": {
246+
"VLLM_RPC_TIMEOUT": 100000,
247+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
248+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
249+
"VLLM_CPU_SGL_KERNEL": 1,
250+
"VLLM_CPU_KVCACHE_SPACE": 40
251+
},
252+
"server_parameters": {
253+
"model": "meta-llama/Llama-3.1-8B-Instruct",
254+
"tensor_parallel_size": 2,
102255
"dtype": "bfloat16",
103256
"distributed_executor_backend": "mp",
104257
"block_size": 128,
105258
"trust_remote_code": "",
106259
"enable_chunked_prefill": "",
107260
"disable_log_stats": "",
108-
"disable_log_requests": "",
261+
"enforce_eager": "",
262+
"max_num_batched_tokens": 2048,
263+
"max_num_seqs": 256,
109264
"load_format": "dummy"
110265
},
111266
"client_parameters": {
112-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
267+
"model": "meta-llama/Llama-3.1-8B-Instruct",
113268
"backend": "vllm",
114269
"dataset_name": "random",
115-
"random-input-len": 1024,
270+
"random-input-len": 2048,
116271
"random-output-len": 128,
117272
"ignore-eos": "",
118-
"num_prompts": 100
273+
"num_prompts": 32
119274
}
120275
}
121276
]

vllm-benchmarks/benchmarks/cpu/throughput-tests-cpu.json

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,24 @@
11
[
22
{
3-
"test_name": "throughput_llama8B_tp1",
3+
"test_name": "throughput_llama8B_tp2",
44
"environment_variables": {
5+
"VLLM_RPC_TIMEOUT": 100000,
56
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
7+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
8+
"VLLM_CPU_SGL_KERNEL": 1,
69
"VLLM_CPU_KVCACHE_SPACE": 40
710
},
811
"parameters": {
9-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10-
"tensor_parallel_size": 1,
11-
"load_format": "dummy",
12-
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
13-
"num_prompts": 200,
14-
"backend": "vllm"
15-
}
16-
},
17-
{
18-
"test_name": "throughput_llama8B_tp4",
19-
"environment_variables": {
20-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
21-
"VLLM_CPU_KVCACHE_SPACE": 40
22-
},
23-
"parameters": {
24-
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
25-
"tensor_parallel_size": 4,
26-
"load_format": "dummy",
12+
"model": "meta-llama/Llama-3.1-8B-Instruct",
13+
"tensor_parallel_size": 2,
14+
"dtype": "bfloat16",
15+
"distributed_executor_backend": "mp",
16+
"block_size": 128,
17+
"trust_remote_code": "",
18+
"disable_log_stats": "",
19+
"enforce_eager": "",
20+
"max_num_batched_tokens": 2048,
21+
"max_num_seqs": 256,
2722
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
2823
"num_prompts": 200,
2924
"backend": "vllm"

0 commit comments

Comments
 (0)