Skip to content

Commit de7bec9

Browse files
authored
Add Linux Aarch64 G3 runners to vLLM bms (#107)
1 parent 8dcdcda commit de7bec9

File tree

7 files changed

+244
-12
lines changed

7 files changed

+244
-12
lines changed

.github/scripts/generate_vllm_benchmark_matrix.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"linux.rocm.gpu.gfx942.1",
2020
"linux.24xl.spr-metal",
2121
"linux.24xl.gnr",
22+
"linux.arm64.m7g.4xlarge",
2223
"linux.dgx.b200",
2324
"linux.hpu.gaudi3.8",
2425
],
@@ -59,6 +60,7 @@
5960
"linux.rocm.gpu.gfx942.8": "rocm",
6061
"linux.24xl.spr-metal": "cpu",
6162
"linux.24xl.gnr": "cpu",
63+
"linux.arm64.m7g.4xlarge": "cpu",
6264
"linux.hpu.gaudi3.8": "hpu",
6365
}
6466

@@ -229,8 +231,8 @@ def generate_benchmark_matrix(
229231
) -> Dict[str, Any]:
230232
"""
231233
Parse all the JSON files in vLLM benchmark configs directory to get the
232-
model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
233-
"""
234+
model name and tensor parallel size (aka number of GPUs, CPU NUMA nodes - Intel
235+
or CPUs - ARM)"""
234236
benchmark_matrix: Dict[str, Any] = {
235237
"include": [],
236238
}

.github/scripts/test_generate_vllm_benchmark_matrix.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ def test_generate_benchmark_matrix():
2121
"""\
2222
{
2323
"include": [
24+
{
25+
"runner": "linux.arm64.m7g.4xlarge",
26+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
27+
},
2428
{
2529
"runner": "linux.24xl.spr-metal",
2630
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -80,6 +84,10 @@ def test_generate_benchmark_matrix():
8084
"""\
8185
{
8286
"include": [
87+
{
88+
"runner": "linux.arm64.m7g.4xlarge",
89+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
90+
},
8391
{
8492
"runner": "linux.24xl.spr-metal",
8593
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -110,6 +118,10 @@ def test_generate_benchmark_matrix():
110118
"""\
111119
{
112120
"include": [
121+
{
122+
"runner": "linux.arm64.m7g.4xlarge",
123+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
124+
},
113125
{
114126
"runner": "linux.24xl.spr-metal",
115127
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -145,6 +157,10 @@ def test_generate_benchmark_matrix():
145157
"""\
146158
{
147159
"include": [
160+
{
161+
"runner": "linux.arm64.m7g.4xlarge",
162+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
163+
},
148164
{
149165
"runner": "linux.24xl.spr-metal",
150166
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -172,6 +188,10 @@ def test_generate_benchmark_matrix():
172188
"""\
173189
{
174190
"include": [
191+
{
192+
"runner": "linux.arm64.m7g.4xlarge",
193+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
194+
},
175195
{
176196
"runner": "linux.24xl.spr-metal",
177197
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -225,7 +245,7 @@ def test_generate_benchmark_matrix():
225245

226246
# Select multiple runners
227247
models = []
228-
runners = ["h100", "spr"]
248+
runners = ["h100", "spr", "m7g"]
229249
output = json.dumps(
230250
generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
231251
)
@@ -234,6 +254,10 @@ def test_generate_benchmark_matrix():
234254
"""\
235255
{
236256
"include": [
257+
{
258+
"runner": "linux.arm64.m7g.4xlarge",
259+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
260+
},
237261
{
238262
"runner": "linux.24xl.spr-metal",
239263
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -356,7 +380,7 @@ def test_generate_benchmark_matrix():
356380
"meta-llama/meta-llama-3.1-8b-instruct",
357381
"mistralai/mixtral-8x7b-instruct-v0.1",
358382
]
359-
runners = ["rocm", "spr"]
383+
runners = ["rocm", "spr", "m7g"]
360384
output = json.dumps(
361385
generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
362386
)
@@ -365,6 +389,10 @@ def test_generate_benchmark_matrix():
365389
"""\
366390
{
367391
"include": [
392+
{
393+
"runner": "linux.arm64.m7g.4xlarge",
394+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
395+
},
368396
{
369397
"runner": "linux.24xl.spr-metal",
370398
"models": "meta-llama/meta-llama-3.1-8b-instruct"

.github/workflows/vllm-benchmark.yml

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ on:
2525
A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
2626
required: true
2727
type: string
28-
default: h100,rocm,spr,gnr,b200,gaudi3
28+
default: h100,rocm,spr,gnr,m7g,b200,gaudi3
2929
pull_request:
3030
paths:
3131
- .github/workflows/vllm-benchmark.yml
@@ -112,8 +112,17 @@ jobs:
112112
elif command -v hl-smi; then
113113
DEVICE_NAME=hpu
114114
hl-smi
115-
else
116-
DEVICE_NAME=cpu
115+
else
116+
arch=$(uname -m)
117+
118+
case "$arch" in
119+
aarch64|arm64)
120+
DEVICE_NAME=arm64-cpu
121+
;;
122+
*)
123+
DEVICE_NAME=cpu
124+
;;
125+
esac
117126
lscpu
118127
fi
119128
echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
@@ -132,6 +141,8 @@ jobs:
132141
DEVICE_TYPE="Intel Gaudi3 "$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
133142
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
134143
DEVICE_TYPE="$(lscpu | grep "Model name" | sed -E 's/.*Model name:[[:space:]]*//; s/Intel\(R\)//g; s/\(R\)//g; s/\(TM\)//g; s/CPU//g; s/Processor//g; s/[[:space:]]+/ /g; s/^ //; s/ $//; s/ /_/g')_$(awk -F: '/Core\(s\) per socket/ {c=$2} /Socket\(s\)/ {s=$2} END {gsub(/ /,"",c); gsub(/ /,"",s); printf "%sc", c*s}' < <(lscpu))"
144+
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
145+
DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
135146
fi
136147
echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
137148
@@ -172,6 +183,8 @@ jobs:
172183
DOCKER_IMAGE_SUFFIX=-hpu
173184
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
174185
DOCKER_IMAGE_SUFFIX=-cpu
186+
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
187+
DOCKER_IMAGE_SUFFIX=-arm64-cpu
175188
fi
176189
echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
177190
echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
@@ -284,11 +297,13 @@ jobs:
284297
run: |
285298
set -eux
286299
287-
if [[ "${DEVICE_NAME}" == "cpu" ]]; then
288-
ON_CPU=1
289-
else
290-
ON_CPU=0
291-
fi
300+
ON_ARM64_CPU=0
301+
ON_CPU=0
302+
303+
case "$DEVICE_NAME" in
304+
cpu) ON_CPU=1 ;;
305+
arm64-cpu) ON_ARM64_CPU=1 ;;
306+
esac
292307
293308
container_name=$(docker run \
294309
${GPU_FLAG:-} \
@@ -301,6 +316,7 @@ jobs:
301316
-e ENGINE_VERSION \
302317
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
303318
-e ON_CPU="${ON_CPU}" \
319+
-e ON_ARM64_CPU="${ON_ARM64_CPU}" \
304320
--ipc=host \
305321
--tty \
306322
--detach \

LICENSE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ MIT License
22

33
Copyright (c) Facebook, Inc. and its affiliates.
44

5+
All contributions by Arm:
6+
Copyright (c) 2025 Arm Limited and/or its affiliates
7+
58
Permission is hereby granted, free of charge, to any person obtaining a copy
69
of this software and associated documentation files (the "Software"), to deal
710
in the Software without restriction, including without limitation the rights
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"num_iters_warmup": 5,
13+
"num_iters": 15
14+
}
15+
},
16+
{
17+
"test_name": "latency_llama8B_tp4",
18+
"environment_variables": {
19+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
20+
"VLLM_CPU_KVCACHE_SPACE": 40
21+
},
22+
"parameters": {
23+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
24+
"tensor_parallel_size": 4,
25+
"load_format": "dummy",
26+
"num_iters_warmup": 5,
27+
"num_iters": 15
28+
}
29+
}
30+
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1_sharegpt",
4+
"qps_list": [1, 4, 16, "inf"],
5+
"server_environment_variables": {
6+
"VLLM_RPC_TIMEOUT": 100000,
7+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
8+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
9+
"VLLM_CPU_KVCACHE_SPACE": 40
10+
},
11+
"server_parameters": {
12+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
13+
"tensor_parallel_size": 1,
14+
"device": "cpu",
15+
"dtype": "bfloat16",
16+
"distributed_executor_backend": "mp",
17+
"block_size": 16,
18+
"trust_remote_code": "",
19+
"disable_log_stats": "",
20+
"disable_log_requests": "",
21+
"load_format": "dummy"
22+
},
23+
"client_parameters": {
24+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
25+
"backend": "vllm",
26+
"dataset_name": "sharegpt",
27+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200
29+
}
30+
},
31+
{
32+
"test_name": "serving_llama8B_tp2_sharegpt",
33+
"qps_list": [1, 4, 16, "inf"],
34+
"server_environment_variables": {
35+
"VLLM_RPC_TIMEOUT": 100000,
36+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
37+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
38+
"VLLM_CPU_KVCACHE_SPACE": 40
39+
},
40+
"server_parameters": {
41+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
42+
"tensor_parallel_size": 2,
43+
"device": "cpu",
44+
"dtype": "bfloat16",
45+
"distributed_executor_backend": "mp",
46+
"block_size": 16,
47+
"trust_remote_code": "",
48+
"disable_log_stats": "",
49+
"disable_log_requests": "",
50+
"load_format": "dummy"
51+
},
52+
"client_parameters": {
53+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
54+
"backend": "vllm",
55+
"dataset_name": "sharegpt",
56+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
57+
"num_prompts": 200
58+
}
59+
},
60+
{
61+
"test_name": "serving_llama8B_tp4_sharegpt",
62+
"qps_list": [1, 4, 16, "inf"],
63+
"server_environment_variables": {
64+
"VLLM_RPC_TIMEOUT": 100000,
65+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
66+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
67+
"VLLM_CPU_KVCACHE_SPACE": 40
68+
},
69+
"server_parameters": {
70+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
71+
"tensor_parallel_size": 4,
72+
"device": "cpu",
73+
"dtype": "bfloat16",
74+
"distributed_executor_backend": "mp",
75+
"block_size": 16,
76+
"trust_remote_code": "",
77+
"disable_log_stats": "",
78+
"disable_log_requests": "",
79+
"load_format": "dummy"
80+
},
81+
"client_parameters": {
82+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
83+
"backend": "vllm",
84+
"dataset_name": "sharegpt",
85+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
86+
"num_prompts": 200
87+
}
88+
},
89+
{
90+
"test_name": "serving_llama8B_tp4_random_1024_128",
91+
"qps_list": [1, 4, 16, "inf"],
92+
"server_environment_variables": {
93+
"VLLM_RPC_TIMEOUT": 100000,
94+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
95+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
96+
"VLLM_CPU_KVCACHE_SPACE": 40
97+
},
98+
"server_parameters": {
99+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
100+
"tensor_parallel_size": 4,
101+
"device": "cpu",
102+
"dtype": "bfloat16",
103+
"distributed_executor_backend": "mp",
104+
"block_size": 16,
105+
"trust_remote_code": "",
106+
"enable_chunked_prefill": "",
107+
"disable_log_stats": "",
108+
"disable_log_requests": "",
109+
"load_format": "dummy"
110+
},
111+
"client_parameters": {
112+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
113+
"backend": "vllm",
114+
"dataset_name": "random",
115+
"random-input-len": 1024,
116+
"random-output-len": 128,
117+
"ignore-eos": "",
118+
"num_prompts": 100
119+
}
120+
}
121+
]
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[
2+
{
3+
"test_name": "throughput_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
13+
"num_prompts": 200,
14+
"backend": "vllm"
15+
}
16+
},
17+
{
18+
"test_name": "throughput_llama8B_tp4",
19+
"environment_variables": {
20+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
21+
"VLLM_CPU_KVCACHE_SPACE": 40
22+
},
23+
"parameters": {
24+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
25+
"tensor_parallel_size": 4,
26+
"load_format": "dummy",
27+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200,
29+
"backend": "vllm"
30+
}
31+
}
32+
]

0 commit comments

Comments
 (0)