Add Linux Aarch64 G3 runners to vLLM bms (#107)

ioghiban · web-flow · commit de7bec9772f5 · 2025-11-14T12:14:46.000-08:00
diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -19,6 +19,7 @@
         "linux.rocm.gpu.gfx942.1",
         "linux.24xl.spr-metal",
         "linux.24xl.gnr",
+        "linux.arm64.m7g.4xlarge",
         "linux.dgx.b200",
         "linux.hpu.gaudi3.8",
     ],
@@ -59,6 +60,7 @@
     "linux.rocm.gpu.gfx942.8": "rocm",
     "linux.24xl.spr-metal": "cpu",
     "linux.24xl.gnr": "cpu",
+    "linux.arm64.m7g.4xlarge": "cpu",
     "linux.hpu.gaudi3.8": "hpu",
 }
 
@@ -229,8 +231,8 @@ def generate_benchmark_matrix(
 ) -> Dict[str, Any]:
     """
     Parse all the JSON files in vLLM benchmark configs directory to get the
-    model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
-    """
+    model name and tensor parallel size (aka number of GPUs, CPU NUMA nodes - Intel
+    or CPUs - ARM)"""
     benchmark_matrix: Dict[str, Any] = {
         "include": [],
     }
diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py
@@ -21,6 +21,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -80,6 +84,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -110,6 +118,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -145,6 +157,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -172,6 +188,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -225,7 +245,7 @@ def test_generate_benchmark_matrix():
 
     # Select multiple runners
     models = []
-    runners = ["h100", "spr"]
+    runners = ["h100", "spr", "m7g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -234,6 +254,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -356,7 +380,7 @@ def test_generate_benchmark_matrix():
         "meta-llama/meta-llama-3.1-8b-instruct",
         "mistralai/mixtral-8x7b-instruct-v0.1",
     ]
-    runners = ["rocm", "spr"]
+    runners = ["rocm", "spr", "m7g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -365,6 +389,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -25,7 +25,7 @@ on:
           A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
         required: true
         type: string
-        default: h100,rocm,spr,gnr,b200,gaudi3
+        default: h100,rocm,spr,gnr,m7g,b200,gaudi3
   pull_request:
     paths:
       - .github/workflows/vllm-benchmark.yml
@@ -112,8 +112,17 @@ jobs:
           elif command -v hl-smi; then
             DEVICE_NAME=hpu
             hl-smi
-          else
-            DEVICE_NAME=cpu
+          else 
+            arch=$(uname -m)
+
+            case "$arch" in
+              aarch64|arm64)
+                DEVICE_NAME=arm64-cpu
+                ;;
+              *)
+                DEVICE_NAME=cpu
+                ;;
+            esac
             lscpu
           fi
           echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
@@ -132,6 +141,8 @@ jobs:
             DEVICE_TYPE="Intel Gaudi3 "$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
           elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
             DEVICE_TYPE="$(lscpu | grep "Model name" | sed -E 's/.*Model name:[[:space:]]*//; s/Intel\(R\)//g; s/\(R\)//g; s/\(TM\)//g; s/CPU//g; s/Processor//g; s/[[:space:]]+/ /g; s/^ //; s/ $//; s/ /_/g')_$(awk -F: '/Core\(s\) per socket/ {c=$2} /Socket\(s\)/ {s=$2} END {gsub(/ /,"",c); gsub(/ /,"",s); printf "%sc", c*s}' < <(lscpu))"
+          elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
+            DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
           fi
           echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
 
@@ -172,6 +183,8 @@ jobs:
             DOCKER_IMAGE_SUFFIX=-hpu
           elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
             DOCKER_IMAGE_SUFFIX=-cpu
+          elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
+            DOCKER_IMAGE_SUFFIX=-arm64-cpu
           fi
           echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
           echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
@@ -284,11 +297,13 @@ jobs:
         run: |
           set -eux
 
-          if [[ "${DEVICE_NAME}" == "cpu" ]]; then
-            ON_CPU=1
-          else
-            ON_CPU=0
-          fi
+          ON_ARM64_CPU=0
+          ON_CPU=0
+
+          case "$DEVICE_NAME" in
+            cpu)       ON_CPU=1 ;;
+            arm64-cpu) ON_ARM64_CPU=1 ;;
+          esac
 
           container_name=$(docker run \
             ${GPU_FLAG:-} \
@@ -301,6 +316,7 @@ jobs:
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
             -e ON_CPU="${ON_CPU}" \
+            -e ON_ARM64_CPU="${ON_ARM64_CPU}" \
             --ipc=host \
             --tty \
             --detach \
diff --git a/LICENSE b/LICENSE
@@ -2,6 +2,9 @@ MIT License
 
 Copyright (c) Facebook, Inc. and its affiliates.
 
+All contributions by Arm:
+Copyright (c) 2025 Arm Limited and/or its affiliates
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
diff --git a/vllm-benchmarks/benchmarks/cpu/latency-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/cpu/latency-tests-arm64-cpu.json
@@ -0,0 +1,30 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/cpu/serving-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/cpu/serving-tests-arm64-cpu.json
@@ -0,0 +1,121 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 1024,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 100
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/cpu/throughput-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/cpu/throughput-tests-arm64-cpu.json
@@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,10 @@ def test_generate_benchmark_matrix():`
`21`	`21`	`"""\`
`22`	`22`	`{`
`23`	`23`	`"include": [`
	`24`	`+ {`
	`25`	`+ "runner": "linux.arm64.m7g.4xlarge",`
	`26`	`+ "models": "meta-llama/meta-llama-3.1-8b-instruct"`
	`27`	`+ },`
`24`	`28`	`{`
`25`	`29`	`"runner": "linux.24xl.spr-metal",`
`26`	`30`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`@@ -80,6 +84,10 @@ def test_generate_benchmark_matrix():`
`80`	`84`	`"""\`
`81`	`85`	`{`
`82`	`86`	`"include": [`
	`87`	`+ {`
	`88`	`+ "runner": "linux.arm64.m7g.4xlarge",`
	`89`	`+ "models": "meta-llama/meta-llama-3.1-8b-instruct"`
	`90`	`+ },`
`83`	`91`	`{`
`84`	`92`	`"runner": "linux.24xl.spr-metal",`
`85`	`93`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`@@ -110,6 +118,10 @@ def test_generate_benchmark_matrix():`
`110`	`118`	`"""\`
`111`	`119`	`{`
`112`	`120`	`"include": [`
	`121`	`+ {`
	`122`	`+ "runner": "linux.arm64.m7g.4xlarge",`
	`123`	`+ "models": "meta-llama/meta-llama-3.1-8b-instruct"`
	`124`	`+ },`
`113`	`125`	`{`
`114`	`126`	`"runner": "linux.24xl.spr-metal",`
`115`	`127`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`@@ -145,6 +157,10 @@ def test_generate_benchmark_matrix():`
`145`	`157`	`"""\`
`146`	`158`	`{`
`147`	`159`	`"include": [`
	`160`	`+ {`
	`161`	`+ "runner": "linux.arm64.m7g.4xlarge",`
	`162`	`+ "models": "meta-llama/meta-llama-3.1-8b-instruct"`
	`163`	`+ },`
`148`	`164`	`{`
`149`	`165`	`"runner": "linux.24xl.spr-metal",`
`150`	`166`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`@@ -172,6 +188,10 @@ def test_generate_benchmark_matrix():`
`172`	`188`	`"""\`
`173`	`189`	`{`
`174`	`190`	`"include": [`
	`191`	`+ {`
	`192`	`+ "runner": "linux.arm64.m7g.4xlarge",`
	`193`	`+ "models": "meta-llama/meta-llama-3.1-8b-instruct"`
	`194`	`+ },`
`175`	`195`	`{`
`176`	`196`	`"runner": "linux.24xl.spr-metal",`
`177`	`197`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`@@ -225,7 +245,7 @@ def test_generate_benchmark_matrix():`
`225`	`245`
`226`	`246`	`# Select multiple runners`
`227`	`247`	`models = []`
`228`		`- runners = ["h100", "spr"]`
	`248`	`+ runners = ["h100", "spr", "m7g"]`
`229`	`249`	`output = json.dumps(`
`230`	`250`	`generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2`
`231`	`251`	`)`
`@@ -234,6 +254,10 @@ def test_generate_benchmark_matrix():`
`234`	`254`	`"""\`
`235`	`255`	`{`
`236`	`256`	`"include": [`
	`257`	`+ {`
	`258`	`+ "runner": "linux.arm64.m7g.4xlarge",`
	`259`	`+ "models": "meta-llama/meta-llama-3.1-8b-instruct"`
	`260`	`+ },`
`237`	`261`	`{`
`238`	`262`	`"runner": "linux.24xl.spr-metal",`
`239`	`263`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`@@ -356,7 +380,7 @@ def test_generate_benchmark_matrix():`
`356`	`380`	`"meta-llama/meta-llama-3.1-8b-instruct",`
`357`	`381`	`"mistralai/mixtral-8x7b-instruct-v0.1",`
`358`	`382`	`]`
`359`		`- runners = ["rocm", "spr"]`
	`383`	`+ runners = ["rocm", "spr", "m7g"]`
`360`	`384`	`output = json.dumps(`
`361`	`385`	`generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2`
`362`	`386`	`)`
`@@ -365,6 +389,10 @@ def test_generate_benchmark_matrix():`
`365`	`389`	`"""\`
`366`	`390`	`{`
`367`	`391`	`"include": [`
	`392`	`+ {`
	`393`	`+ "runner": "linux.arm64.m7g.4xlarge",`
	`394`	`+ "models": "meta-llama/meta-llama-3.1-8b-instruct"`
	`395`	`+ },`
`368`	`396`	`{`
`369`	`397`	`"runner": "linux.24xl.spr-metal",`
`370`	`398`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`