Re-enable Arm CPU vLLM HUD Benchmarks

fadara01 · fadara01 · commit beaad7c6c223 · 2025-12-17T12:12:28.000Z
- re-enables the benchmarks disabled in #114 - related to vllm-project/vllm#26494 (not sure which needs to go in first) - use default block_size in serving benchmarks (i.e. 128 instead of setting it to 16) Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -19,7 +19,7 @@
         "linux.rocm.gpu.gfx942.1",
         "linux.24xl.spr-metal",
         "linux.24xl.gnr",
-        # "linux.arm64.m7g.4xlarge",  # TODO (huydhn): This is not working yet
+        "linux.arm64.m8g.4xlarge",
         "linux.dgx.b200",
         "linux.hpu.gaudi3.8",
     ],
@@ -60,7 +60,7 @@
     "linux.rocm.gpu.gfx942.8": "rocm",
     "linux.24xl.spr-metal": "cpu",
     "linux.24xl.gnr": "cpu",
-    # "linux.arm64.m7g.4xlarge": "arm64-cpu",  # TODO (huydhn): This is not working yet
+    "linux.arm64.m8g.4xlarge": "arm64-cpu",
     "linux.hpu.gaudi3.8": "hpu",
 }
 
diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py
@@ -22,7 +22,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -209,7 +209,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -247,7 +247,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -286,7 +286,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -321,7 +321,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -409,7 +409,7 @@ def test_generate_benchmark_matrix():
 
     # Select multiple runners
     models = []
-    runners = ["h100", "spr", "m7g"]
+    runners = ["h100", "spr", "m8g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -419,7 +419,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -624,7 +624,7 @@ def test_generate_benchmark_matrix():
         "meta-llama/meta-llama-3.1-8b-instruct",
         "mistralai/mixtral-8x7b-instruct-v0.1",
     ]
-    runners = ["rocm", "spr", "m7g"]
+    runners = ["rocm", "spr", "m8g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -634,7 +634,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -25,10 +25,7 @@ on:
           A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
         required: true
         type: string
-        # TODO (huydhn): Remove aarch64 CPU benchmark running on m7g until the change
-        # from https://github.com/vllm-project/vllm/pull/26494#issuecomment-3537415441
-        # is resolved and merged
-        default: h100,rocm,spr,gnr,b200,gaudi3
+        default: h100,rocm,spr,gnr,b200,m8g,gaudi3
   pull_request:
     paths:
       - .github/workflows/vllm-benchmark.yml
@@ -306,13 +303,11 @@ jobs:
         run: |
           set -eux
 
-          ON_ARM64_CPU=0
           ON_CPU=0
 
-          case "$DEVICE_NAME" in
-            cpu)       ON_CPU=1 ;;
-            arm64-cpu) ON_ARM64_CPU=1 ;;
-          esac
+          if ["$DEVICE_NAME" == "cpu"] || ["$DEVICE_NAME" == "arm64-cpu"]; then
+            ON_CPU=1
+          fi
 
           container_name=$(docker run \
             ${GPU_FLAG:-} \
@@ -325,7 +320,6 @@ jobs:
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
             -e ON_CPU="${ON_CPU}" \
-            -e ON_ARM64_CPU="${ON_ARM64_CPU}" \
             --ipc=host \
             --tty \
             --detach \
diff --git a/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json
@@ -14,7 +14,6 @@
             "device": "cpu",
             "dtype": "bfloat16",
             "distributed_executor_backend": "mp",
-            "block_size": 16,
             "trust_remote_code": "",
             "disable_log_stats": "",
             "disable_log_requests": "",
@@ -43,7 +42,6 @@
             "device": "cpu",
             "dtype": "bfloat16",
             "distributed_executor_backend": "mp",
-            "block_size": 16,
             "trust_remote_code": "",
             "disable_log_stats": "",
             "disable_log_requests": "",
@@ -72,7 +70,6 @@
             "device": "cpu",
             "dtype": "bfloat16",
             "distributed_executor_backend": "mp",
-            "block_size": 16,
             "trust_remote_code": "",
             "disable_log_stats": "",
             "disable_log_requests": "",
@@ -101,7 +98,6 @@
             "device": "cpu",
             "dtype": "bfloat16",
             "distributed_executor_backend": "mp",
-            "block_size": 16,
             "trust_remote_code": "",
             "enable_chunked_prefill": "",
             "disable_log_stats": "",

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ def test_generate_benchmark_matrix():`
`22`	`22`	`{`
`23`	`23`	`"include": [`
`24`	`24`	`{`
`25`		`- "runner": "linux.arm64.m7g.4xlarge",`
	`25`	`+ "runner": "linux.arm64.m8g.4xlarge",`
`26`	`26`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`27`	`27`	`},`
`28`	`28`	`{`
`@@ -209,7 +209,7 @@ def test_generate_benchmark_matrix():`
`209`	`209`	`{`
`210`	`210`	`"include": [`
`211`	`211`	`{`
`212`		`- "runner": "linux.arm64.m7g.4xlarge",`
	`212`	`+ "runner": "linux.arm64.m8g.4xlarge",`
`213`	`213`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`214`	`214`	`},`
`215`	`215`	`{`
`@@ -247,7 +247,7 @@ def test_generate_benchmark_matrix():`
`247`	`247`	`{`
`248`	`248`	`"include": [`
`249`	`249`	`{`
`250`		`- "runner": "linux.arm64.m7g.4xlarge",`
	`250`	`+ "runner": "linux.arm64.m8g.4xlarge",`
`251`	`251`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`252`	`252`	`},`
`253`	`253`	`{`
`@@ -286,7 +286,7 @@ def test_generate_benchmark_matrix():`
`286`	`286`	`{`
`287`	`287`	`"include": [`
`288`	`288`	`{`
`289`		`- "runner": "linux.arm64.m7g.4xlarge",`
	`289`	`+ "runner": "linux.arm64.m8g.4xlarge",`
`290`	`290`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`291`	`291`	`},`
`292`	`292`	`{`
`@@ -321,7 +321,7 @@ def test_generate_benchmark_matrix():`
`321`	`321`	`{`
`322`	`322`	`"include": [`
`323`	`323`	`{`
`324`		`- "runner": "linux.arm64.m7g.4xlarge",`
	`324`	`+ "runner": "linux.arm64.m8g.4xlarge",`
`325`	`325`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`326`	`326`	`},`
`327`	`327`	`{`
`@@ -409,7 +409,7 @@ def test_generate_benchmark_matrix():`
`409`	`409`
`410`	`410`	`# Select multiple runners`
`411`	`411`	`models = []`
`412`		`- runners = ["h100", "spr", "m7g"]`
	`412`	`+ runners = ["h100", "spr", "m8g"]`
`413`	`413`	`output = json.dumps(`
`414`	`414`	`generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2`
`415`	`415`	`)`
`@@ -419,7 +419,7 @@ def test_generate_benchmark_matrix():`
`419`	`419`	`{`
`420`	`420`	`"include": [`
`421`	`421`	`{`
`422`		`- "runner": "linux.arm64.m7g.4xlarge",`
	`422`	`+ "runner": "linux.arm64.m8g.4xlarge",`
`423`	`423`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`424`	`424`	`},`
`425`	`425`	`{`
`@@ -624,7 +624,7 @@ def test_generate_benchmark_matrix():`
`624`	`624`	`"meta-llama/meta-llama-3.1-8b-instruct",`
`625`	`625`	`"mistralai/mixtral-8x7b-instruct-v0.1",`
`626`	`626`	`]`
`627`		`- runners = ["rocm", "spr", "m7g"]`
	`627`	`+ runners = ["rocm", "spr", "m8g"]`
`628`	`628`	`output = json.dumps(`
`629`	`629`	`generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2`
`630`	`630`	`)`
`@@ -634,7 +634,7 @@ def test_generate_benchmark_matrix():`
`634`	`634`	`{`
`635`	`635`	`"include": [`
`636`	`636`	`{`
`637`		`- "runner": "linux.arm64.m7g.4xlarge",`
	`637`	`+ "runner": "linux.arm64.m8g.4xlarge",`
`638`	`638`	`"models": "meta-llama/meta-llama-3.1-8b-instruct"`
`639`	`639`	`},`
`640`	`640`	`{`