From 0109e6a19b746c2292fa63a7732c9d974e4bf727 Mon Sep 17 00:00:00 2001
From: Donato Capitella <donato.capitella@reversec.com>
Date: Mon, 2 Feb 2026 08:45:13 +0000
Subject: [PATCH] feat: Optimize model `max_num_seqs` and global benchmark
 parameters for Strix Halo, and centralize configurations in `models.py`.

---
 benchmarks/find_max_context.py   | 20 +++++++++---
 benchmarks/vllm_cluster_bench.py | 52 +++++++++++++++++---------------
 scripts/models.py                | 20 ++++++------
 3 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/benchmarks/find_max_context.py b/benchmarks/find_max_context.py
index 5362c9d..79ec0ff 100644
--- a/benchmarks/find_max_context.py
+++ b/benchmarks/find_max_context.py
@@ -15,13 +15,25 @@ except ImportError:
     print("Error: 'transformers' not found. Please install it or run in vLLM environment.")
     sys.exit(1)
 
-# Import configuration from average benchmark script
+# Import path handling for scripts/models.py
 try:
-    from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN, get_gpu_count, kill_vllm
+    import sys, os
+    sys.path.append(str(Path(__file__).parent.parent / "scripts"))
+    import models
 except ImportError:
-    print("Error: Could not import run_vllm_bench.py. Make sure it is in the same directory.")
+    print("Error: Could not import scripts/models.py.")
     sys.exit(1)
 
+# Import Utils from run_vllm_bench (keep utils shared)
+try:
+    from run_vllm_bench import get_gpu_count, kill_vllm
+except ImportError:
+    print("Error: Could not import run_vllm_bench.py.")
+    sys.exit(1)
+
+MODEL_TABLE = models.MODEL_TABLE
+MODELS_TO_RUN = models.MODELS_TO_RUN
+
 # =========================
 # 🧠 GROUNDING & METHODOLOGY
 # =========================
@@ -46,7 +58,7 @@ REPORT_FILE = Path("max_context_report.md")
 
 # We test these GPU Utilizations steps to see how much we can squeeze
 # 0.90 is default, but we want MAX context.
-# 0.98 is our target high. 0.95 is the fallback.
+# 0.58 is our target high. 0.90 is the fallback.
 GPU_UTIL_STEPS = ["0.95", "0.90"]
 # We test these concurrency settings
 CONCURRENCY_STEPS = [1, 4, 8, 16]
diff --git a/benchmarks/vllm_cluster_bench.py b/benchmarks/vllm_cluster_bench.py
index be18a37..b9876a2 100755
--- a/benchmarks/vllm_cluster_bench.py
+++ b/benchmarks/vllm_cluster_bench.py
@@ -2,30 +2,7 @@
 import subprocess, time, json, sys, os, requests, argparse, re
 from pathlib import Path
 
-# =========================
-# ⚙️ GLOBAL SETTINGS
-# =========================
-
-# CLUSTER CONFIG: 2x Strix Halo (TP=2)
-# User requested specifically to test with TP=2 on the cluster.
-CLUSTER_TP = 2
-GPU_UTIL = "0.90" 
-
-# THROUGHPUT CONFIG (Same as run_vllm_bench)
-OFF_NUM_PROMPTS      = 200 
-OFF_FORCED_OUTPUT    = "512"
-DEFAULT_BATCH_TOKENS = "8192"
-
-RESULTS_DIR = Path("benchmark_results")
-RESULTS_DIR.mkdir(exist_ok=True)
-
-# Reuse the model table from the main benchmark script
-# We can just import it or copy it. Importing is cleaner but might rely on path.
-# For standalone robustness, I will copy the minimal needed config or import if possible.
-# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir.
-# Let's assume it's in the same dir as run_vllm_bench.py.
-
-
+# Import models immediately to access globals
 try:
     import models
 except ImportError:
@@ -37,10 +14,35 @@ except ImportError:
     except ImportError:
         sys.path.append(str(Path(__file__).parent.parent / "scripts"))
         import models
-        
+
+# =========================
+# ⚙️ GLOBAL SETTINGS
+# =========================
+
+# CLUSTER CONFIG: 2x Strix Halo (TP=2)
+# User requested specifically to test with TP=2 on the cluster.
+CLUSTER_TP = 2
+GPU_UTIL = "0.90" 
+
+# THROUGHPUT CONFIG (Imported from models.py)
+OFF_NUM_PROMPTS      = models.OFF_NUM_PROMPTS
+OFF_FORCED_OUTPUT    = models.OFF_FORCED_OUTPUT
+DEFAULT_BATCH_TOKENS = models.DEFAULT_BATCH_TOKENS
+
+RESULTS_DIR = Path("benchmark_results")
+RESULTS_DIR.mkdir(exist_ok=True)
+
+# Reuse the model table from the main benchmark script
+# We can just import it or copy it. Importing is cleaner but might rely on path.
+# For standalone robustness, I will copy the minimal needed config or import if possible.
+# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir.
+# Let's assume it's in the same dir as run_vllm_bench.py.
+
+
 MODEL_TABLE = models.MODEL_TABLE
 MODELS_TO_RUN = models.MODELS_TO_RUN
 
+
 # =========================
 # UTILS (Adapted for Cluster)
 # =========================
diff --git a/scripts/models.py b/scripts/models.py
index bcca074..3ca4281 100644
--- a/scripts/models.py
+++ b/scripts/models.py
@@ -4,14 +4,14 @@ MODEL_TABLE = {
     "meta-llama/Meta-Llama-3.1-8B-Instruct": {
         "trust_remote": False,
         "valid_tp": [1, 2],
-        "max_num_seqs": "64",
+        "max_num_seqs": "24", # Strix Halo Optimized (Bandwidth Limit)
         "max_tokens": "32768" 
     },
     
     "google/gemma-3-12b-it": {
         "trust_remote": False,
         "valid_tp": [1, 2],
-        "max_num_seqs": "64",
+        "max_num_seqs": "24",
         "max_tokens": "32768" 
     },
     # 2. GPT-OSS 20B (MXFP4)
@@ -19,14 +19,14 @@ MODEL_TABLE = {
     "openai/gpt-oss-20b": {
         "trust_remote": True,
         "valid_tp": [1, 2],
-        "max_num_seqs": "64",
+        "max_num_seqs": "24",
         "max_tokens": "8192"
     },
     
     "openai/gpt-oss-120b": {
         "trust_remote": True,
         "valid_tp": [1],
-        "max_num_seqs": "64",
+        "max_num_seqs": "16",
         "max_tokens": "8192"
     },
 
@@ -34,7 +34,7 @@ MODEL_TABLE = {
     "Qwen/Qwen3-14B-AWQ": {
         "trust_remote": True,
         "valid_tp": [1], # Too big for single GPU
-        "max_num_seqs": "32", # Lower concurrency for safety
+        "max_num_seqs": "24", # Strix Halo Optimized
         "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
         "enforce_eager": False, 
         "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
@@ -45,7 +45,7 @@ MODEL_TABLE = {
         "trust_remote": True,
         "enforce_eager": False, 
         "valid_tp": [1, 2],
-        "max_num_seqs": "64",
+        "max_num_seqs": "24",
         "max_tokens": "32768"
     },
 
@@ -53,7 +53,7 @@ MODEL_TABLE = {
         "trust_remote": True,
         "enforce_eager": False, 
         "valid_tp": [1, 2],
-        "max_num_seqs": "64",
+        "max_num_seqs": "24",
         "max_tokens": "32768"
     },
 
@@ -61,7 +61,7 @@ MODEL_TABLE = {
         "trust_remote": True,
         "enforce_eager": False, 
         "valid_tp": [1, 2],
-        "max_num_seqs": "64",
+        "max_num_seqs": "24",
         "max_tokens": "32768",
     },
 
@@ -71,7 +71,7 @@ MODEL_TABLE = {
      "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
         "trust_remote": True,
         "valid_tp": [1], # Too big for single GPU
-        "max_num_seqs": "32", # Lower concurrency for safety
+        "max_num_seqs": "16", # Large Model / Bandwidth Constrained
         "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
         "enforce_eager": True, 
         "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
@@ -93,6 +93,6 @@ MODELS_TO_RUN = [
 
 # Hardware / Global Defaults
 GPU_UTIL = "0.90"
-OFF_NUM_PROMPTS = 200
+OFF_NUM_PROMPTS = 100 # Reduced for Strix Halo (Bandwidth Limited)
 OFF_FORCED_OUTPUT = "512"
 DEFAULT_BATCH_TOKENS = "8192"