From 0109e6a19b746c2292fa63a7732c9d974e4bf727 Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Mon, 2 Feb 2026 08:45:13 +0000 Subject: [PATCH] feat: Optimize model `max_num_seqs` and global benchmark parameters for Strix Halo, and centralize configurations in `models.py`. --- benchmarks/find_max_context.py | 20 +++++++++--- benchmarks/vllm_cluster_bench.py | 52 +++++++++++++++++--------------- scripts/models.py | 20 ++++++------ 3 files changed, 53 insertions(+), 39 deletions(-) diff --git a/benchmarks/find_max_context.py b/benchmarks/find_max_context.py index 5362c9d..79ec0ff 100644 --- a/benchmarks/find_max_context.py +++ b/benchmarks/find_max_context.py @@ -15,13 +15,25 @@ except ImportError: print("Error: 'transformers' not found. Please install it or run in vLLM environment.") sys.exit(1) -# Import configuration from average benchmark script +# Import path handling for scripts/models.py try: - from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN, get_gpu_count, kill_vllm + import sys, os + sys.path.append(str(Path(__file__).parent.parent / "scripts")) + import models except ImportError: - print("Error: Could not import run_vllm_bench.py. Make sure it is in the same directory.") + print("Error: Could not import scripts/models.py.") sys.exit(1) +# Import Utils from run_vllm_bench (keep utils shared) +try: + from run_vllm_bench import get_gpu_count, kill_vllm +except ImportError: + print("Error: Could not import run_vllm_bench.py.") + sys.exit(1) + +MODEL_TABLE = models.MODEL_TABLE +MODELS_TO_RUN = models.MODELS_TO_RUN + # ========================= # 🧠 GROUNDING & METHODOLOGY # ========================= @@ -46,7 +58,7 @@ REPORT_FILE = Path("max_context_report.md") # We test these GPU Utilizations steps to see how much we can squeeze # 0.90 is default, but we want MAX context. -# 0.98 is our target high. 0.95 is the fallback. +# 0.58 is our target high. 0.90 is the fallback. GPU_UTIL_STEPS = ["0.95", "0.90"] # We test these concurrency settings CONCURRENCY_STEPS = [1, 4, 8, 16] diff --git a/benchmarks/vllm_cluster_bench.py b/benchmarks/vllm_cluster_bench.py index be18a37..b9876a2 100755 --- a/benchmarks/vllm_cluster_bench.py +++ b/benchmarks/vllm_cluster_bench.py @@ -2,30 +2,7 @@ import subprocess, time, json, sys, os, requests, argparse, re from pathlib import Path -# ========================= -# ⚙️ GLOBAL SETTINGS -# ========================= - -# CLUSTER CONFIG: 2x Strix Halo (TP=2) -# User requested specifically to test with TP=2 on the cluster. -CLUSTER_TP = 2 -GPU_UTIL = "0.90" - -# THROUGHPUT CONFIG (Same as run_vllm_bench) -OFF_NUM_PROMPTS = 200 -OFF_FORCED_OUTPUT = "512" -DEFAULT_BATCH_TOKENS = "8192" - -RESULTS_DIR = Path("benchmark_results") -RESULTS_DIR.mkdir(exist_ok=True) - -# Reuse the model table from the main benchmark script -# We can just import it or copy it. Importing is cleaner but might rely on path. -# For standalone robustness, I will copy the minimal needed config or import if possible. -# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir. -# Let's assume it's in the same dir as run_vllm_bench.py. - - +# Import models immediately to access globals try: import models except ImportError: @@ -37,10 +14,35 @@ except ImportError: except ImportError: sys.path.append(str(Path(__file__).parent.parent / "scripts")) import models - + +# ========================= +# ⚙️ GLOBAL SETTINGS +# ========================= + +# CLUSTER CONFIG: 2x Strix Halo (TP=2) +# User requested specifically to test with TP=2 on the cluster. +CLUSTER_TP = 2 +GPU_UTIL = "0.90" + +# THROUGHPUT CONFIG (Imported from models.py) +OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS +OFF_FORCED_OUTPUT = models.OFF_FORCED_OUTPUT +DEFAULT_BATCH_TOKENS = models.DEFAULT_BATCH_TOKENS + +RESULTS_DIR = Path("benchmark_results") +RESULTS_DIR.mkdir(exist_ok=True) + +# Reuse the model table from the main benchmark script +# We can just import it or copy it. Importing is cleaner but might rely on path. +# For standalone robustness, I will copy the minimal needed config or import if possible. +# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir. +# Let's assume it's in the same dir as run_vllm_bench.py. + + MODEL_TABLE = models.MODEL_TABLE MODELS_TO_RUN = models.MODELS_TO_RUN + # ========================= # UTILS (Adapted for Cluster) # ========================= diff --git a/scripts/models.py b/scripts/models.py index bcca074..3ca4281 100644 --- a/scripts/models.py +++ b/scripts/models.py @@ -4,14 +4,14 @@ MODEL_TABLE = { "meta-llama/Meta-Llama-3.1-8B-Instruct": { "trust_remote": False, "valid_tp": [1, 2], - "max_num_seqs": "64", + "max_num_seqs": "24", # Strix Halo Optimized (Bandwidth Limit) "max_tokens": "32768" }, "google/gemma-3-12b-it": { "trust_remote": False, "valid_tp": [1, 2], - "max_num_seqs": "64", + "max_num_seqs": "24", "max_tokens": "32768" }, # 2. GPT-OSS 20B (MXFP4) @@ -19,14 +19,14 @@ MODEL_TABLE = { "openai/gpt-oss-20b": { "trust_remote": True, "valid_tp": [1, 2], - "max_num_seqs": "64", + "max_num_seqs": "24", "max_tokens": "8192" }, "openai/gpt-oss-120b": { "trust_remote": True, "valid_tp": [1], - "max_num_seqs": "64", + "max_num_seqs": "16", "max_tokens": "8192" }, @@ -34,7 +34,7 @@ MODEL_TABLE = { "Qwen/Qwen3-14B-AWQ": { "trust_remote": True, "valid_tp": [1], # Too big for single GPU - "max_num_seqs": "32", # Lower concurrency for safety + "max_num_seqs": "24", # Strix Halo Optimized "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive "enforce_eager": False, "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error @@ -45,7 +45,7 @@ MODEL_TABLE = { "trust_remote": True, "enforce_eager": False, "valid_tp": [1, 2], - "max_num_seqs": "64", + "max_num_seqs": "24", "max_tokens": "32768" }, @@ -53,7 +53,7 @@ MODEL_TABLE = { "trust_remote": True, "enforce_eager": False, "valid_tp": [1, 2], - "max_num_seqs": "64", + "max_num_seqs": "24", "max_tokens": "32768" }, @@ -61,7 +61,7 @@ MODEL_TABLE = { "trust_remote": True, "enforce_eager": False, "valid_tp": [1, 2], - "max_num_seqs": "64", + "max_num_seqs": "24", "max_tokens": "32768", }, @@ -71,7 +71,7 @@ MODEL_TABLE = { "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": { "trust_remote": True, "valid_tp": [1], # Too big for single GPU - "max_num_seqs": "32", # Lower concurrency for safety + "max_num_seqs": "16", # Large Model / Bandwidth Constrained "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive "enforce_eager": True, "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error @@ -93,6 +93,6 @@ MODELS_TO_RUN = [ # Hardware / Global Defaults GPU_UTIL = "0.90" -OFF_NUM_PROMPTS = 200 +OFF_NUM_PROMPTS = 100 # Reduced for Strix Halo (Bandwidth Limited) OFF_FORCED_OUTPUT = "512" DEFAULT_BATCH_TOKENS = "8192"