feat: Optimize model max_num_seqs and global benchmark parameters for Strix Halo, and centralize configurations in models.py.

Tá an tiomantas seo le fáil i:
Donato Capitella
2026-02-02 08:45:13 +00:00
tuismitheoir 6f118ff936
tiomantas 0109e6a19b
D'athraigh 3 comhad le 53 breiseanna agus 39 scriosta
+16 -4
Féach ar an gComhad
@@ -15,13 +15,25 @@ except ImportError:
print("Error: 'transformers' not found. Please install it or run in vLLM environment.")
sys.exit(1)
# Import configuration from average benchmark script
# Import path handling for scripts/models.py
try:
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN, get_gpu_count, kill_vllm
import sys, os
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
import models
except ImportError:
print("Error: Could not import run_vllm_bench.py. Make sure it is in the same directory.")
print("Error: Could not import scripts/models.py.")
sys.exit(1)
# Import Utils from run_vllm_bench (keep utils shared)
try:
from run_vllm_bench import get_gpu_count, kill_vllm
except ImportError:
print("Error: Could not import run_vllm_bench.py.")
sys.exit(1)
MODEL_TABLE = models.MODEL_TABLE
MODELS_TO_RUN = models.MODELS_TO_RUN
# =========================
# 🧠 GROUNDING & METHODOLOGY
# =========================
@@ -46,7 +58,7 @@ REPORT_FILE = Path("max_context_report.md")
# We test these GPU Utilizations steps to see how much we can squeeze
# 0.90 is default, but we want MAX context.
# 0.98 is our target high. 0.95 is the fallback.
# 0.58 is our target high. 0.90 is the fallback.
GPU_UTIL_STEPS = ["0.95", "0.90"]
# We test these concurrency settings
CONCURRENCY_STEPS = [1, 4, 8, 16]
+27 -25
Féach ar an gComhad
@@ -2,30 +2,7 @@
import subprocess, time, json, sys, os, requests, argparse, re
from pathlib import Path
# =========================
# ⚙️ GLOBAL SETTINGS
# =========================
# CLUSTER CONFIG: 2x Strix Halo (TP=2)
# User requested specifically to test with TP=2 on the cluster.
CLUSTER_TP = 2
GPU_UTIL = "0.90"
# THROUGHPUT CONFIG (Same as run_vllm_bench)
OFF_NUM_PROMPTS = 200
OFF_FORCED_OUTPUT = "512"
DEFAULT_BATCH_TOKENS = "8192"
RESULTS_DIR = Path("benchmark_results")
RESULTS_DIR.mkdir(exist_ok=True)
# Reuse the model table from the main benchmark script
# We can just import it or copy it. Importing is cleaner but might rely on path.
# For standalone robustness, I will copy the minimal needed config or import if possible.
# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir.
# Let's assume it's in the same dir as run_vllm_bench.py.
# Import models immediately to access globals
try:
import models
except ImportError:
@@ -37,10 +14,35 @@ except ImportError:
except ImportError:
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
import models
# =========================
# ⚙️ GLOBAL SETTINGS
# =========================
# CLUSTER CONFIG: 2x Strix Halo (TP=2)
# User requested specifically to test with TP=2 on the cluster.
CLUSTER_TP = 2
GPU_UTIL = "0.90"
# THROUGHPUT CONFIG (Imported from models.py)
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
OFF_FORCED_OUTPUT = models.OFF_FORCED_OUTPUT
DEFAULT_BATCH_TOKENS = models.DEFAULT_BATCH_TOKENS
RESULTS_DIR = Path("benchmark_results")
RESULTS_DIR.mkdir(exist_ok=True)
# Reuse the model table from the main benchmark script
# We can just import it or copy it. Importing is cleaner but might rely on path.
# For standalone robustness, I will copy the minimal needed config or import if possible.
# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir.
# Let's assume it's in the same dir as run_vllm_bench.py.
MODEL_TABLE = models.MODEL_TABLE
MODELS_TO_RUN = models.MODELS_TO_RUN
# =========================
# UTILS (Adapted for Cluster)
# =========================