feat: Optimize model max_num_seqs and global benchmark parameters for Strix Halo, and centralize configurations in models.py.
Цей коміт міститься в:
@@ -15,13 +15,25 @@ except ImportError:
|
||||
print("Error: 'transformers' not found. Please install it or run in vLLM environment.")
|
||||
sys.exit(1)
|
||||
|
||||
# Import configuration from average benchmark script
|
||||
# Import path handling for scripts/models.py
|
||||
try:
|
||||
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN, get_gpu_count, kill_vllm
|
||||
import sys, os
|
||||
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
|
||||
import models
|
||||
except ImportError:
|
||||
print("Error: Could not import run_vllm_bench.py. Make sure it is in the same directory.")
|
||||
print("Error: Could not import scripts/models.py.")
|
||||
sys.exit(1)
|
||||
|
||||
# Import Utils from run_vllm_bench (keep utils shared)
|
||||
try:
|
||||
from run_vllm_bench import get_gpu_count, kill_vllm
|
||||
except ImportError:
|
||||
print("Error: Could not import run_vllm_bench.py.")
|
||||
sys.exit(1)
|
||||
|
||||
MODEL_TABLE = models.MODEL_TABLE
|
||||
MODELS_TO_RUN = models.MODELS_TO_RUN
|
||||
|
||||
# =========================
|
||||
# 🧠 GROUNDING & METHODOLOGY
|
||||
# =========================
|
||||
@@ -46,7 +58,7 @@ REPORT_FILE = Path("max_context_report.md")
|
||||
|
||||
# We test these GPU Utilizations steps to see how much we can squeeze
|
||||
# 0.90 is default, but we want MAX context.
|
||||
# 0.98 is our target high. 0.95 is the fallback.
|
||||
# 0.58 is our target high. 0.90 is the fallback.
|
||||
GPU_UTIL_STEPS = ["0.95", "0.90"]
|
||||
# We test these concurrency settings
|
||||
CONCURRENCY_STEPS = [1, 4, 8, 16]
|
||||
|
||||
@@ -2,30 +2,7 @@
|
||||
import subprocess, time, json, sys, os, requests, argparse, re
|
||||
from pathlib import Path
|
||||
|
||||
# =========================
|
||||
# ⚙️ GLOBAL SETTINGS
|
||||
# =========================
|
||||
|
||||
# CLUSTER CONFIG: 2x Strix Halo (TP=2)
|
||||
# User requested specifically to test with TP=2 on the cluster.
|
||||
CLUSTER_TP = 2
|
||||
GPU_UTIL = "0.90"
|
||||
|
||||
# THROUGHPUT CONFIG (Same as run_vllm_bench)
|
||||
OFF_NUM_PROMPTS = 200
|
||||
OFF_FORCED_OUTPUT = "512"
|
||||
DEFAULT_BATCH_TOKENS = "8192"
|
||||
|
||||
RESULTS_DIR = Path("benchmark_results")
|
||||
RESULTS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# Reuse the model table from the main benchmark script
|
||||
# We can just import it or copy it. Importing is cleaner but might rely on path.
|
||||
# For standalone robustness, I will copy the minimal needed config or import if possible.
|
||||
# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir.
|
||||
# Let's assume it's in the same dir as run_vllm_bench.py.
|
||||
|
||||
|
||||
# Import models immediately to access globals
|
||||
try:
|
||||
import models
|
||||
except ImportError:
|
||||
@@ -37,10 +14,35 @@ except ImportError:
|
||||
except ImportError:
|
||||
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
|
||||
import models
|
||||
|
||||
|
||||
# =========================
|
||||
# ⚙️ GLOBAL SETTINGS
|
||||
# =========================
|
||||
|
||||
# CLUSTER CONFIG: 2x Strix Halo (TP=2)
|
||||
# User requested specifically to test with TP=2 on the cluster.
|
||||
CLUSTER_TP = 2
|
||||
GPU_UTIL = "0.90"
|
||||
|
||||
# THROUGHPUT CONFIG (Imported from models.py)
|
||||
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
|
||||
OFF_FORCED_OUTPUT = models.OFF_FORCED_OUTPUT
|
||||
DEFAULT_BATCH_TOKENS = models.DEFAULT_BATCH_TOKENS
|
||||
|
||||
RESULTS_DIR = Path("benchmark_results")
|
||||
RESULTS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# Reuse the model table from the main benchmark script
|
||||
# We can just import it or copy it. Importing is cleaner but might rely on path.
|
||||
# For standalone robustness, I will copy the minimal needed config or import if possible.
|
||||
# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir.
|
||||
# Let's assume it's in the same dir as run_vllm_bench.py.
|
||||
|
||||
|
||||
MODEL_TABLE = models.MODEL_TABLE
|
||||
MODELS_TO_RUN = models.MODELS_TO_RUN
|
||||
|
||||
|
||||
# =========================
|
||||
# UTILS (Adapted for Cluster)
|
||||
# =========================
|
||||
|
||||
+10
-10
@@ -4,14 +4,14 @@ MODEL_TABLE = {
|
||||
"meta-llama/Meta-Llama-3.1-8B-Instruct": {
|
||||
"trust_remote": False,
|
||||
"valid_tp": [1, 2],
|
||||
"max_num_seqs": "64",
|
||||
"max_num_seqs": "24", # Strix Halo Optimized (Bandwidth Limit)
|
||||
"max_tokens": "32768"
|
||||
},
|
||||
|
||||
"google/gemma-3-12b-it": {
|
||||
"trust_remote": False,
|
||||
"valid_tp": [1, 2],
|
||||
"max_num_seqs": "64",
|
||||
"max_num_seqs": "24",
|
||||
"max_tokens": "32768"
|
||||
},
|
||||
# 2. GPT-OSS 20B (MXFP4)
|
||||
@@ -19,14 +19,14 @@ MODEL_TABLE = {
|
||||
"openai/gpt-oss-20b": {
|
||||
"trust_remote": True,
|
||||
"valid_tp": [1, 2],
|
||||
"max_num_seqs": "64",
|
||||
"max_num_seqs": "24",
|
||||
"max_tokens": "8192"
|
||||
},
|
||||
|
||||
"openai/gpt-oss-120b": {
|
||||
"trust_remote": True,
|
||||
"valid_tp": [1],
|
||||
"max_num_seqs": "64",
|
||||
"max_num_seqs": "16",
|
||||
"max_tokens": "8192"
|
||||
},
|
||||
|
||||
@@ -34,7 +34,7 @@ MODEL_TABLE = {
|
||||
"Qwen/Qwen3-14B-AWQ": {
|
||||
"trust_remote": True,
|
||||
"valid_tp": [1], # Too big for single GPU
|
||||
"max_num_seqs": "32", # Lower concurrency for safety
|
||||
"max_num_seqs": "24", # Strix Halo Optimized
|
||||
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
|
||||
"enforce_eager": False,
|
||||
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
|
||||
@@ -45,7 +45,7 @@ MODEL_TABLE = {
|
||||
"trust_remote": True,
|
||||
"enforce_eager": False,
|
||||
"valid_tp": [1, 2],
|
||||
"max_num_seqs": "64",
|
||||
"max_num_seqs": "24",
|
||||
"max_tokens": "32768"
|
||||
},
|
||||
|
||||
@@ -53,7 +53,7 @@ MODEL_TABLE = {
|
||||
"trust_remote": True,
|
||||
"enforce_eager": False,
|
||||
"valid_tp": [1, 2],
|
||||
"max_num_seqs": "64",
|
||||
"max_num_seqs": "24",
|
||||
"max_tokens": "32768"
|
||||
},
|
||||
|
||||
@@ -61,7 +61,7 @@ MODEL_TABLE = {
|
||||
"trust_remote": True,
|
||||
"enforce_eager": False,
|
||||
"valid_tp": [1, 2],
|
||||
"max_num_seqs": "64",
|
||||
"max_num_seqs": "24",
|
||||
"max_tokens": "32768",
|
||||
},
|
||||
|
||||
@@ -71,7 +71,7 @@ MODEL_TABLE = {
|
||||
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
|
||||
"trust_remote": True,
|
||||
"valid_tp": [1], # Too big for single GPU
|
||||
"max_num_seqs": "32", # Lower concurrency for safety
|
||||
"max_num_seqs": "16", # Large Model / Bandwidth Constrained
|
||||
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
|
||||
"enforce_eager": True,
|
||||
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
|
||||
@@ -93,6 +93,6 @@ MODELS_TO_RUN = [
|
||||
|
||||
# Hardware / Global Defaults
|
||||
GPU_UTIL = "0.90"
|
||||
OFF_NUM_PROMPTS = 200
|
||||
OFF_NUM_PROMPTS = 100 # Reduced for Strix Halo (Bandwidth Limited)
|
||||
OFF_FORCED_OUTPUT = "512"
|
||||
DEFAULT_BATCH_TOKENS = "8192"
|
||||
|
||||
Посилання в новій задачі
Заблокувати користувача