feat: centralize model configurations and benchmark settings into a new models.py module and update Dockerfile and scripts to use it.

This commit is contained in:
Donato Capitella
2026-02-01 21:17:15 +00:00
förälder 4b09188776
incheckning ba503f6e61
6 ändrade filer med 154 tillägg och 97 borttagningar
+22 -85
Visa fil
@@ -2,17 +2,32 @@
import subprocess, time, json, sys, os, requests, argparse
from pathlib import Path
# =========================
# ⚙️ GLOBAL SETTINGS
# =========================
# HARDWARE: 1x Strix Halo (128GB, RDNA 3.5)
GPU_UTIL = "0.90"
# 1. THROUGHPUT CONFIG
OFF_NUM_PROMPTS = 200
OFF_FORCED_OUTPUT = "512"
# Default fallback if not specified in MODEL_TABLE
DEFAULT_BATCH_TOKENS = "8192"
try:
import models
except ImportError:
# If running locally and models.py is in ../scripts?
# Or if running in /opt where models.py is alongside.
# We will try adding current dir to path just in case
sys.path.append(os.getcwd())
try:
import models
except ImportError:
# Fallback for local structure: assuming this is in benchmarks/ and models is in scripts/
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
import models
# Import from shared config
MODEL_TABLE = models.MODEL_TABLE
MODELS_TO_RUN = models.MODELS_TO_RUN
GPU_UTIL = models.GPU_UTIL
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
OFF_FORCED_OUTPUT = models.OFF_FORCED_OUTPUT
DEFAULT_BATCH_TOKENS = models.DEFAULT_BATCH_TOKENS
# Fallbacks
FALLBACK_INPUT_LEN = 1024
@@ -21,84 +36,6 @@ FALLBACK_OUTPUT_LEN = 512
RESULTS_DIR = Path("benchmark_results")
RESULTS_DIR.mkdir(exist_ok=True)
# =========================
# 🛠️ MODEL CONFIGURATION 🛠️
# =========================
MODEL_TABLE = {
# 1. Llama 3.1 8B Instruct
# MAD uses 131k tokens. We scale to 32k for 32GB VRAM safety.
"meta-llama/Meta-Llama-3.1-8B-Instruct": {
"trust_remote": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
"google/gemma-3-12b-it": {
"trust_remote": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
# 2. GPT-OSS 20B (MXFP4)
# MAD Row 0 uses 8192. We match this exactly.
"openai/gpt-oss-20b": {
"trust_remote": True,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "8192"
},
"openai/gpt-oss-120b": {
"trust_remote": True,
"valid_tp": [1],
"max_num_seqs": "64",
"max_tokens": "8192"
},
"Qwen/Qwen3-14B-AWQ": {
"trust_remote": True,
"valid_tp": [1], # Too big for single GPU
"max_num_seqs": "32", # Lower concurrency for safety
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
"enforce_eager": False,
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
},
# 4. Qwen 30B 4-bit
"cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit": {
"trust_remote": True,
"enforce_eager": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
# 5. Qwen 80B AWQ
# Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
# Config: 20k ctx fits in that cache. Eager mode required for stability.
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
"trust_remote": True,
"valid_tp": [1], # Too big for single GPU
"max_num_seqs": "32", # Lower concurrency for safety
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
"enforce_eager": True,
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
},
}
MODELS_TO_RUN = [
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"google/gemma-3-12b-it",
"Qwen/Qwen3-14B-AWQ",
"openai/gpt-oss-20b",
"openai/gpt-oss-120b",
"cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
]
# =========================
# UTILS
+13 -4
Visa fil
@@ -25,12 +25,21 @@ RESULTS_DIR.mkdir(exist_ok=True)
# Since this is a new file in root/benchmarks? No, likely scripts/ or same dir.
# Let's assume it's in the same dir as run_vllm_bench.py.
try:
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
import models
except ImportError:
# Fallback if run directly and path issues
sys.path.append(os.path.dirname(__file__))
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
# If in /opt, this should work if path includes ., otherwise:
sys.path.append(os.getcwd())
try:
import models
# Also try parent/scripts for local dev if above failed?
except ImportError:
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
import models
MODEL_TABLE = models.MODEL_TABLE
MODELS_TO_RUN = models.MODELS_TO_RUN
# =========================
# UTILS (Adapted for Cluster)