feat: centralize model configurations and benchmark settings into a new models.py module and update Dockerfile and scripts to use it.

Cette révision appartient à :
Donato Capitella
2026-02-01 21:17:15 +00:00
Parent 4b09188776
révision ba503f6e61
6 fichiers modifiés avec 154 ajouts et 97 suppressions
+98
Voir le fichier
@@ -0,0 +1,98 @@
MODEL_TABLE = {
# 1. Llama 3.1 8B Instruct
# MAD uses 131k tokens. We scale to 32k for 32GB VRAM safety.
"meta-llama/Meta-Llama-3.1-8B-Instruct": {
"trust_remote": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
"google/gemma-3-12b-it": {
"trust_remote": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
# 2. GPT-OSS 20B (MXFP4)
# MAD Row 0 uses 8192. We match this exactly.
"openai/gpt-oss-20b": {
"trust_remote": True,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "8192"
},
"openai/gpt-oss-120b": {
"trust_remote": True,
"valid_tp": [1],
"max_num_seqs": "64",
"max_tokens": "8192"
},
"Qwen/Qwen3-14B-AWQ": {
"trust_remote": True,
"valid_tp": [1], # Too big for single GPU
"max_num_seqs": "32", # Lower concurrency for safety
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
"enforce_eager": False,
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
},
# 4. Qwen 30B 4-bit
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit": {
"trust_remote": True,
"enforce_eager": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit": {
"trust_remote": True,
"enforce_eager": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
"zai-org/GLM-4.7-Flash": {
"trust_remote": True,
"enforce_eager": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768",
},
# 5. Qwen 80B AWQ
# Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
# Config: 20k ctx fits in that cache. Eager mode required for stability.
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
"trust_remote": True,
"valid_tp": [1], # Too big for single GPU
"max_num_seqs": "32", # Lower concurrency for safety
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
"enforce_eager": True,
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
},
}
MODELS_TO_RUN = [
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"google/gemma-3-12b-it",
"Qwen/Qwen3-14B-AWQ",
"openai/gpt-oss-20b",
"openai/gpt-oss-120b",
"zai-org/GLM-4.7-Flash",
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
]
# Hardware / Global Defaults
GPU_UTIL = "0.90"
OFF_NUM_PROMPTS = 200
OFF_FORCED_OUTPUT = "512"
DEFAULT_BATCH_TOKENS = "8192"
+8 -3
Voir le fichier
@@ -12,16 +12,21 @@ SCRIPT_DIR = Path(__file__).parent.resolve()
BENCH_DIR = SCRIPT_DIR.parent / "benchmarks"
OPT_DIR = Path("/opt")
# Check /opt first (Container), then local fallback
# Check /opt first (Container), then local fallback for results file location
if (OPT_DIR / "run_vllm_bench.py").exists():
sys.path.append(str(OPT_DIR))
else:
sys.path.append(str(BENCH_DIR))
# Also ensure current script dir is in path for local 'models' import if not already
sys.path.append(str(SCRIPT_DIR))
try:
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
import models
MODEL_TABLE = models.MODEL_TABLE
MODELS_TO_RUN = models.MODELS_TO_RUN
except ImportError:
print("Error: Could not import run_vllm_bench.py config.")
print("Error: Could not import models.py config.")
sys.exit(1)
if (OPT_DIR / "max_context_results.json").exists():
+6 -2
Voir le fichier
@@ -13,16 +13,20 @@ SCRIPT_DIR = Path(__file__).parent.resolve()
BENCH_DIR = SCRIPT_DIR.parent / "benchmarks"
OPT_DIR = Path("/opt")
# Check /opt first (Container), then local fallback
if (OPT_DIR / "run_vllm_bench.py").exists():
sys.path.append(str(OPT_DIR))
else:
sys.path.append(str(BENCH_DIR))
sys.path.append(str(SCRIPT_DIR))
try:
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
import models
MODEL_TABLE = models.MODEL_TABLE
MODELS_TO_RUN = models.MODELS_TO_RUN
except ImportError:
print("Error: Could not import run_vllm_bench.py config.")
print("Error: Could not import models.py config.")
sys.exit(1)
if (OPT_DIR / "max_context_results.json").exists():