Bu işleme şunda yer alıyor:
Donato Capitella
2025-12-20 11:37:06 +00:00
ebeveyn f19932b360
işleme 5e8b6bb545
20 değiştirilmiş dosya ile 3612 ekleme ve 248 silme
+7
Dosyayı Görüntüle
@@ -0,0 +1,7 @@
{
"elapsed_time": 1302.7062463890015,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.15352655332265747,
"tokens_per_second": 112.69232830266365
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 540.2676798280002,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.37018686748700586,
"tokens_per_second": 271.7264154071495
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1303.4944151099999,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.15343372221746138,
"tokens_per_second": 112.62418795067208
}
+7
Dosyayı Görüntüle
@@ -0,0 +1,7 @@
{
"elapsed_time": 914.8563823220001,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.21861354838273012,
"tokens_per_second": 162.71078485804028
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 522.8661062630126,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.38250710383471637,
"tokens_per_second": 278.99494393048457
}
+7
Dosyayı Görüntüle
@@ -0,0 +1,7 @@
{
"elapsed_time": 1339.915984058,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.14926308990977954,
"tokens_per_second": 109.73523843987172
}
+7
Dosyayı Görüntüle
@@ -0,0 +1,7 @@
{
"elapsed_time": 468.4791132300161,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.42691337639593563,
"tokens_per_second": 313.85817605876395
}
+575
Dosyayı Görüntüle
@@ -0,0 +1,575 @@
#!/usr/bin/env python3
import subprocess
import time
import socket
import json
import sys
import os
import requests
import re
import argparse
from pathlib import Path
try:
from transformers import AutoConfig
except ImportError:
print("Error: 'transformers' not found. Please install it or run in vLLM environment.")
sys.exit(1)
# Import configuration from average benchmark script
try:
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN, get_gpu_count, kill_vllm
except ImportError:
print("Error: Could not import run_vllm_bench.py. Make sure it is in the same directory.")
sys.exit(1)
# =========================
# 🧠 GROUNDING & METHODOLOGY
# =========================
# This script finds the Maximum Working Context (MWC) for vLLM models.
#
# Methodology:
# 1. **Inspect**: Use `transformers.AutoConfig` to determine the model's theoretical limit
# (e.g., `max_position_embeddings`).
# 2. **Probe**: Launch `vllm serve` at this limit.
# 3. **React**:
# - If stable ("Application startup complete"): Success.
# - If OOM ("KV cache capacity... is X"): Retry with vLLM's suggested X.
# - If Config Error ("max_model_len... is Y"): Retry with vLLM's suggested Y.
# =========================
# ⚙️ CONFIG
# =========================
HOST = "127.0.0.1"
PORT = 8000
RESULTS_FILE = Path("max_context_results.json")
REPORT_FILE = Path("max_context_report.md")
# We test these GPU Utilizations steps to see how much we can squeeze
# 0.90 is default, but we want MAX context.
# 0.98 is our target high. 0.95 is the fallback.
GPU_UTIL_STEPS = ["0.95", "0.90"]
# We test these concurrency settings
CONCURRENCY_STEPS = [1, 4, 8, 16]
def log(msg): print(f"[MAX-CTX] {msg}", flush=True)
def get_hf_context_limit(model_name, trust_remote=False):
try:
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote)
# Gemma 3 and similar multi-config models
if hasattr(cfg, "text_config"):
tc = cfg.text_config
if hasattr(tc, "max_position_embeddings"):
return int(tc.max_position_embeddings)
# Standard HF attributes
for attr in (
"max_position_embeddings",
"seq_length",
"max_seq_len",
"n_positions",
):
val = getattr(cfg, attr, None)
if val is not None:
return int(val)
return 8192
except Exception as e:
log(f"Warning: Could not read config for {model_name}: {e}. Defaulting to 32768.")
return 32768
def get_vllm_server_cmd(model, tp_size, util, max_len, max_seqs):
"""
Constructs the vLLM serve command.
"""
config = MODEL_TABLE[model]
cmd = [
"vllm", "serve", model,
"--gpu-memory-utilization", str(util),
"--max-model-len", str(max_len),
"--tensor-parallel-size", str(tp_size),
"--max-num-seqs", str(max_seqs),
"--dtype", "auto",
# "--disable-log-stats" # Cleaner output, but user managed without it
]
if config.get("trust_remote"): cmd.append("--trust-remote-code")
if config.get("enforce_eager"): cmd.append("--enforce-eager")
# Add model specific env vars
env = os.environ.copy()
env.update(config.get("env", {}))
return cmd, env
def is_port_free(port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(('localhost', port)) != 0
def force_cleanup(hard=False):
"""
Kills vLLM using multiple methods and ensures port is free.
BLOCKS until processes are definitely gone.
"""
timeout = 20 if hard else 10
start_time = time.time()
while True:
# 1. Aggressive Kill Commands
# We send these EVERY loop iteration until they die.
subprocess.run("pkill -9 -f 'vllm.entrypoints.api_server'", shell=True, stderr=subprocess.DEVNULL)
subprocess.run("pkill -9 -f 'vllm serve'", shell=True, stderr=subprocess.DEVNULL)
subprocess.run("pkill -9 -f 'VLLM::'", shell=True, stderr=subprocess.DEVNULL)
subprocess.run("pkill -9 -f 'multiprocessing.spawn'", shell=True, stderr=subprocess.DEVNULL)
subprocess.run("pkill -9 -f ray::", shell=True, stderr=subprocess.DEVNULL)
# 2. Check if they are still there
# We check specifically for the persistence of any vllm-related process
# We use explicit list to know WHICH one triggered it
# CRITICAL FIX: We MUST use shell=False otherwise 'pgrep -f pattern'
# matches the 'sh -c pgrep ... pattern' command content itself!
dirty = False
# Check 1: vllm serve
if subprocess.run(["pgrep", "-f", "vllm serve"], stdout=subprocess.DEVNULL).returncode == 0:
# Double check it's not us (Python script)
# But simpler to just proceed if we trust shell=False works
log("Clean waiting: Found 'vllm serve' process:")
subprocess.run("pgrep -a -f 'vllm serve'", shell=True) # debug
dirty = True
# Check 2: api_server
if subprocess.run(["pgrep", "-f", "vllm.entrypoints.api_server"], stdout=subprocess.DEVNULL).returncode == 0:
log("Clean waiting: Found 'vllm.entrypoints.api_server' process:")
subprocess.run("pgrep -a -f 'vllm.entrypoints.api_server'", shell=True) # debug
dirty = True
# Check 3: VLLM:: (Ray workers)
if subprocess.run(["pgrep", "-f", "VLLM::"], stdout=subprocess.DEVNULL).returncode == 0:
log("Clean waiting: Found 'VLLM::' process:")
subprocess.run("pgrep -a -f 'VLLM::'", shell=True) # debug
dirty = True
if not dirty:
# Processes are gone. Now check port.
if is_port_free(PORT):
time.sleep(1) # Final safety buffer
return # Clean!
else:
log("Clean: Processes gone, but Port 8000 still held. Waiting...")
else:
log("Clean: Processes still detected. Retrying kill...")
if time.time() - start_time > timeout:
log("CRITICAL: Cleanup timed out! Force attempting `killall -9 vllm` as last resort.")
subprocess.run("killall -9 vllm", shell=True, stderr=subprocess.DEVNULL)
break
time.sleep(1.5) # Wait a bit before hammering again
def wait_for_server_and_parse(process, timeout=300):
"""
Waits for server to be ready.
Parses stdout for "Count of GPU blocks" and "Block size".
Returns: (ready_bool, gpu_blocks, block_size, max_len_clamped, failure_reason)
"""
start = time.time()
gpu_blocks = 0
block_size = 16 # default
max_len_clamped = None
logs = []
failure_reason = None
while time.time() - start < timeout:
if process.poll() is not None:
# Process died.
for line in process.stdout:
line_str = line.decode("utf-8", errors="replace").strip()
logs.append(line_str)
# SCAN FULL HISTORY if not found yet
# Sometimes error was in previous lines or split
if not failure_reason:
full_log = "\n".join(logs)
# Check 1: Sampler OOM
if "warming up sampler" in full_log and "CUDA out of memory" in full_log:
failure_reason = "Sampler Warmup OOM"
# Check 2: Explicit vLLM suggestion (Estimated)
# "estimated maximum model length is 127120"
elif "estimated maximum model length is" in full_log:
m = re.search(r"estimated maximum model length is (\d+)", full_log)
if m:
failure_reason = f"estimated maximum model length is {m.group(1)}"
# Check 3: Derived Max Model Len
# "derived max_model_len (max_position_embeddings=131072.0 ...)"
elif "derived max_model_len" in full_log:
failure_reason = "derived max_model_len detected"
# Check 4: Capacity/Value Error
elif "ValueError" in full_log and "maximum number of tokens" in full_log:
failure_reason = "Capacity Error (Found in history)"
# Check 5: Generic OOM
elif "CUDA out of memory" in full_log or "hipErrorOutOfMemory" in full_log:
failure_reason = "OOM detected"
if not failure_reason:
# Unexpected death! Dump logs to see why.
log("CRITICAL: Process died unexpectedly! Dumping last 100 lines:")
print("=== vLLM SERVER LOGS (LAST 100 LINES) ===")
for l in logs[-100:]:
print(l)
print("=============================================")
return False, 0, 0, None, failure_reason
line = process.stdout.readline()
if line:
line_str = line.decode("utf-8", errors="replace").strip()
logs.append(line_str)
# 1. Parse Legacy "GPU blocks" (if present)
m_blocks = re.search(r"# GPU blocks:\s*(\d+)", line_str)
if m_blocks:
gpu_blocks = int(m_blocks.group(1))
block_size = 16 # assume default unless found
log(f" -> Found GPU blocks: {gpu_blocks} (Legacy)")
# 2. Parse Newer "GPU KV cache size" (vLLM 0.11+)
# "GPU KV cache size: 111,536 tokens"
m_kv_tokens = re.search(r"GPU KV cache size:\s*([\d,]+)\s*tokens", line_str)
if m_kv_tokens:
tokens_str = m_kv_tokens.group(1).replace(",", "")
gpu_blocks = int(tokens_str) # We use 'gpu_blocks' variable to store total tokens now for simplicity
block_size = 1 # Effectively 1 because we have the total count
log(f" -> Found GPU KV Cache tokens: {gpu_blocks}")
# 3. Parse Block Size (optional, mostly for legacy)
m_bs = re.search(r"block_size=(\d+)", line_str)
if m_bs:
block_size = int(m_bs.group(1))
# Failure hints
if "ValueError" in line_str and "maximum number of tokens" in line_str:
failure_reason = line_str
if "derived max_model_len" in line_str:
failure_reason = line_str
if "warming up sampler" in line_str and "CUDA out of memory" in line_str:
failure_reason = "Sampler Warmup OOM"
elif "CUDA out of memory" in line_str or "hipErrorOutOfMemory" in line_str:
failure_reason = "OOM detected"
# Check for startup
if "Application startup complete" in line_str or "Uvicorn running on" in line_str:
if gpu_blocks > 0:
log(" -> Server signal detected. Waiting 5s for socket stability...")
time.sleep(5)
return True, gpu_blocks, block_size, max_len_clamped, None
else:
return False, 0, 0, None, "Parsed Success but Token/Block Count was 0"
# Timeout case
log("CRITICAL: Server startup timed out! Dumping last 100 lines:")
print("=== vLLM SERVER LOGS (LAST 100 LINES) ===")
for l in logs[-100:]:
print(l)
print("=============================================")
return False, 0, 0, None, "Timeout"
def verify_context(model, context_len):
"""
Sends a request to the server with length ~context_len to verify stability.
"""
url = f"http://{HOST}:{PORT}/v1/completions"
# We use a simple "A " * N prompt.
# Llama 3 tokenizer: "A" is usually 1 token.
prompt = "A " * int(context_len * 0.5) # 50% fill to be safe/approx
payload = {
"model": model,
"prompt": prompt,
"max_tokens": 10,
"temperature": 0
}
# Retry loop for connection refusals (race condition)
max_retries = 5
for attempt in range(max_retries):
try:
# Increased timeout to 300s because prefilling 60k+ tokens takes time!
r = requests.post(url, json=payload, timeout=300)
if r.status_code == 200:
return True, "Success"
else:
# If 500 or 400 error, maybe we shouldn't retry? Usually yes for 500 if transient.
# But for now let's just fail or retry.
# If we are OOMing, we will likely get a 500 or it will hang.
return False, f"HTTP {r.status_code}: {r.text[:200]}"
except requests.exceptions.ConnectionError:
if attempt < max_retries - 1:
log(f" -> Connection refused. Retrying verification ({attempt+1}/{max_retries})...")
time.sleep(2)
else:
return False, "Connection Refused (Max Retries)"
except Exception as e:
return False, str(e)
return False, "Unknown Error"
def run_probe(model, tp, util, max_seqs, start_limit=None):
"""
Probes a specific configuration starting from the model's architectural limit.
"""
trust_remote = MODEL_TABLE[model].get("trust_remote", False)
# 1. Get the Advertised Limit (The "Smart" Way)
arch_limit = get_hf_context_limit(model, trust_remote)
# Intelligent Start: If we know a lower limit worked for lower concurrency, start there.
target_len = arch_limit
if start_limit:
target_len = min(arch_limit, start_limit)
log(f" -> Smart Start: Capping initial probe at {target_len} (based on previous run)")
result_data = {
"model": model,
"tp": tp,
"util": util,
"max_seqs": max_seqs,
"model_limit": arch_limit,
"configured_len": 0,
"real_capacity": 0,
"status": "fail",
"error": ""
}
log(f"Probing {model} | TP={tp} | Util={util} | Seqs={max_seqs} | Model Limit={arch_limit}")
# We loop until we succeed OR we drop below a useful context size.
while target_len >= 2048:
force_cleanup()
cmd, env = get_vllm_server_cmd(model, tp, util, target_len, max_seqs)
log(f"DEBUG: Cmd: {' '.join(cmd)}")
proc = None
try:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env)
ready, blocks, block_size, _, fail_msg = wait_for_server_and_parse(proc)
if ready:
# Success - but let's VERIFY it actually answers
total_capacity = blocks * block_size
workable_len = min(target_len, total_capacity)
# Verify with actual request
# We cap verification at 4096 because we just want to know if it crashes,
# we don't need to wait for a 128k context fill just for a liveness check.
verify_len = min(workable_len, 4096)
log(f" -> Server ready. Verifying stability with approx {int(verify_len * 0.5)} tokens (capped at 4k)...")
v_ok, v_msg = verify_context(model, verify_len)
if v_ok:
log(f" -> Success! capacity={total_capacity}, configured={workable_len}")
log(f" -> Verification passed: {v_msg}")
# Cleanup SUCCESSFUL process immediately
proc.terminate()
try: proc.wait(timeout=5)
except: proc.kill()
result_data["status"] = "success"
result_data["configured_len"] = target_len
result_data["real_capacity"] = total_capacity
result_data["max_context_1_user"] = workable_len
return result_data
else:
log(f" -> Server started, but Verification FAILED: {v_msg}")
# Treat as a crash/failure, back off
fail_msg = "Verification Failed"
# Capture any remaining logs if the process is dead or dying
# Or just read what's currently available non-blocking?
# Simpler: just terminate and read output.
proc.terminate()
try:
outs, errs = proc.communicate(timeout=5)
if outs:
print("=== vLLM SERVER LOGS (DURING VERIFICATION FAILURE) ===")
print(outs.decode('utf-8', errors='replace'))
print("======================================================")
except:
proc.kill()
# If we fall through here, ready=False OR verify=False
log(f" -> Attempt failed at {target_len}")
if fail_msg: log(f" Reason: {fail_msg}")
result_data["error"] = fail_msg if fail_msg else "Process died or timed out"
if fail_msg:
# Case V: Verification Failed (Server up, but unstable inference)
# User requests drop to 0.95 tier immediately.
# Must check this FIRST to ensure we don't fall through.
if "Verification Failed" in str(fail_msg):
log(" -> Verification Failed (Unstable). Aborting this Util, dropping to lower tier.")
break
# Case S: Sampler Warmup OOM (Fatal for this Util)
if "Sampler Warmup OOM" in fail_msg:
log(" -> Critical Sampler OOM. Utilization/Seqs too high. Aborting this configuration.")
break # Give up on this Util/Seq combo immediately
# Case X: Dirty State / Zombie VRAM
# "Free memory on device (1.56/31.86 GiB) on startup is less than desired..."
if "Free memory on device" in fail_msg and "less than desired" in fail_msg:
log(" -> Dirty VRAM detected (previous run didn't cleanup?). Retrying with HARD cleanup.")
force_cleanup(hard=True)
continue # Retry SAME target_len
# Case A: VRAM Limit ("maximum number of tokens... is X")
m_capacity = re.search(r"maximum number of tokens.*?KV cache is (\d+)", fail_msg)
if m_capacity:
cap = int(m_capacity.group(1))
log(f" -> Found Hardware Capacity: {cap}")
target_len = cap
continue # Retry Exact Cap
# Case B: Model Limit mismatch
# "Value error, User-specified max_model_len (500000) is greater than the derived max_model_len (max_position_embeddings=131072.0 ...)"
# We regex for 'derived max_model_len' and then look for numbers in the proximity.
if "derived max_model_len" in fail_msg:
# Try to capture "max_position_embeddings=131072"
m_pos = re.search(r"max_position_embeddings=([\d\.]+)", fail_msg)
if m_pos:
limit = int(float(m_pos.group(1))) # handle 131072.0
log(f" -> Found Model Limit: {limit}")
target_len = limit
continue
# Fallback: look for simple parenthesis pattern if the above fails
m_derived = re.search(r"derived max_model_len\s*\((\d+)\)", fail_msg)
if m_derived:
limit = int(m_derived.group(1))
log(f" -> Found Model Limit (Legacy): {limit}")
target_len = limit
continue
# Case C: Estimated Max Length (New vLLM Safe Limit)
# "estimated maximum model length is 111536"
m_est = re.search(r"estimated maximum model length is (\d+)", fail_msg)
if m_est:
limit = int(m_est.group(1))
log(f" -> Found vLLM Estimated Limit: {limit}")
target_len = limit
continue
# Case D: Generic OOM/Crash
target_len = int(target_len * 0.8)
log(f" -> Backing off to: {target_len}")
if target_len < 2048:
log(" -> Give up (too small)")
break
finally:
if proc:
try: proc.terminate()
except: pass
try: proc.kill()
except: pass
proc.wait()
force_cleanup()
return result_data
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="Filter to run only this model (substring match)")
parser.add_argument("--steps", type=int, default=-1, help="Number of models to run (default: all)")
args = parser.parse_args()
gpu_count = get_gpu_count()
# 1. Load existing results to support RESUME
results = []
if RESULTS_FILE.exists():
try:
with open(RESULTS_FILE, "r") as f:
results = json.load(f)
log(f"Loaded {len(results)} previous results. Resuming...")
except Exception as e:
log(f"Warning: Could not read existing results: {e}")
count = 0
for model in MODELS_TO_RUN:
if args.model and args.model not in model:
continue
config = MODEL_TABLE[model]
valid_tps = [t for t in config["valid_tp"] if t <= gpu_count]
for tp in valid_tps:
# Track successful seqs for this TP to skip lower utils
# effectively: {seqs_count: max_working_util}
# Since we iterate high-util -> low-util, if we succeeded already for this 'seqs', we skip.
successful_seqs = set()
# Reset smart limit for each TP (TP2 should not inherit TP1's limit)
last_working_len = None
for util in GPU_UTIL_STEPS:
for seqs in CONCURRENCY_STEPS:
if seqs in successful_seqs:
log(f"Skipping {model} (TP={tp}, Util={util}, Seqs={seqs}) - Already succeeded at higher util.")
continue
# Check if we already have this result
existing_res = next((r for r in results
if r["model"] == model
and r["tp"] == tp
and str(r["util"]) == str(util)
and r["max_seqs"] == seqs), None)
if existing_res:
res = existing_res
log(f"Skipping {model} (TP={tp}, Util={util}, Seqs={seqs}) - Found in results.")
else:
# New run
res = run_probe(model, tp, util, seqs, start_limit=last_working_len)
results.append(res)
# Save immediately
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2)
# Update logic for Resume OR New Run:
if res["status"] == "success":
last_working_len = res["configured_len"]
successful_seqs.add(seqs) # Mark this seq count as done for this TP
# Smart Break: If we failed at this concurrency level (capacity=0),
# higher concurrency will also fail.
if res["real_capacity"] == 0 or res["status"] == "fail":
log(f"Stopping higher concurrency tests for {model} (failed at {seqs} seqs)")
break
count += 1
if args.steps != -1 and count >= args.steps and not args.model:
break
# generate_report(results) - Moved to separate script
if __name__ == "__main__":
main()
+333
Dosyayı Görüntüle
@@ -0,0 +1,333 @@
[
{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tp": 1,
"util": "0.95",
"max_seqs": 1,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 829952,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tp": 1,
"util": "0.95",
"max_seqs": 4,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 830064,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tp": 1,
"util": "0.95",
"max_seqs": 8,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 830080,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tp": 1,
"util": "0.95",
"max_seqs": 16,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 830064,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "google/gemma-3-12b-it",
"tp": 1,
"util": "0.95",
"max_seqs": 1,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 246032,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "google/gemma-3-12b-it",
"tp": 1,
"util": "0.95",
"max_seqs": 4,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 246064,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "google/gemma-3-12b-it",
"tp": 1,
"util": "0.95",
"max_seqs": 8,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 246064,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "google/gemma-3-12b-it",
"tp": 1,
"util": "0.95",
"max_seqs": 16,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 246064,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"tp": 1,
"util": "0.95",
"max_seqs": 1,
"model_limit": 40960,
"configured_len": 0,
"real_capacity": 0,
"status": "fail",
"error": "Verification Failed"
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"tp": 1,
"util": "0.90",
"max_seqs": 1,
"model_limit": 40960,
"configured_len": 40960,
"real_capacity": 655712,
"status": "success",
"error": "",
"max_context_1_user": 40960
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"tp": 1,
"util": "0.90",
"max_seqs": 4,
"model_limit": 40960,
"configured_len": 40960,
"real_capacity": 655616,
"status": "success",
"error": "",
"max_context_1_user": 40960
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"tp": 1,
"util": "0.90",
"max_seqs": 8,
"model_limit": 40960,
"configured_len": 40960,
"real_capacity": 655600,
"status": "success",
"error": "",
"max_context_1_user": 40960
},
{
"model": "Qwen/Qwen3-14B-AWQ",
"tp": 1,
"util": "0.90",
"max_seqs": 16,
"model_limit": 40960,
"configured_len": 40960,
"real_capacity": 655600,
"status": "success",
"error": "",
"max_context_1_user": 40960
},
{
"model": "openai/gpt-oss-20b",
"tp": 1,
"util": "0.95",
"max_seqs": 1,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 2232848,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "openai/gpt-oss-20b",
"tp": 1,
"util": "0.95",
"max_seqs": 4,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 2232560,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "openai/gpt-oss-20b",
"tp": 1,
"util": "0.95",
"max_seqs": 8,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 2232544,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "openai/gpt-oss-20b",
"tp": 1,
"util": "0.95",
"max_seqs": 16,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 2232544,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "openai/gpt-oss-120b",
"tp": 1,
"util": "0.95",
"max_seqs": 1,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 711360,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "openai/gpt-oss-120b",
"tp": 1,
"util": "0.95",
"max_seqs": 4,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 711168,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "openai/gpt-oss-120b",
"tp": 1,
"util": "0.95",
"max_seqs": 8,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 711168,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "openai/gpt-oss-120b",
"tp": 1,
"util": "0.95",
"max_seqs": 16,
"model_limit": 131072,
"configured_len": 131072,
"real_capacity": 711168,
"status": "success",
"error": "",
"max_context_1_user": 131072
},
{
"model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
"tp": 1,
"util": "0.95",
"max_seqs": 1,
"model_limit": 262144,
"configured_len": 262144,
"real_capacity": 1097712,
"status": "success",
"error": "",
"max_context_1_user": 262144
},
{
"model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
"tp": 1,
"util": "0.95",
"max_seqs": 4,
"model_limit": 262144,
"configured_len": 0,
"real_capacity": 0,
"status": "fail",
"error": "Verification Failed"
},
{
"model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
"tp": 1,
"util": "0.90",
"max_seqs": 4,
"model_limit": 262144,
"configured_len": 209715,
"real_capacity": 1029856,
"status": "success",
"error": "Process died or timed out",
"max_context_1_user": 209715
},
{
"model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
"tp": 1,
"util": "0.90",
"max_seqs": 8,
"model_limit": 262144,
"configured_len": 0,
"real_capacity": 0,
"status": "fail",
"error": "Verification Failed"
},
{
"model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"tp": 1,
"util": "0.95",
"max_seqs": 1,
"model_limit": 262144,
"configured_len": 0,
"real_capacity": 0,
"status": "fail",
"error": "Verification Failed"
},
{
"model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"tp": 1,
"util": "0.90",
"max_seqs": 1,
"model_limit": 262144,
"configured_len": 262144,
"real_capacity": 696320,
"status": "success",
"error": "",
"max_context_1_user": 262144
},
{
"model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"tp": 1,
"util": "0.90",
"max_seqs": 4,
"model_limit": 262144,
"configured_len": 0,
"real_capacity": 0,
"status": "fail",
"error": "Verification Failed"
}
]
+266
Dosyayı Görüntüle
@@ -0,0 +1,266 @@
#!/usr/bin/env python3
import subprocess, time, json, sys, os, requests, argparse
from pathlib import Path
# =========================
# ⚙️ GLOBAL SETTINGS
# =========================
# HARDWARE: 1x Strix Halo (128GB, RDNA 3.5)
GPU_UTIL = "0.90"
# 1. THROUGHPUT CONFIG
OFF_NUM_PROMPTS = 200
OFF_FORCED_OUTPUT = "512"
# Default fallback if not specified in MODEL_TABLE
DEFAULT_BATCH_TOKENS = "8192"
# Fallbacks
FALLBACK_INPUT_LEN = 1024
FALLBACK_OUTPUT_LEN = 512
RESULTS_DIR = Path("benchmark_results")
RESULTS_DIR.mkdir(exist_ok=True)
# =========================
# 🛠️ MODEL CONFIGURATION 🛠️
# =========================
MODEL_TABLE = {
# 1. Llama 3.1 8B Instruct
# MAD uses 131k tokens. We scale to 32k for 32GB VRAM safety.
"meta-llama/Meta-Llama-3.1-8B-Instruct": {
"trust_remote": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
"google/gemma-3-12b-it": {
"trust_remote": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
# 2. GPT-OSS 20B (MXFP4)
# MAD Row 0 uses 8192. We match this exactly.
"openai/gpt-oss-20b": {
"trust_remote": True,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "8192"
},
"openai/gpt-oss-120b": {
"trust_remote": True,
"valid_tp": [1],
"max_num_seqs": "64",
"max_tokens": "8192"
},
"Qwen/Qwen3-14B-AWQ": {
"trust_remote": True,
"valid_tp": [1], # Too big for single GPU
"max_num_seqs": "32", # Lower concurrency for safety
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
"enforce_eager": False,
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
},
# 4. Qwen 30B 4-bit
"cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit": {
"trust_remote": True,
"enforce_eager": False,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
},
# 5. Qwen 80B AWQ (The Big One) [NEW]
# Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
# Config: 20k ctx fits in that cache. Eager mode required for stability.
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
"trust_remote": True,
"valid_tp": [1], # Too big for single GPU
"max_num_seqs": "32", # Lower concurrency for safety
"max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
"enforce_eager": True,
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
},
}
MODELS_TO_RUN = [
#"meta-llama/Meta-Llama-3.1-8B-Instruct",
#"google/gemma-3-12b-it",
#"Qwen/Qwen3-14B-AWQ",
#"openai/gpt-oss-20b",
#"openai/gpt-oss-120b",
"cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
]
# =========================
# UTILS
# =========================
def log(msg): print(f"\n[BENCH] {msg}")
def get_gpu_count():
try:
# Using rocm-smi --showid to list GPUs.
# Output format: "GPU[0] : Device Name: ..."
res = subprocess.run(["rocm-smi", "--showid"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if res.returncode == 0:
# Filter specifically for the target GPU as requested
# target_gpu = "AMD Radeon AI PRO R9700"
# count = 0
# for line in res.stdout.strip().split('\n'):
# if "Device Name" in line and target_gpu in line:
# count += 1
# return count if count > 0 else 1
return 1 # Force return 1 for Strix Halo APU
else:
log("rocm-smi failed, defaulting to 1 GPU (Hardcoded Fallback)")
return 1
except Exception as e:
log(f"Error detecting GPUs: {e}, defaulting to 1 GPU")
return 1
def kill_vllm():
subprocess.run("pgrep -f 'vllm serve' | xargs -r kill -9",
shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(5)
def nuke_vllm_cache():
cache = Path.home() / ".cache" / "vllm"
if cache.exists():
try:
subprocess.run(["rm", "-rf", str(cache)], check=True)
cache.mkdir(parents=True, exist_ok=True)
time.sleep(2)
except: pass
def get_dataset():
data_path = Path("ShareGPT_V3_unfiltered_cleaned_split.json")
if data_path.exists(): return str(data_path)
log("Downloading ShareGPT dataset...")
url = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
try:
r = requests.get(url, stream=True, timeout=15)
r.raise_for_status()
with open(data_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192): f.write(chunk)
return str(data_path)
except Exception as e:
log(f"WARNING: ShareGPT download failed ({e}). using RANDOM.")
return None
def get_model_args(model, tp_size):
config = MODEL_TABLE.get(model, {"max_num_seqs": "32"})
# Allow per-model GPU utilization override
util = config.get("gpu_util", GPU_UTIL)
cmd = [
"--model", model,
"--gpu-memory-utilization", util,
"--dtype", "auto",
"--tensor-parallel-size", str(tp_size),
"--max-num-seqs", config["max_num_seqs"]
]
# Optional: if a model really needs a hard limit, we can still support "ctx" in config,
# but by default we rely on auto.
if "ctx" in config:
cmd.extend(["--max-model-len", config["ctx"]])
if config.get("trust_remote"): cmd.append("--trust-remote-code")
if config.get("enforce_eager"): cmd.append("--enforce-eager")
return cmd
def run_throughput(model, tp_size):
if tp_size not in MODEL_TABLE[model]["valid_tp"]: return
model_safe = model.replace("/", "_")
output_file = RESULTS_DIR / f"{model_safe}_tp{tp_size}_throughput.json"
if output_file.exists():
log(f"SKIP Throughput {model} (TP={tp_size})")
return
dataset_path = get_dataset()
dataset_args = ["--dataset-name", "sharegpt", "--dataset-path", dataset_path] if dataset_path else ["--input-len", "1024"]
# Retrieve Model-Specific Batch Tokens
batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)
log(f"START Throughput {model} (TP={tp_size}) [Batch: {batch_tokens}]...")
kill_vllm()
nuke_vllm_cache()
cmd = ["vllm", "bench", "throughput"] + get_model_args(model, tp_size)
cmd.extend([
"--num-prompts", str(OFF_NUM_PROMPTS),
"--max-num-batched-tokens", batch_tokens,
"--output-len", OFF_FORCED_OUTPUT,
"--output-json", str(output_file),
"--disable-log-stats"
])
cmd.extend(dataset_args)
# ENV Setup: Global + Model Specific
env = os.environ.copy()
# Inject model specific env vars (e.g. for AWQ)
model_env = MODEL_TABLE[model].get("env", {})
env.update(model_env)
try:
subprocess.run(cmd, check=True, env=env)
except:
log(f"ERROR: Throughput failed {model}")
def print_summary(tps):
print(f"\n{'MODEL':<40} | {'TP':<2} | {'TOK/S':<8}")
print("-" * 60)
for m in MODELS_TO_RUN:
msafe = m.replace("/", "_")
for tp in tps:
if tp not in MODEL_TABLE[m]["valid_tp"]: continue
try:
tdata = json.loads((RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json").read_text())
tok_s = f"{tdata.get('tokens_per_second', 0):.1f}"
except: tok_s = "N/A"
name_cell = m.split('/')[-1]
print(f"{name_cell:<40} | {tp:<2} | {tok_s:<8}")
print("-" * 60)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--tp", type=int, nargs="+", default=[1])
args = parser.parse_args()
gpu_count = get_gpu_count()
log(f"Detected {gpu_count} AMD GPU(s)")
valid_tp_args = [t for t in args.tp if t <= gpu_count]
if not valid_tp_args:
log(f"Requested TP={args.tp} but only {gpu_count} GPU(s) detected. Nothing to run.")
sys.exit(0)
kill_vllm()
for tp in valid_tp_args:
for m in MODELS_TO_RUN:
run_throughput(m, tp)
print_summary(valid_tp_args)