updates

2025-12-20 11:37:06 +00:00
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 1302.7062463890015,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.15352655332265747,
+    "tokens_per_second": 112.69232830266365
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 540.2676798280002,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.37018686748700586,
+    "tokens_per_second": 271.7264154071495
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 1303.4944151099999,
+    "num_requests": 200,
+    "total_num_tokens": 146805,
+    "requests_per_second": 0.15343372221746138,
+    "tokens_per_second": 112.62418795067208
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 914.8563823220001,
+    "num_requests": 200,
+    "total_num_tokens": 148857,
+    "requests_per_second": 0.21861354838273012,
+    "tokens_per_second": 162.71078485804028
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 522.8661062630126,
+    "num_requests": 200,
+    "total_num_tokens": 145877,
+    "requests_per_second": 0.38250710383471637,
+    "tokens_per_second": 278.99494393048457
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 1339.915984058,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.14926308990977954,
+    "tokens_per_second": 109.73523843987172
+}
@@ -0,0 +1,7 @@
+{
+    "elapsed_time": 468.4791132300161,
+    "num_requests": 200,
+    "total_num_tokens": 147036,
+    "requests_per_second": 0.42691337639593563,
+    "tokens_per_second": 313.85817605876395
+}
@@ -0,0 +1,575 @@
+#!/usr/bin/env python3
+import subprocess
+import time
+import socket
+import json
+import sys
+import os
+import requests
+import re
+import argparse
+from pathlib import Path
+try:
+    from transformers import AutoConfig
+except ImportError:
+    print("Error: 'transformers' not found. Please install it or run in vLLM environment.")
+    sys.exit(1)
+
+# Import configuration from average benchmark script
+try:
+    from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN, get_gpu_count, kill_vllm
+except ImportError:
+    print("Error: Could not import run_vllm_bench.py. Make sure it is in the same directory.")
+    sys.exit(1)
+
+# =========================
+# 🧠 GROUNDING & METHODOLOGY
+# =========================
+# This script finds the Maximum Working Context (MWC) for vLLM models.
+#
+# Methodology:
+# 1. **Inspect**: Use `transformers.AutoConfig` to determine the model's theoretical limit 
+#    (e.g., `max_position_embeddings`). 
+# 2. **Probe**: Launch `vllm serve` at this limit.
+# 3. **React**: 
+#    - If stable ("Application startup complete"): Success.
+#    - If OOM ("KV cache capacity... is X"): Retry with vLLM's suggested X.
+#    - If Config Error ("max_model_len... is Y"): Retry with vLLM's suggested Y.
+
+# =========================
+# ⚙️ CONFIG
+# =========================
+HOST = "127.0.0.1"
+PORT = 8000
+RESULTS_FILE = Path("max_context_results.json")
+REPORT_FILE = Path("max_context_report.md")
+
+# We test these GPU Utilizations steps to see how much we can squeeze
+# 0.90 is default, but we want MAX context.
+# 0.98 is our target high. 0.95 is the fallback.
+GPU_UTIL_STEPS = ["0.95", "0.90"]
+# We test these concurrency settings
+CONCURRENCY_STEPS = [1, 4, 8, 16]
+
+def log(msg):    print(f"[MAX-CTX] {msg}", flush=True)
+
+def get_hf_context_limit(model_name, trust_remote=False):
+    try:
+        cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote)
+
+        # Gemma 3 and similar multi-config models
+        if hasattr(cfg, "text_config"):
+            tc = cfg.text_config
+            if hasattr(tc, "max_position_embeddings"):
+                return int(tc.max_position_embeddings)
+
+        # Standard HF attributes
+        for attr in (
+            "max_position_embeddings",
+            "seq_length",
+            "max_seq_len",
+            "n_positions",
+        ):
+            val = getattr(cfg, attr, None)
+            if val is not None:
+                return int(val)
+
+        return 8192
+
+    except Exception as e:
+        log(f"Warning: Could not read config for {model_name}: {e}. Defaulting to 32768.")
+        return 32768
+
+def get_vllm_server_cmd(model, tp_size, util, max_len, max_seqs):
+    """
+    Constructs the vLLM serve command.
+    """
+    config = MODEL_TABLE[model]
+    
+    cmd = [
+        "vllm", "serve", model,
+        "--gpu-memory-utilization", str(util),
+        "--max-model-len", str(max_len),
+        "--tensor-parallel-size", str(tp_size),
+        "--max-num-seqs", str(max_seqs),
+        "--dtype", "auto",
+        # "--disable-log-stats" # Cleaner output, but user managed without it
+    ]
+    
+    if config.get("trust_remote"): cmd.append("--trust-remote-code")
+    if config.get("enforce_eager"): cmd.append("--enforce-eager")
+    
+    # Add model specific env vars
+    env = os.environ.copy()
+    env.update(config.get("env", {}))
+    
+    return cmd, env
+
+def is_port_free(port):
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(('localhost', port)) != 0
+
+def force_cleanup(hard=False):
+    """
+    Kills vLLM using multiple methods and ensures port is free.
+    BLOCKS until processes are definitely gone.
+    """
+    timeout = 20 if hard else 10
+    start_time = time.time()
+    
+    while True:
+        # 1. Aggressive Kill Commands
+        # We send these EVERY loop iteration until they die.
+        subprocess.run("pkill -9 -f 'vllm.entrypoints.api_server'", shell=True, stderr=subprocess.DEVNULL)
+        subprocess.run("pkill -9 -f 'vllm serve'", shell=True, stderr=subprocess.DEVNULL)
+        subprocess.run("pkill -9 -f 'VLLM::'", shell=True, stderr=subprocess.DEVNULL)
+        subprocess.run("pkill -9 -f 'multiprocessing.spawn'", shell=True, stderr=subprocess.DEVNULL)
+        subprocess.run("pkill -9 -f ray::", shell=True, stderr=subprocess.DEVNULL)
+
+        # 2. Check if they are still there
+        # We check specifically for the persistence of any vllm-related process
+        # We use explicit list to know WHICH one triggered it
+        # CRITICAL FIX: We MUST use shell=False otherwise 'pgrep -f pattern' 
+        # matches the 'sh -c pgrep ... pattern' command content itself!
+        dirty = False
+        
+        # Check 1: vllm serve
+        if subprocess.run(["pgrep", "-f", "vllm serve"], stdout=subprocess.DEVNULL).returncode == 0:
+            # Double check it's not us (Python script)
+            # But simpler to just proceed if we trust shell=False works
+            log("Clean waiting: Found 'vllm serve' process:")
+            subprocess.run("pgrep -a -f 'vllm serve'", shell=True) # debug
+            dirty = True
+            
+        # Check 2: api_server
+        if subprocess.run(["pgrep", "-f", "vllm.entrypoints.api_server"], stdout=subprocess.DEVNULL).returncode == 0:
+            log("Clean waiting: Found 'vllm.entrypoints.api_server' process:")
+            subprocess.run("pgrep -a -f 'vllm.entrypoints.api_server'", shell=True) # debug
+            dirty = True
+            
+        # Check 3: VLLM:: (Ray workers)
+        if subprocess.run(["pgrep", "-f", "VLLM::"], stdout=subprocess.DEVNULL).returncode == 0:
+            log("Clean waiting: Found 'VLLM::' process:")
+            subprocess.run("pgrep -a -f 'VLLM::'", shell=True) # debug
+            dirty = True
+
+        if not dirty:
+            # Processes are gone. Now check port.
+            if is_port_free(PORT):
+                time.sleep(1) # Final safety buffer
+                return # Clean!
+            else:
+                log("Clean: Processes gone, but Port 8000 still held. Waiting...")
+        else:
+            log("Clean: Processes still detected. Retrying kill...")
+        
+        if time.time() - start_time > timeout:
+            log("CRITICAL: Cleanup timed out! Force attempting `killall -9 vllm` as last resort.")
+            subprocess.run("killall -9 vllm", shell=True, stderr=subprocess.DEVNULL)
+            break
+            
+        time.sleep(1.5) # Wait a bit before hammering again
+
+
+def wait_for_server_and_parse(process, timeout=300):
+    """
+    Waits for server to be ready.
+    Parses stdout for "Count of GPU blocks" and "Block size".
+    Returns: (ready_bool, gpu_blocks, block_size, max_len_clamped, failure_reason)
+    """
+    start = time.time()
+    gpu_blocks = 0
+    block_size = 16 # default
+    max_len_clamped = None
+    
+    logs = []
+    failure_reason = None
+    
+    while time.time() - start < timeout:
+        if process.poll() is not None:
+            # Process died.
+            for line in process.stdout:
+                line_str = line.decode("utf-8", errors="replace").strip()
+                logs.append(line_str)
+            
+            # SCAN FULL HISTORY if not found yet
+            # Sometimes error was in previous lines or split
+            if not failure_reason:
+                full_log = "\n".join(logs)
+                
+                # Check 1: Sampler OOM
+                if "warming up sampler" in full_log and "CUDA out of memory" in full_log:
+                     failure_reason = "Sampler Warmup OOM"
+                
+                # Check 2: Explicit vLLM suggestion (Estimated)
+                # "estimated maximum model length is 127120"
+                elif "estimated maximum model length is" in full_log:
+                     m = re.search(r"estimated maximum model length is (\d+)", full_log)
+                     if m:
+                         failure_reason = f"estimated maximum model length is {m.group(1)}"
+                         
+                # Check 3: Derived Max Model Len
+                # "derived max_model_len (max_position_embeddings=131072.0 ...)"
+                elif "derived max_model_len" in full_log:
+                     failure_reason = "derived max_model_len detected"
+                
+                # Check 4: Capacity/Value Error
+                elif "ValueError" in full_log and "maximum number of tokens" in full_log:
+                     failure_reason = "Capacity Error (Found in history)"
+                
+                # Check 5: Generic OOM
+                elif "CUDA out of memory" in full_log or "hipErrorOutOfMemory" in full_log:
+                     failure_reason = "OOM detected"
+            
+            if not failure_reason:
+                # Unexpected death! Dump logs to see why.
+                log("CRITICAL: Process died unexpectedly! Dumping last 100 lines:")
+                print("=== vLLM SERVER LOGS (LAST 100 LINES) ===")
+                for l in logs[-100:]:
+                    print(l)
+                print("=============================================")
+                    
+            return False, 0, 0, None, failure_reason
+            
+        line = process.stdout.readline()
+        if line:
+            line_str = line.decode("utf-8", errors="replace").strip()
+            logs.append(line_str)
+            
+            # 1. Parse Legacy "GPU blocks" (if present)
+            m_blocks = re.search(r"# GPU blocks:\s*(\d+)", line_str)
+            if m_blocks:
+                gpu_blocks = int(m_blocks.group(1))
+                block_size = 16 # assume default unless found
+                log(f"  -> Found GPU blocks: {gpu_blocks} (Legacy)")
+
+            # 2. Parse Newer "GPU KV cache size" (vLLM 0.11+)
+            # "GPU KV cache size: 111,536 tokens"
+            m_kv_tokens = re.search(r"GPU KV cache size:\s*([\d,]+)\s*tokens", line_str)
+            if m_kv_tokens:
+                tokens_str = m_kv_tokens.group(1).replace(",", "")
+                gpu_blocks = int(tokens_str) # We use 'gpu_blocks' variable to store total tokens now for simplicity
+                block_size = 1 # Effectively 1 because we have the total count
+                log(f"  -> Found GPU KV Cache tokens: {gpu_blocks}")
+
+            # 3. Parse Block Size (optional, mostly for legacy)
+            m_bs = re.search(r"block_size=(\d+)", line_str)
+            if m_bs:
+                block_size = int(m_bs.group(1))
+
+            # Failure hints
+            if "ValueError" in line_str and "maximum number of tokens" in line_str:
+                failure_reason = line_str
+            if "derived max_model_len" in line_str:
+                failure_reason = line_str
+            if "warming up sampler" in line_str and "CUDA out of memory" in line_str:
+                failure_reason = "Sampler Warmup OOM"
+            elif "CUDA out of memory" in line_str or "hipErrorOutOfMemory" in line_str:
+                failure_reason = "OOM detected"
+
+            # Check for startup
+            if "Application startup complete" in line_str or "Uvicorn running on" in line_str:
+                if gpu_blocks > 0:
+                    log("  -> Server signal detected. Waiting 5s for socket stability...")
+                    time.sleep(5)
+                    return True, gpu_blocks, block_size, max_len_clamped, None
+                else:
+                    return False, 0, 0, None, "Parsed Success but Token/Block Count was 0"
+                
+    # Timeout case
+    log("CRITICAL: Server startup timed out! Dumping last 100 lines:")
+    print("=== vLLM SERVER LOGS (LAST 100 LINES) ===")
+    for l in logs[-100:]:
+        print(l)
+    print("=============================================")
+    return False, 0, 0, None, "Timeout"
+
+def verify_context(model, context_len):
+    """
+    Sends a request to the server with length ~context_len to verify stability.
+    """
+    url = f"http://{HOST}:{PORT}/v1/completions"
+    
+    # We use a simple "A " * N prompt.
+    # Llama 3 tokenizer: "A" is usually 1 token.
+    
+    prompt = "A " * int(context_len * 0.5) # 50% fill to be safe/approx
+    
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "max_tokens": 10,
+        "temperature": 0
+    }
+    
+    # Retry loop for connection refusals (race condition)
+    max_retries = 5
+    for attempt in range(max_retries):
+        try:
+            # Increased timeout to 300s because prefilling 60k+ tokens takes time!
+            r = requests.post(url, json=payload, timeout=300)
+            if r.status_code == 200:
+                return True, "Success"
+            else:
+                # If 500 or 400 error, maybe we shouldn't retry? Usually yes for 500 if transient.
+                # But for now let's just fail or retry.
+                # If we are OOMing, we will likely get a 500 or it will hang.
+                return False, f"HTTP {r.status_code}: {r.text[:200]}"
+        except requests.exceptions.ConnectionError:
+            if attempt < max_retries - 1:
+                log(f"  -> Connection refused. Retrying verification ({attempt+1}/{max_retries})...")
+                time.sleep(2)
+            else:
+                return False, "Connection Refused (Max Retries)"
+        except Exception as e:
+            return False, str(e)
+            
+    return False, "Unknown Error"
+
+def run_probe(model, tp, util, max_seqs, start_limit=None):
+    """
+    Probes a specific configuration starting from the model's architectural limit.
+    """
+    trust_remote = MODEL_TABLE[model].get("trust_remote", False)
+    # 1. Get the Advertised Limit (The "Smart" Way)
+    arch_limit = get_hf_context_limit(model, trust_remote)
+    
+    # Intelligent Start: If we know a lower limit worked for lower concurrency, start there.
+    target_len = arch_limit
+    if start_limit:
+        target_len = min(arch_limit, start_limit)
+        log(f"  -> Smart Start: Capping initial probe at {target_len} (based on previous run)")
+    
+    result_data = {
+        "model": model,
+        "tp": tp,
+        "util": util,
+        "max_seqs": max_seqs,
+        "model_limit": arch_limit,
+        "configured_len": 0,
+        "real_capacity": 0,
+        "status": "fail",
+        "error": ""
+    }
+
+    log(f"Probing {model} | TP={tp} | Util={util} | Seqs={max_seqs} | Model Limit={arch_limit}")
+    
+    # We loop until we succeed OR we drop below a useful context size.
+    while target_len >= 2048:
+        force_cleanup()
+        
+        cmd, env = get_vllm_server_cmd(model, tp, util, target_len, max_seqs)
+        log(f"DEBUG: Cmd: {' '.join(cmd)}")
+        
+        proc = None
+        try:
+            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env)
+            ready, blocks, block_size, _, fail_msg = wait_for_server_and_parse(proc)
+            
+            if ready:
+                # Success - but let's VERIFY it actually answers
+                total_capacity = blocks * block_size
+                workable_len = min(target_len, total_capacity)
+                
+                # Verify with actual request
+                # We cap verification at 4096 because we just want to know if it crashes, 
+                # we don't need to wait for a 128k context fill just for a liveness check.
+                verify_len = min(workable_len, 4096)
+                log(f"  -> Server ready. Verifying stability with approx {int(verify_len * 0.5)} tokens (capped at 4k)...")
+                v_ok, v_msg = verify_context(model, verify_len)
+                
+                if v_ok:
+                    log(f"  -> Success! capacity={total_capacity}, configured={workable_len}")
+                    log(f"  -> Verification passed: {v_msg}")
+                    
+                    # Cleanup SUCCESSFUL process immediately
+                    proc.terminate()
+                    try: proc.wait(timeout=5)
+                    except: proc.kill()
+                    
+                    result_data["status"] = "success"
+                    result_data["configured_len"] = target_len
+                    result_data["real_capacity"] = total_capacity
+                    result_data["max_context_1_user"] = workable_len
+                    
+                    return result_data
+                else:
+                    log(f"  -> Server started, but Verification FAILED: {v_msg}")
+                    # Treat as a crash/failure, back off
+                    fail_msg = "Verification Failed"
+                    
+                    # Capture any remaining logs if the process is dead or dying
+                    # Or just read what's currently available non-blocking? 
+                    # Simpler: just terminate and read output.
+                    proc.terminate()
+                    try: 
+                        outs, errs = proc.communicate(timeout=5)
+                        if outs:
+                            print("=== vLLM SERVER LOGS (DURING VERIFICATION FAILURE) ===")
+                            print(outs.decode('utf-8', errors='replace'))
+                            print("======================================================")
+                    except: 
+                        proc.kill()
+
+            
+            # If we fall through here, ready=False OR verify=False
+            log(f"  -> Attempt failed at {target_len}")
+            if fail_msg: log(f"     Reason: {fail_msg}")
+            result_data["error"] = fail_msg if fail_msg else "Process died or timed out"
+                
+            if fail_msg:
+                # Case V: Verification Failed (Server up, but unstable inference)
+                # User requests drop to 0.95 tier immediately.
+                # Must check this FIRST to ensure we don't fall through.
+                if "Verification Failed" in str(fail_msg):
+                    log("  -> Verification Failed (Unstable). Aborting this Util, dropping to lower tier.")
+                    break
+
+                # Case S: Sampler Warmup OOM (Fatal for this Util)
+                if "Sampler Warmup OOM" in fail_msg:
+                    log("  -> Critical Sampler OOM. Utilization/Seqs too high. Aborting this configuration.")
+                    break # Give up on this Util/Seq combo immediately
+
+                # Case X: Dirty State / Zombie VRAM 
+                # "Free memory on device (1.56/31.86 GiB) on startup is less than desired..."
+                if "Free memory on device" in fail_msg and "less than desired" in fail_msg:
+                        log("  -> Dirty VRAM detected (previous run didn't cleanup?). Retrying with HARD cleanup.")
+                        force_cleanup(hard=True)
+                        continue # Retry SAME target_len
+
+                # Case A: VRAM Limit ("maximum number of tokens... is X")
+                m_capacity = re.search(r"maximum number of tokens.*?KV cache is (\d+)", fail_msg)
+                if m_capacity:
+                    cap = int(m_capacity.group(1))
+                    log(f"  -> Found Hardware Capacity: {cap}")
+                    target_len = cap
+                    continue # Retry Exact Cap
+
+                # Case B: Model Limit mismatch 
+                # "Value error, User-specified max_model_len (500000) is greater than the derived max_model_len (max_position_embeddings=131072.0 ...)"
+                # We regex for 'derived max_model_len' and then look for numbers in the proximity.
+                
+                if "derived max_model_len" in fail_msg:
+                    # Try to capture "max_position_embeddings=131072"
+                    m_pos = re.search(r"max_position_embeddings=([\d\.]+)", fail_msg)
+                    if m_pos:
+                        limit = int(float(m_pos.group(1))) # handle 131072.0
+                        log(f"  -> Found Model Limit: {limit}")
+                        target_len = limit
+                        continue
+                        
+                    # Fallback: look for simple parenthesis pattern if the above fails
+                    m_derived = re.search(r"derived max_model_len\s*\((\d+)\)", fail_msg)
+                    if m_derived:
+                        limit = int(m_derived.group(1))
+                        log(f"  -> Found Model Limit (Legacy): {limit}")
+                        target_len = limit
+                        continue
+
+                # Case C: Estimated Max Length (New vLLM Safe Limit)
+                # "estimated maximum model length is 111536"
+                m_est = re.search(r"estimated maximum model length is (\d+)", fail_msg)
+                if m_est:
+                    limit = int(m_est.group(1))
+                    log(f"  -> Found vLLM Estimated Limit: {limit}")
+                    target_len = limit
+                    continue
+
+            # Case D: Generic OOM/Crash
+            target_len = int(target_len * 0.8)
+            log(f"  -> Backing off to: {target_len}")
+                
+            if target_len < 2048:
+                log("  -> Give up (too small)")
+                break
+        finally:
+            if proc:
+                try: proc.terminate()
+                except: pass
+                try: proc.kill() 
+                except: pass
+                proc.wait() 
+            force_cleanup()
+                
+    return result_data
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="Filter to run only this model (substring match)")
+    parser.add_argument("--steps", type=int, default=-1, help="Number of models to run (default: all)")
+    args = parser.parse_args()
+
+    gpu_count = get_gpu_count()
+    
+    # 1. Load existing results to support RESUME
+    results = []
+    if RESULTS_FILE.exists():
+        try:
+            with open(RESULTS_FILE, "r") as f:
+                results = json.load(f)
+            log(f"Loaded {len(results)} previous results. Resuming...")
+        except Exception as e:
+            log(f"Warning: Could not read existing results: {e}")
+
+    count = 0
+    for model in MODELS_TO_RUN:
+        if args.model and args.model not in model:
+            continue
+            
+        config = MODEL_TABLE[model]
+        valid_tps = [t for t in config["valid_tp"] if t <= gpu_count]
+        
+        for tp in valid_tps:
+            # Track successful seqs for this TP to skip lower utils
+            # effectively: {seqs_count: max_working_util}
+            # Since we iterate high-util -> low-util, if we succeeded already for this 'seqs', we skip.
+            successful_seqs = set() 
+            
+            # Reset smart limit for each TP (TP2 should not inherit TP1's limit)
+            last_working_len = None 
+            
+            for util in GPU_UTIL_STEPS:
+                
+                for seqs in CONCURRENCY_STEPS:
+                    if seqs in successful_seqs:
+                        log(f"Skipping {model} (TP={tp}, Util={util}, Seqs={seqs}) - Already succeeded at higher util.")
+                        continue
+
+                    # Check if we already have this result
+                    existing_res = next((r for r in results 
+                                         if r["model"] == model 
+                                         and r["tp"] == tp 
+                                         and str(r["util"]) == str(util) 
+                                         and r["max_seqs"] == seqs), None)
+                    
+                    if existing_res:
+                        res = existing_res
+                        log(f"Skipping {model} (TP={tp}, Util={util}, Seqs={seqs}) - Found in results.")
+                    else:
+                        # New run
+                        res = run_probe(model, tp, util, seqs, start_limit=last_working_len)
+                        results.append(res)
+                        
+                        # Save immediately
+                        with open(RESULTS_FILE, "w") as f:
+                            json.dump(results, f, indent=2)
+
+                    # Update logic for Resume OR New Run:
+                    if res["status"] == "success":
+                        last_working_len = res["configured_len"]
+                        successful_seqs.add(seqs) # Mark this seq count as done for this TP
+
+                    # Smart Break: If we failed at this concurrency level (capacity=0), 
+                    # higher concurrency will also fail.
+                    if res["real_capacity"] == 0 or res["status"] == "fail":
+                        log(f"Stopping higher concurrency tests for {model} (failed at {seqs} seqs)")
+                        break
+
+        count += 1
+        if args.steps != -1 and count >= args.steps and not args.model:
+             break
+
+    # generate_report(results) - Moved to separate script
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,333 @@
+[
+  {
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 829952,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 4,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 830064,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 8,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 830080,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 16,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 830064,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "google/gemma-3-12b-it",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 246032,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "google/gemma-3-12b-it",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 4,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 246064,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "google/gemma-3-12b-it",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 8,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 246064,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "google/gemma-3-12b-it",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 16,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 246064,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "Qwen/Qwen3-14B-AWQ",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 40960,
+    "configured_len": 0,
+    "real_capacity": 0,
+    "status": "fail",
+    "error": "Verification Failed"
+  },
+  {
+    "model": "Qwen/Qwen3-14B-AWQ",
+    "tp": 1,
+    "util": "0.90",
+    "max_seqs": 1,
+    "model_limit": 40960,
+    "configured_len": 40960,
+    "real_capacity": 655712,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 40960
+  },
+  {
+    "model": "Qwen/Qwen3-14B-AWQ",
+    "tp": 1,
+    "util": "0.90",
+    "max_seqs": 4,
+    "model_limit": 40960,
+    "configured_len": 40960,
+    "real_capacity": 655616,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 40960
+  },
+  {
+    "model": "Qwen/Qwen3-14B-AWQ",
+    "tp": 1,
+    "util": "0.90",
+    "max_seqs": 8,
+    "model_limit": 40960,
+    "configured_len": 40960,
+    "real_capacity": 655600,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 40960
+  },
+  {
+    "model": "Qwen/Qwen3-14B-AWQ",
+    "tp": 1,
+    "util": "0.90",
+    "max_seqs": 16,
+    "model_limit": 40960,
+    "configured_len": 40960,
+    "real_capacity": 655600,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 40960
+  },
+  {
+    "model": "openai/gpt-oss-20b",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 2232848,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "openai/gpt-oss-20b",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 4,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 2232560,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "openai/gpt-oss-20b",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 8,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 2232544,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "openai/gpt-oss-20b",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 16,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 2232544,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "openai/gpt-oss-120b",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 711360,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "openai/gpt-oss-120b",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 4,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 711168,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "openai/gpt-oss-120b",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 8,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 711168,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "openai/gpt-oss-120b",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 16,
+    "model_limit": 131072,
+    "configured_len": 131072,
+    "real_capacity": 711168,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 131072
+  },
+  {
+    "model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 262144,
+    "configured_len": 262144,
+    "real_capacity": 1097712,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 262144
+  },
+  {
+    "model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 4,
+    "model_limit": 262144,
+    "configured_len": 0,
+    "real_capacity": 0,
+    "status": "fail",
+    "error": "Verification Failed"
+  },
+  {
+    "model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
+    "tp": 1,
+    "util": "0.90",
+    "max_seqs": 4,
+    "model_limit": 262144,
+    "configured_len": 209715,
+    "real_capacity": 1029856,
+    "status": "success",
+    "error": "Process died or timed out",
+    "max_context_1_user": 209715
+  },
+  {
+    "model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
+    "tp": 1,
+    "util": "0.90",
+    "max_seqs": 8,
+    "model_limit": 262144,
+    "configured_len": 0,
+    "real_capacity": 0,
+    "status": "fail",
+    "error": "Verification Failed"
+  },
+  {
+    "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
+    "tp": 1,
+    "util": "0.95",
+    "max_seqs": 1,
+    "model_limit": 262144,
+    "configured_len": 0,
+    "real_capacity": 0,
+    "status": "fail",
+    "error": "Verification Failed"
+  },
+  {
+    "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
+    "tp": 1,
+    "util": "0.90",
+    "max_seqs": 1,
+    "model_limit": 262144,
+    "configured_len": 262144,
+    "real_capacity": 696320,
+    "status": "success",
+    "error": "",
+    "max_context_1_user": 262144
+  },
+  {
+    "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
+    "tp": 1,
+    "util": "0.90",
+    "max_seqs": 4,
+    "model_limit": 262144,
+    "configured_len": 0,
+    "real_capacity": 0,
+    "status": "fail",
+    "error": "Verification Failed"
+  }
+]
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+import subprocess, time, json, sys, os, requests, argparse
+from pathlib import Path
+
+# =========================
+# ⚙️ GLOBAL SETTINGS
+# =========================
+
+# HARDWARE: 1x Strix Halo (128GB, RDNA 3.5)
+GPU_UTIL = "0.90" 
+# 1. THROUGHPUT CONFIG
+OFF_NUM_PROMPTS      = 200 
+OFF_FORCED_OUTPUT    = "512"
+# Default fallback if not specified in MODEL_TABLE
+DEFAULT_BATCH_TOKENS = "8192"
+
+# Fallbacks
+FALLBACK_INPUT_LEN  = 1024
+FALLBACK_OUTPUT_LEN = 512
+
+RESULTS_DIR = Path("benchmark_results")
+RESULTS_DIR.mkdir(exist_ok=True)
+
+# =========================
+# 🛠️ MODEL CONFIGURATION 🛠️
+# =========================
+
+MODEL_TABLE = {
+    # 1. Llama 3.1 8B Instruct
+    # MAD uses 131k tokens. We scale to 32k for 32GB VRAM safety.
+    "meta-llama/Meta-Llama-3.1-8B-Instruct": {
+        "trust_remote": False,
+        "valid_tp": [1, 2],
+        "max_num_seqs": "64",
+        "max_tokens": "32768" 
+    },
+    
+    "google/gemma-3-12b-it": {
+        "trust_remote": False,
+        "valid_tp": [1, 2],
+        "max_num_seqs": "64",
+        "max_tokens": "32768" 
+    },
+    # 2. GPT-OSS 20B (MXFP4)
+    # MAD Row 0 uses 8192. We match this exactly.
+    "openai/gpt-oss-20b": {
+        "trust_remote": True,
+        "valid_tp": [1, 2],
+        "max_num_seqs": "64",
+        "max_tokens": "8192"
+    },
+    
+    "openai/gpt-oss-120b": {
+        "trust_remote": True,
+        "valid_tp": [1],
+        "max_num_seqs": "64",
+        "max_tokens": "8192"
+    },
+
+
+    "Qwen/Qwen3-14B-AWQ": {
+        "trust_remote": True,
+        "valid_tp": [1], # Too big for single GPU
+        "max_num_seqs": "32", # Lower concurrency for safety
+        "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
+        "enforce_eager": False, 
+        "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
+    },
+
+    # 4. Qwen 30B 4-bit
+    "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit": {
+        "trust_remote": True,
+        "enforce_eager": False, 
+        "valid_tp": [1, 2],
+        "max_num_seqs": "64",
+        "max_tokens": "32768"
+    },
+
+    # 5. Qwen 80B AWQ (The Big One) [NEW]
+    # Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
+    # Config: 20k ctx fits in that cache. Eager mode required for stability.
+     "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
+        "trust_remote": True,
+        "valid_tp": [1], # Too big for single GPU
+        "max_num_seqs": "32", # Lower concurrency for safety
+        "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive
+        "enforce_eager": True, 
+        "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
+    },
+
+}
+
+MODELS_TO_RUN = [
+    #"meta-llama/Meta-Llama-3.1-8B-Instruct",
+    #"google/gemma-3-12b-it",
+    #"Qwen/Qwen3-14B-AWQ",
+    #"openai/gpt-oss-20b",
+    #"openai/gpt-oss-120b",
+    "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit",
+    "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
+]
+
+# =========================
+# UTILS
+# =========================
+
+def log(msg): print(f"\n[BENCH] {msg}")
+
+def get_gpu_count():
+    try:
+        # Using rocm-smi --showid to list GPUs. 
+        # Output format: "GPU[0] : Device Name: ..."
+        res = subprocess.run(["rocm-smi", "--showid"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if res.returncode == 0:
+            # Filter specifically for the target GPU as requested
+            # target_gpu = "AMD Radeon AI PRO R9700"
+            # count = 0
+            # for line in res.stdout.strip().split('\n'):
+            #     if "Device Name" in line and target_gpu in line:
+            #         count += 1
+            
+            # return count if count > 0 else 1
+            return 1 # Force return 1 for Strix Halo APU
+        else:
+            log("rocm-smi failed, defaulting to 1 GPU (Hardcoded Fallback)")
+            return 1
+    except Exception as e:
+        log(f"Error detecting GPUs: {e}, defaulting to 1 GPU")
+        return 1
+
+def kill_vllm():
+    subprocess.run("pgrep -f 'vllm serve' | xargs -r kill -9", 
+                   shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    time.sleep(5)
+
+def nuke_vllm_cache():
+    cache = Path.home() / ".cache" / "vllm"
+    if cache.exists():
+        try:
+            subprocess.run(["rm", "-rf", str(cache)], check=True)
+            cache.mkdir(parents=True, exist_ok=True)
+            time.sleep(2)
+        except: pass
+
+def get_dataset():
+    data_path = Path("ShareGPT_V3_unfiltered_cleaned_split.json")
+    if data_path.exists(): return str(data_path)
+    
+    log("Downloading ShareGPT dataset...")
+    url = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
+    try:
+        r = requests.get(url, stream=True, timeout=15)
+        r.raise_for_status()
+        with open(data_path, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192): f.write(chunk)
+        return str(data_path)
+    except Exception as e:
+        log(f"WARNING: ShareGPT download failed ({e}). using RANDOM.")
+        return None
+
+
+
+def get_model_args(model, tp_size):
+    config = MODEL_TABLE.get(model, {"max_num_seqs": "32"})
+    
+    # Allow per-model GPU utilization override
+    util = config.get("gpu_util", GPU_UTIL)
+
+    cmd = [
+        "--model", model,
+        "--gpu-memory-utilization", util,
+        "--dtype", "auto",
+        "--tensor-parallel-size", str(tp_size),
+        "--max-num-seqs", config["max_num_seqs"]
+    ]
+    
+    # Optional: if a model really needs a hard limit, we can still support "ctx" in config,
+    # but by default we rely on auto.
+    if "ctx" in config:
+        cmd.extend(["--max-model-len", config["ctx"]])
+        
+    if config.get("trust_remote"): cmd.append("--trust-remote-code")
+    if config.get("enforce_eager"): cmd.append("--enforce-eager")
+    
+    return cmd
+
+def run_throughput(model, tp_size):
+    if tp_size not in MODEL_TABLE[model]["valid_tp"]: return
+    
+    model_safe = model.replace("/", "_")
+    output_file = RESULTS_DIR / f"{model_safe}_tp{tp_size}_throughput.json"
+    
+    if output_file.exists():
+        log(f"SKIP Throughput {model} (TP={tp_size})")
+        return
+
+    dataset_path = get_dataset()
+    dataset_args = ["--dataset-name", "sharegpt", "--dataset-path", dataset_path] if dataset_path else ["--input-len", "1024"]
+    
+    # Retrieve Model-Specific Batch Tokens
+    batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)
+
+    log(f"START Throughput {model} (TP={tp_size}) [Batch: {batch_tokens}]...")
+    kill_vllm()
+    nuke_vllm_cache()
+
+    cmd = ["vllm", "bench", "throughput"] + get_model_args(model, tp_size)
+    cmd.extend([
+        "--num-prompts", str(OFF_NUM_PROMPTS),
+        "--max-num-batched-tokens", batch_tokens,
+        "--output-len", OFF_FORCED_OUTPUT,
+        "--output-json", str(output_file),
+        "--disable-log-stats"
+    ])
+    cmd.extend(dataset_args)
+
+    # ENV Setup: Global + Model Specific
+    env = os.environ.copy()
+    
+    # Inject model specific env vars (e.g. for AWQ)
+    model_env = MODEL_TABLE[model].get("env", {})
+    env.update(model_env)
+
+    try: 
+        subprocess.run(cmd, check=True, env=env)
+    except: 
+        log(f"ERROR: Throughput failed {model}")
+
+
+
+def print_summary(tps):
+    print(f"\n{'MODEL':<40} | {'TP':<2} | {'TOK/S':<8}")
+    print("-" * 60)
+    
+    for m in MODELS_TO_RUN:
+        msafe = m.replace("/", "_")
+        for tp in tps:
+            if tp not in MODEL_TABLE[m]["valid_tp"]: continue
+            
+            try: 
+                tdata = json.loads((RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json").read_text())
+                tok_s = f"{tdata.get('tokens_per_second', 0):.1f}"
+            except: tok_s = "N/A"
+
+            name_cell = m.split('/')[-1]
+            print(f"{name_cell:<40} | {tp:<2} | {tok_s:<8}")
+    print("-" * 60)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tp", type=int, nargs="+", default=[1])
+    args = parser.parse_args()
+    
+    gpu_count = get_gpu_count()
+    log(f"Detected {gpu_count} AMD GPU(s)")
+    
+    valid_tp_args = [t for t in args.tp if t <= gpu_count]
+    if not valid_tp_args:
+        log(f"Requested TP={args.tp} but only {gpu_count} GPU(s) detected. Nothing to run.")
+        sys.exit(0)
+
+    kill_vllm()
+    for tp in valid_tp_args:
+        for m in MODELS_TO_RUN:
+            run_throughput(m, tp)
+    print_summary(valid_tp_args)