amd-strix-halo-vllm-toolboxes/scripts/start_vllm.py

#!/usr/bin/env python3
import sys
import os
import json
import shutil
import tempfile
import subprocess
from pathlib import Path

# Add benchmarks dir to path to import config
SCRIPT_DIR = Path(__file__).parent.resolve()
BENCH_DIR = SCRIPT_DIR.parent / "benchmarks"
OPT_DIR = Path("/opt")

# Check /opt first (Container), then local fallback
if (OPT_DIR / "run_vllm_bench.py").exists():
    sys.path.append(str(OPT_DIR))
else:
    sys.path.append(str(BENCH_DIR))

try:
    from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
except ImportError:
    print("Error: Could not import run_vllm_bench.py config.")
    sys.exit(1)

if (OPT_DIR / "max_context_results.json").exists():
    RESULTS_FILE = OPT_DIR / "max_context_results.json"
else:
    RESULTS_FILE = BENCH_DIR / "max_context_results.json"
HOST = os.getenv("HOST", "0.0.0.0")
PORT = os.getenv("PORT", "8000")

def check_dependencies():
    if not shutil.which("dialog"):
        print("Error: 'dialog' is required. Please install it (apt-get install dialog).")
        sys.exit(1)

def detect_gpus():
    """Detects AMD GPUs via rocm-smi or /dev/dri."""
    try:
        # Try rocm-smi first
        res = subprocess.run(["rocm-smi", "--showid", "--csv"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if res.returncode == 0:
            count = res.stdout.count("GPU")
            if count > 0: return count
    except: pass

    # Fallback to /dev/dri/render*
    try:
        return len(list(Path("/dev/dri").glob("renderD*")))
    except:
        return 1

def get_verified_config(model_id, tp_size, max_seqs):
    """
    Reads max_context_results.json to find the best verified configuration.
    Returns dict: {'ctx': int, 'util': float}
    """
    default_config = {
        "ctx": int(MODEL_TABLE.get(model_id, {}).get("ctx", 8192)),
        "util": 0.90 # Safe default
    }

    if not RESULTS_FILE.exists():
        return default_config

    try:
        with open(RESULTS_FILE, "r") as f:
            data = json.load(f)

        # Filter for Model + TP + Sequences
        matches = [r for r in data
                  if r["model"] == model_id
                  and r["tp"] == tp_size
                  and r["max_seqs"] == max_seqs
                  and r["status"] == "success"]

        if not matches:
            # Fallback 1: Try finding match with SAME TP but ANY Sequences (e.g. 1) to get base context?
            # Actually, safer to fallback to default or try finding nearest sequence?
            # Let's try finding exact match first. If fail, return default.
            return default_config

        # Sort by Util desc, then Context desc
        # We prefer higher utilization if available (performance), as long as it is verified success
        matches.sort(key=lambda x: (float(x["util"]), x["max_context_1_user"]), reverse=True)

        best = matches[0]
        return {
            "ctx": best["max_context_1_user"],
            "util": float(best["util"])
        }

    except Exception as e:
        return default_config

def run_dialog(args):
    """Runs dialog and returns stderr (selection)."""
    with tempfile.NamedTemporaryFile(mode="w+") as tf:
        cmd = ["dialog"] + args
        try:
            subprocess.run(cmd, stderr=tf, check=True)
            tf.seek(0)
            return tf.read().strip()
        except subprocess.CalledProcessError:
            return None # User cancelled

def nuke_vllm_cache():
    """Removes vLLM cache directory to fix potential graph/incompatibility issues."""
    cache = Path.home() / ".cache" / "vllm"
    if cache.exists():
        try:
            print(f"Clearing vLLM cache at {cache}...", end="", flush=True)
            subprocess.run(["rm", "-rf", str(cache)], check=True)
            cache.mkdir(parents=True, exist_ok=True)
            print(" Done.")
            time.sleep(1)
        except Exception as e:
            print(f" Failed: {e}")

def configure_and_launch(model_idx, gpu_count):
    model_id = MODELS_TO_RUN[model_idx]
    config = MODEL_TABLE[model_id]

    # Static Config
    valid_tps = config.get("valid_tp", [1])
    max_tp = max(valid_tps) if valid_tps else 1

    # Defaults
    current_tp = min(gpu_count, max_tp)
    current_seqs = 1 # Default to 1 concurrent user/request for stability

    # Initial Lookup
    verified = get_verified_config(model_id, current_tp, current_seqs)
    current_ctx = verified["ctx"]
    current_util = verified["util"]

    clear_cache = False
    use_eager = config.get("enforce_eager", False) # Default to model config, usually False
    use_rocm_attn = False # Default to Triton

    name = model_id.split("/")[-1]

    while True:
        cache_status = "YES" if clear_cache else "NO"
        eager_status = "YES" if use_eager else "NO"
        attn_backend = "ROCm" if use_rocm_attn else "Triton"

        menu_args = [
            "--clear", "--backtitle", f"AMD R9700 vLLM Launcher (GPUs: {gpu_count})",
            "--title", f"Configuration: {name}",
            "--menu", "Customize Launch Parameters:", "22", "65", "9",
            "1", f"Tensor Parallelism:   {current_tp}",
            "2", f"Concurrent Requests:  {current_seqs}",
            "3", f"Context Length:       {current_ctx} (Verified)",
            "4", f"GPU Utilization:      {current_util} (Verified)",
            "5", f"Attention Backend:    {attn_backend}",
            "6", f"Erase vLLM Cache:     {cache_status}",
            "7", f"Force Eager Mode:     {eager_status}",
            "8", "LAUNCH SERVER"
        ]

        choice = run_dialog(menu_args)
        if not choice: return False # Back/Cancel

        if choice == "1":
            # TP Selection
            new_tp = run_dialog([
                "--title", "Tensor Parallelism",
                "--rangebox", f"Set TP Size (1-{max_tp})", "10", "40", "1", str(max_tp), str(current_tp)
            ])
            if new_tp:
                new_tp_int = int(new_tp)
                if new_tp_int != current_tp:
                    current_tp = new_tp_int
                    # RE-CALCULATE Config
                    verified = get_verified_config(model_id, current_tp, current_seqs)
                    current_ctx = verified["ctx"]
                    current_util = verified["util"]

        elif choice == "2":
            # Max Seqs Selection
            new_seqs = run_dialog([
                "--title", "Concurrent Requests",
                "--menu", "Select Max Concurrent Requests:", "12", "40", "4",
                "1", "1 (Latency Focus)",
                "4", "4 (Balanced)",
                "8", "8 (Throughput)",
                "16", "16 (Max Load)"
            ])
            if new_seqs:
                current_seqs = int(new_seqs)
                # RE-CALCULATE Config based on new concurrency
                verified = get_verified_config(model_id, current_tp, current_seqs)
                current_ctx = verified["ctx"]
                current_util = verified["util"]

        elif choice == "3":
            # Configured Length Override
            new_ctx = run_dialog([
                "--title", "Context Length",
                "--inputbox", f"Override verified limit ({current_ctx}):", "10", "40", str(current_ctx)
            ])
            if new_ctx: current_ctx = int(new_ctx)

        elif choice == "4":
             # Util Override
             pass

        elif choice == "5":
            # Toggle Attention Backend
            use_rocm_attn = not use_rocm_attn

        elif choice == "6":
            # Toggle Cache
            if not clear_cache:
                # Enabling it -> Show Warning
                warn_msg = (
                    "WARNING: Erasing the vLLM cache will remove the compiled compute graphs.\n\n"
                    "This is useful if you are experiencing crashes, 'invalid graph' errors,\n"
                    "or have switched vLLM versions recently.\n\n"
                    "However, the next startup will take longer as graphs are re-compiled.\n\n"
                    "Are you sure you want to enable this?"
                )
                confirm = run_dialog([
                    "--title", "Erase Cache Warning",
                    "--yesno", warn_msg, "12", "60"
                ])

                # If confirm is not None (exit 0), it is YES.
                if confirm is not None:
                     clear_cache = True
            else:
                # Disabling it -> No warning needed
                clear_cache = False

        elif choice == "7":
            # Toggle Eager Mode
            use_eager = not use_eager

        elif choice == "8":
            # Launch
            break

    # Build Command
    subprocess.run(["clear"])

    if clear_cache:
        nuke_vllm_cache()

    cmd = [
        "vllm", "serve", model_id,
        "--host", HOST,
        "--port", PORT,
        "--tensor-parallel-size", str(current_tp),
        "--max-num-seqs", str(current_seqs),
        "--max-model-len", str(current_ctx),
        "--gpu-memory-utilization", str(current_util),
        "--dtype", "auto"
    ]

    if config.get("trust_remote"): cmd.append("--trust-remote-code")
    if use_eager: cmd.append("--enforce-eager")

    # Env Vars
    env = os.environ.copy()
    env.update(config.get("env", {}))

    if use_rocm_attn:
        env["VLLM_V1_USE_PREFILL_DECODE_ATTENTION"] = "1"
        env["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
        # Optional: Explicitly mention these in print


    print("\n" + "="*60)
    print(f" Launching: {name}")
    print(f" Config:    TP={current_tp} | Seqs={current_seqs} | Ctx={current_ctx} | Util={current_util}")
    print(f" Backend:   {'ROCm' if use_rocm_attn else 'Triton'}")
    if clear_cache:
        print(f" Action:    Clearing vLLM Cache (~/.cache/vllm)")
    print(f" Command:   {' '.join(cmd)}")
    print("="*60 + "\n")

    os.execvpe("vllm", cmd, env)

def main():
    check_dependencies()
    gpu_count = detect_gpus()

    while True:
        # Build Model Menu
        menu_items = []
        for i, m_id in enumerate(MODELS_TO_RUN):
            name = m_id.split("/")[-1]
            # Pre-calc verified ctx for 'default' TP to show in menu?
            # Or just show names. Just names is cleaner.
            config = MODEL_TABLE[m_id]
            menu_items.extend([str(i), name])

        choice = run_dialog([
            "--clear", "--backtitle", f"AMD R9700 vLLM Launcher (GPUs: {gpu_count})",
            "--title", "Select Model",
            "--menu", "Choose a model to serve:", "20", "60", "10"
        ] + menu_items)

        if not choice:
            subprocess.run(["clear"])
            print("Selection cancelled.")
            sys.exit(0)

        configure_and_launch(int(choice), gpu_count)

if __name__ == "__main__":
    main()