amd-strix-halo-vllm-toolboxes/scripts/start_vllm.py

#!/usr/bin/env python3
import sys
import os
import json
import shutil
import tempfile
import subprocess
from pathlib import Path

# Add benchmarks dir to path to import config
SCRIPT_DIR = Path(__file__).parent.resolve()
BENCH_DIR = SCRIPT_DIR.parent / "benchmarks"
OPT_DIR = Path("/opt")


# Check /opt first (Container), then local fallback for results file location
if (OPT_DIR / "run_vllm_bench.py").exists():
    sys.path.append(str(OPT_DIR))
else:
    sys.path.append(str(BENCH_DIR))
    # Also ensure current script dir is in path for local 'models' import if not already
    sys.path.append(str(SCRIPT_DIR))

try:
    import models
    MODEL_TABLE = models.MODEL_TABLE
    MODELS_TO_RUN = models.MODELS_TO_RUN
except ImportError:
    print("Error: Could not import models.py config.")
    sys.exit(1)

if (OPT_DIR / "max_context_results.json").exists():
    RESULTS_FILE = OPT_DIR / "max_context_results.json"
else:
    RESULTS_FILE = BENCH_DIR / "max_context_results.json"
HOST = os.getenv("HOST", "0.0.0.0")
PORT = os.getenv("PORT", "8000")

def detect_gpus():
    """Detects AMD GPUs via rocm-smi or /dev/dri."""
    try:
        # Try rocm-smi first
        res = subprocess.run(["rocm-smi", "--showid", "--csv"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if res.returncode == 0:
            count = res.stdout.count("GPU")
            if count > 0: return count
    except: pass

    # Fallback to /dev/dri/render*
    try:
        return len(list(Path("/dev/dri").glob("renderD*")))
    except:
        return 1

def get_discovered_models():
    """
    Overrides the hardcoded MODELS_TO_RUN by looking at what we actually have results for.
    This allows the UI to show all verified models, not just what's enabled for benchmarking.
    """
    try:
        if RESULTS_FILE.exists():
            with open(RESULTS_FILE, "r") as f:
                data = json.load(f)

            # 1. Find all models with at least one success
            verified_models = set()
            for r in data:
                if r.get("status") == "success":
                    verified_models.add(r["model"])

            # 2. Filter: Must be in MODEL_TABLE (so we have config/valid_tp)
            #    and must be in our verified list (if results exist)
            final_list = []
            gpu_count = detect_gpus()

            for m in sorted(list(verified_models)):
                if m in MODEL_TABLE:
                    # Check valid_tp
                    valid_tps = MODEL_TABLE[m].get("valid_tp", [1])
                    min_required = min(valid_tps)

                    if min_required <= gpu_count:
                        final_list.append(m)

            if final_list:
                return final_list

    except Exception as e:
        print(f"Warning: Model discovery failed ({e}). Using default list.")

    # Fallback if no results file or error: return all models compatible with current hardware
    gpu_count = detect_gpus()
    compatible_models = []

    for m in MODELS_TO_RUN:
        if m in MODEL_TABLE:
            valid_tps = MODEL_TABLE[m].get("valid_tp", [1])
            min_required = min(valid_tps)
            if min_required <= gpu_count:
                compatible_models.append(m)

    return compatible_models

# Refresh the list of models to run based on what we found
MODELS_TO_RUN = get_discovered_models()

def check_dependencies():
    if not shutil.which("dialog"):
        print("Error: 'dialog' is required. Please install it (apt-get install dialog).")
        sys.exit(1)

def get_verified_config(model_id, tp_size, max_seqs):
    """
    Reads max_context_results.json to find the best verified configuration.
    Returns dict: {'ctx': int, 'util': float}
    """
    default_config = {
        "ctx": int(MODEL_TABLE.get(model_id, {}).get("ctx", 8192)),
        "util": 0.90 # Safe default
    }

    if not RESULTS_FILE.exists():
        return default_config

    try:
        with open(RESULTS_FILE, "r") as f:
            data = json.load(f)

        # Filter for Model + TP + Sequences
        matches = [r for r in data
                  if r["model"] == model_id
                  and r["tp"] == tp_size
                  and r["max_seqs"] == max_seqs
                  and r["status"] == "success"]

        if not matches:
            # Fallback 1: Try finding match with SAME TP but ANY Sequences (e.g. 1) to get base context?
            # Actually, safer to fallback to default or try finding nearest sequence?
            # Let's try finding exact match first. If fail, return default.
            return default_config

        # Sort by Util desc, then Context desc
        # We prefer higher utilization if available (performance), as long as it is verified success
        matches.sort(key=lambda x: (float(x["util"]), x["max_context_1_user"]), reverse=True)

        best = matches[0]
        return {
            "ctx": best["max_context_1_user"],
            "util": float(best["util"])
        }

    except Exception as e:
        return default_config

def run_dialog(args):
    """Runs dialog and returns stderr (selection)."""
    with tempfile.NamedTemporaryFile(mode="w+") as tf:
        cmd = ["dialog"] + args
        try:
            subprocess.run(cmd, stderr=tf, check=True)
            tf.seek(0)
            return tf.read().strip()
        except subprocess.CalledProcessError:
            return None # User cancelled

def nuke_vllm_cache():
    """Removes vLLM cache directory to fix potential graph/incompatibility issues."""
    cache = Path.home() / ".cache" / "vllm"
    if cache.exists():
        try:
            print(f"Clearing vLLM cache at {cache}...", end="", flush=True)
            subprocess.run(["rm", "-rf", str(cache)], check=True)
            cache.mkdir(parents=True, exist_ok=True)
            print(" Done.")
            time.sleep(1)
        except Exception as e:
            print(f" Failed: {e}")

def configure_and_launch(model_idx, gpu_count):
    model_id = MODELS_TO_RUN[model_idx]
    config = MODEL_TABLE[model_id]

    # Static Config
    valid_tps = config.get("valid_tp", [1])
    max_tp = max(valid_tps) if valid_tps else 1

    # Defaults
    current_tp = min(gpu_count, max_tp)
    current_seqs = 1 # Default to 1 concurrent user/request for stability

    # Initial Lookup
    verified = get_verified_config(model_id, current_tp, current_seqs)
    current_ctx = verified["ctx"]
    current_util = verified["util"]

    clear_cache = False
    use_eager = config.get("enforce_eager", False) # Default to model config, usually False
    use_rocm_attn = False # Default to Triton

    name = model_id.split("/")[-1]

    while True:
        cache_status = "YES" if clear_cache else "NO"
        eager_status = "YES" if use_eager else "NO"
        attn_backend = "ROCm" if use_rocm_attn else "Triton"

        menu_args = [
            "--clear", "--backtitle", f"AMD Strix Halo vLLM Launcher (GPUs: {gpu_count})",
            "--title", f"Configuration: {name}",
            "--menu", "Customize Launch Parameters:", "22", "65", "9",
            "1", f"Tensor Parallelism:   {current_tp}",
            "2", f"Concurrent Requests:  {current_seqs}",
            "3", f"Context Length:       {current_ctx} (Verified)",
            "4", f"GPU Utilization:      {current_util} (Verified)",
            "5", f"Attention Backend:    {attn_backend}",
            "6", f"Erase vLLM Cache:     {cache_status}",
            "7", f"Force Eager Mode:     {eager_status}",
            "8", "LAUNCH SERVER"
        ]

        choice = run_dialog(menu_args)
        if not choice: return False # Back/Cancel

        if choice == "1":
            # TP Selection
            new_tp = run_dialog([
                "--title", "Tensor Parallelism",
                "--rangebox", f"Set TP Size (1-{max_tp})", "10", "40", "1", str(max_tp), str(current_tp)
            ])
            if new_tp:
                new_tp_int = int(new_tp)
                if new_tp_int != current_tp:
                    current_tp = new_tp_int
                    # RE-CALCULATE Config
                    verified = get_verified_config(model_id, current_tp, current_seqs)
                    current_ctx = verified["ctx"]
                    current_util = verified["util"]

        elif choice == "2":
            # Max Seqs Selection
            new_seqs = run_dialog([
                "--title", "Concurrent Requests",
                "--menu", "Select Max Concurrent Requests:", "12", "40", "4",
                "1", "1 (Latency Focus)",
                "4", "4 (Balanced)",
                "8", "8 (Throughput)",
                "16", "16 (Max Load)"
            ])
            if new_seqs:
                current_seqs = int(new_seqs)
                # RE-CALCULATE Config based on new concurrency
                verified = get_verified_config(model_id, current_tp, current_seqs)
                current_ctx = verified["ctx"]
                current_util = verified["util"]

        elif choice == "3":
            # Configured Length Override
            new_ctx = run_dialog([
                "--title", "Context Length",
                "--inputbox", f"Override verified limit ({current_ctx}):", "10", "40", str(current_ctx)
            ])
            if new_ctx: current_ctx = int(new_ctx)

        elif choice == "4":
             # Util Override
             pass

        elif choice == "5":
            # Toggle Attention Backend
            use_rocm_attn = not use_rocm_attn

        elif choice == "6":
            # Toggle Cache
            if not clear_cache:
                # Enabling it -> Show Warning
                warn_msg = (
                    "WARNING: Erasing the vLLM cache will remove the compiled compute graphs.\n\n"
                    "This is useful if you are experiencing crashes, 'invalid graph' errors,\n"
                    "or have switched vLLM versions recently.\n\n"
                    "However, the next startup will take longer as graphs are re-compiled.\n\n"
                    "Are you sure you want to enable this?"
                )
                confirm = run_dialog([
                    "--title", "Erase Cache Warning",
                    "--yesno", warn_msg, "12", "60"
                ])

                # If confirm is not None (exit 0), it is YES.
                if confirm is not None:
                     clear_cache = True
            else:
                # Disabling it -> No warning needed
                clear_cache = False

        elif choice == "7":
            # Toggle Eager Mode
            use_eager = not use_eager

        elif choice == "8":
            # Launch
            break

    # Build Command
    subprocess.run(["clear"])

    if clear_cache:
        nuke_vllm_cache()

    cmd = [
        "vllm", "serve", model_id,
        "--host", HOST,
        "--port", PORT,
        "--tensor-parallel-size", str(current_tp),
        "--max-num-seqs", str(current_seqs),
        "--max-model-len", str(current_ctx),
        "--gpu-memory-utilization", str(current_util),
        "--dtype", "auto"
    ]

    if config.get("trust_remote"): cmd.append("--trust-remote-code")
    if use_eager: cmd.append("--enforce-eager")

    # Env Vars
    env = os.environ.copy()
    env["VLLM_DISABLE_COMPILE_CACHE"] = "1"
    env.update(config.get("env", {}))

    if use_rocm_attn:
        cmd.extend(["--attention-backend", "ROCM_ATTN"])


    print("\n" + "="*60)
    print(f" Launching: {name}")
    print(f" Config:    TP={current_tp} | Seqs={current_seqs} | Ctx={current_ctx} | Util={current_util}")
    print(f" Backend:   {'ROCm' if use_rocm_attn else 'Triton'}")
    if clear_cache:
        print(f" Action:    Clearing vLLM Cache (~/.cache/vllm)")

    # Variables that represent the custom environment overrides for models
    custom_env = config.get("env", {})
    if custom_env:
        print("\n --- Environment Variables ---")
        for k, v in custom_env.items():
            print(f" export {k}={v}")

    print(f"\n Command:   {' '.join(cmd)}")
    print("="*60 + "\n")

    os.execvpe("vllm", cmd, env)

def main():
    check_dependencies()
    gpu_count = detect_gpus()

    while True:
        # Build Model Menu
        menu_items = []
        for i, m_id in enumerate(MODELS_TO_RUN):
            name = m_id.split("/")[-1]
            # Pre-calc verified ctx for 'default' TP to show in menu?
            # Or just show names. Just names is cleaner.
            config = MODEL_TABLE[m_id]
            menu_items.extend([str(i), name])

        choice = run_dialog([
            "--clear", "--backtitle", f"AMD R9700 vLLM Launcher (GPUs: {gpu_count})",
            "--title", "Select Model",
            "--menu", "Choose a model to serve:", "20", "60", "10"
        ] + menu_items)

        if not choice:
            subprocess.run(["clear"])
            print("Selection cancelled.")
            sys.exit(0)

        configure_and_launch(int(choice), gpu_count)

if __name__ == "__main__":
    main()