#!/usr/bin/env python3 import subprocess, time, json, sys, os, requests, argparse from pathlib import Path try: import bench_utils except ImportError: sys.path.append(str(Path(__file__).parent)) import bench_utils # ========================= # ⚙️ GLOBAL SETTINGS # ========================= try: import models except ImportError: # If running locally and models.py is in ../scripts? # Or if running in /opt where models.py is alongside. # We will try adding current dir to path just in case sys.path.append(os.getcwd()) try: import models except ImportError: # Fallback for local structure: assuming this is in benchmarks/ and models is in scripts/ sys.path.append(str(Path(__file__).parent.parent / "scripts")) import models # Import from shared config MODEL_TABLE = models.MODEL_TABLE MODELS_TO_RUN = models.MODELS_TO_RUN GPU_UTIL = models.GPU_UTIL OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS OFF_FORCED_OUTPUT = models.OFF_FORCED_OUTPUT DEFAULT_BATCH_TOKENS = models.DEFAULT_BATCH_TOKENS # Fallbacks FALLBACK_INPUT_LEN = 1024 FALLBACK_OUTPUT_LEN = 512 RESULTS_DIR = Path("benchmark_results") RESULTS_DIR.mkdir(exist_ok=True) # ========================= # UTILS # ========================= def log(msg): print(f"\n[BENCH] {msg}") def get_gpu_count(): try: # Using rocm-smi --showid to list GPUs. # Output format: "GPU[0] : Device Name: ..." res = subprocess.run(["rocm-smi", "--showid"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if res.returncode == 0: return 1 # Force return 1 for Strix Halo APU else: log("rocm-smi failed, defaulting to 1 GPU (Hardcoded Fallback)") return 1 except Exception as e: log(f"Error detecting GPUs: {e}, defaulting to 1 GPU") return 1 def kill_vllm(): subprocess.run("pgrep -f 'vllm serve' | xargs -r kill -9", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) time.sleep(5) def nuke_vllm_cache(): cache = Path.home() / ".cache" / "vllm" if cache.exists(): try: subprocess.run(["rm", "-rf", str(cache)], check=True) cache.mkdir(parents=True, exist_ok=True) time.sleep(2) except: pass def get_dataset(): data_path = Path("ShareGPT_V3_unfiltered_cleaned_split.json") if data_path.exists(): return str(data_path) log("Downloading ShareGPT dataset...") url = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json" try: r = requests.get(url, stream=True, timeout=15) r.raise_for_status() with open(data_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) return str(data_path) except Exception as e: log(f"WARNING: ShareGPT download failed ({e}). using RANDOM.") return None def get_model_args(model, tp_size, overrides=None): config = MODEL_TABLE.get(model, {"max_num_seqs": "32"}) overrides = overrides or {} # Allow per-model GPU utilization override util = overrides.get("gpu_util", config.get("gpu_util", GPU_UTIL)) max_seq_override = overrides.get("max_num_seqs", config.get("max_num_seqs", "32")) cmd = [ "--model", model, "--gpu-memory-utilization", str(util), "--dtype", "auto", "--tensor-parallel-size", str(tp_size), "--max-num-seqs", str(max_seq_override) ] # Optional: if a model really needs a hard limit, we can still support "ctx" in config, # but by default we rely on auto. if "ctx" in overrides or "ctx" in config: cmd.extend(["--max-model-len", str(overrides.get("ctx", config.get("ctx")))]) if config.get("trust_remote"): cmd.append("--trust-remote-code") if config.get("enforce_eager"): cmd.append("--enforce-eager") return cmd def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DIR, extra_env=None, overrides=None): if tp_size not in MODEL_TABLE[model]["valid_tp"]: return overrides = overrides or {} model_safe = model.replace("/", "_") output_dir_path = Path(output_dir) output_dir_path.mkdir(parents=True, exist_ok=True) tag = overrides.get("tag", "").strip() tag_suffix = f"_{tag}" if tag else "" output_file = output_dir_path / f"{model_safe}_tp{tp_size}{tag_suffix}_throughput.json" if output_file.exists(): log(f"SKIP {model} (TP={tp_size} | {backend_name})") return dataset_path = get_dataset() dataset_args = ["--dataset-name", "sharegpt", "--dataset-path", dataset_path] if dataset_path else ["--input-len", "1024"] # Retrieve Model-Specific Batch Tokens batch_tokens = str(overrides.get("max_tokens", MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS))) log(f"START {model} (TP={tp_size} | {backend_name}) [Batch: {batch_tokens}]...") kill_vllm() nuke_vllm_cache() cmd = ["vllm", "bench", "throughput"] + get_model_args(model, tp_size, overrides) cmd.extend([ "--num-prompts", str(OFF_NUM_PROMPTS), "--max-num-batched-tokens", batch_tokens, "--output-len", OFF_FORCED_OUTPUT, "--output-json", str(output_file), "--disable-log-stats" ]) cmd.extend(dataset_args) # Force Attention Backend via CLI if ROCm-Attn if backend_name == "ROCm-Attn": cmd.extend(["--attention-backend", "ROCM_ATTN"]) # ENV Setup: Global + Model Specific env = os.environ.copy() env["VLLM_DISABLE_COMPILE_CACHE"] = "1" # Inject model specific env vars (e.g. for AWQ) model_env = MODEL_TABLE[model].get("env", {}) env.update(model_env) # Extra Env if extra_env: env.update(extra_env) try: subprocess.run(cmd, check=True, env=env) except: log(f"ERROR: Failed {model} [{backend_name}]") def print_summary(tps): print(f"\n{'MODEL':<40} | {'TP':<2} | {'Tag':<15} | {'Triton':<8} | {'ROCm':<8}") print("-" * 92) for m in MODELS_TO_RUN: msafe = m.replace("/", "_") name_cell = m.split('/')[-1] for tp in tps: if tp not in MODEL_TABLE[m]["valid_tp"]: continue prefix = f"{msafe}_tp{tp}" tags = set() for p in RESULTS_DIR.glob(f"{prefix}*_throughput.json"): name_part = p.name[len(prefix):-len("_throughput.json")] tag = name_part.lstrip("_") tags.add(tag) for p in Path("benchmark_results_rocm").glob(f"{prefix}*_throughput.json"): name_part = p.name[len(prefix):-len("_throughput.json")] tag = name_part.lstrip("_") tags.add(tag) if not tags: tags.add("") # Default empty tag if no files found for tag in sorted(list(tags)): tag_suffix = f"_{tag}" if tag else "" # Default try: p1 = RESULTS_DIR / f"{prefix}{tag_suffix}_throughput.json" if p1.exists(): d1 = json.loads(p1.read_text()) val1 = f"{d1.get('tokens_per_second', 0):.1f}" else: val1 = "N/A" except: val1 = "N/A" # ROCm try: p2 = Path("benchmark_results_rocm") / f"{prefix}{tag_suffix}_throughput.json" if p2.exists(): d2 = json.loads(p2.read_text()) val2 = f"{d2.get('tokens_per_second', 0):.1f}" else: val2 = "N/A" except: val2 = "N/A" display_tag = tag if tag else "(Default)" print(f"{name_cell:<40} | {tp:<2} | {display_tag:<15} | {val1:<8} | {val2:<8}") print("-" * 92) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--tp", type=int, nargs="+", default=[1]) parser.add_argument("--tui", action="store_true", help="Launch interactive configuration UI") args = parser.parse_args() gpu_count = get_gpu_count() log(f"Detected {gpu_count} AMD GPU(s)") valid_tp_args = [t for t in args.tp if t <= gpu_count] if not valid_tp_args: log(f"Requested TP={args.tp} but only {gpu_count} GPU(s) detected. Nothing to run.") sys.exit(0) selected_models = MODELS_TO_RUN if args.tui: # TUI Model Selection checklist_args = [ "--clear", "--backtitle", "AMD vLLM Benchmark Launcher", "--title", "Model Selection", "--checklist", "Select models to benchmark:", "20", "65", "10" ] for m in MODELS_TO_RUN: m_name = m.split("/")[-1] # All selected "on" by default checklist_args.extend([m, m_name, "on"]) choice = bench_utils.run_dialog(checklist_args) if choice is None: subprocess.run(["clear"]) print("Cancelled by user.") sys.exit(0) # Parse space-separated quoted output from dialog checklist import shlex selected_models = [m for m in shlex.split(choice)] if not selected_models: subprocess.run(["clear"]) print("No models selected. Exiting.") sys.exit(0) kill_vllm() for tp in valid_tp_args: for m in selected_models: overrides = {} if args.tui: config = MODEL_TABLE.get(m, {}) default_seqs = config.get("max_num_seqs", "32") default_tokens = config.get("max_tokens", DEFAULT_BATCH_TOKENS) default_util = config.get("gpu_util", GPU_UTIL) default_ctx = config.get("ctx", "auto") form_args = [ "--clear", "--backtitle", f"AMD vLLM Benchmark Configuration (TP: {tp})", "--title", f"Tune Parameters: {m.split('/')[-1]}", "--form", "Edit the options below. Leave tag empty for no suffix.", "15", "70", "5", "Max Concurrent Seqs:", "1", "1", str(default_seqs), "1", "25", "15", "0", "Max Batched Tokens:", "2", "1", str(default_tokens), "2", "25", "15", "0", "GPU Utilization (0-1):", "3", "1", str(default_util), "3", "25", "15", "0", "Max Context Length:", "4", "1", str(default_ctx), "4", "25", "15", "0", "Filename Tag (Optional):", "5", "1", "", "5", "25", "15", "0" ] form_res = bench_utils.run_dialog(form_args) if form_res is None: subprocess.run(["clear"]) print(f"Skipping {m} (TP={tp}) due to user cancellation.") continue lines = form_res.splitlines() if len(lines) >= 5: overrides["max_num_seqs"] = lines[0].strip() overrides["max_tokens"] = lines[1].strip() overrides["gpu_util"] = lines[2].strip() ctx_val = lines[3].strip() if ctx_val and ctx_val.lower() != "auto": overrides["ctx"] = ctx_val overrides["tag"] = lines[4].strip() # 1. Default (Triton) run_throughput(m, tp, "Default", RESULTS_DIR, overrides=overrides) # 2. ROCm Attention # We force this via CLI argument --attention-backend ROCM_ATTN below # No specific env vars needed if forcing backend. rocm_env = {} print(f"[DEBUG] Forcing ROCm Env: {rocm_env} + CLI: --attention-backend ROCM_ATTN") run_throughput(m, tp, "ROCm-Attn", "benchmark_results_rocm", rocm_env, overrides=overrides) print_summary(valid_tp_args)