updates
Этот коммит содержится в:
@@ -89,11 +89,11 @@ echo
|
||||
printf 'Machine: %s\n' "$MACHINE"
|
||||
printf 'GPU : %s\n\n' "$GPU"
|
||||
printf 'Repo : https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes\n'
|
||||
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1151-aotriton:latest\n\n'
|
||||
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1151:latest\n\n'
|
||||
printf 'Included:\n'
|
||||
printf ' - %-16s → %s\n' "start-vllm (wizard)" "Beginner-friendly launcher that guides you through model & settings"
|
||||
printf ' - %-16s → %s\n' "vLLM server" "vllm serve Qwen/Qwen2.5-7B-Instruct --download-dir ~/vllm-models"
|
||||
printf ' - %-16s → %s\n' "API test" "curl localhost:8000/v1/chat/completions (see README)"
|
||||
printf ' - %-16s → %s\n' "start-vllm (TUI)" "Interactive launcher: Model select, Multi-GPU & Cache handling"
|
||||
printf ' - %-16s → %s\n' "vLLM server" "vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
printf ' - %-16s → %s\n' "API test" "curl localhost:8000/v1/chat/completions"
|
||||
echo
|
||||
printf 'SSH tip: ssh -L 8000:localhost:8000 user@host\n\n'
|
||||
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Defaults (override via env: HOST, PORT, DOWNLOAD_DIR, EXTRA_FLAGS)
|
||||
HOST="${HOST:-0.0.0.0}"
|
||||
PORT="${PORT:-8000}"
|
||||
DOWNLOAD_DIR="${DOWNLOAD_DIR:-$HOME/vllm-models}"
|
||||
EXTRA_FLAGS="${EXTRA_FLAGS:-}"
|
||||
|
||||
models=(
|
||||
"Llama 2 7B Chat|meta-llama/Llama-2-7b-chat-hf|"
|
||||
"Qwen2.5 7B Instruct|Qwen/Qwen2.5-7B-Instruct|"
|
||||
"Qwen3 30B A3B Instruct|Qwen/Qwen3-30B-A3B-Instruct-2507|"
|
||||
"Qwen3 14B AWQ|Qwen/Qwen3-14B-AWQ|--quantization awq --dtype float16 --enforce-eager"
|
||||
"Gemma 3 27B instruct|google/gemma-3-27b-it|"
|
||||
"Gemma 3 12B Instruct|google/gemma-3-12b-it|"
|
||||
"Gemma 3 4B Instruct|google/gemma-3-4b-it|"
|
||||
)
|
||||
|
||||
echo "Select a model:"
|
||||
for i in "${!models[@]}"; do
|
||||
name="${models[$i]%%|*}"
|
||||
printf " [%d] %s\n" "$((i+1))" "$name"
|
||||
done
|
||||
|
||||
read -rp "Enter number: " choice
|
||||
[[ "$choice" =~ ^[1-9][0-9]*$ ]] || { echo "Invalid choice."; exit 1; }
|
||||
idx=$((choice-1))
|
||||
(( idx >= 0 && idx < ${#models[@]} )) || { echo "Invalid choice."; exit 1; }
|
||||
|
||||
IFS='|' read -r label repo flags <<< "${models[$idx]}"
|
||||
|
||||
mkdir -p "$DOWNLOAD_DIR"
|
||||
|
||||
CMD=(vllm serve "$repo" --host "$HOST" --port "$PORT" --download-dir "$DOWNLOAD_DIR")
|
||||
|
||||
# Per-model flags
|
||||
if [[ -n "${flags:-}" ]]; then
|
||||
# shellcheck disable=SC2206
|
||||
CMD+=($flags)
|
||||
fi
|
||||
|
||||
# Optional global extras: e.g. EXTRA_FLAGS="--gpu-memory-utilization 0.8"
|
||||
if [[ -n "${EXTRA_FLAGS:-}" ]]; then
|
||||
# shellcheck disable=SC2206
|
||||
CMD+=($EXTRA_FLAGS)
|
||||
fi
|
||||
|
||||
echo -e "Running:\n\n ${CMD[@]}\n"
|
||||
echo "API test → curl -s http://localhost:${PORT}/v1/models | jq -r '.data[0].id'"
|
||||
echo "SSH tip → ssh -L ${PORT}:localhost:${PORT} user@host"
|
||||
echo
|
||||
|
||||
exec "${CMD[@]}"
|
||||
@@ -0,0 +1,315 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
# Add benchmarks dir to path to import config
|
||||
SCRIPT_DIR = Path(__file__).parent.resolve()
|
||||
BENCH_DIR = SCRIPT_DIR.parent / "benchmarks"
|
||||
OPT_DIR = Path("/opt")
|
||||
|
||||
# Check /opt first (Container), then local fallback
|
||||
if (OPT_DIR / "run_vllm_bench.py").exists():
|
||||
sys.path.append(str(OPT_DIR))
|
||||
else:
|
||||
sys.path.append(str(BENCH_DIR))
|
||||
|
||||
try:
|
||||
from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN
|
||||
except ImportError:
|
||||
print("Error: Could not import run_vllm_bench.py config.")
|
||||
sys.exit(1)
|
||||
|
||||
if (OPT_DIR / "max_context_results.json").exists():
|
||||
RESULTS_FILE = OPT_DIR / "max_context_results.json"
|
||||
else:
|
||||
RESULTS_FILE = BENCH_DIR / "max_context_results.json"
|
||||
HOST = os.getenv("HOST", "0.0.0.0")
|
||||
PORT = os.getenv("PORT", "8000")
|
||||
|
||||
def check_dependencies():
|
||||
if not shutil.which("dialog"):
|
||||
print("Error: 'dialog' is required. Please install it (apt-get install dialog).")
|
||||
sys.exit(1)
|
||||
|
||||
def detect_gpus():
|
||||
"""Detects AMD GPUs via rocm-smi or /dev/dri."""
|
||||
try:
|
||||
# Try rocm-smi first
|
||||
res = subprocess.run(["rocm-smi", "--showid", "--csv"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
if res.returncode == 0:
|
||||
count = res.stdout.count("GPU")
|
||||
if count > 0: return count
|
||||
except: pass
|
||||
|
||||
# Fallback to /dev/dri/render*
|
||||
try:
|
||||
return len(list(Path("/dev/dri").glob("renderD*")))
|
||||
except:
|
||||
return 1
|
||||
|
||||
def get_verified_config(model_id, tp_size, max_seqs):
|
||||
"""
|
||||
Reads max_context_results.json to find the best verified configuration.
|
||||
Returns dict: {'ctx': int, 'util': float}
|
||||
"""
|
||||
default_config = {
|
||||
"ctx": int(MODEL_TABLE.get(model_id, {}).get("ctx", 8192)),
|
||||
"util": 0.90 # Safe default
|
||||
}
|
||||
|
||||
if not RESULTS_FILE.exists():
|
||||
return default_config
|
||||
|
||||
try:
|
||||
with open(RESULTS_FILE, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Filter for Model + TP + Sequences
|
||||
matches = [r for r in data
|
||||
if r["model"] == model_id
|
||||
and r["tp"] == tp_size
|
||||
and r["max_seqs"] == max_seqs
|
||||
and r["status"] == "success"]
|
||||
|
||||
if not matches:
|
||||
# Fallback 1: Try finding match with SAME TP but ANY Sequences (e.g. 1) to get base context?
|
||||
# Actually, safer to fallback to default or try finding nearest sequence?
|
||||
# Let's try finding exact match first. If fail, return default.
|
||||
return default_config
|
||||
|
||||
# Sort by Util desc, then Context desc
|
||||
# We prefer higher utilization if available (performance), as long as it is verified success
|
||||
matches.sort(key=lambda x: (float(x["util"]), x["max_context_1_user"]), reverse=True)
|
||||
|
||||
best = matches[0]
|
||||
return {
|
||||
"ctx": best["max_context_1_user"],
|
||||
"util": float(best["util"])
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return default_config
|
||||
|
||||
def run_dialog(args):
|
||||
"""Runs dialog and returns stderr (selection)."""
|
||||
with tempfile.NamedTemporaryFile(mode="w+") as tf:
|
||||
cmd = ["dialog"] + args
|
||||
try:
|
||||
subprocess.run(cmd, stderr=tf, check=True)
|
||||
tf.seek(0)
|
||||
return tf.read().strip()
|
||||
except subprocess.CalledProcessError:
|
||||
return None # User cancelled
|
||||
|
||||
def nuke_vllm_cache():
|
||||
"""Removes vLLM cache directory to fix potential graph/incompatibility issues."""
|
||||
cache = Path.home() / ".cache" / "vllm"
|
||||
if cache.exists():
|
||||
try:
|
||||
print(f"Clearing vLLM cache at {cache}...", end="", flush=True)
|
||||
subprocess.run(["rm", "-rf", str(cache)], check=True)
|
||||
cache.mkdir(parents=True, exist_ok=True)
|
||||
print(" Done.")
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
print(f" Failed: {e}")
|
||||
|
||||
def configure_and_launch(model_idx, gpu_count):
|
||||
model_id = MODELS_TO_RUN[model_idx]
|
||||
config = MODEL_TABLE[model_id]
|
||||
|
||||
# Static Config
|
||||
valid_tps = config.get("valid_tp", [1])
|
||||
max_tp = max(valid_tps) if valid_tps else 1
|
||||
|
||||
# Defaults
|
||||
current_tp = min(gpu_count, max_tp)
|
||||
current_seqs = 1 # Default to 1 concurrent user/request for stability
|
||||
|
||||
# Initial Lookup
|
||||
verified = get_verified_config(model_id, current_tp, current_seqs)
|
||||
current_ctx = verified["ctx"]
|
||||
current_util = verified["util"]
|
||||
|
||||
clear_cache = False
|
||||
use_eager = config.get("enforce_eager", False) # Default to model config, usually False
|
||||
use_rocm_attn = False # Default to Triton
|
||||
|
||||
name = model_id.split("/")[-1]
|
||||
|
||||
while True:
|
||||
cache_status = "YES" if clear_cache else "NO"
|
||||
eager_status = "YES" if use_eager else "NO"
|
||||
attn_backend = "ROCm" if use_rocm_attn else "Triton"
|
||||
|
||||
menu_args = [
|
||||
"--clear", "--backtitle", f"AMD R9700 vLLM Launcher (GPUs: {gpu_count})",
|
||||
"--title", f"Configuration: {name}",
|
||||
"--menu", "Customize Launch Parameters:", "22", "65", "9",
|
||||
"1", f"Tensor Parallelism: {current_tp}",
|
||||
"2", f"Concurrent Requests: {current_seqs}",
|
||||
"3", f"Context Length: {current_ctx} (Verified)",
|
||||
"4", f"GPU Utilization: {current_util} (Verified)",
|
||||
"5", f"Attention Backend: {attn_backend}",
|
||||
"6", f"Erase vLLM Cache: {cache_status}",
|
||||
"7", f"Force Eager Mode: {eager_status}",
|
||||
"8", "LAUNCH SERVER"
|
||||
]
|
||||
|
||||
choice = run_dialog(menu_args)
|
||||
if not choice: return False # Back/Cancel
|
||||
|
||||
if choice == "1":
|
||||
# TP Selection
|
||||
new_tp = run_dialog([
|
||||
"--title", "Tensor Parallelism",
|
||||
"--rangebox", f"Set TP Size (1-{max_tp})", "10", "40", "1", str(max_tp), str(current_tp)
|
||||
])
|
||||
if new_tp:
|
||||
new_tp_int = int(new_tp)
|
||||
if new_tp_int != current_tp:
|
||||
current_tp = new_tp_int
|
||||
# RE-CALCULATE Config
|
||||
verified = get_verified_config(model_id, current_tp, current_seqs)
|
||||
current_ctx = verified["ctx"]
|
||||
current_util = verified["util"]
|
||||
|
||||
elif choice == "2":
|
||||
# Max Seqs Selection
|
||||
new_seqs = run_dialog([
|
||||
"--title", "Concurrent Requests",
|
||||
"--menu", "Select Max Concurrent Requests:", "12", "40", "4",
|
||||
"1", "1 (Latency Focus)",
|
||||
"4", "4 (Balanced)",
|
||||
"8", "8 (Throughput)",
|
||||
"16", "16 (Max Load)"
|
||||
])
|
||||
if new_seqs:
|
||||
current_seqs = int(new_seqs)
|
||||
# RE-CALCULATE Config based on new concurrency
|
||||
verified = get_verified_config(model_id, current_tp, current_seqs)
|
||||
current_ctx = verified["ctx"]
|
||||
current_util = verified["util"]
|
||||
|
||||
elif choice == "3":
|
||||
# Configured Length Override
|
||||
new_ctx = run_dialog([
|
||||
"--title", "Context Length",
|
||||
"--inputbox", f"Override verified limit ({current_ctx}):", "10", "40", str(current_ctx)
|
||||
])
|
||||
if new_ctx: current_ctx = int(new_ctx)
|
||||
|
||||
elif choice == "4":
|
||||
# Util Override
|
||||
pass
|
||||
|
||||
elif choice == "5":
|
||||
# Toggle Attention Backend
|
||||
use_rocm_attn = not use_rocm_attn
|
||||
|
||||
elif choice == "6":
|
||||
# Toggle Cache
|
||||
if not clear_cache:
|
||||
# Enabling it -> Show Warning
|
||||
warn_msg = (
|
||||
"WARNING: Erasing the vLLM cache will remove the compiled compute graphs.\n\n"
|
||||
"This is useful if you are experiencing crashes, 'invalid graph' errors,\n"
|
||||
"or have switched vLLM versions recently.\n\n"
|
||||
"However, the next startup will take longer as graphs are re-compiled.\n\n"
|
||||
"Are you sure you want to enable this?"
|
||||
)
|
||||
confirm = run_dialog([
|
||||
"--title", "Erase Cache Warning",
|
||||
"--yesno", warn_msg, "12", "60"
|
||||
])
|
||||
|
||||
# If confirm is not None (exit 0), it is YES.
|
||||
if confirm is not None:
|
||||
clear_cache = True
|
||||
else:
|
||||
# Disabling it -> No warning needed
|
||||
clear_cache = False
|
||||
|
||||
elif choice == "7":
|
||||
# Toggle Eager Mode
|
||||
use_eager = not use_eager
|
||||
|
||||
elif choice == "8":
|
||||
# Launch
|
||||
break
|
||||
|
||||
# Build Command
|
||||
subprocess.run(["clear"])
|
||||
|
||||
if clear_cache:
|
||||
nuke_vllm_cache()
|
||||
|
||||
cmd = [
|
||||
"vllm", "serve", model_id,
|
||||
"--host", HOST,
|
||||
"--port", PORT,
|
||||
"--tensor-parallel-size", str(current_tp),
|
||||
"--max-num-seqs", str(current_seqs),
|
||||
"--max-model-len", str(current_ctx),
|
||||
"--gpu-memory-utilization", str(current_util),
|
||||
"--dtype", "auto"
|
||||
]
|
||||
|
||||
if config.get("trust_remote"): cmd.append("--trust-remote-code")
|
||||
if use_eager: cmd.append("--enforce-eager")
|
||||
|
||||
# Env Vars
|
||||
env = os.environ.copy()
|
||||
env.update(config.get("env", {}))
|
||||
|
||||
if use_rocm_attn:
|
||||
env["VLLM_V1_USE_PREFILL_DECODE_ATTENTION"] = "1"
|
||||
env["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
# Optional: Explicitly mention these in print
|
||||
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(f" Launching: {name}")
|
||||
print(f" Config: TP={current_tp} | Seqs={current_seqs} | Ctx={current_ctx} | Util={current_util}")
|
||||
print(f" Backend: {'ROCm' if use_rocm_attn else 'Triton'}")
|
||||
if clear_cache:
|
||||
print(f" Action: Clearing vLLM Cache (~/.cache/vllm)")
|
||||
print(f" Command: {' '.join(cmd)}")
|
||||
print("="*60 + "\n")
|
||||
|
||||
os.execvpe("vllm", cmd, env)
|
||||
|
||||
def main():
|
||||
check_dependencies()
|
||||
gpu_count = detect_gpus()
|
||||
|
||||
while True:
|
||||
# Build Model Menu
|
||||
menu_items = []
|
||||
for i, m_id in enumerate(MODELS_TO_RUN):
|
||||
name = m_id.split("/")[-1]
|
||||
# Pre-calc verified ctx for 'default' TP to show in menu?
|
||||
# Or just show names. Just names is cleaner.
|
||||
config = MODEL_TABLE[m_id]
|
||||
menu_items.extend([str(i), name])
|
||||
|
||||
choice = run_dialog([
|
||||
"--clear", "--backtitle", f"AMD R9700 vLLM Launcher (GPUs: {gpu_count})",
|
||||
"--title", "Select Model",
|
||||
"--menu", "Choose a model to serve:", "20", "60", "10"
|
||||
] + menu_items)
|
||||
|
||||
if not choice:
|
||||
subprocess.run(["clear"])
|
||||
print("Selection cancelled.")
|
||||
sys.exit(0)
|
||||
|
||||
configure_and_launch(int(choice), gpu_count)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Ссылка в новой задаче
Block a user