From 46f4003f79da1e77e7bdf72473de418f306d97ec Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Wed, 3 Sep 2025 22:37:26 +0100 Subject: [PATCH] added start-vllm script --- Dockerfile.vllm-therock-gfx1151-aotriton | 3 + scripts/start-vllm.sh | 211 +++++++++++++++++++++++ scripts/vllm-banner.sh | 119 +++++++++---- 3 files changed, 295 insertions(+), 38 deletions(-) create mode 100644 scripts/start-vllm.sh diff --git a/Dockerfile.vllm-therock-gfx1151-aotriton b/Dockerfile.vllm-therock-gfx1151-aotriton index 79c77bf..3d9b807 100644 --- a/Dockerfile.vllm-therock-gfx1151-aotriton +++ b/Dockerfile.vllm-therock-gfx1151-aotriton @@ -94,4 +94,7 @@ COPY scripts/vllm-env.sh /etc/profile.d/vllm-env.sh COPY scripts/vllm-banner.sh /etc/profile.d/vllm-banner.sh RUN chmod 644 /etc/profile.d/vllm-env.sh /etc/profile.d/vllm-banner.sh +COPY scripts/start_vllm /usr/local/bin/start-vllm +RUN chmod 755 /usr/local/bin/start-vllm + CMD ["bash", "-c", "source .venv/bin/activate && bash"] \ No newline at end of file diff --git a/scripts/start-vllm.sh b/scripts/start-vllm.sh new file mode 100644 index 0000000..2f30f9b --- /dev/null +++ b/scripts/start-vllm.sh @@ -0,0 +1,211 @@ +#!/usr/bin/env bash +set -euo pipefail + +# start_vllm — interactive helper to launch vLLM on AMD Strix Halo (gfx1151) +# - Presents a curated list of recent HF models that fit within ~100GB memory (with FP16 or AWQ) +# - Asks for context length, concurrency, kv‑cache dtype, port, etc. +# - Starts vLLM with sensible ROCm defaults for Strix Halo +# +# Requirements inside the toolbox/container: +# - vLLM installed in /torch-therock/.venv (this image has it) +# - internet for first model download (or pre‑downloaded into ~/vllm-models) +# - optional: ~/.cache/vllm mapped to persist compile cache when using Podman/Docker +# +# Notes on quantization: +# - vLLM supports weight‑only quantized models like AWQ and GPTQ (load pre‑quantized repos). +# - For AMD GPUs, FP8 KV‑cache can be supported but is experimental on consumer APUs; INT8 KV‑cache is a safer saver. +# - Qwen3 provides AWQ variants officially; using them can materially reduce memory use. (You do NOT need GGUF; that is for llama.cpp.) +# +# Model memory rule of thumb (VERY rough): +# - FP16 weights ≈ 2 bytes/parameter. So 12B ≈ ~24 GB; 27B ≈ ~54 GB; 32B ≈ ~64 GB (weights only). +# - Plus KV‑cache, which grows with context & concurrency. If you OOM, lower max context or max concurrent requests. +# +# Default directories +DOWNLOAD_DIR="${DOWNLOAD_DIR:-$HOME/vllm-models}" +CACHE_DIR_DEFAULT="$HOME/.cache/vllm" +PORT_DEFAULT="8000" +HOST_DEFAULT="0.0.0.0" +GPU_UTIL_DEFAULT="0.92" +MAX_NUM_SEQS_DEFAULT="4" +MAX_MODEL_LEN_DEFAULT="16384" +KV_CACHE_DTYPE_DEFAULT="auto" # choices: auto|int8|fp8 (fp8_e4m3) +DTYPE_DEFAULT="float16" # choices: float16|bfloat16 + +VENV_ACTIVATE="/torch-therock/.venv/bin/activate" +if [[ -f "$VENV_ACTIVATE" ]]; then + # shellcheck disable=SC1090 + source "$VENV_ACTIVATE" +fi + +print_divider() { printf '\n%s\n' "────────────────────────────────────────────────────────"; } + +# --- curated model list (recent, likely to fit <= ~100GB with sane settings) --- +# Format: label|hf_repo|quant_hint +MODELS=( + # label | hf_repo | quant_hint | compat | note + "Llama-4 Scout 17B-16E Instruct FP4|nvidia/Llama-4-Scout-17B-16E-Instruct-FP4|modelopt|nvidia_only|Optimized for NVIDIA; FP4 path may not work on AMD/ROCm" + "Llama-4 Scout 17B-16E Instruct FP8|nvidia/Llama-4-Scout-17B-16E-Instruct-FP8|modelopt|nvidia_only|Optimized for NVIDIA; FP8 ModelOpt path may not work on AMD/ROCm" + "OpenAI GPT‑OSS 20B (MXFP4)|openai/gpt-oss-20b|mxfp4|experimental|MXFP4 support requires recent vLLM; performance/compat on AMD RDNA iGPU varies" + "OpenAI GPT‑OSS 120B (MXFP4, huge)|openai/gpt-oss-120b|mxfp4|too_large|~120B total params; not practical on a single APU" + "GLM‑4.5‑Air FP8 (12B active)|zai-org/GLM-4.5-Air-FP8|fp8|multi_gpu_fp8|Published FP8; vendor recommends multi‑GPU with native FP8" + "Gemma 3 12B IT (FP16)|google/gemma-3-12b-it|fp16|amd_ok|Good baseline" + "Gemma 3 27B IT (FP16)|google/gemma-3-27b-it|fp16|borderline|Large; consider GPTQ variant if memory tight" + "Gemma 3 27B IT (GPTQ 4bit)|ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g|gptq|amd_ok|Weight‑only INT4 reduces memory; throughput may drop" + "Qwen3 8B Instruct (FP16)|Qwen/Qwen3-8B-Instruct|fp16|amd_ok|Solid quality, easy fit" + "Qwen3 8B Instruct (AWQ 4bit)|Qwen/Qwen3-8B-AWQ|awq|amd_ok|Official AWQ" + "Qwen3 14B Instruct (FP16)|Qwen/Qwen3-14B-Instruct|fp16|amd_ok|" + "Qwen3 14B Instruct (AWQ 4bit)|Qwen/Qwen3-14B-AWQ|awq|amd_ok|" + "Qwen3 30B A3B Instruct (FP16)|Qwen/Qwen3-30B-A3B-Instruct-2507|fp16|amd_ok|MoE; fits with careful context/concurrency" + "Qwen3 30B A3B Instruct (AWQ 4bit)|cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit|awq|community|Community AWQ; quality varies" +)|Qwen/Qwen3-8B-Instruct|fp16" + "Qwen3 8B Instruct (AWQ int4)|Qwen/Qwen3-8B-AWQ|awq" + "Qwen3 14B Instruct (FP16)|Qwen/Qwen3-14B-Instruct|fp16" + "Qwen3 14B Instruct (AWQ int4)|Qwen/Qwen3-14B-AWQ|awq" + "Gemma 3 12B IT (FP16)|google/gemma-3-12b-it|fp16" + "Gemma 3 27B IT (FP16, borderline on memory)|google/gemma-3-27b-it|fp16" + "OpenAI GPT‑OSS 20B (FP16)|openai/gpt-oss-20b|fp16" + "Llama 4 Scout 17B‑16E (experimental; may require 4‑bit not always supported)|meta-llama/Llama-4-Scout-17B-16E|experimental" +) + +cat <<'HDR' +Start vLLM — AMD Strix Halo (gfx1151) +This helper will: + 1) Let you pick a model (FP16 or AWQ when available) + 2) Ask for context length, concurrency, and KV‑cache dtype + 3) Launch vLLM with Strix‑friendly defaults +HDR + +print_divider +printf 'Model download dir (persisted on host) [%s]: ' "$DOWNLOAD_DIR" +read -r REPLY_DL +[[ -n "${REPLY_DL:-}" ]] && DOWNLOAD_DIR="$REPLY_DL" +mkdir -p "$DOWNLOAD_DIR" + +printf 'Cache dir for compiled kernels [%s]: ' "$CACHE_DIR_DEFAULT" +read -r REPLY_CACHE +[[ -n "${REPLY_CACHE:-}" ]] && export VLLM_CACHE_DIR="$REPLY_CACHE" || export VLLM_CACHE_DIR="$CACHE_DIR_DEFAULT" +mkdir -p "$VLLM_CACHE_DIR" + +print_divider +printf 'Select a model:\n' +idx=1 +for m in "${MODELS[@]}"; do + IFS='|' read -r label _ _ <<<"$m" + printf ' [%d] %s\n' "$idx" "$label" + idx=$((idx+1)) +done + +printf 'Enter number: ' +read -r CHOICE +if ! [[ "$CHOICE" =~ ^[0-9]+$ ]] || (( CHOICE < 1 || CHOICE > ${#MODELS[@]} )); then + echo 'Invalid choice.'; exit 1 +fi +SEL="${MODELS[$((CHOICE-1))]}" +IFS='|' read -r SEL_LABEL HF_REPO QUANT_HINT COMPAT NOTE <<<"$SEL" + +# Quantization flag heuristic +QUANT_FLAG=() +case "$QUANT_HINT" in + awq) QUANT_FLAG=(--quantization awq) ;; + gptq) QUANT_FLAG=(--quantization gptq) ;; + mxfp4) QUANT_FLAG=(--quantization mxfp4) ;; + modelopt) QUANT_FLAG=(--quantization modelopt) ;; + fp16|fp8|bf16|auto|'') ;; # rely on model config +esac + +# Compatibility warnings +case "$COMPAT" in + nvidia_only) + echo "WARNING: This checkpoint is optimized for NVIDIA (TensorRT/ModelOpt). It may not run on AMD ROCm (RDNA iGPU)." ;; + multi_gpu_fp8) + echo "WARNING: Vendor docs indicate multi‑GPU FP8 is recommended. On a single Strix Halo APU this is likely impractical." ;; + too_large) + echo "WARNING: 120B‑class model is far beyond single‑APU capacity. Expect failure unless heavy offload/sharding is used." ;; + borderline) + echo "Note: Large model — keep context/concurrency modest or use a quantized variant." ;; + community) + echo "Note: Community quantization — quality/perf may vary." ;; + amd_ok|*) ;; +esac + +[[ -n "$NOTE" ]] && echo "Note: $NOTE" + +# fp16 default +esac + +print_divider +printf 'Max context tokens (--max-model-len) [%s]: ' "$MAX_MODEL_LEN_DEFAULT" +read -r REPLY_CTX +MAX_MODEL_LEN="${REPLY_CTX:-$MAX_MODEL_LEN_DEFAULT}" + +printf 'Max concurrent requests (--max-num-seqs) [%s]: ' "$MAX_NUM_SEQS_DEFAULT" +read -r REPLY_CONC +MAX_NUM_SEQS="${REPLY_CONC:-$MAX_NUM_SEQS_DEFAULT}" + +printf 'KV cache dtype (auto|int8|fp8) [%s]: ' "$KV_CACHE_DTYPE_DEFAULT" +read -r REPLY_KV +KV_CACHE_DTYPE="${REPLY_KV:-$KV_CACHE_DTYPE_DEFAULT}" + +printf 'Model dtype (float16|bfloat16) [%s]: ' "$DTYPE_DEFAULT" +read -r REPLY_DTYPE +DTYPE="${REPLY_DTYPE:-$DTYPE_DEFAULT}" + +printf 'GPU memory utilization (0.50‑0.98) [%s]: ' "$GPU_UTIL_DEFAULT" +read -r REPLY_UTIL +GPU_UTIL="${REPLY_UTIL:-$GPU_UTIL_DEFAULT}" + +printf 'Host bind address [%s]: ' "$HOST_DEFAULT" +read -r REPLY_HOST +HOST="${REPLY_HOST:-$HOST_DEFAULT}" + +printf 'Optional CPU offload in GB (0 to disable) [0]: ' +read -r REPLY_OFF +CPU_OFFLOAD_GB="${REPLY_OFF:-0}" + +printf 'Port [%s]: ' "$PORT_DEFAULT" +read -r REPLY_PORT +PORT="${REPLY_PORT:-$PORT_DEFAULT}" + +print_divider +CMD=( + vllm serve "$HF_REPO" + --host "$HOST" + --port "$PORT" + --download-dir "$DOWNLOAD_DIR" + --dtype "$DTYPE" + --max-model-len "$MAX_MODEL_LEN" + --max-num-seqs "$MAX_NUM_SEQS" + --gpu-memory-utilization "$GPU_UTIL" +) + +# Add CPU offload if requested +if [[ "$CPU_OFFLOAD_GB" =~ ^[0-9]+$ ]] && (( CPU_OFFLOAD_GB > 0 )); then + CMD+=(--cpu-offload-gb "$CPU_OFFLOAD_GB") +fi + +# kv‑cache dtype +if [[ "$KV_CACHE_DTYPE" != "auto" ]]; then + # Map fp8 -> fp8_e4m3 for AMD unless user typed explicit subtype already + if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then + CMD+=(--kv-cache-dtype fp8_e4m3) + else + CMD+=(--kv-cache-dtype "$KV_CACHE_DTYPE") + fi +fi + +# quantization flags (if any) +CMD+=("${QUANT_FLAG[@]}") + +# AMD ROCm/AOTriton helpful env +export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH:-gfx1151}" +export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 + +printf 'About to run:\n\n %q' "${CMD[0]}"; for ((i=1;i<${#CMD[@]};i++)); do printf ' \\\n %q' "${CMD[$i]}"; done; printf '\n\n' + +read -r -p "Proceed? [Y/n] " yn +yn=${yn:-Y} +if [[ "$yn" =~ ^[Yy]$ ]]; then + exec "${CMD[@]}" +else + echo "Canceled." +fi diff --git a/scripts/vllm-banner.sh b/scripts/vllm-banner.sh index d1ee663..f3a1f50 100644 --- a/scripts/vllm-banner.sh +++ b/scripts/vllm-banner.sh @@ -1,5 +1,30 @@ #!/usr/bin/env bash -# vLLM Toolbox banner +# Lightweight banner with machine/GPU and ROCm version (vLLM edition) +# No Triton env sourcing, same info/format as the image/video banner. + +# Only show for interactive shells +case $- in *i*) ;; *) return 0 ;; esac + +oem_info() { + local v="" m="" d lv lm + for d in /sys/class/dmi/id /sys/devices/virtual/dmi/id; do + [[ -r "$d/sys_vendor" ]] && v=$(<"$d/sys_vendor") + [[ -r "$d/product_name" ]] && m=$(<"$d/product_name") + [[ -n "$v" || -n "$m" ]] && break + done + # ARM/SBC fallback + if [[ -z "$v" && -z "$m" && -r /proc/device-tree/model ]]; then + tr -d '\0' /dev/null | tail -n1 | cut -d, -f2) [[ -z "$name" ]] && name=$(rocm-smi --showproductname 2>/dev/null | grep -m1 -E 'Product Name|Card series' | sed 's/.*: //') fi - if [[ -z "$name" ]]; then - name="Unknown AMD GPU" + if [[ -z "$name" ]] && command -v rocminfo >/dev/null 2>&1; then + name=$(rocminfo 2>/dev/null | awk -F': ' '/^[[:space:]]*Name:/{print $2; exit}') fi - printf '%s\n' "$name" + if [[ -z "$name" ]] && command -v lspci >/dev/null 2>&1; then + name=$(lspci -nn 2>/dev/null | grep -Ei 'vga|display|gpu' | grep -i amd | head -n1 | cut -d: -f3-) + fi + # trim + name=$(printf '%s' "$name" | sed -e 's/^[[:space:]]\+//' -e 's/[[:space:]]\+$//' -e 's/[[:space:]]\{2,\}/ /g') + printf '%s\n' "${name:-Unknown AMD GPU}" } -vllm_version() { - python -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "unknown" -} - -# Simple model selector -vllm_start() { - echo - echo "Select a model to serve:" - echo "1) Qwen2.5-7B-Instruct (recommended, ~14GB VRAM)" - echo "2) Llama-3.1-8B-Instruct (~16GB VRAM)" - echo "3) Qwen3-8B (~16GB VRAM, latest with thinking mode)" - echo - read -p "Choose [1-3]: " choice - - case $choice in - 1) vllm serve Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 8000 --download-dir ~/models --dtype float16 --max-model-len 32768 ;; - 2) vllm serve meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 8000 --download-dir ~/models --dtype float16 --max-model-len 32768 ;; - 3) vllm serve Qwen/Qwen3-8B --host 0.0.0.0 --port 8000 --download-dir ~/models --dtype float16 --max-model-len 32768 --enable-reasoning --reasoning-parser qwen3 ;; - *) echo "Invalid choice." ;; - esac +rocm_version() { + # Prefer the PyTorch HIP version from the venv, fallback to rocm pkg metadata + local PY="/torch-therock/.venv/bin/python" + [[ -x "$PY" ]] || PY="python" + "$PY" - <<'PY' 2>/dev/null || true +try: + import torch + v = getattr(getattr(torch, "version", None), "hip", "") or "" + if v: + print(v) + else: + raise Exception("no torch.version.hip") +except Exception: + try: + import importlib.metadata as im + try: + print(im.version("_rocm_sdk_core")) + except Exception: + print(im.version("rocm")) + except Exception: + print("") +PY } +MACHINE="$(oem_info)" GPU="$(gpu_name)" -VLLM_VER="$(vllm_version)" +ROCM_VER="$(rocm_version)" echo -echo "vLLM Toolbox - AMD STRIX HALO (gfx1151)" -echo "GPU: $GPU" -echo "vLLM: $VLLM_VER" -echo -echo "Commands:" -echo " vllm_start - Start model server" -echo " vllm_test - Test API" -echo " ls ~/models - List downloaded models" -echo -echo "Server will be available at: http://localhost:8000" -echo +cat <<'ASCII' +███████╗████████╗██████╗ ██╗██╗ ██╗ ██╗ ██╗ █████╗ ██╗ ██████╗ +██╔════╝╚══██╔══╝██╔══██╗██║╚██╗██╔╝ ██║ ██║██╔══██╗██║ ██╔═══██╗ +███████╗ ██║ ██████╔╝██║ ╚███╔╝ ███████║███████║██║ ██║ ██║ +╚════██║ ██║ ██╔══██╗██║ ██╔██╗ ██╔══██║██╔══██║██║ ██║ ██║ +███████║ ██║ ██║ ██║██║██╔╝ ██╗ ██║ ██║██║ ██║███████╗╚██████╔╝ +╚══════╝ ╚═╝ ╚═╝ ╚═╝╚═╝╚═╝ ╚═╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝ ╚═════╝ -# Test alias -alias vllm_test='curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '\''{"model":"auto","messages":[{"role":"user","content":"Hello!"}]}'\''' \ No newline at end of file + v L L M +ASCII +echo +printf 'AMD STRIX HALO — vLLM Toolbox (gfx1151, ROCm via TheRock)\n' +[[ -n "$ROCM_VER" ]] && printf 'ROCm nightly: %s\n' "$ROCM_VER" +echo +printf 'Machine: %s\n' "$MACHINE" +printf 'GPU : %s\n\n' "$GPU" +printf 'Repo : https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes\n' +printf 'Image : docker.io/kyuz0/vllm-therock-gfx1151-aotriton:latest\n\n' +printf 'Included:\n' +printf ' - %-16s → %s\n' "start-vllm (wizard)" "Beginner-friendly launcher that guides you through model & settings" +printf ' - %-16s → %s\n' "vLLM server" "vllm serve Qwen/Qwen2.5-7B-Instruct --download-dir ~/vllm-models" +printf ' - %-16s → %s\n' "API test" "curl localhost:8000/v1/chat/completions (see README)" +echo +printf 'SSH tip: ssh -L 8000:localhost:8000 user@host\n\n' \ No newline at end of file