From e9460b20ad9ff0679353055d3850d6083b9a707b Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Thu, 4 Sep 2025 13:33:53 +0100 Subject: [PATCH] updated with set of working models --- README.md | 59 +++++++++-- scripts/start-vllm.sh | 227 +++++------------------------------------- 2 files changed, 80 insertions(+), 206 deletions(-) diff --git a/README.md b/README.md index 905a0c6..f3697dd 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,34 @@ An **Arch-based** Docker/Podman container that is **Toolbx-compatible** (usable --- +## ⚠️ Status & Expectations (Experimental) + +This setup is **highly experimental** on ROCm/Strix Halo. Some models work; **many fail** due to missing custom kernels, unsupported quant types, or TorchInductor/AOTriton limitations on gfx1151. The matrix below lists combinations tested so far. **Please contribute fixes** or additional working recipes (see *Contributing*). + +--- + +## Tested Models (Experimental Matrix) + +> **Legend:** ✅ Works (with flags) · ❌ Fails · ⚠️ Notes include the *exact* error/symptom seen. + +| Model (Hugging Face) | Params / Quant | Status | Required flags (if any) | Notes / Errors | +| ---------------------------------- | -------------- | -------------------: | ---------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | +| `Qwen/Qwen2.5-7B-Instruct` | 7B FP16 | ✅ Works | (recommended) `--dtype float16` | Good baseline; simple serve works. | +| `meta-llama/Llama-2-7b-chat-hf` | 7B FP16 | ✅ Works | (recommended) `--dtype float16` | Stable. | +| `Qwen/Qwen3-30B-A3B-Instruct-2507` | 30B (A3B) FP16 | ✅ Works | (recommended) `--dtype float16` | Heavy; ensure **unified memory** tweaks. | +| `Qwen/Qwen3-14B-AWQ` | 14B AWQ | ✅ Works (with flags) | `--quantization awq --dtype float16 --enforce-eager` | On ROCm, eager avoids missing `awq_dequantize` during compile; vLLM auto‑sets `VLLM_USE_TRITON_AWQ`. | +| `openai/gpt-oss-20b` | 20B MXFP4 | ❌ Fails | — | `ModuleNotFoundError: triton_kernels.matmul_ogs` (MXFP4 path not available in this image). | +| `zai-org/GLM-4.5-Air-FP8` | FP8 | ❌ Fails | — | `ValueError: type fp8e4nv not supported (only 'fp8e5')`. | +| `cpatonn/GLM-4.5-Air-AWQ-4bit` | AWQ-4bit (MoE) | ❌ Fails | — | Missing custom op: `torch.ops._C.gptq_marlin_repack` (Marlin kernels). | + +> If you get a model to work, please PR a new row with: **model name**, **exact flags**, vLLM version, `torch` & `triton` versions, and a note on **gfx1151** driver/kernel stack. + +--- + ## 1) Toolbx vs Docker/Podman The `kyuz0/pytorch-therock-gfx1151-aotriton-builder` image can be used both as:  -## - * **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean.  * **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container. @@ -55,14 +77,14 @@ vllm serve Qwen/Qwen2.5-7B-Instruct \ > > ```bash > du -sh ~/.cache/vllm/torch_compile_cache/ -> # e.g., 138M /home/kyuz0/.cache/vllm/torch_compile_cache/ +> # e.g., 138M /home/you/.cache/vllm/torch_compile_cache/ > ``` --- ## 3) Testing the API -Once the server is up (from section 2), hit the OpenAI‑compatible endpoint: +Once the server is up, hit the OpenAI‑compatible endpoint: ```bash curl -X POST http://localhost:8000/v1/chat/completions \ @@ -72,6 +94,19 @@ curl -X POST http://localhost:8000/v1/chat/completions \ You should receive a JSON response with a `choices[0].message.content` reply. +If you don't want to bother specifying the model name, you can run this which will query the currently deployed model: + +```bash +MODEL=$(curl -s http://localhost:8000/v1/models | jq -r '.data[0].id') + +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"$MODEL\", + \"messages\":[{\"role\":\"user\",\"content\":\"Hello! Test the performance.\"}] + }" +``` + --- ## 4) Quickstart — Podman/Docker @@ -156,9 +191,21 @@ Enable large GTT/unified memory so the iGPU can borrow system RAM for bigger mod --- -## 8) Acknowledgements & Links +## 8) Contributing + +Spotted a fix, a working flag combo, or a model that should be on the list? **PRs welcome!** Please include: + +* Model repo + exact version tag (if any) +* Full `vllm serve` command/flags that work +* vLLM version, `torch` & `triton` versions (`python -c "import torch, triton; print(torch.__version__, triton.__version__)"`) +* Short log snippet of success/failure (especially the **first** error) +* Any relevant kernel/AOTriton env vars (e.g., `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1`) + +--- + +## 9) Acknowledgements & Links * Base images & docs: [https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton](https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton) * Upstreams: [vLLM](https://github.com/vllm-project/vllm), [ROCm/TheRock](https://github.com/ROCm/TheRock), [AOTriton](https://github.com/ROCm/aotriton) * Community: **AMD Strix Halo Home Lab Discord** — [https://discord.gg/pnPRyucNrG](https://discord.gg/pnPRyucNrG) -* Big thanks to **lhl** and **ssweens** for prior art and inspiration. +* Big thanks to **lhl** and **ssweens** for doing the actual heavy lifting for this. diff --git a/scripts/start-vllm.sh b/scripts/start-vllm.sh index 2c79410..f36c022 100644 --- a/scripts/start-vllm.sh +++ b/scripts/start-vllm.sh @@ -1,218 +1,45 @@ #!/usr/bin/env bash set -euo pipefail -# start_vllm — interactive helper to launch vLLM on AMD Strix Halo (gfx1151) -# - Presents a curated list of recent HF models that fit within ~100GB memory (with FP16 or AWQ) -# - Asks for context length, concurrency, kv‑cache dtype, port, etc. -# - Starts vLLM with sensible ROCm defaults for Strix Halo -# -# Requirements inside the toolbox/container: -# - vLLM installed in /torch-therock/.venv (this image has it) -# - internet for first model download (or pre‑downloaded into ~/vllm-models) -# - optional: ~/.cache/vllm mapped to persist compile cache when using Podman/Docker -# -# Notes on quantization: -# - vLLM supports weight‑only quantized models like AWQ and GPTQ (load pre‑quantized repos). -# - For AMD GPUs, FP8 KV‑cache can be supported but is experimental on consumer APUs; INT8 KV‑cache is a safer saver. -# - Qwen3 provides AWQ variants officially; using them can materially reduce memory use. (You do NOT need GGUF; that is for llama.cpp.) -# -# Model memory rule of thumb (VERY rough): -# - FP16 weights ≈ 2 bytes/parameter. So 12B ≈ ~24 GB; 27B ≈ ~54 GB; 32B ≈ ~64 GB (weights only). -# - Plus KV‑cache, which grows with context & concurrency. If you OOM, lower max context or max concurrent requests. -# -# Default directories -DOWNLOAD_DIR="${DOWNLOAD_DIR:-$HOME/vllm-models}" -CACHE_DIR_DEFAULT="$HOME/.cache/vllm" -PORT_DEFAULT="8000" -HOST_DEFAULT="0.0.0.0" -GPU_UTIL_DEFAULT="0.92" -MAX_NUM_SEQS_DEFAULT="4" -MAX_MODEL_LEN_DEFAULT="16384" -KV_CACHE_DTYPE_DEFAULT="auto" # choices: auto|int8|fp8 (fp8_e4m3) -DTYPE_DEFAULT="float16" # choices: float16|bfloat16 +# start_vllm_basic — pick a known-good model, print the vLLM command, run it. +# No extra flags; uses vLLM defaults. -VENV_ACTIVATE="/torch-therock/.venv/bin/activate" -if [[ -f "$VENV_ACTIVATE" ]]; then - # shellcheck disable=SC1090 - source "$VENV_ACTIVATE" +# Optional: activate the toolbox venv if present +if [[ -f "/torch-therock/.venv/bin/activate" ]]; then + # shellcheck disable=SC1091 + source "/torch-therock/.venv/bin/activate" fi -print_divider() { printf '\n%s\n' "────────────────────────────────────────────────────────"; } - -# --- curated model list (recent, likely to fit <= ~100GB with sane settings) --- -# Format: label|hf_repo|quant_hint|compat|note +# Only the models you've reported working MODELS=( - "Llama-4 Scout 17B-16E Instruct FP4|nvidia/Llama-4-Scout-17B-16E-Instruct-FP4|modelopt|nvidia_only|Optimized for NVIDIA; FP4 path may not work on AMD/ROCm" - "Llama-4 Scout 17B-16E Instruct FP8|nvidia/Llama-4-Scout-17B-16E-Instruct-FP8|modelopt|nvidia_only|Optimized for NVIDIA; FP8 ModelOpt path may not work on AMD/ROCm" - "OpenAI GPT-OSS 20B (MXFP4)|openai/gpt-oss-20b|mxfp4|experimental|MXFP4 support requires recent vLLM; performance/compat on AMD RDNA iGPU varies" - "OpenAI GPT-OSS 120B (MXFP4, huge)|openai/gpt-oss-120b|mxfp4|too_large|~120B total params; not practical on a single APU" - "GLM-4.5-Air FP8 (12B active)|zai-org/GLM-4.5-Air-FP8|fp8|multi_gpu_fp8|Published FP8; vendor recommends multi-GPU with native FP8" - "Gemma 3 12B IT (FP16)|google/gemma-3-12b-it|fp16|amd_ok|Good baseline" - "Gemma 3 27B IT (FP16)|google/gemma-3-27b-it|fp16|borderline|Large; consider GPTQ variant if memory tight" - "Gemma 3 27B IT (GPTQ 4bit)|ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g|gptq|amd_ok|Weight-only INT4 reduces memory; throughput may drop" - "Qwen3 8B Instruct (FP16)|Qwen/Qwen3-8B-Instruct|fp16|amd_ok|Solid quality, easy fit" - "Qwen3 8B Instruct (AWQ 4bit)|Qwen/Qwen3-8B-AWQ|awq|amd_ok|Official AWQ" - "Qwen3 14B Instruct (FP16)|Qwen/Qwen3-14B-Instruct|fp16|amd_ok|" - "Qwen3 14B Instruct (AWQ 4bit)|Qwen/Qwen3-14B-AWQ|awq|amd_ok|" - "Qwen3 30B A3B Instruct (FP16)|Qwen/Qwen3-30B-A3B-Instruct-2507|fp16|amd_ok|MoE; fits with careful context/concurrency" - "Qwen3 30B A3B Instruct (AWQ 4bit)|cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit|awq|community|Community AWQ; quality varies" + "meta-llama/Llama-2-7b-chat-hf|Llama 2 7B Chat" + "Qwen/Qwen2.5-7B-Instruct|Qwen2.5 7B Instruct" + "Qwen/Qwen3-30B-A3B-Instruct-2507|Qwen3 30B A3B Instruct" + "Qwen/Qwen3-14B-AWQ|Qwen3 14B AWQ" ) - -cat <<'HDR' -Start vLLM — AMD Strix Halo (gfx1151) -This helper will: - 1) Let you pick a model (FP16 or AWQ when available) - 2) Ask for context length, concurrency, and KV‑cache dtype - 3) Launch vLLM with Strix‑friendly defaults -HDR - -print_divider -printf 'Model download dir (persisted on host) [%s]: ' "$DOWNLOAD_DIR" -read -r REPLY_DL -[[ -n "${REPLY_DL:-}" ]] && DOWNLOAD_DIR="$REPLY_DL" -mkdir -p "$DOWNLOAD_DIR" - -printf 'Cache dir for compiled kernels [%s]: ' "$CACHE_DIR_DEFAULT" -read -r REPLY_CACHE -[[ -n "${REPLY_CACHE:-}" ]] && export VLLM_CACHE_DIR="$REPLY_CACHE" || export VLLM_CACHE_DIR="$CACHE_DIR_DEFAULT" -mkdir -p "$VLLM_CACHE_DIR" - -print_divider -printf 'Select a model:\n' -idx=1 -for m in "${MODELS[@]}"; do - IFS='|' read -r label _ _ <<<"$m" - printf ' [%d] %s\n' "$idx" "$label" - idx=$((idx+1)) +echo "Select a model:" +for i in "${!MODELS[@]}"; do + IFS='|' read -r _ label <<<"${MODELS[$i]}" + printf " [%d] %s\n" "$((i+1))" "$label" done -printf 'Enter number: ' -read -r CHOICE -if ! [[ "$CHOICE" =~ ^[0-9]+$ ]] || (( CHOICE < 1 || CHOICE > ${#MODELS[@]} )); then - echo 'Invalid choice.'; exit 1 -fi -SEL="${MODELS[$((CHOICE-1))]}" -IFS='|' read -r SEL_LABEL HF_REPO QUANT_HINT COMPAT NOTE <<<"$SEL" - -# Model-specific dtype requirements -REQUIRED_DTYPE="" -if [[ "$QUANT_HINT" == "mxfp4" ]]; then - REQUIRED_DTYPE="bfloat16" +read -rp "Enter number: " choice +if ! [[ "$choice" =~ ^[1-9][0-9]*$ ]] || (( choice < 1 || choice > ${#MODELS[@]} )); then + echo "Invalid choice." >&2 + exit 1 fi -# Quantization flag heuristic -QUANT_FLAG=() -case "$QUANT_HINT" in - awq) QUANT_FLAG=(--quantization awq) ;; - gptq) QUANT_FLAG=(--quantization gptq) ;; - mxfp4) QUANT_FLAG=(--quantization mxfp4) ;; - modelopt) QUANT_FLAG=(--quantization modelopt) ;; - fp16|fp8|bf16|auto|'') ;; # rely on model config -esac +IFS='|' read -r MODEL _ <<<"${MODELS[$((choice-1))]}" -# Compatibility warnings -case "$COMPAT" in - nvidia_only) - echo "WARNING: This checkpoint is optimized for NVIDIA (TensorRT/ModelOpt). It may not run on AMD ROCm (RDNA iGPU)." ;; - multi_gpu_fp8) - echo "WARNING: Vendor docs indicate multi‑GPU FP8 is recommended. On a single Strix Halo APU this is likely impractical." ;; - too_large) - echo "WARNING: 120B‑class model is far beyond single‑APU capacity. Expect failure unless heavy offload/sharding is used." ;; - borderline) - echo "Note: Large model — keep context/concurrency modest or use a quantized variant." ;; - community) - echo "Note: Community quantization — quality/perf may vary." ;; - amd_ok|*) ;; -esac +CMD=(vllm serve "$MODEL") -[[ -n "$NOTE" ]] && echo "Note: $NOTE" - -print_divider -printf 'Max context tokens (--max-model-len) [%s]: ' "$MAX_MODEL_LEN_DEFAULT" -read -r REPLY_CTX -MAX_MODEL_LEN="${REPLY_CTX:-$MAX_MODEL_LEN_DEFAULT}" - -printf 'Max concurrent requests (--max-num-seqs) [%s]: ' "$MAX_NUM_SEQS_DEFAULT" -read -r REPLY_CONC -MAX_NUM_SEQS="${REPLY_CONC:-$MAX_NUM_SEQS_DEFAULT}" - -printf 'KV cache dtype (auto|int8|fp8) [%s]: ' "$KV_CACHE_DTYPE_DEFAULT" -read -r REPLY_KV -KV_CACHE_DTYPE="${REPLY_KV:-$KV_CACHE_DTYPE_DEFAULT}" - -# Model dtype prompt (use required dtype if set) -dtype_default="$DTYPE_DEFAULT" -if [[ -n "$REQUIRED_DTYPE" ]]; then - dtype_default="$REQUIRED_DTYPE" -fi -printf 'Model dtype (float16|bfloat16) [%s]: ' "$dtype_default" -read -r REPLY_DTYPE -DTYPE="${REPLY_DTYPE:-$dtype_default}" - -# Enforce required dtype if user chose something else -if [[ -n "$REQUIRED_DTYPE" && "$DTYPE" != "$REQUIRED_DTYPE" ]]; then - echo "Note: this quantization requires --dtype=$REQUIRED_DTYPE; overriding." - DTYPE="$REQUIRED_DTYPE" +# Minimal, model-specific additions +if [[ "$MODEL" == "Qwen/Qwen3-14B-AWQ" ]]; then + # Needed on your ROCm setup for AWQ + CMD+=(--quantization awq --dtype float16 --enforce-eager) fi +printf 'Running:\n\n %q' "${CMD[0]}"; for ((i=1;i<${#CMD[@]};i++)); do printf ' %q' "${CMD[$i]}"; done; printf '\n\n' -printf 'GPU memory utilization (0.50‑0.98) [%s]: ' "$GPU_UTIL_DEFAULT" -read -r REPLY_UTIL -GPU_UTIL="${REPLY_UTIL:-$GPU_UTIL_DEFAULT}" - -printf 'Host bind address [%s]: ' "$HOST_DEFAULT" -read -r REPLY_HOST -HOST="${REPLY_HOST:-$HOST_DEFAULT}" - -printf 'Optional CPU offload in GB (0 to disable) [0]: ' -read -r REPLY_OFF -CPU_OFFLOAD_GB="${REPLY_OFF:-0}" - -printf 'Port [%s]: ' "$PORT_DEFAULT" -read -r REPLY_PORT -PORT="${REPLY_PORT:-$PORT_DEFAULT}" - -print_divider -CMD=( - vllm serve "$HF_REPO" - --host "$HOST" - --port "$PORT" - --download-dir "$DOWNLOAD_DIR" - --dtype "$DTYPE" - --max-model-len "$MAX_MODEL_LEN" - --max-num-seqs "$MAX_NUM_SEQS" - --gpu-memory-utilization "$GPU_UTIL" -) - -# Add CPU offload if requested -if [[ "$CPU_OFFLOAD_GB" =~ ^[0-9]+$ ]] && (( CPU_OFFLOAD_GB > 0 )); then - CMD+=(--cpu-offload-gb "$CPU_OFFLOAD_GB") -fi - -# kv‑cache dtype -if [[ "$KV_CACHE_DTYPE" != "auto" ]]; then - # Map fp8 -> fp8_e4m3 for AMD unless user typed explicit subtype already - if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then - CMD+=(--kv-cache-dtype fp8_e4m3) - else - CMD+=(--kv-cache-dtype "$KV_CACHE_DTYPE") - fi -fi - -# quantization flags (if any) -CMD+=("${QUANT_FLAG[@]}") - -# AMD ROCm/AOTriton helpful env -export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH:-gfx1151}" -export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 - -printf 'About to run:\n\n %q' "${CMD[0]}"; for ((i=1;i<${#CMD[@]};i++)); do printf ' \\\n %q' "${CMD[$i]}"; done; printf '\n\n' - -read -r -p "Proceed? [Y/n] " yn -yn=${yn:-Y} -if [[ "$yn" =~ ^[Yy]$ ]]; then - exec "${CMD[@]}" -else - echo "Canceled." -fi +exec "${CMD[@]}"