From e9460b20ad9ff0679353055d3850d6083b9a707b Mon Sep 17 00:00:00 2001
From: Donato Capitella <donato.capitella@withsecure.com>
Date: Thu, 4 Sep 2025 13:33:53 +0100
Subject: [PATCH] updated with set of working models

---
 README.md             |  59 +++++++++--
 scripts/start-vllm.sh | 227 +++++-------------------------------------
 2 files changed, 80 insertions(+), 206 deletions(-)

diff --git a/README.md b/README.md
index 905a0c6..f3697dd 100644
--- a/README.md
+++ b/README.md
@@ -7,12 +7,34 @@ An **Arch-based** Docker/Podman container that is **Toolbx-compatible** (usable
 
 ---
 
+## ⚠️ Status & Expectations (Experimental)
+
+This setup is **highly experimental** on ROCm/Strix Halo. Some models work; **many fail** due to missing custom kernels, unsupported quant types, or TorchInductor/AOTriton limitations on gfx1151. The matrix below lists combinations tested so far. **Please contribute fixes** or additional working recipes (see *Contributing*).
+
+---
+
+## Tested Models (Experimental Matrix)
+
+> **Legend:** ✅ Works (with flags) · ❌ Fails · ⚠️ Notes include the *exact* error/symptom seen.
+
+| Model (Hugging Face)               | Params / Quant |               Status | Required flags (if any)                              | Notes / Errors                                                                                       |
+| ---------------------------------- | -------------- | -------------------: | ---------------------------------------------------- | ---------------------------------------------------------------------------------------------------- |
+| `Qwen/Qwen2.5-7B-Instruct`         | 7B FP16        |              ✅ Works | (recommended) `--dtype float16`                      | Good baseline; simple serve works.                                                                   |
+| `meta-llama/Llama-2-7b-chat-hf`    | 7B FP16        |              ✅ Works | (recommended) `--dtype float16`                      | Stable.                                                                                              |
+| `Qwen/Qwen3-30B-A3B-Instruct-2507` | 30B (A3B) FP16 |              ✅ Works | (recommended) `--dtype float16`                      | Heavy; ensure **unified memory** tweaks.                                                             |
+| `Qwen/Qwen3-14B-AWQ`               | 14B AWQ        | ✅ Works (with flags) | `--quantization awq --dtype float16 --enforce-eager` | On ROCm, eager avoids missing `awq_dequantize` during compile; vLLM auto‑sets `VLLM_USE_TRITON_AWQ`. |
+| `openai/gpt-oss-20b`               | 20B MXFP4      |              ❌ Fails | —                                                    | `ModuleNotFoundError: triton_kernels.matmul_ogs` (MXFP4 path not available in this image).           |
+| `zai-org/GLM-4.5-Air-FP8`          | FP8            |              ❌ Fails | —                                                    | `ValueError: type fp8e4nv not supported (only 'fp8e5')`.                                             |
+| `cpatonn/GLM-4.5-Air-AWQ-4bit`     | AWQ-4bit (MoE) |              ❌ Fails | —                                                    | Missing custom op: `torch.ops._C.gptq_marlin_repack` (Marlin kernels).                               |
+
+> If you get a model to work, please PR a new row with: **model name**, **exact flags**, vLLM version, `torch` & `triton` versions, and a note on **gfx1151** driver/kernel stack.
+
+---
+
 ## 1) Toolbx vs Docker/Podman
 
 The `kyuz0/pytorch-therock-gfx1151-aotriton-builder` image can be used both as: 
 
-## &#x20;
-
 * **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean. 
 * **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container.
 
@@ -55,14 +77,14 @@ vllm serve Qwen/Qwen2.5-7B-Instruct \
 >
 > ```bash
 > du -sh ~/.cache/vllm/torch_compile_cache/
-> # e.g., 138M  /home/kyuz0/.cache/vllm/torch_compile_cache/
+> # e.g., 138M  /home/you/.cache/vllm/torch_compile_cache/
 > ```
 
 ---
 
 ## 3) Testing the API
 
-Once the server is up (from section 2), hit the OpenAI‑compatible endpoint:
+Once the server is up, hit the OpenAI‑compatible endpoint:
 
 ```bash
 curl -X POST http://localhost:8000/v1/chat/completions \
@@ -72,6 +94,19 @@ curl -X POST http://localhost:8000/v1/chat/completions \
 
 You should receive a JSON response with a `choices[0].message.content` reply.
 
+If you don't want to bother specifying the model name, you can run this which will query the currently deployed model:
+
+```bash
+MODEL=$(curl -s http://localhost:8000/v1/models | jq -r '.data[0].id')
+
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d "{
+    \"model\": \"$MODEL\",
+    \"messages\":[{\"role\":\"user\",\"content\":\"Hello! Test the performance.\"}]
+  }"
+```
+
 ---
 
 ## 4) Quickstart — Podman/Docker
@@ -156,9 +191,21 @@ Enable large GTT/unified memory so the iGPU can borrow system RAM for bigger mod
 
 ---
 
-## 8) Acknowledgements & Links
+## 8) Contributing
+
+Spotted a fix, a working flag combo, or a model that should be on the list? **PRs welcome!** Please include:
+
+* Model repo + exact version tag (if any)
+* Full `vllm serve` command/flags that work
+* vLLM version, `torch` & `triton` versions (`python -c "import torch, triton; print(torch.__version__, triton.__version__)"`)
+* Short log snippet of success/failure (especially the **first** error)
+* Any relevant kernel/AOTriton env vars (e.g., `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1`)
+
+---
+
+## 9) Acknowledgements & Links
 
 * Base images & docs: [https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton](https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton)
 * Upstreams: [vLLM](https://github.com/vllm-project/vllm), [ROCm/TheRock](https://github.com/ROCm/TheRock), [AOTriton](https://github.com/ROCm/aotriton)
 * Community: **AMD Strix Halo Home Lab Discord** — [https://discord.gg/pnPRyucNrG](https://discord.gg/pnPRyucNrG)
-* Big thanks to **lhl** and **ssweens** for prior art and inspiration.
+* Big thanks to **lhl** and **ssweens** for doing the actual heavy lifting for this.
diff --git a/scripts/start-vllm.sh b/scripts/start-vllm.sh
index 2c79410..f36c022 100644
--- a/scripts/start-vllm.sh
+++ b/scripts/start-vllm.sh
@@ -1,218 +1,45 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-# start_vllm — interactive helper to launch vLLM on AMD Strix Halo (gfx1151)
-# - Presents a curated list of recent HF models that fit within ~100GB memory (with FP16 or AWQ)
-# - Asks for context length, concurrency, kv‑cache dtype, port, etc.
-# - Starts vLLM with sensible ROCm defaults for Strix Halo
-#
-# Requirements inside the toolbox/container:
-#   - vLLM installed in /torch-therock/.venv (this image has it)
-#   - internet for first model download (or pre‑downloaded into ~/vllm-models)
-#   - optional: ~/.cache/vllm mapped to persist compile cache when using Podman/Docker
-#
-# Notes on quantization:
-#   - vLLM supports weight‑only quantized models like AWQ and GPTQ (load pre‑quantized repos).
-#   - For AMD GPUs, FP8 KV‑cache can be supported but is experimental on consumer APUs; INT8 KV‑cache is a safer saver.
-#   - Qwen3 provides AWQ variants officially; using them can materially reduce memory use. (You do NOT need GGUF; that is for llama.cpp.)
-#
-# Model memory rule of thumb (VERY rough):
-#   - FP16 weights ≈ 2 bytes/parameter. So 12B ≈ ~24 GB; 27B ≈ ~54 GB; 32B ≈ ~64 GB (weights only).
-#   - Plus KV‑cache, which grows with context & concurrency. If you OOM, lower max context or max concurrent requests.
-#
-# Default directories
-DOWNLOAD_DIR="${DOWNLOAD_DIR:-$HOME/vllm-models}"
-CACHE_DIR_DEFAULT="$HOME/.cache/vllm"
-PORT_DEFAULT="8000"
-HOST_DEFAULT="0.0.0.0"
-GPU_UTIL_DEFAULT="0.92"
-MAX_NUM_SEQS_DEFAULT="4"
-MAX_MODEL_LEN_DEFAULT="16384"
-KV_CACHE_DTYPE_DEFAULT="auto"   # choices: auto|int8|fp8 (fp8_e4m3)
-DTYPE_DEFAULT="float16"         # choices: float16|bfloat16
+# start_vllm_basic — pick a known-good model, print the vLLM command, run it.
+# No extra flags; uses vLLM defaults.
 
-VENV_ACTIVATE="/torch-therock/.venv/bin/activate"
-if [[ -f "$VENV_ACTIVATE" ]]; then
-  # shellcheck disable=SC1090
-  source "$VENV_ACTIVATE"
+# Optional: activate the toolbox venv if present
+if [[ -f "/torch-therock/.venv/bin/activate" ]]; then
+  # shellcheck disable=SC1091
+  source "/torch-therock/.venv/bin/activate"
 fi
 
-print_divider() { printf '\n%s\n' "────────────────────────────────────────────────────────"; }
-
-# --- curated model list (recent, likely to fit <= ~100GB with sane settings) ---
-# Format: label|hf_repo|quant_hint|compat|note
+# Only the models you've reported working
 MODELS=(
-  "Llama-4 Scout 17B-16E Instruct FP4|nvidia/Llama-4-Scout-17B-16E-Instruct-FP4|modelopt|nvidia_only|Optimized for NVIDIA; FP4 path may not work on AMD/ROCm"
-  "Llama-4 Scout 17B-16E Instruct FP8|nvidia/Llama-4-Scout-17B-16E-Instruct-FP8|modelopt|nvidia_only|Optimized for NVIDIA; FP8 ModelOpt path may not work on AMD/ROCm"
-  "OpenAI GPT-OSS 20B (MXFP4)|openai/gpt-oss-20b|mxfp4|experimental|MXFP4 support requires recent vLLM; performance/compat on AMD RDNA iGPU varies"
-  "OpenAI GPT-OSS 120B (MXFP4, huge)|openai/gpt-oss-120b|mxfp4|too_large|~120B total params; not practical on a single APU"
-  "GLM-4.5-Air FP8 (12B active)|zai-org/GLM-4.5-Air-FP8|fp8|multi_gpu_fp8|Published FP8; vendor recommends multi-GPU with native FP8"
-  "Gemma 3 12B IT (FP16)|google/gemma-3-12b-it|fp16|amd_ok|Good baseline"
-  "Gemma 3 27B IT (FP16)|google/gemma-3-27b-it|fp16|borderline|Large; consider GPTQ variant if memory tight"
-  "Gemma 3 27B IT (GPTQ 4bit)|ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g|gptq|amd_ok|Weight-only INT4 reduces memory; throughput may drop"
-  "Qwen3 8B Instruct (FP16)|Qwen/Qwen3-8B-Instruct|fp16|amd_ok|Solid quality, easy fit"
-  "Qwen3 8B Instruct (AWQ 4bit)|Qwen/Qwen3-8B-AWQ|awq|amd_ok|Official AWQ"
-  "Qwen3 14B Instruct (FP16)|Qwen/Qwen3-14B-Instruct|fp16|amd_ok|"
-  "Qwen3 14B Instruct (AWQ 4bit)|Qwen/Qwen3-14B-AWQ|awq|amd_ok|"
-  "Qwen3 30B A3B Instruct (FP16)|Qwen/Qwen3-30B-A3B-Instruct-2507|fp16|amd_ok|MoE; fits with careful context/concurrency"
-  "Qwen3 30B A3B Instruct (AWQ 4bit)|cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit|awq|community|Community AWQ; quality varies"
+  "meta-llama/Llama-2-7b-chat-hf|Llama 2 7B Chat"
+  "Qwen/Qwen2.5-7B-Instruct|Qwen2.5 7B Instruct"
+  "Qwen/Qwen3-30B-A3B-Instruct-2507|Qwen3 30B A3B Instruct"
+  "Qwen/Qwen3-14B-AWQ|Qwen3 14B AWQ"
 )
 
-
-cat <<'HDR'
-Start vLLM — AMD Strix Halo (gfx1151)
-This helper will:
-  1) Let you pick a model (FP16 or AWQ when available)
-  2) Ask for context length, concurrency, and KV‑cache dtype
-  3) Launch vLLM with Strix‑friendly defaults
-HDR
-
-print_divider
-printf 'Model download dir (persisted on host) [%s]: ' "$DOWNLOAD_DIR"
-read -r REPLY_DL
-[[ -n "${REPLY_DL:-}" ]] && DOWNLOAD_DIR="$REPLY_DL"
-mkdir -p "$DOWNLOAD_DIR"
-
-printf 'Cache dir for compiled kernels [%s]: ' "$CACHE_DIR_DEFAULT"
-read -r REPLY_CACHE
-[[ -n "${REPLY_CACHE:-}" ]] && export VLLM_CACHE_DIR="$REPLY_CACHE" || export VLLM_CACHE_DIR="$CACHE_DIR_DEFAULT"
-mkdir -p "$VLLM_CACHE_DIR"
-
-print_divider
-printf 'Select a model:\n'
-idx=1
-for m in "${MODELS[@]}"; do
-  IFS='|' read -r label _ _ <<<"$m"
-  printf '  [%d] %s\n' "$idx" "$label"
-  idx=$((idx+1))
+echo "Select a model:"
+for i in "${!MODELS[@]}"; do
+  IFS='|' read -r _ label <<<"${MODELS[$i]}"
+  printf "  [%d] %s\n" "$((i+1))" "$label"
 done
 
-printf 'Enter number: '
-read -r CHOICE
-if ! [[ "$CHOICE" =~ ^[0-9]+$ ]] || (( CHOICE < 1 || CHOICE > ${#MODELS[@]} )); then
-  echo 'Invalid choice.'; exit 1
-fi
-SEL="${MODELS[$((CHOICE-1))]}"
-IFS='|' read -r SEL_LABEL HF_REPO QUANT_HINT COMPAT NOTE <<<"$SEL"
-
-# Model-specific dtype requirements
-REQUIRED_DTYPE=""
-if [[ "$QUANT_HINT" == "mxfp4" ]]; then
-  REQUIRED_DTYPE="bfloat16"
+read -rp "Enter number: " choice
+if ! [[ "$choice" =~ ^[1-9][0-9]*$ ]] || (( choice < 1 || choice > ${#MODELS[@]} )); then
+  echo "Invalid choice." >&2
+  exit 1
 fi
 
-# Quantization flag heuristic
-QUANT_FLAG=()
-case "$QUANT_HINT" in
-  awq) QUANT_FLAG=(--quantization awq) ;;
-  gptq) QUANT_FLAG=(--quantization gptq) ;;
-  mxfp4) QUANT_FLAG=(--quantization mxfp4) ;;
-  modelopt) QUANT_FLAG=(--quantization modelopt) ;;
-  fp16|fp8|bf16|auto|'') ;; # rely on model config
-esac
+IFS='|' read -r MODEL _ <<<"${MODELS[$((choice-1))]}"
 
-# Compatibility warnings
-case "$COMPAT" in
-  nvidia_only)
-    echo "WARNING: This checkpoint is optimized for NVIDIA (TensorRT/ModelOpt). It may not run on AMD ROCm (RDNA iGPU)." ;;
-  multi_gpu_fp8)
-    echo "WARNING: Vendor docs indicate multi‑GPU FP8 is recommended. On a single Strix Halo APU this is likely impractical." ;;
-  too_large)
-    echo "WARNING: 120B‑class model is far beyond single‑APU capacity. Expect failure unless heavy offload/sharding is used." ;;
-  borderline)
-    echo "Note: Large model — keep context/concurrency modest or use a quantized variant." ;;
-  community)
-    echo "Note: Community quantization — quality/perf may vary." ;;
-  amd_ok|*) ;;
-esac
+CMD=(vllm serve "$MODEL")
 
-[[ -n "$NOTE" ]] && echo "Note: $NOTE"
-
-print_divider
-printf 'Max context tokens (--max-model-len) [%s]: ' "$MAX_MODEL_LEN_DEFAULT"
-read -r REPLY_CTX
-MAX_MODEL_LEN="${REPLY_CTX:-$MAX_MODEL_LEN_DEFAULT}"
-
-printf 'Max concurrent requests (--max-num-seqs) [%s]: ' "$MAX_NUM_SEQS_DEFAULT"
-read -r REPLY_CONC
-MAX_NUM_SEQS="${REPLY_CONC:-$MAX_NUM_SEQS_DEFAULT}"
-
-printf 'KV cache dtype (auto|int8|fp8) [%s]: ' "$KV_CACHE_DTYPE_DEFAULT"
-read -r REPLY_KV
-KV_CACHE_DTYPE="${REPLY_KV:-$KV_CACHE_DTYPE_DEFAULT}"
-
-# Model dtype prompt (use required dtype if set)
-dtype_default="$DTYPE_DEFAULT"
-if [[ -n "$REQUIRED_DTYPE" ]]; then
-  dtype_default="$REQUIRED_DTYPE"
-fi
-printf 'Model dtype (float16|bfloat16) [%s]: ' "$dtype_default"
-read -r REPLY_DTYPE
-DTYPE="${REPLY_DTYPE:-$dtype_default}"
-
-# Enforce required dtype if user chose something else
-if [[ -n "$REQUIRED_DTYPE" && "$DTYPE" != "$REQUIRED_DTYPE" ]]; then
-  echo "Note: this quantization requires --dtype=$REQUIRED_DTYPE; overriding."
-  DTYPE="$REQUIRED_DTYPE"
+# Minimal, model-specific additions
+if [[ "$MODEL" == "Qwen/Qwen3-14B-AWQ" ]]; then
+  # Needed on your ROCm setup for AWQ
+  CMD+=(--quantization awq --dtype float16 --enforce-eager)
 fi
 
+printf 'Running:\n\n  %q' "${CMD[0]}"; for ((i=1;i<${#CMD[@]};i++)); do printf ' %q' "${CMD[$i]}"; done; printf '\n\n'
 
-printf 'GPU memory utilization (0.50‑0.98) [%s]: ' "$GPU_UTIL_DEFAULT"
-read -r REPLY_UTIL
-GPU_UTIL="${REPLY_UTIL:-$GPU_UTIL_DEFAULT}"
-
-printf 'Host bind address [%s]: ' "$HOST_DEFAULT"
-read -r REPLY_HOST
-HOST="${REPLY_HOST:-$HOST_DEFAULT}"
-
-printf 'Optional CPU offload in GB (0 to disable) [0]: '
-read -r REPLY_OFF
-CPU_OFFLOAD_GB="${REPLY_OFF:-0}"
-
-printf 'Port [%s]: ' "$PORT_DEFAULT"
-read -r REPLY_PORT
-PORT="${REPLY_PORT:-$PORT_DEFAULT}"
-
-print_divider
-CMD=(
-  vllm serve "$HF_REPO"
-  --host "$HOST"
-  --port "$PORT"
-  --download-dir "$DOWNLOAD_DIR"
-  --dtype "$DTYPE"
-  --max-model-len "$MAX_MODEL_LEN"
-  --max-num-seqs "$MAX_NUM_SEQS"
-  --gpu-memory-utilization "$GPU_UTIL"
-)
-
-# Add CPU offload if requested
-if [[ "$CPU_OFFLOAD_GB" =~ ^[0-9]+$ ]] && (( CPU_OFFLOAD_GB > 0 )); then
-  CMD+=(--cpu-offload-gb "$CPU_OFFLOAD_GB")
-fi
-
-# kv‑cache dtype
-if [[ "$KV_CACHE_DTYPE" != "auto" ]]; then
-  # Map fp8 -> fp8_e4m3 for AMD unless user typed explicit subtype already
-  if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then
-    CMD+=(--kv-cache-dtype fp8_e4m3)
-  else
-    CMD+=(--kv-cache-dtype "$KV_CACHE_DTYPE")
-  fi
-fi
-
-# quantization flags (if any)
-CMD+=("${QUANT_FLAG[@]}")
-
-# AMD ROCm/AOTriton helpful env
-export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH:-gfx1151}"
-export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
-
-printf 'About to run:\n\n  %q' "${CMD[0]}"; for ((i=1;i<${#CMD[@]};i++)); do printf ' \\\n    %q' "${CMD[$i]}"; done; printf '\n\n'
-
-read -r -p "Proceed? [Y/n] " yn
-yn=${yn:-Y}
-if [[ "$yn" =~ ^[Yy]$ ]]; then
-  exec "${CMD[@]}"
-else
-  echo "Canceled."
-fi
+exec "${CMD[@]}"