From fb54a2a9b962fc79f239add147896984488e2c29 Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Thu, 4 Sep 2025 13:58:51 +0100 Subject: [PATCH] Fixed missing parameters in start-vllm --- README.md | 25 +++++++++++++---- scripts/start-vllm.sh | 64 +++++++++++++++++++++++-------------------- 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index f3697dd..c8cc9a4 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,23 @@ This setup is **highly experimental** on ROCm/Strix Halo. Some models work; **ma --- +## Table of Contents + +* [1) Toolbx vs Docker/Podman](#1-toolbx-vs-dockerpodman) +* [2) Quickstart — Fedora Toolbx (development)](#2-quickstart--fedora-toolbx-development) +* [3) Testing the API](#3-testing-the-api) +* [4) Quickstart — Podman/Docker](#4-quickstart--podmandocker) +* [5) Models, dtypes & storage](#5-models-dtypes--storage) +* [6) Performance notes (short)](#6-performance-notes-short) +* [7) Requirements (host)](#7-requirements-host) +* [8) Acknowledgements & Links](#8-acknowledgements--links) +* [Tested Models](#tested-models) +* [Contributing](#contributing) + + ## 1) Toolbx vs Docker/Podman -The `kyuz0/pytorch-therock-gfx1151-aotriton-builder` image can be used both as:  +The `kyuz0/vllm-therock-gfx1151-aotriton:latest` image can be used both as:  * **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean.  * **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container. @@ -63,14 +77,15 @@ toolbox enter vllm mkdir -p ~/vllm-models ``` -Serve a model with vLLM (downloads to `~/vllm-models`; if the model isn't present, it will be fetched from Hugging Face automatically): +Serve a model using the helper script **`start-vllm`** (it prints the exact `vllm serve` command and then runs it). Models download to `~/vllm-models` by default; if a model isn't present, it will be fetched from Hugging Face automatically: ```bash -vllm serve Qwen/Qwen2.5-7B-Instruct \ - --host 0.0.0.0 --port 8000 \ - --download-dir ~/vllm-models +start-vllm +# pick a model from the menu; the script prints the serve command and launches it ``` +> Defaults: `0.0.0.0:8000` and `~/vllm-models` for weights. You can still run `vllm serve` manually if you prefer. + > Toolbx shares HOME by design, so `~/vllm-models` stays on the host and survives toolbox updates. > > **Cache note (Toolbx):** vLLM will also write compiled kernels to `~/.cache/vllm/torch_compile_cache/` in your HOME. For example: diff --git a/scripts/start-vllm.sh b/scripts/start-vllm.sh index f36c022..bb79029 100644 --- a/scripts/start-vllm.sh +++ b/scripts/start-vllm.sh @@ -1,45 +1,51 @@ #!/usr/bin/env bash set -euo pipefail -# start_vllm_basic — pick a known-good model, print the vLLM command, run it. -# No extra flags; uses vLLM defaults. +# Defaults (override via env: HOST, PORT, DOWNLOAD_DIR, EXTRA_FLAGS) +HOST="${HOST:-0.0.0.0}" +PORT="${PORT:-8000}" +DOWNLOAD_DIR="${DOWNLOAD_DIR:-$HOME/vllm-models}" +EXTRA_FLAGS="${EXTRA_FLAGS:-}" -# Optional: activate the toolbox venv if present -if [[ -f "/torch-therock/.venv/bin/activate" ]]; then - # shellcheck disable=SC1091 - source "/torch-therock/.venv/bin/activate" -fi - -# Only the models you've reported working -MODELS=( - "meta-llama/Llama-2-7b-chat-hf|Llama 2 7B Chat" - "Qwen/Qwen2.5-7B-Instruct|Qwen2.5 7B Instruct" - "Qwen/Qwen3-30B-A3B-Instruct-2507|Qwen3 30B A3B Instruct" - "Qwen/Qwen3-14B-AWQ|Qwen3 14B AWQ" +models=( + "Llama 2 7B Chat|meta-llama/Llama-2-7b-chat-hf|" + "Qwen2.5 7B Instruct|Qwen/Qwen2.5-7B-Instruct|" + "Qwen3 30B A3B Instruct|Qwen/Qwen3-30B-A3B-Instruct-2507|" + "Qwen3 14B AWQ|Qwen/Qwen3-14B-AWQ|--quantization awq --dtype float16 --enforce-eager" ) echo "Select a model:" -for i in "${!MODELS[@]}"; do - IFS='|' read -r _ label <<<"${MODELS[$i]}" - printf " [%d] %s\n" "$((i+1))" "$label" +for i in "${!models[@]}"; do + name="${models[$i]%%|*}" + printf " [%d] %s\n" "$((i+1))" "$name" done read -rp "Enter number: " choice -if ! [[ "$choice" =~ ^[1-9][0-9]*$ ]] || (( choice < 1 || choice > ${#MODELS[@]} )); then - echo "Invalid choice." >&2 - exit 1 +[[ "$choice" =~ ^[1-9][0-9]*$ ]] || { echo "Invalid choice."; exit 1; } +idx=$((choice-1)) +(( idx >= 0 && idx < ${#models[@]} )) || { echo "Invalid choice."; exit 1; } + +IFS='|' read -r label repo flags <<< "${models[$idx]}" + +mkdir -p "$DOWNLOAD_DIR" + +CMD=(vllm serve "$repo" --host "$HOST" --port "$PORT" --download-dir "$DOWNLOAD_DIR") + +# Per-model flags +if [[ -n "${flags:-}" ]]; then + # shellcheck disable=SC2206 + CMD+=($flags) fi -IFS='|' read -r MODEL _ <<<"${MODELS[$((choice-1))]}" - -CMD=(vllm serve "$MODEL") - -# Minimal, model-specific additions -if [[ "$MODEL" == "Qwen/Qwen3-14B-AWQ" ]]; then - # Needed on your ROCm setup for AWQ - CMD+=(--quantization awq --dtype float16 --enforce-eager) +# Optional global extras: e.g. EXTRA_FLAGS="--gpu-memory-utilization 0.8" +if [[ -n "${EXTRA_FLAGS:-}" ]]; then + # shellcheck disable=SC2206 + CMD+=($EXTRA_FLAGS) fi -printf 'Running:\n\n %q' "${CMD[0]}"; for ((i=1;i<${#CMD[@]};i++)); do printf ' %q' "${CMD[$i]}"; done; printf '\n\n' +echo -e "Running:\n\n ${CMD[@]}\n" +echo "API test → curl -s http://localhost:${PORT}/v1/models | jq -r '.data[0].id'" +echo "SSH tip → ssh -L ${PORT}:localhost:${PORT} user@host" +echo exec "${CMD[@]}"