Fixed missing parameters in start-vllm
This commit is contained in:
+20
-5
@@ -31,9 +31,23 @@ This setup is **highly experimental** on ROCm/Strix Halo. Some models work; **ma
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
|
||||
* [1) Toolbx vs Docker/Podman](#1-toolbx-vs-dockerpodman)
|
||||
* [2) Quickstart — Fedora Toolbx (development)](#2-quickstart--fedora-toolbx-development)
|
||||
* [3) Testing the API](#3-testing-the-api)
|
||||
* [4) Quickstart — Podman/Docker](#4-quickstart--podmandocker)
|
||||
* [5) Models, dtypes & storage](#5-models-dtypes--storage)
|
||||
* [6) Performance notes (short)](#6-performance-notes-short)
|
||||
* [7) Requirements (host)](#7-requirements-host)
|
||||
* [8) Acknowledgements & Links](#8-acknowledgements--links)
|
||||
* [Tested Models](#tested-models)
|
||||
* [Contributing](#contributing)
|
||||
|
||||
|
||||
## 1) Toolbx vs Docker/Podman
|
||||
|
||||
The `kyuz0/pytorch-therock-gfx1151-aotriton-builder` image can be used both as:
|
||||
The `kyuz0/vllm-therock-gfx1151-aotriton:latest` image can be used both as:
|
||||
|
||||
* **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean.
|
||||
* **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container.
|
||||
@@ -63,14 +77,15 @@ toolbox enter vllm
|
||||
mkdir -p ~/vllm-models
|
||||
```
|
||||
|
||||
Serve a model with vLLM (downloads to `~/vllm-models`; if the model isn't present, it will be fetched from Hugging Face automatically):
|
||||
Serve a model using the helper script **`start-vllm`** (it prints the exact `vllm serve` command and then runs it). Models download to `~/vllm-models` by default; if a model isn't present, it will be fetched from Hugging Face automatically:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-7B-Instruct \
|
||||
--host 0.0.0.0 --port 8000 \
|
||||
--download-dir ~/vllm-models
|
||||
start-vllm
|
||||
# pick a model from the menu; the script prints the serve command and launches it
|
||||
```
|
||||
|
||||
> Defaults: `0.0.0.0:8000` and `~/vllm-models` for weights. You can still run `vllm serve` manually if you prefer.
|
||||
|
||||
> Toolbx shares HOME by design, so `~/vllm-models` stays on the host and survives toolbox updates.
|
||||
>
|
||||
> **Cache note (Toolbx):** vLLM will also write compiled kernels to `~/.cache/vllm/torch_compile_cache/` in your HOME. For example:
|
||||
|
||||
@@ -1,45 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# start_vllm_basic — pick a known-good model, print the vLLM command, run it.
|
||||
# No extra flags; uses vLLM defaults.
|
||||
# Defaults (override via env: HOST, PORT, DOWNLOAD_DIR, EXTRA_FLAGS)
|
||||
HOST="${HOST:-0.0.0.0}"
|
||||
PORT="${PORT:-8000}"
|
||||
DOWNLOAD_DIR="${DOWNLOAD_DIR:-$HOME/vllm-models}"
|
||||
EXTRA_FLAGS="${EXTRA_FLAGS:-}"
|
||||
|
||||
# Optional: activate the toolbox venv if present
|
||||
if [[ -f "/torch-therock/.venv/bin/activate" ]]; then
|
||||
# shellcheck disable=SC1091
|
||||
source "/torch-therock/.venv/bin/activate"
|
||||
fi
|
||||
|
||||
# Only the models you've reported working
|
||||
MODELS=(
|
||||
"meta-llama/Llama-2-7b-chat-hf|Llama 2 7B Chat"
|
||||
"Qwen/Qwen2.5-7B-Instruct|Qwen2.5 7B Instruct"
|
||||
"Qwen/Qwen3-30B-A3B-Instruct-2507|Qwen3 30B A3B Instruct"
|
||||
"Qwen/Qwen3-14B-AWQ|Qwen3 14B AWQ"
|
||||
models=(
|
||||
"Llama 2 7B Chat|meta-llama/Llama-2-7b-chat-hf|"
|
||||
"Qwen2.5 7B Instruct|Qwen/Qwen2.5-7B-Instruct|"
|
||||
"Qwen3 30B A3B Instruct|Qwen/Qwen3-30B-A3B-Instruct-2507|"
|
||||
"Qwen3 14B AWQ|Qwen/Qwen3-14B-AWQ|--quantization awq --dtype float16 --enforce-eager"
|
||||
)
|
||||
|
||||
echo "Select a model:"
|
||||
for i in "${!MODELS[@]}"; do
|
||||
IFS='|' read -r _ label <<<"${MODELS[$i]}"
|
||||
printf " [%d] %s\n" "$((i+1))" "$label"
|
||||
for i in "${!models[@]}"; do
|
||||
name="${models[$i]%%|*}"
|
||||
printf " [%d] %s\n" "$((i+1))" "$name"
|
||||
done
|
||||
|
||||
read -rp "Enter number: " choice
|
||||
if ! [[ "$choice" =~ ^[1-9][0-9]*$ ]] || (( choice < 1 || choice > ${#MODELS[@]} )); then
|
||||
echo "Invalid choice." >&2
|
||||
exit 1
|
||||
[[ "$choice" =~ ^[1-9][0-9]*$ ]] || { echo "Invalid choice."; exit 1; }
|
||||
idx=$((choice-1))
|
||||
(( idx >= 0 && idx < ${#models[@]} )) || { echo "Invalid choice."; exit 1; }
|
||||
|
||||
IFS='|' read -r label repo flags <<< "${models[$idx]}"
|
||||
|
||||
mkdir -p "$DOWNLOAD_DIR"
|
||||
|
||||
CMD=(vllm serve "$repo" --host "$HOST" --port "$PORT" --download-dir "$DOWNLOAD_DIR")
|
||||
|
||||
# Per-model flags
|
||||
if [[ -n "${flags:-}" ]]; then
|
||||
# shellcheck disable=SC2206
|
||||
CMD+=($flags)
|
||||
fi
|
||||
|
||||
IFS='|' read -r MODEL _ <<<"${MODELS[$((choice-1))]}"
|
||||
|
||||
CMD=(vllm serve "$MODEL")
|
||||
|
||||
# Minimal, model-specific additions
|
||||
if [[ "$MODEL" == "Qwen/Qwen3-14B-AWQ" ]]; then
|
||||
# Needed on your ROCm setup for AWQ
|
||||
CMD+=(--quantization awq --dtype float16 --enforce-eager)
|
||||
# Optional global extras: e.g. EXTRA_FLAGS="--gpu-memory-utilization 0.8"
|
||||
if [[ -n "${EXTRA_FLAGS:-}" ]]; then
|
||||
# shellcheck disable=SC2206
|
||||
CMD+=($EXTRA_FLAGS)
|
||||
fi
|
||||
|
||||
printf 'Running:\n\n %q' "${CMD[0]}"; for ((i=1;i<${#CMD[@]};i++)); do printf ' %q' "${CMD[$i]}"; done; printf '\n\n'
|
||||
echo -e "Running:\n\n ${CMD[@]}\n"
|
||||
echo "API test → curl -s http://localhost:${PORT}/v1/models | jq -r '.data[0].id'"
|
||||
echo "SSH tip → ssh -L ${PORT}:localhost:${PORT} user@host"
|
||||
echo
|
||||
|
||||
exec "${CMD[@]}"
|
||||
|
||||
Verwijs in nieuw issue
Block a user