Files
amd-strix-halo-vllm-toolboxes/scripts/start-vllm.sh
T
Donato Capitella fc12e2cc63 fixing quant
2025-09-03 23:08:45 +01:00

219 righe
8.3 KiB
Bash
Originale Blame Cronologia

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
set -euo pipefail
# start_vllm — interactive helper to launch vLLM on AMD Strix Halo (gfx1151)
# - Presents a curated list of recent HF models that fit within ~100GB memory (with FP16 or AWQ)
# - Asks for context length, concurrency, kvcache dtype, port, etc.
# - Starts vLLM with sensible ROCm defaults for Strix Halo
#
# Requirements inside the toolbox/container:
# - vLLM installed in /torch-therock/.venv (this image has it)
# - internet for first model download (or predownloaded into ~/vllm-models)
# - optional: ~/.cache/vllm mapped to persist compile cache when using Podman/Docker
#
# Notes on quantization:
# - vLLM supports weightonly quantized models like AWQ and GPTQ (load prequantized repos).
# - For AMD GPUs, FP8 KVcache can be supported but is experimental on consumer APUs; INT8 KVcache is a safer saver.
# - Qwen3 provides AWQ variants officially; using them can materially reduce memory use. (You do NOT need GGUF; that is for llama.cpp.)
#
# Model memory rule of thumb (VERY rough):
# - FP16 weights ≈ 2 bytes/parameter. So 12B ≈ ~24 GB; 27B ≈ ~54 GB; 32B ≈ ~64 GB (weights only).
# - Plus KVcache, which grows with context & concurrency. If you OOM, lower max context or max concurrent requests.
#
# Default directories
DOWNLOAD_DIR="${DOWNLOAD_DIR:-$HOME/vllm-models}"
CACHE_DIR_DEFAULT="$HOME/.cache/vllm"
PORT_DEFAULT="8000"
HOST_DEFAULT="0.0.0.0"
GPU_UTIL_DEFAULT="0.92"
MAX_NUM_SEQS_DEFAULT="4"
MAX_MODEL_LEN_DEFAULT="16384"
KV_CACHE_DTYPE_DEFAULT="auto" # choices: auto|int8|fp8 (fp8_e4m3)
DTYPE_DEFAULT="float16" # choices: float16|bfloat16
VENV_ACTIVATE="/torch-therock/.venv/bin/activate"
if [[ -f "$VENV_ACTIVATE" ]]; then
# shellcheck disable=SC1090
source "$VENV_ACTIVATE"
fi
print_divider() { printf '\n%s\n' "────────────────────────────────────────────────────────"; }
# --- curated model list (recent, likely to fit <= ~100GB with sane settings) ---
# Format: label|hf_repo|quant_hint|compat|note
MODELS=(
"Llama-4 Scout 17B-16E Instruct FP4|nvidia/Llama-4-Scout-17B-16E-Instruct-FP4|modelopt|nvidia_only|Optimized for NVIDIA; FP4 path may not work on AMD/ROCm"
"Llama-4 Scout 17B-16E Instruct FP8|nvidia/Llama-4-Scout-17B-16E-Instruct-FP8|modelopt|nvidia_only|Optimized for NVIDIA; FP8 ModelOpt path may not work on AMD/ROCm"
"OpenAI GPT-OSS 20B (MXFP4)|openai/gpt-oss-20b|mxfp4|experimental|MXFP4 support requires recent vLLM; performance/compat on AMD RDNA iGPU varies"
"OpenAI GPT-OSS 120B (MXFP4, huge)|openai/gpt-oss-120b|mxfp4|too_large|~120B total params; not practical on a single APU"
"GLM-4.5-Air FP8 (12B active)|zai-org/GLM-4.5-Air-FP8|fp8|multi_gpu_fp8|Published FP8; vendor recommends multi-GPU with native FP8"
"Gemma 3 12B IT (FP16)|google/gemma-3-12b-it|fp16|amd_ok|Good baseline"
"Gemma 3 27B IT (FP16)|google/gemma-3-27b-it|fp16|borderline|Large; consider GPTQ variant if memory tight"
"Gemma 3 27B IT (GPTQ 4bit)|ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g|gptq|amd_ok|Weight-only INT4 reduces memory; throughput may drop"
"Qwen3 8B Instruct (FP16)|Qwen/Qwen3-8B-Instruct|fp16|amd_ok|Solid quality, easy fit"
"Qwen3 8B Instruct (AWQ 4bit)|Qwen/Qwen3-8B-AWQ|awq|amd_ok|Official AWQ"
"Qwen3 14B Instruct (FP16)|Qwen/Qwen3-14B-Instruct|fp16|amd_ok|"
"Qwen3 14B Instruct (AWQ 4bit)|Qwen/Qwen3-14B-AWQ|awq|amd_ok|"
"Qwen3 30B A3B Instruct (FP16)|Qwen/Qwen3-30B-A3B-Instruct-2507|fp16|amd_ok|MoE; fits with careful context/concurrency"
"Qwen3 30B A3B Instruct (AWQ 4bit)|cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit|awq|community|Community AWQ; quality varies"
)
cat <<'HDR'
Start vLLM — AMD Strix Halo (gfx1151)
This helper will:
1) Let you pick a model (FP16 or AWQ when available)
2) Ask for context length, concurrency, and KVcache dtype
3) Launch vLLM with Strixfriendly defaults
HDR
print_divider
printf 'Model download dir (persisted on host) [%s]: ' "$DOWNLOAD_DIR"
read -r REPLY_DL
[[ -n "${REPLY_DL:-}" ]] && DOWNLOAD_DIR="$REPLY_DL"
mkdir -p "$DOWNLOAD_DIR"
printf 'Cache dir for compiled kernels [%s]: ' "$CACHE_DIR_DEFAULT"
read -r REPLY_CACHE
[[ -n "${REPLY_CACHE:-}" ]] && export VLLM_CACHE_DIR="$REPLY_CACHE" || export VLLM_CACHE_DIR="$CACHE_DIR_DEFAULT"
mkdir -p "$VLLM_CACHE_DIR"
print_divider
printf 'Select a model:\n'
idx=1
for m in "${MODELS[@]}"; do
IFS='|' read -r label _ _ <<<"$m"
printf ' [%d] %s\n' "$idx" "$label"
idx=$((idx+1))
done
printf 'Enter number: '
read -r CHOICE
if ! [[ "$CHOICE" =~ ^[0-9]+$ ]] || (( CHOICE < 1 || CHOICE > ${#MODELS[@]} )); then
echo 'Invalid choice.'; exit 1
fi
SEL="${MODELS[$((CHOICE-1))]}"
IFS='|' read -r SEL_LABEL HF_REPO QUANT_HINT COMPAT NOTE <<<"$SEL"
# Model-specific dtype requirements
REQUIRED_DTYPE=""
if [[ "$QUANT_HINT" == "mxfp4" ]]; then
REQUIRED_DTYPE="bfloat16"
fi
# Quantization flag heuristic
QUANT_FLAG=()
case "$QUANT_HINT" in
awq) QUANT_FLAG=(--quantization awq) ;;
gptq) QUANT_FLAG=(--quantization gptq) ;;
mxfp4) QUANT_FLAG=(--quantization mxfp4) ;;
modelopt) QUANT_FLAG=(--quantization modelopt) ;;
fp16|fp8|bf16|auto|'') ;; # rely on model config
esac
# Compatibility warnings
case "$COMPAT" in
nvidia_only)
echo "WARNING: This checkpoint is optimized for NVIDIA (TensorRT/ModelOpt). It may not run on AMD ROCm (RDNA iGPU)." ;;
multi_gpu_fp8)
echo "WARNING: Vendor docs indicate multiGPU FP8 is recommended. On a single Strix Halo APU this is likely impractical." ;;
too_large)
echo "WARNING: 120Bclass model is far beyond singleAPU capacity. Expect failure unless heavy offload/sharding is used." ;;
borderline)
echo "Note: Large model — keep context/concurrency modest or use a quantized variant." ;;
community)
echo "Note: Community quantization — quality/perf may vary." ;;
amd_ok|*) ;;
esac
[[ -n "$NOTE" ]] && echo "Note: $NOTE"
print_divider
printf 'Max context tokens (--max-model-len) [%s]: ' "$MAX_MODEL_LEN_DEFAULT"
read -r REPLY_CTX
MAX_MODEL_LEN="${REPLY_CTX:-$MAX_MODEL_LEN_DEFAULT}"
printf 'Max concurrent requests (--max-num-seqs) [%s]: ' "$MAX_NUM_SEQS_DEFAULT"
read -r REPLY_CONC
MAX_NUM_SEQS="${REPLY_CONC:-$MAX_NUM_SEQS_DEFAULT}"
printf 'KV cache dtype (auto|int8|fp8) [%s]: ' "$KV_CACHE_DTYPE_DEFAULT"
read -r REPLY_KV
KV_CACHE_DTYPE="${REPLY_KV:-$KV_CACHE_DTYPE_DEFAULT}"
# Model dtype prompt (use required dtype if set)
dtype_default="$DTYPE_DEFAULT"
if [[ -n "$REQUIRED_DTYPE" ]]; then
dtype_default="$REQUIRED_DTYPE"
fi
printf 'Model dtype (float16|bfloat16) [%s]: ' "$dtype_default"
read -r REPLY_DTYPE
DTYPE="${REPLY_DTYPE:-$dtype_default}"
# Enforce required dtype if user chose something else
if [[ -n "$REQUIRED_DTYPE" && "$DTYPE" != "$REQUIRED_DTYPE" ]]; then
echo "Note: this quantization requires --dtype=$REQUIRED_DTYPE; overriding."
DTYPE="$REQUIRED_DTYPE"
fi
printf 'GPU memory utilization (0.500.98) [%s]: ' "$GPU_UTIL_DEFAULT"
read -r REPLY_UTIL
GPU_UTIL="${REPLY_UTIL:-$GPU_UTIL_DEFAULT}"
printf 'Host bind address [%s]: ' "$HOST_DEFAULT"
read -r REPLY_HOST
HOST="${REPLY_HOST:-$HOST_DEFAULT}"
printf 'Optional CPU offload in GB (0 to disable) [0]: '
read -r REPLY_OFF
CPU_OFFLOAD_GB="${REPLY_OFF:-0}"
printf 'Port [%s]: ' "$PORT_DEFAULT"
read -r REPLY_PORT
PORT="${REPLY_PORT:-$PORT_DEFAULT}"
print_divider
CMD=(
vllm serve "$HF_REPO"
--host "$HOST"
--port "$PORT"
--download-dir "$DOWNLOAD_DIR"
--dtype "$DTYPE"
--max-model-len "$MAX_MODEL_LEN"
--max-num-seqs "$MAX_NUM_SEQS"
--gpu-memory-utilization "$GPU_UTIL"
)
# Add CPU offload if requested
if [[ "$CPU_OFFLOAD_GB" =~ ^[0-9]+$ ]] && (( CPU_OFFLOAD_GB > 0 )); then
CMD+=(--cpu-offload-gb "$CPU_OFFLOAD_GB")
fi
# kvcache dtype
if [[ "$KV_CACHE_DTYPE" != "auto" ]]; then
# Map fp8 -> fp8_e4m3 for AMD unless user typed explicit subtype already
if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then
CMD+=(--kv-cache-dtype fp8_e4m3)
else
CMD+=(--kv-cache-dtype "$KV_CACHE_DTYPE")
fi
fi
# quantization flags (if any)
CMD+=("${QUANT_FLAG[@]}")
# AMD ROCm/AOTriton helpful env
export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH:-gfx1151}"
export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
printf 'About to run:\n\n %q' "${CMD[0]}"; for ((i=1;i<${#CMD[@]};i++)); do printf ' \\\n %q' "${CMD[$i]}"; done; printf '\n\n'
read -r -p "Proceed? [Y/n] " yn
yn=${yn:-Y}
if [[ "$yn" =~ ^[Yy]$ ]]; then
exec "${CMD[@]}"
else
echo "Canceled."
fi