From 5e8b6bb545bdb79b00fc7778d92175cf6ffe41fd Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Sat, 20 Dec 2025 11:37:06 +0000 Subject: [PATCH] updates --- Dockerfile | 4 + README.md | 255 ++---- .../Qwen_Qwen3-14B-AWQ_tp1_throughput.json | 7 + ...A3B-Instruct-GPTQ-4bit_tp1_throughput.json | 7 + ...-Instruct-GPTQ-Int4A16_tp1_throughput.json | 7 + .../google_gemma-3-12b-it_tp1_throughput.json | 7 + ...-Llama-3.1-8B-Instruct_tp1_throughput.json | 7 + .../openai_gpt-oss-120b_tp1_throughput.json | 7 + .../openai_gpt-oss-20b_tp1_throughput.json | 7 + benchmarks/find_max_context.py | 575 +++++++++++++ benchmarks/max_context_results.json | 333 ++++++++ benchmarks/run_vllm_bench.py | 266 ++++++ docs/assets/index2.css | 401 +++++++++ docs/assets/index2.js | 542 ++++++++++++ docs/index.html | 782 ++++++++++++++++++ docs/parse_results.py | 181 ++++ docs/results.json | 95 +++ scripts/99-toolbox-banner.sh | 8 +- scripts/start-vllm.sh | 54 -- scripts/start_vllm.py | 315 +++++++ 20 files changed, 3612 insertions(+), 248 deletions(-) create mode 100644 benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json create mode 100644 benchmarks/benchmark_results/cpatonn_Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit_tp1_throughput.json create mode 100644 benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json create mode 100644 benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json create mode 100644 benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json create mode 100644 benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json create mode 100644 benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json create mode 100644 benchmarks/find_max_context.py create mode 100644 benchmarks/max_context_results.json create mode 100644 benchmarks/run_vllm_bench.py create mode 100644 docs/assets/index2.css create mode 100644 docs/assets/index2.js create mode 100644 docs/index.html create mode 100644 docs/parse_results.py create mode 100644 docs/results.json delete mode 100644 scripts/start-vllm.sh create mode 100644 scripts/start_vllm.py diff --git a/Dockerfile b/Dockerfile index caee0c7..53279c5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -154,6 +154,10 @@ RUN chmod -R a+rwX /opt && \ COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh COPY scripts/zz-venv-last.sh /etc/profile.d/zz-venv-last.sh +COPY scripts/start_vllm.py /usr/local/bin/start-vllm +COPY benchmarks/max_context_results.json /opt/max_context_results.json +COPY benchmarks/run_vllm_bench.py /opt/run_vllm_bench.py +RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmod 0644 /opt/max_context_results.json RUN chmod 0644 /etc/profile.d/*.sh RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh diff --git a/README.md b/README.md index 87b9703..9828798 100644 --- a/README.md +++ b/README.md @@ -1,53 +1,40 @@ -# AMD Strix Halo — vLLM Toolbox/Container (gfx1151, PyTorch + AOTriton) +# AMD Strix Halo (gfx1151) — vLLM Toolbox/Container -An **Arch-based** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1151)**. Built on the PyTorch + AOTriton base to make ROCm on Strix Halo practical for day‑to‑day use. +An **Arch-based** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1151)**. Built on the **TheRock nightly builds** for ROCm. -> **Built on:** [https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton](https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton) -> **Credits:** **lhl** (build tools/scripts), **ssweens** (Arch‑based Dockerfiles), and the **AMD Strix Halo Home Lab Discord** for testing/support. ---- - -## ⚠️ Status & Expectations (Experimental) - -This setup is **highly experimental** on ROCm/Strix Halo. Some models work; **many fail** due to missing custom kernels, unsupported quant types, or TorchInductor/AOTriton limitations on gfx1151. The matrix below lists combinations tested so far. **Please contribute fixes** or additional working recipes (see *Contributing*). - ---- - -## Tested Models (Experimental Matrix) - -> **Legend:** ✅ Works (with flags) · ❌ Fails · ⚠️ Notes include the *exact* error/symptom seen. - -| Model (Hugging Face) | Params / Quant | Status | Required flags (if any) | Notes / Errors | -| ---------------------------------- | -------------- | -------------------: | ---------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | -| `Qwen/Qwen2.5-7B-Instruct` | 7B FP16 | ✅ Works | (recommended) `--dtype float16` | Good baseline; simple serve works. | -| `meta-llama/Llama-2-7b-chat-hf` | 7B FP16 | ✅ Works | (recommended) `--dtype float16` | Stable. | -| `Qwen/Qwen3-30B-A3B-Instruct-2507` | 30B (A3B) FP16 | ✅ Works | (recommended) `--dtype float16` | | -| `Google/Gemma3-27B-Instruct` | 27B FP16 | ✅ Works | (recommended) `--dtype float16` | Slow | -| `Google/Gemma3-12B-Instruct` | 12B FP16 | ✅ Works | (recommended) `--dtype float16` | | -| `Google/Gemma3-4B-Instruct` |4B FP16 | ✅ Works | (recommended) `--dtype float16` | | -| `Qwen/Qwen3-14B-AWQ` | 14B AWQ | ✅ Works (with flags) | `--quantization awq --dtype float16 --enforce-eager` | On ROCm, eager avoids missing `awq_dequantize` during compile; vLLM auto‑sets `VLLM_USE_TRITON_AWQ`. | -| `openai/gpt-oss-20b` | 20B MXFP4 | ❌ Fails | — | `ModuleNotFoundError: triton_kernels.matmul_ogs` (MXFP4 path not available in this image). | -| `zai-org/GLM-4.5-Air-FP8` | FP8 | ❌ Fails | — | `ValueError: type fp8e4nv not supported (only 'fp8e5')`. | -| `cpatonn/GLM-4.5-Air-AWQ-4bit` | AWQ-4bit (MoE) | ❌ Fails | — | Missing custom op: `torch.ops._C.gptq_marlin_repack` (Marlin kernels). | - -> If you get a model to work, please PR a new row with: **model name**, **exact flags**, vLLM version, `torch` & `triton` versions, and a note on **gfx1151** driver/kernel stack. --- ## Table of Contents +* [Tested Models (Benchmarks)](#tested-models-benchmarks) * [1) Toolbx vs Docker/Podman](#1-toolbx-vs-dockerpodman) * [2) Quickstart — Fedora Toolbx (development)](#2-quickstart--fedora-toolbx-development) -* [3) Testing the API](#3-testing-the-api) -* [4) Quickstart — Podman/Docker](#4-quickstart--podmandocker) -* [5) Models, dtypes & storage](#5-models-dtypes--storage) -* [6) Performance notes (short)](#6-performance-notes-short) -* [7) Requirements (host)](#7-requirements-host) -* [8) Acknowledgements & Links](#8-acknowledgements--links) -* [Tested Models](#tested-models) -* [Contributing](#contributing) +* [3) Quickstart — Ubuntu (Distrobox)](#3-quickstart--ubuntu-distrobox) +* [4) Testing the API](#4-testing-the-api) +* [5) Use a Web UI for Chatting](#5-use-a-web-ui-for-chatting) +## Tested Models (Benchmarks) + +View full benchmarks at: [https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/](https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/) + +**Table Key:** Cell values represent `Max Context Length (GPU Memory Utilization)`. + +| Model | TP | 1 Req | 4 Reqs | 8 Reqs | 16 Reqs | +| :--- | :--- | :--- | :--- | :--- | :--- | +| **`meta-llama/Meta-Llama-3.1-8B-Instruct`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) | +| **`google/gemma-3-12b-it`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) | +| **`openai/gpt-oss-20b`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) | +| **`Qwen/Qwen3-14B-AWQ`** | 1 | 40k (0.90) | 40k (0.90) | 40k (0.90) | 40k (0.90) | +| **`cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit`** | 1 | 256k (0.95) | 204k (0.90) | - | - | +| **`dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16`** | 1 | 256k (0.90) | - | - | - | +| **`openai/gpt-oss-120b`** | 1 | 128k (0.95) | 128k (0.95) | 128k (0.95) | 128k (0.95) | + + +--- + ## 1) Toolbx vs Docker/Podman The `kyuz0/vllm-therock-gfx1151-aotriton:latest` image can be used both as:  @@ -63,7 +50,7 @@ Create a toolbox that exposes the GPU and relaxes seccomp to avoid ROCm syscall ```bash toolbox create vllm \ - --image docker.io/kyuz0/vllm-therock-gfx1151-aotriton:latest \ + --image docker.io/kyuz0/vllm-therock-gfx1151:latest \ -- --device /dev/dri --device /dev/kfd \ --group-add video --group-add render --security-opt seccomp=unconfined ``` @@ -74,33 +61,45 @@ Enter it: toolbox enter vllm ``` -**Model storage (Toolbx):** keep weights **outside** the toolbox under your HOME so they persist. Recommended path: +**Model storage:** Models are downloaded to `~/.cache/huggingface` by default. This directory is shared with the host if you created the toolbox correctly, so downloads persist. -```bash -mkdir -p ~/vllm-models -``` +### Serving a Model (Easiest Way) -Serve a model using the helper script **`start-vllm`** (it prints the exact `vllm serve` command and then runs it). Models download to `~/vllm-models` by default; if a model isn't present, it will be fetched from Hugging Face automatically: +The toolbox includes a TUI wizard called **`start-vllm`** which includes pre-configured models and handles the launch flags for you. This is the easiest way to get started. ```bash start-vllm -# pick a model from the menu; the script prints the serve command and launches it ``` -> Defaults: `0.0.0.0:8000` and `~/vllm-models` for weights. You can still run `vllm serve` manually if you prefer. - -> Toolbx shares HOME by design, so `~/vllm-models` stays on the host and survives toolbox updates. -> -> **Cache note (Toolbx):** vLLM will also write compiled kernels to `~/.cache/vllm/torch_compile_cache/` in your HOME. For example: -> -> ```bash -> du -sh ~/.cache/vllm/torch_compile_cache/ -> # e.g., 138M /home/you/.cache/vllm/torch_compile_cache/ -> ``` +> **Cache note:** vLLM writes compiled kernels to `~/.cache/vllm/`. --- -## 3) Testing the API +## 3) Quickstart — Ubuntu (Distrobox) + +Ubuntu’s toolbox package still breaks GPU access, so use Distrobox instead: + +```bash +distrobox create -n vllm \ + --image docker.io/kyuz0/vllm-therock-gfx1151:latest \ + --additional-flags "--device /dev/kfd --device /dev/dri --group-add video --group-add render --security-opt seccomp=unconfined" + +distrobox enter vllm +``` + +> **Verification:** Run `rocm-smi` to check GPU status. + +### Serving a Model (Easiest Way) + +The toolbox includes a TUI wizard called **`start-vllm`** which includes pre-configured models and handles the launch flags for you. This is the easiest way to get started. + +```bash +start-vllm +``` + +--- + +## 4) Testing the API Once the server is up, hit the OpenAI‑compatible endpoint: @@ -125,145 +124,21 @@ MODEL=$(curl -s http://localhost:8000/v1/models | jq -r '.data[0].id') curl -X P --- -## 4) Quickstart — Podman/Docker +## 5) Use a Web UI for Chatting -Prefer this for persistent services. **Always mount a host directory for weights** so they live outside the container. If the model isn't present, vLLM will fetch it from **Hugging Face** into the mapped directory. - -**Qwen2.5 7B Instruct** +If vLLM is on a remote server, expose port 8000 via SSH port forwarding: ```bash -podman run -d --name vllm-qwen2p5-7b \ - --ipc=host \ - --network host \ - --device /dev/kfd \ - --device /dev/dri \ - --group-add video \ - --group-add render \ - -v ~/vllm-models:/models \ - -v ~/.cache/vllm:/root/.cache/vllm \ - docker.io/kyuz0/vllm-therock-gfx1151-aotriton:latest \ - bash -lc 'source /torch-therock/.venv/bin/activate; \ - TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 \ - vllm serve Qwen/Qwen2.5-7B-Instruct --dtype float16 \ - --host 0.0.0.0 --port 8000 --download-dir /models' +ssh -L 0.0.0.0:8000:localhost:8000 ``` -> Not using `--network host`? Map a port instead: `-p 8000:8000`. - -For other models, you can try: - - -**Qwen3 30B A3B Instruct (2507)** +Then, you can start HuggingFace ChatUI like this (on your host): ```bash -podman run -d --name vllm-qwen3-30b-a3b \ - --ipc=host \ - --network host \ - --device /dev/kfd \ - --device /dev/dri \ - --group-add video \ - --group-add render \ - -v ~/vllm-models:/models \ - -v ~/.cache/vllm:/root/.cache/vllm \ - docker.io/kyuz0/vllm-therock-gfx1151-aotriton:latest \ - bash -lc 'source /torch-therock/.venv/bin/activate; \ - TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 \ - vllm serve Qwen/Qwen3-30B-A3B-Instruct-2507 --dtype float16 \ - --host 0.0.0.0 --port 8000 --download-dir /models' +docker run -p 3000:3000 \ + --add-host=host.docker.internal:host-gateway \ + -e OPENAI_BASE_URL=http://host.docker.internal:8000/v1 \ + -e OPENAI_API_KEY=dummy \ + -v chat-ui-data:/data \ + ghcr.io/huggingface/chat-ui-db ``` - -**Qwen3 14B AWQ** *(requires extra flags on ROCm)* - -```bash -podman run -d --name vllm-qwen3-14b-awq \ - --ipc=host \ - --network host \ - --device /dev/kfd \ - --device /dev/dri \ - --group-add video \ - --group-add render \ - -v ~/vllm-models:/models \ - -v ~/.cache/vllm:/root/.cache/vllm \ - docker.io/kyuz0/vllm-therock-gfx1151-aotriton:latest \ - bash -lc 'source /torch-therock/.venv/bin/activate; \ - TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 \ - vllm serve Qwen/Qwen3-14B-AWQ --quantization awq --dtype float16 --enforce-eager \ - --host 0.0.0.0 --port 8000 --download-dir /models' -``` - ---- - -## 5) Models, dtypes & storage - -* Start with **Qwen/Qwen2.5-7B-Instruct**; larger models may work but are less forgiving on unified memory. -* Use `--dtype float16` unless you have a reason to change. -* **Storage discipline:** - - * **Toolbx:** `--download-dir ~/vllm-models` (lives in your HOME on the host). - * **Podman/Docker:** `-v ~/vllm-models:/models` and `--download-dir /models`. - ---- - -## 6) Performance notes (short) - -* The image is built on the PyTorch + **AOTriton** base; enabling `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` can improve startup/throughput on some models. -* vLLM flags you might tune later: `--gpu-memory-utilization`, `--max-num-seqs`, `--max-model-len`. Start simple; add knobs only if needed. - ---- - -## 7) Requirements (host) - -**Hardware & drivers** - -* AMD Strix Halo APU (gfx1151). -* Working amdgpu stack with `/dev/kfd` (ROCm compute) and `/dev/dri` (graphics). -* Your user in the **video** and **render** groups. - -**Unified memory setup (HIGHLY recommended)** -Enable large GTT/unified memory so the iGPU can borrow system RAM for bigger models: - -1. **Kernel parameters** (append to your GRUB cmdline): - - ``` - amd_iommu=off amdgpu.gttsize=131072 ttm.pages_limit=33554432 - ``` - - | Parameter | Purpose | - | -------------------------- | ---------------------------- | - | `amd_iommu=off` | Reduces latency | - | `amdgpu.gttsize=131072` | 128 GiB GTT (unified memory) | - | `ttm.pages_limit=33554432` | Large pinned allocations | - -2. **BIOS**: allocate **minimal VRAM** to the iGPU (e.g., **512 MB**) and rely on unified memory. - -3. **Fedora example** (GRUB): edit `/etc/default/grub` → `GRUB_CMDLINE_LINUX=...` then: - - ```bash - sudo grub2-mkconfig -o /boot/grub2/grub.cfg - sudo reboot - ``` - -**Container runtime** - -* Podman or Docker installed (examples use Podman; replace with Docker if preferred). - ---- - -## 8) Contributing - -Spotted a fix, a working flag combo, or a model that should be on the list? **PRs welcome!** Please include: - -* Model repo + exact version tag (if any) -* Full `vllm serve` command/flags that work -* vLLM version, `torch` & `triton` versions (`python -c "import torch, triton; print(torch.__version__, triton.__version__)"`) -* Short log snippet of success/failure (especially the **first** error) -* Any relevant kernel/AOTriton env vars (e.g., `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1`) - ---- - -## 9) Acknowledgements & Links - -* Base images & docs: [https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton](https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton) -* Upstreams: [vLLM](https://github.com/vllm-project/vllm), [ROCm/TheRock](https://github.com/ROCm/TheRock), [AOTriton](https://github.com/ROCm/aotriton) -* Community: **AMD Strix Halo Home Lab Discord** — [https://discord.gg/pnPRyucNrG](https://discord.gg/pnPRyucNrG) -* Big thanks to **lhl** and **ssweens** for doing the actual heavy lifting for this. diff --git a/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json new file mode 100644 index 0000000..d63771d --- /dev/null +++ b/benchmarks/benchmark_results/Qwen_Qwen3-14B-AWQ_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1302.7062463890015, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.15352655332265747, + "tokens_per_second": 112.69232830266365 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/cpatonn_Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit_tp1_throughput.json b/benchmarks/benchmark_results/cpatonn_Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit_tp1_throughput.json new file mode 100644 index 0000000..ecc0309 --- /dev/null +++ b/benchmarks/benchmark_results/cpatonn_Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 540.2676798280002, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.37018686748700586, + "tokens_per_second": 271.7264154071495 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json new file mode 100644 index 0000000..2e5e0ea --- /dev/null +++ b/benchmarks/benchmark_results/dazipe_Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1303.4944151099999, + "num_requests": 200, + "total_num_tokens": 146805, + "requests_per_second": 0.15343372221746138, + "tokens_per_second": 112.62418795067208 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json b/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json new file mode 100644 index 0000000..db3260c --- /dev/null +++ b/benchmarks/benchmark_results/google_gemma-3-12b-it_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 914.8563823220001, + "num_requests": 200, + "total_num_tokens": 148857, + "requests_per_second": 0.21861354838273012, + "tokens_per_second": 162.71078485804028 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json new file mode 100644 index 0000000..36031a5 --- /dev/null +++ b/benchmarks/benchmark_results/meta-llama_Meta-Llama-3.1-8B-Instruct_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 522.8661062630126, + "num_requests": 200, + "total_num_tokens": 145877, + "requests_per_second": 0.38250710383471637, + "tokens_per_second": 278.99494393048457 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json new file mode 100644 index 0000000..b2ee048 --- /dev/null +++ b/benchmarks/benchmark_results/openai_gpt-oss-120b_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 1339.915984058, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.14926308990977954, + "tokens_per_second": 109.73523843987172 +} \ No newline at end of file diff --git a/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json b/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json new file mode 100644 index 0000000..ce8eecc --- /dev/null +++ b/benchmarks/benchmark_results/openai_gpt-oss-20b_tp1_throughput.json @@ -0,0 +1,7 @@ +{ + "elapsed_time": 468.4791132300161, + "num_requests": 200, + "total_num_tokens": 147036, + "requests_per_second": 0.42691337639593563, + "tokens_per_second": 313.85817605876395 +} \ No newline at end of file diff --git a/benchmarks/find_max_context.py b/benchmarks/find_max_context.py new file mode 100644 index 0000000..5362c9d --- /dev/null +++ b/benchmarks/find_max_context.py @@ -0,0 +1,575 @@ +#!/usr/bin/env python3 +import subprocess +import time +import socket +import json +import sys +import os +import requests +import re +import argparse +from pathlib import Path +try: + from transformers import AutoConfig +except ImportError: + print("Error: 'transformers' not found. Please install it or run in vLLM environment.") + sys.exit(1) + +# Import configuration from average benchmark script +try: + from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN, get_gpu_count, kill_vllm +except ImportError: + print("Error: Could not import run_vllm_bench.py. Make sure it is in the same directory.") + sys.exit(1) + +# ========================= +# 🧠 GROUNDING & METHODOLOGY +# ========================= +# This script finds the Maximum Working Context (MWC) for vLLM models. +# +# Methodology: +# 1. **Inspect**: Use `transformers.AutoConfig` to determine the model's theoretical limit +# (e.g., `max_position_embeddings`). +# 2. **Probe**: Launch `vllm serve` at this limit. +# 3. **React**: +# - If stable ("Application startup complete"): Success. +# - If OOM ("KV cache capacity... is X"): Retry with vLLM's suggested X. +# - If Config Error ("max_model_len... is Y"): Retry with vLLM's suggested Y. + +# ========================= +# ⚙️ CONFIG +# ========================= +HOST = "127.0.0.1" +PORT = 8000 +RESULTS_FILE = Path("max_context_results.json") +REPORT_FILE = Path("max_context_report.md") + +# We test these GPU Utilizations steps to see how much we can squeeze +# 0.90 is default, but we want MAX context. +# 0.98 is our target high. 0.95 is the fallback. +GPU_UTIL_STEPS = ["0.95", "0.90"] +# We test these concurrency settings +CONCURRENCY_STEPS = [1, 4, 8, 16] + +def log(msg): print(f"[MAX-CTX] {msg}", flush=True) + +def get_hf_context_limit(model_name, trust_remote=False): + try: + cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote) + + # Gemma 3 and similar multi-config models + if hasattr(cfg, "text_config"): + tc = cfg.text_config + if hasattr(tc, "max_position_embeddings"): + return int(tc.max_position_embeddings) + + # Standard HF attributes + for attr in ( + "max_position_embeddings", + "seq_length", + "max_seq_len", + "n_positions", + ): + val = getattr(cfg, attr, None) + if val is not None: + return int(val) + + return 8192 + + except Exception as e: + log(f"Warning: Could not read config for {model_name}: {e}. Defaulting to 32768.") + return 32768 + +def get_vllm_server_cmd(model, tp_size, util, max_len, max_seqs): + """ + Constructs the vLLM serve command. + """ + config = MODEL_TABLE[model] + + cmd = [ + "vllm", "serve", model, + "--gpu-memory-utilization", str(util), + "--max-model-len", str(max_len), + "--tensor-parallel-size", str(tp_size), + "--max-num-seqs", str(max_seqs), + "--dtype", "auto", + # "--disable-log-stats" # Cleaner output, but user managed without it + ] + + if config.get("trust_remote"): cmd.append("--trust-remote-code") + if config.get("enforce_eager"): cmd.append("--enforce-eager") + + # Add model specific env vars + env = os.environ.copy() + env.update(config.get("env", {})) + + return cmd, env + +def is_port_free(port): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(('localhost', port)) != 0 + +def force_cleanup(hard=False): + """ + Kills vLLM using multiple methods and ensures port is free. + BLOCKS until processes are definitely gone. + """ + timeout = 20 if hard else 10 + start_time = time.time() + + while True: + # 1. Aggressive Kill Commands + # We send these EVERY loop iteration until they die. + subprocess.run("pkill -9 -f 'vllm.entrypoints.api_server'", shell=True, stderr=subprocess.DEVNULL) + subprocess.run("pkill -9 -f 'vllm serve'", shell=True, stderr=subprocess.DEVNULL) + subprocess.run("pkill -9 -f 'VLLM::'", shell=True, stderr=subprocess.DEVNULL) + subprocess.run("pkill -9 -f 'multiprocessing.spawn'", shell=True, stderr=subprocess.DEVNULL) + subprocess.run("pkill -9 -f ray::", shell=True, stderr=subprocess.DEVNULL) + + # 2. Check if they are still there + # We check specifically for the persistence of any vllm-related process + # We use explicit list to know WHICH one triggered it + # CRITICAL FIX: We MUST use shell=False otherwise 'pgrep -f pattern' + # matches the 'sh -c pgrep ... pattern' command content itself! + dirty = False + + # Check 1: vllm serve + if subprocess.run(["pgrep", "-f", "vllm serve"], stdout=subprocess.DEVNULL).returncode == 0: + # Double check it's not us (Python script) + # But simpler to just proceed if we trust shell=False works + log("Clean waiting: Found 'vllm serve' process:") + subprocess.run("pgrep -a -f 'vllm serve'", shell=True) # debug + dirty = True + + # Check 2: api_server + if subprocess.run(["pgrep", "-f", "vllm.entrypoints.api_server"], stdout=subprocess.DEVNULL).returncode == 0: + log("Clean waiting: Found 'vllm.entrypoints.api_server' process:") + subprocess.run("pgrep -a -f 'vllm.entrypoints.api_server'", shell=True) # debug + dirty = True + + # Check 3: VLLM:: (Ray workers) + if subprocess.run(["pgrep", "-f", "VLLM::"], stdout=subprocess.DEVNULL).returncode == 0: + log("Clean waiting: Found 'VLLM::' process:") + subprocess.run("pgrep -a -f 'VLLM::'", shell=True) # debug + dirty = True + + if not dirty: + # Processes are gone. Now check port. + if is_port_free(PORT): + time.sleep(1) # Final safety buffer + return # Clean! + else: + log("Clean: Processes gone, but Port 8000 still held. Waiting...") + else: + log("Clean: Processes still detected. Retrying kill...") + + if time.time() - start_time > timeout: + log("CRITICAL: Cleanup timed out! Force attempting `killall -9 vllm` as last resort.") + subprocess.run("killall -9 vllm", shell=True, stderr=subprocess.DEVNULL) + break + + time.sleep(1.5) # Wait a bit before hammering again + + +def wait_for_server_and_parse(process, timeout=300): + """ + Waits for server to be ready. + Parses stdout for "Count of GPU blocks" and "Block size". + Returns: (ready_bool, gpu_blocks, block_size, max_len_clamped, failure_reason) + """ + start = time.time() + gpu_blocks = 0 + block_size = 16 # default + max_len_clamped = None + + logs = [] + failure_reason = None + + while time.time() - start < timeout: + if process.poll() is not None: + # Process died. + for line in process.stdout: + line_str = line.decode("utf-8", errors="replace").strip() + logs.append(line_str) + + # SCAN FULL HISTORY if not found yet + # Sometimes error was in previous lines or split + if not failure_reason: + full_log = "\n".join(logs) + + # Check 1: Sampler OOM + if "warming up sampler" in full_log and "CUDA out of memory" in full_log: + failure_reason = "Sampler Warmup OOM" + + # Check 2: Explicit vLLM suggestion (Estimated) + # "estimated maximum model length is 127120" + elif "estimated maximum model length is" in full_log: + m = re.search(r"estimated maximum model length is (\d+)", full_log) + if m: + failure_reason = f"estimated maximum model length is {m.group(1)}" + + # Check 3: Derived Max Model Len + # "derived max_model_len (max_position_embeddings=131072.0 ...)" + elif "derived max_model_len" in full_log: + failure_reason = "derived max_model_len detected" + + # Check 4: Capacity/Value Error + elif "ValueError" in full_log and "maximum number of tokens" in full_log: + failure_reason = "Capacity Error (Found in history)" + + # Check 5: Generic OOM + elif "CUDA out of memory" in full_log or "hipErrorOutOfMemory" in full_log: + failure_reason = "OOM detected" + + if not failure_reason: + # Unexpected death! Dump logs to see why. + log("CRITICAL: Process died unexpectedly! Dumping last 100 lines:") + print("=== vLLM SERVER LOGS (LAST 100 LINES) ===") + for l in logs[-100:]: + print(l) + print("=============================================") + + return False, 0, 0, None, failure_reason + + line = process.stdout.readline() + if line: + line_str = line.decode("utf-8", errors="replace").strip() + logs.append(line_str) + + # 1. Parse Legacy "GPU blocks" (if present) + m_blocks = re.search(r"# GPU blocks:\s*(\d+)", line_str) + if m_blocks: + gpu_blocks = int(m_blocks.group(1)) + block_size = 16 # assume default unless found + log(f" -> Found GPU blocks: {gpu_blocks} (Legacy)") + + # 2. Parse Newer "GPU KV cache size" (vLLM 0.11+) + # "GPU KV cache size: 111,536 tokens" + m_kv_tokens = re.search(r"GPU KV cache size:\s*([\d,]+)\s*tokens", line_str) + if m_kv_tokens: + tokens_str = m_kv_tokens.group(1).replace(",", "") + gpu_blocks = int(tokens_str) # We use 'gpu_blocks' variable to store total tokens now for simplicity + block_size = 1 # Effectively 1 because we have the total count + log(f" -> Found GPU KV Cache tokens: {gpu_blocks}") + + # 3. Parse Block Size (optional, mostly for legacy) + m_bs = re.search(r"block_size=(\d+)", line_str) + if m_bs: + block_size = int(m_bs.group(1)) + + # Failure hints + if "ValueError" in line_str and "maximum number of tokens" in line_str: + failure_reason = line_str + if "derived max_model_len" in line_str: + failure_reason = line_str + if "warming up sampler" in line_str and "CUDA out of memory" in line_str: + failure_reason = "Sampler Warmup OOM" + elif "CUDA out of memory" in line_str or "hipErrorOutOfMemory" in line_str: + failure_reason = "OOM detected" + + # Check for startup + if "Application startup complete" in line_str or "Uvicorn running on" in line_str: + if gpu_blocks > 0: + log(" -> Server signal detected. Waiting 5s for socket stability...") + time.sleep(5) + return True, gpu_blocks, block_size, max_len_clamped, None + else: + return False, 0, 0, None, "Parsed Success but Token/Block Count was 0" + + # Timeout case + log("CRITICAL: Server startup timed out! Dumping last 100 lines:") + print("=== vLLM SERVER LOGS (LAST 100 LINES) ===") + for l in logs[-100:]: + print(l) + print("=============================================") + return False, 0, 0, None, "Timeout" + +def verify_context(model, context_len): + """ + Sends a request to the server with length ~context_len to verify stability. + """ + url = f"http://{HOST}:{PORT}/v1/completions" + + # We use a simple "A " * N prompt. + # Llama 3 tokenizer: "A" is usually 1 token. + + prompt = "A " * int(context_len * 0.5) # 50% fill to be safe/approx + + payload = { + "model": model, + "prompt": prompt, + "max_tokens": 10, + "temperature": 0 + } + + # Retry loop for connection refusals (race condition) + max_retries = 5 + for attempt in range(max_retries): + try: + # Increased timeout to 300s because prefilling 60k+ tokens takes time! + r = requests.post(url, json=payload, timeout=300) + if r.status_code == 200: + return True, "Success" + else: + # If 500 or 400 error, maybe we shouldn't retry? Usually yes for 500 if transient. + # But for now let's just fail or retry. + # If we are OOMing, we will likely get a 500 or it will hang. + return False, f"HTTP {r.status_code}: {r.text[:200]}" + except requests.exceptions.ConnectionError: + if attempt < max_retries - 1: + log(f" -> Connection refused. Retrying verification ({attempt+1}/{max_retries})...") + time.sleep(2) + else: + return False, "Connection Refused (Max Retries)" + except Exception as e: + return False, str(e) + + return False, "Unknown Error" + +def run_probe(model, tp, util, max_seqs, start_limit=None): + """ + Probes a specific configuration starting from the model's architectural limit. + """ + trust_remote = MODEL_TABLE[model].get("trust_remote", False) + # 1. Get the Advertised Limit (The "Smart" Way) + arch_limit = get_hf_context_limit(model, trust_remote) + + # Intelligent Start: If we know a lower limit worked for lower concurrency, start there. + target_len = arch_limit + if start_limit: + target_len = min(arch_limit, start_limit) + log(f" -> Smart Start: Capping initial probe at {target_len} (based on previous run)") + + result_data = { + "model": model, + "tp": tp, + "util": util, + "max_seqs": max_seqs, + "model_limit": arch_limit, + "configured_len": 0, + "real_capacity": 0, + "status": "fail", + "error": "" + } + + log(f"Probing {model} | TP={tp} | Util={util} | Seqs={max_seqs} | Model Limit={arch_limit}") + + # We loop until we succeed OR we drop below a useful context size. + while target_len >= 2048: + force_cleanup() + + cmd, env = get_vllm_server_cmd(model, tp, util, target_len, max_seqs) + log(f"DEBUG: Cmd: {' '.join(cmd)}") + + proc = None + try: + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) + ready, blocks, block_size, _, fail_msg = wait_for_server_and_parse(proc) + + if ready: + # Success - but let's VERIFY it actually answers + total_capacity = blocks * block_size + workable_len = min(target_len, total_capacity) + + # Verify with actual request + # We cap verification at 4096 because we just want to know if it crashes, + # we don't need to wait for a 128k context fill just for a liveness check. + verify_len = min(workable_len, 4096) + log(f" -> Server ready. Verifying stability with approx {int(verify_len * 0.5)} tokens (capped at 4k)...") + v_ok, v_msg = verify_context(model, verify_len) + + if v_ok: + log(f" -> Success! capacity={total_capacity}, configured={workable_len}") + log(f" -> Verification passed: {v_msg}") + + # Cleanup SUCCESSFUL process immediately + proc.terminate() + try: proc.wait(timeout=5) + except: proc.kill() + + result_data["status"] = "success" + result_data["configured_len"] = target_len + result_data["real_capacity"] = total_capacity + result_data["max_context_1_user"] = workable_len + + return result_data + else: + log(f" -> Server started, but Verification FAILED: {v_msg}") + # Treat as a crash/failure, back off + fail_msg = "Verification Failed" + + # Capture any remaining logs if the process is dead or dying + # Or just read what's currently available non-blocking? + # Simpler: just terminate and read output. + proc.terminate() + try: + outs, errs = proc.communicate(timeout=5) + if outs: + print("=== vLLM SERVER LOGS (DURING VERIFICATION FAILURE) ===") + print(outs.decode('utf-8', errors='replace')) + print("======================================================") + except: + proc.kill() + + + # If we fall through here, ready=False OR verify=False + log(f" -> Attempt failed at {target_len}") + if fail_msg: log(f" Reason: {fail_msg}") + result_data["error"] = fail_msg if fail_msg else "Process died or timed out" + + if fail_msg: + # Case V: Verification Failed (Server up, but unstable inference) + # User requests drop to 0.95 tier immediately. + # Must check this FIRST to ensure we don't fall through. + if "Verification Failed" in str(fail_msg): + log(" -> Verification Failed (Unstable). Aborting this Util, dropping to lower tier.") + break + + # Case S: Sampler Warmup OOM (Fatal for this Util) + if "Sampler Warmup OOM" in fail_msg: + log(" -> Critical Sampler OOM. Utilization/Seqs too high. Aborting this configuration.") + break # Give up on this Util/Seq combo immediately + + # Case X: Dirty State / Zombie VRAM + # "Free memory on device (1.56/31.86 GiB) on startup is less than desired..." + if "Free memory on device" in fail_msg and "less than desired" in fail_msg: + log(" -> Dirty VRAM detected (previous run didn't cleanup?). Retrying with HARD cleanup.") + force_cleanup(hard=True) + continue # Retry SAME target_len + + # Case A: VRAM Limit ("maximum number of tokens... is X") + m_capacity = re.search(r"maximum number of tokens.*?KV cache is (\d+)", fail_msg) + if m_capacity: + cap = int(m_capacity.group(1)) + log(f" -> Found Hardware Capacity: {cap}") + target_len = cap + continue # Retry Exact Cap + + # Case B: Model Limit mismatch + # "Value error, User-specified max_model_len (500000) is greater than the derived max_model_len (max_position_embeddings=131072.0 ...)" + # We regex for 'derived max_model_len' and then look for numbers in the proximity. + + if "derived max_model_len" in fail_msg: + # Try to capture "max_position_embeddings=131072" + m_pos = re.search(r"max_position_embeddings=([\d\.]+)", fail_msg) + if m_pos: + limit = int(float(m_pos.group(1))) # handle 131072.0 + log(f" -> Found Model Limit: {limit}") + target_len = limit + continue + + # Fallback: look for simple parenthesis pattern if the above fails + m_derived = re.search(r"derived max_model_len\s*\((\d+)\)", fail_msg) + if m_derived: + limit = int(m_derived.group(1)) + log(f" -> Found Model Limit (Legacy): {limit}") + target_len = limit + continue + + # Case C: Estimated Max Length (New vLLM Safe Limit) + # "estimated maximum model length is 111536" + m_est = re.search(r"estimated maximum model length is (\d+)", fail_msg) + if m_est: + limit = int(m_est.group(1)) + log(f" -> Found vLLM Estimated Limit: {limit}") + target_len = limit + continue + + # Case D: Generic OOM/Crash + target_len = int(target_len * 0.8) + log(f" -> Backing off to: {target_len}") + + if target_len < 2048: + log(" -> Give up (too small)") + break + finally: + if proc: + try: proc.terminate() + except: pass + try: proc.kill() + except: pass + proc.wait() + force_cleanup() + + return result_data + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, help="Filter to run only this model (substring match)") + parser.add_argument("--steps", type=int, default=-1, help="Number of models to run (default: all)") + args = parser.parse_args() + + gpu_count = get_gpu_count() + + # 1. Load existing results to support RESUME + results = [] + if RESULTS_FILE.exists(): + try: + with open(RESULTS_FILE, "r") as f: + results = json.load(f) + log(f"Loaded {len(results)} previous results. Resuming...") + except Exception as e: + log(f"Warning: Could not read existing results: {e}") + + count = 0 + for model in MODELS_TO_RUN: + if args.model and args.model not in model: + continue + + config = MODEL_TABLE[model] + valid_tps = [t for t in config["valid_tp"] if t <= gpu_count] + + for tp in valid_tps: + # Track successful seqs for this TP to skip lower utils + # effectively: {seqs_count: max_working_util} + # Since we iterate high-util -> low-util, if we succeeded already for this 'seqs', we skip. + successful_seqs = set() + + # Reset smart limit for each TP (TP2 should not inherit TP1's limit) + last_working_len = None + + for util in GPU_UTIL_STEPS: + + for seqs in CONCURRENCY_STEPS: + if seqs in successful_seqs: + log(f"Skipping {model} (TP={tp}, Util={util}, Seqs={seqs}) - Already succeeded at higher util.") + continue + + # Check if we already have this result + existing_res = next((r for r in results + if r["model"] == model + and r["tp"] == tp + and str(r["util"]) == str(util) + and r["max_seqs"] == seqs), None) + + if existing_res: + res = existing_res + log(f"Skipping {model} (TP={tp}, Util={util}, Seqs={seqs}) - Found in results.") + else: + # New run + res = run_probe(model, tp, util, seqs, start_limit=last_working_len) + results.append(res) + + # Save immediately + with open(RESULTS_FILE, "w") as f: + json.dump(results, f, indent=2) + + # Update logic for Resume OR New Run: + if res["status"] == "success": + last_working_len = res["configured_len"] + successful_seqs.add(seqs) # Mark this seq count as done for this TP + + # Smart Break: If we failed at this concurrency level (capacity=0), + # higher concurrency will also fail. + if res["real_capacity"] == 0 or res["status"] == "fail": + log(f"Stopping higher concurrency tests for {model} (failed at {seqs} seqs)") + break + + count += 1 + if args.steps != -1 and count >= args.steps and not args.model: + break + + # generate_report(results) - Moved to separate script + +if __name__ == "__main__": + main() diff --git a/benchmarks/max_context_results.json b/benchmarks/max_context_results.json new file mode 100644 index 0000000..461cceb --- /dev/null +++ b/benchmarks/max_context_results.json @@ -0,0 +1,333 @@ +[ + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 829952, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tp": 1, + "util": "0.95", + "max_seqs": 4, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 830064, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tp": 1, + "util": "0.95", + "max_seqs": 8, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 830080, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tp": 1, + "util": "0.95", + "max_seqs": 16, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 830064, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "google/gemma-3-12b-it", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 246032, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "google/gemma-3-12b-it", + "tp": 1, + "util": "0.95", + "max_seqs": 4, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 246064, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "google/gemma-3-12b-it", + "tp": 1, + "util": "0.95", + "max_seqs": 8, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 246064, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "google/gemma-3-12b-it", + "tp": 1, + "util": "0.95", + "max_seqs": 16, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 246064, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 40960, + "configured_len": 0, + "real_capacity": 0, + "status": "fail", + "error": "Verification Failed" + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "tp": 1, + "util": "0.90", + "max_seqs": 1, + "model_limit": 40960, + "configured_len": 40960, + "real_capacity": 655712, + "status": "success", + "error": "", + "max_context_1_user": 40960 + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "tp": 1, + "util": "0.90", + "max_seqs": 4, + "model_limit": 40960, + "configured_len": 40960, + "real_capacity": 655616, + "status": "success", + "error": "", + "max_context_1_user": 40960 + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "tp": 1, + "util": "0.90", + "max_seqs": 8, + "model_limit": 40960, + "configured_len": 40960, + "real_capacity": 655600, + "status": "success", + "error": "", + "max_context_1_user": 40960 + }, + { + "model": "Qwen/Qwen3-14B-AWQ", + "tp": 1, + "util": "0.90", + "max_seqs": 16, + "model_limit": 40960, + "configured_len": 40960, + "real_capacity": 655600, + "status": "success", + "error": "", + "max_context_1_user": 40960 + }, + { + "model": "openai/gpt-oss-20b", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 2232848, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "openai/gpt-oss-20b", + "tp": 1, + "util": "0.95", + "max_seqs": 4, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 2232560, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "openai/gpt-oss-20b", + "tp": 1, + "util": "0.95", + "max_seqs": 8, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 2232544, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "openai/gpt-oss-20b", + "tp": 1, + "util": "0.95", + "max_seqs": 16, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 2232544, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "openai/gpt-oss-120b", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 711360, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "openai/gpt-oss-120b", + "tp": 1, + "util": "0.95", + "max_seqs": 4, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 711168, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "openai/gpt-oss-120b", + "tp": 1, + "util": "0.95", + "max_seqs": 8, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 711168, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "openai/gpt-oss-120b", + "tp": 1, + "util": "0.95", + "max_seqs": 16, + "model_limit": 131072, + "configured_len": 131072, + "real_capacity": 711168, + "status": "success", + "error": "", + "max_context_1_user": 131072 + }, + { + "model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 262144, + "configured_len": 262144, + "real_capacity": 1097712, + "status": "success", + "error": "", + "max_context_1_user": 262144 + }, + { + "model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit", + "tp": 1, + "util": "0.95", + "max_seqs": 4, + "model_limit": 262144, + "configured_len": 0, + "real_capacity": 0, + "status": "fail", + "error": "Verification Failed" + }, + { + "model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit", + "tp": 1, + "util": "0.90", + "max_seqs": 4, + "model_limit": 262144, + "configured_len": 209715, + "real_capacity": 1029856, + "status": "success", + "error": "Process died or timed out", + "max_context_1_user": 209715 + }, + { + "model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit", + "tp": 1, + "util": "0.90", + "max_seqs": 8, + "model_limit": 262144, + "configured_len": 0, + "real_capacity": 0, + "status": "fail", + "error": "Verification Failed" + }, + { + "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "tp": 1, + "util": "0.95", + "max_seqs": 1, + "model_limit": 262144, + "configured_len": 0, + "real_capacity": 0, + "status": "fail", + "error": "Verification Failed" + }, + { + "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "tp": 1, + "util": "0.90", + "max_seqs": 1, + "model_limit": 262144, + "configured_len": 262144, + "real_capacity": 696320, + "status": "success", + "error": "", + "max_context_1_user": 262144 + }, + { + "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "tp": 1, + "util": "0.90", + "max_seqs": 4, + "model_limit": 262144, + "configured_len": 0, + "real_capacity": 0, + "status": "fail", + "error": "Verification Failed" + } +] \ No newline at end of file diff --git a/benchmarks/run_vllm_bench.py b/benchmarks/run_vllm_bench.py new file mode 100644 index 0000000..37b664a --- /dev/null +++ b/benchmarks/run_vllm_bench.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 +import subprocess, time, json, sys, os, requests, argparse +from pathlib import Path + +# ========================= +# ⚙️ GLOBAL SETTINGS +# ========================= + +# HARDWARE: 1x Strix Halo (128GB, RDNA 3.5) +GPU_UTIL = "0.90" +# 1. THROUGHPUT CONFIG +OFF_NUM_PROMPTS = 200 +OFF_FORCED_OUTPUT = "512" +# Default fallback if not specified in MODEL_TABLE +DEFAULT_BATCH_TOKENS = "8192" + +# Fallbacks +FALLBACK_INPUT_LEN = 1024 +FALLBACK_OUTPUT_LEN = 512 + +RESULTS_DIR = Path("benchmark_results") +RESULTS_DIR.mkdir(exist_ok=True) + +# ========================= +# 🛠️ MODEL CONFIGURATION 🛠️ +# ========================= + +MODEL_TABLE = { + # 1. Llama 3.1 8B Instruct + # MAD uses 131k tokens. We scale to 32k for 32GB VRAM safety. + "meta-llama/Meta-Llama-3.1-8B-Instruct": { + "trust_remote": False, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "32768" + }, + + "google/gemma-3-12b-it": { + "trust_remote": False, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "32768" + }, + # 2. GPT-OSS 20B (MXFP4) + # MAD Row 0 uses 8192. We match this exactly. + "openai/gpt-oss-20b": { + "trust_remote": True, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "8192" + }, + + "openai/gpt-oss-120b": { + "trust_remote": True, + "valid_tp": [1], + "max_num_seqs": "64", + "max_tokens": "8192" + }, + + + "Qwen/Qwen3-14B-AWQ": { + "trust_remote": True, + "valid_tp": [1], # Too big for single GPU + "max_num_seqs": "32", # Lower concurrency for safety + "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive + "enforce_eager": False, + "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error + }, + + # 4. Qwen 30B 4-bit + "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit": { + "trust_remote": True, + "enforce_eager": False, + "valid_tp": [1, 2], + "max_num_seqs": "64", + "max_tokens": "32768" + }, + + # 5. Qwen 80B AWQ (The Big One) [NEW] + # Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB. + # Config: 20k ctx fits in that cache. Eager mode required for stability. + "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": { + "trust_remote": True, + "valid_tp": [1], # Too big for single GPU + "max_num_seqs": "32", # Lower concurrency for safety + "max_tokens": "16384", # Lower batch size because Eager mode is CPU intensive + "enforce_eager": True, + "env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error + }, + +} + +MODELS_TO_RUN = [ + #"meta-llama/Meta-Llama-3.1-8B-Instruct", + #"google/gemma-3-12b-it", + #"Qwen/Qwen3-14B-AWQ", + #"openai/gpt-oss-20b", + #"openai/gpt-oss-120b", + "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit", + "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", +] + +# ========================= +# UTILS +# ========================= + +def log(msg): print(f"\n[BENCH] {msg}") + +def get_gpu_count(): + try: + # Using rocm-smi --showid to list GPUs. + # Output format: "GPU[0] : Device Name: ..." + res = subprocess.run(["rocm-smi", "--showid"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if res.returncode == 0: + # Filter specifically for the target GPU as requested + # target_gpu = "AMD Radeon AI PRO R9700" + # count = 0 + # for line in res.stdout.strip().split('\n'): + # if "Device Name" in line and target_gpu in line: + # count += 1 + + # return count if count > 0 else 1 + return 1 # Force return 1 for Strix Halo APU + else: + log("rocm-smi failed, defaulting to 1 GPU (Hardcoded Fallback)") + return 1 + except Exception as e: + log(f"Error detecting GPUs: {e}, defaulting to 1 GPU") + return 1 + +def kill_vllm(): + subprocess.run("pgrep -f 'vllm serve' | xargs -r kill -9", + shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + time.sleep(5) + +def nuke_vllm_cache(): + cache = Path.home() / ".cache" / "vllm" + if cache.exists(): + try: + subprocess.run(["rm", "-rf", str(cache)], check=True) + cache.mkdir(parents=True, exist_ok=True) + time.sleep(2) + except: pass + +def get_dataset(): + data_path = Path("ShareGPT_V3_unfiltered_cleaned_split.json") + if data_path.exists(): return str(data_path) + + log("Downloading ShareGPT dataset...") + url = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json" + try: + r = requests.get(url, stream=True, timeout=15) + r.raise_for_status() + with open(data_path, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): f.write(chunk) + return str(data_path) + except Exception as e: + log(f"WARNING: ShareGPT download failed ({e}). using RANDOM.") + return None + + + +def get_model_args(model, tp_size): + config = MODEL_TABLE.get(model, {"max_num_seqs": "32"}) + + # Allow per-model GPU utilization override + util = config.get("gpu_util", GPU_UTIL) + + cmd = [ + "--model", model, + "--gpu-memory-utilization", util, + "--dtype", "auto", + "--tensor-parallel-size", str(tp_size), + "--max-num-seqs", config["max_num_seqs"] + ] + + # Optional: if a model really needs a hard limit, we can still support "ctx" in config, + # but by default we rely on auto. + if "ctx" in config: + cmd.extend(["--max-model-len", config["ctx"]]) + + if config.get("trust_remote"): cmd.append("--trust-remote-code") + if config.get("enforce_eager"): cmd.append("--enforce-eager") + + return cmd + +def run_throughput(model, tp_size): + if tp_size not in MODEL_TABLE[model]["valid_tp"]: return + + model_safe = model.replace("/", "_") + output_file = RESULTS_DIR / f"{model_safe}_tp{tp_size}_throughput.json" + + if output_file.exists(): + log(f"SKIP Throughput {model} (TP={tp_size})") + return + + dataset_path = get_dataset() + dataset_args = ["--dataset-name", "sharegpt", "--dataset-path", dataset_path] if dataset_path else ["--input-len", "1024"] + + # Retrieve Model-Specific Batch Tokens + batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS) + + log(f"START Throughput {model} (TP={tp_size}) [Batch: {batch_tokens}]...") + kill_vllm() + nuke_vllm_cache() + + cmd = ["vllm", "bench", "throughput"] + get_model_args(model, tp_size) + cmd.extend([ + "--num-prompts", str(OFF_NUM_PROMPTS), + "--max-num-batched-tokens", batch_tokens, + "--output-len", OFF_FORCED_OUTPUT, + "--output-json", str(output_file), + "--disable-log-stats" + ]) + cmd.extend(dataset_args) + + # ENV Setup: Global + Model Specific + env = os.environ.copy() + + # Inject model specific env vars (e.g. for AWQ) + model_env = MODEL_TABLE[model].get("env", {}) + env.update(model_env) + + try: + subprocess.run(cmd, check=True, env=env) + except: + log(f"ERROR: Throughput failed {model}") + + + +def print_summary(tps): + print(f"\n{'MODEL':<40} | {'TP':<2} | {'TOK/S':<8}") + print("-" * 60) + + for m in MODELS_TO_RUN: + msafe = m.replace("/", "_") + for tp in tps: + if tp not in MODEL_TABLE[m]["valid_tp"]: continue + + try: + tdata = json.loads((RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json").read_text()) + tok_s = f"{tdata.get('tokens_per_second', 0):.1f}" + except: tok_s = "N/A" + + name_cell = m.split('/')[-1] + print(f"{name_cell:<40} | {tp:<2} | {tok_s:<8}") + print("-" * 60) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--tp", type=int, nargs="+", default=[1]) + args = parser.parse_args() + + gpu_count = get_gpu_count() + log(f"Detected {gpu_count} AMD GPU(s)") + + valid_tp_args = [t for t in args.tp if t <= gpu_count] + if not valid_tp_args: + log(f"Requested TP={args.tp} but only {gpu_count} GPU(s) detected. Nothing to run.") + sys.exit(0) + + kill_vllm() + for tp in valid_tp_args: + for m in MODELS_TO_RUN: + run_throughput(m, tp) + print_summary(valid_tp_args) diff --git a/docs/assets/index2.css b/docs/assets/index2.css new file mode 100644 index 0000000..48d0e15 --- /dev/null +++ b/docs/assets/index2.css @@ -0,0 +1,401 @@ +:root { + --bg: #f8fafc; + --surface: #ffffff; + --text: #0f172a; + --text-sub: #64748b; + --accent: #d90007; + /* AMD Red */ + --accent-fade: #fff0f0; + --border: #e2e8f0; + --font: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; +} + +body { + margin: 0; + background: var(--bg); + color: var(--text); + font-family: var(--font); + display: flex; + flex-direction: column; + height: 100vh; + overflow-y: auto; +} + +header, +.controls, +.panel-split, +#tables { + max-width: 860px; + margin: 0 auto; + width: 100%; + box-sizing: border-box; +} + +table { + border-collapse: collapse; + width: 100%; + table-layout: fixed; +} + + +h1 { + margin: 0; + font-size: 1.25rem; + font-weight: 600; +} + +p { + margin: 4px 0 0; + font-size: 0.875rem; + color: var(--text-sub); +} + +.legend { + margin-top: 12px; + display: flex; + align-items: center; + gap: 12px; +} + +.legend label { + font-size: 0.75rem; + font-weight: 600; + text-transform: uppercase; + color: var(--text-sub); +} + +.legend-pills { + display: flex; + gap: 8px; +} + +.legend-pill { + cursor: default !important; +} + +.legend-pill-default::before { + content: ""; + display: inline-block; + width: 8px; + height: 8px; + background: #cbd5e1; + border-radius: 50%; + margin-right: 6px; +} + +.legend-pill-dual::before { + content: ""; + display: inline-block; + width: 8px; + height: 8px; + background: #d90007; + border-radius: 50%; + margin-right: 6px; +} + +.controls { + background: var(--surface); + border-bottom: 1px solid var(--border); + padding: 12px 24px; + display: flex; + gap: 24px; + align-items: center; + flex-shrink: 0; +} + +.control { + display: flex; + flex-direction: column; + gap: 4px; +} + +.control label { + font-size: 0.75rem; + font-weight: 600; + color: var(--text-sub); +} + +input[type="text"], +select { + padding: 6px 10px; + border: 1px solid var(--border); + border-radius: 4px; + font-size: 0.875rem; + background: var(--bg); + min-width: 180px; +} + +.range-wrap { + position: relative; + width: 200px; + height: 20px; +} + +.range-track { + position: absolute; + top: 50%; + left: 0; + right: 0; + height: 4px; + background: #e3e7f1; + border-radius: 2px; + transform: translateY(-50%); +} + +input[type=range] { + position: absolute; + width: 100%; + pointer-events: none; + appearance: none; + background: none; + margin: 0; + top: 50%; + transform: translateY(-50%); +} + +input[type=range]::-webkit-slider-thumb { + pointer-events: auto; + appearance: none; + width: 16px; + height: 16px; + border-radius: 50%; + background: var(--surface); + border: 2px solid var(--accent); + cursor: pointer; +} + +.range-values { + font-size: 0.75rem; + color: var(--text-sub); + margin-top: 4px; + text-align: center; +} + +.panel { + flex: 1; + display: flex; + flex-direction: column; + overflow: hidden; +} + +.panel.compact { + flex: 0 0 auto; +} + +#tables-panel { + flex: 1; + background: var(--bg); + padding: 0; + overflow-y: auto; + margin-top: 24px; + margin-bottom: 40px; +} + +.panel-split { + display: flex; + justify-content: space-between; + align-items: center; + padding: 12px 24px; + background: var(--surface); + border-bottom: 1px solid var(--border); +} + +.backend-header { + display: flex; + flex-direction: column; + gap: 8px; +} + +.backend-label { + display: flex; + align-items: center; + gap: 12px; +} + +.backend-label label { + font-size: 0.75rem; + font-weight: 600; + color: var(--text-sub); + text-transform: uppercase; +} + +.backend-list { + display: flex; + gap: 16px; + flex-wrap: wrap; +} + +.backend-item { + display: flex; + align-items: center; + gap: 6px; + font-size: 0.875rem; + cursor: pointer; + user-select: none; +} + +.test-block { + margin-bottom: 32px; + background: var(--surface); + border-top: 1px solid var(--border); + border-bottom: 1px solid var(--border); +} + +h2 { + padding: 16px 24px; + margin: 0; + font-size: 1rem; + background: #f1f5f9; + color: var(--text); + border-bottom: 1px solid var(--border); +} + +.table-wrap { + position: relative; + overflow: hidden; +} + +.table-scroll { + overflow-x: auto; + padding-bottom: 12px; + /* Scrollbar space */ +} + +/* ... */ + +.best { + background: #f0fdf4; +} + +.cell-error { + color: #ef4444; + font-size: 0.75rem; +} + +.cell-empty { + color: var(--border); + font-size: 0.75rem; + font-style: italic; +} + +/* Resize Overlay */ +.resize-overlay { + position: absolute; + top: 0; + left: 0; + pointer-events: none; + z-index: 5; +} + +.resize-bar { + position: absolute; + top: 0; + width: 6px; + height: 100%; + cursor: col-resize; + pointer-events: auto; + /* invisible usually, but can hover */ +} + +.resize-bar:hover { + background: rgba(0, 0, 0, 0.05); +} + +.resize-handle { + position: absolute; + right: 0; + top: 0; + bottom: 0; + width: 4px; + cursor: col-resize; +} + +.backend-header.dragging { + opacity: 0.5; +} + +.backend-header.drop-target { + border-left: 2px solid var(--accent); +} + +::-webkit-scrollbar { + width: 8px; + height: 8px; +} + +::-webkit-scrollbar-track { + background: transparent; +} + +::-webkit-scrollbar-thumb { + background: #cbd5e1; + border-radius: 4px; +} + +/* Modal Styles */ +.modal { + position: fixed; + top: 0; + left: 0; + width: 100vw; + height: 100vh; + background: rgba(0, 0, 0, 0.4); + display: flex; + justify-content: center; + align-items: center; + z-index: 1000; + opacity: 1; + transition: opacity 0.2s; +} + +.modal.hidden { + opacity: 0; + pointer-events: none; +} + +.modal-content { + background: var(--surface); + padding: 24px 32px; + border-radius: 8px; + box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15); + max-width: 500px; + width: 90%; + position: relative; + transform: translateY(0); + transition: transform 0.2s; +} + +.modal.hidden .modal-content { + transform: translateY(20px); +} + +.modal-close { + position: absolute; + top: 12px; + right: 12px; + background: transparent; + border: none; + font-size: 1.5rem; + line-height: 1; + color: var(--text-sub); + cursor: pointer; + padding: 4px; +} + +.modal-close:hover { + color: var(--text); +} + +.modal-content h2 { + margin-top: 0; + background: none; + border: none; + padding: 0; + font-size: 1.25rem; + margin-bottom: 12px; +} + +.modal-content p { + margin-bottom: 12px; + line-height: 1.5; +} \ No newline at end of file diff --git a/docs/assets/index2.js b/docs/assets/index2.js new file mode 100644 index 0000000..2825a67 --- /dev/null +++ b/docs/assets/index2.js @@ -0,0 +1,542 @@ +const K_SIGMA = 1.0; +const MIN_TOL = 0.25; +const MODEL_COL_WIDTH = 300; +// Winner column removed + +const state = { + envs: ["TP1", "TP2"], + backendOrder: ["TP1", "TP2"], + columnWidths: { "TP1": 260, "TP2": 260 }, + filters: { + search: "", + quant: "", + backends: new Set(["TP1", "TP2"]), + sizeLo: null, + sizeHi: null, + }, + ui: {}, + sizeStats: { min: Infinity, max: -Infinity }, + draggingEnv: null, + quantOptions: [], +}; + +document.addEventListener("DOMContentLoaded", async () => { + cacheUI(); + setupModals(); + try { + const res = await fetch("results.json"); + const data = await res.json(); + prepareData(data?.runs || []); + initializeControls(); + renderTables(); + } catch (err) { + console.error("Failed to load results.json", err); + state.ui.stats.textContent = "Failed to load results.json"; + } +}); + +function cacheUI() { + state.ui = { + search: document.getElementById("filter-search"), + quant: document.getElementById("filter-quant"), + backendList: document.getElementById("backend-list"), + backendAll: document.getElementById("backend-all"), + backendNone: document.getElementById("backend-none"), + sizeLo: document.getElementById("sizeLo"), + sizeHi: document.getElementById("sizeHi"), + sizeTrack: document.getElementById("sizeTrack"), + sizeLoVal: document.getElementById("sizeLoVal"), + sizeHiVal: document.getElementById("sizeHiVal"), + stats: document.getElementById("stats-line"), + resetBtn: document.getElementById("reset-layout"), + tables: document.getElementById("tables"), + // Modal hooks + tp1ModalOpen: document.getElementById("tp1-modal-open"), + tp2ModalOpen: document.getElementById("tp2-modal-open"), + tp1Modal: document.getElementById("tp1-modal"), + tp2Modal: document.getElementById("tp2-modal"), + tp1ModalClose: document.getElementById("tp1-modal-close"), + tp2ModalClose: document.getElementById("tp2-modal-close"), + }; +} + +function setupModals() { + const modalConfigs = [ + { open: state.ui.tp1ModalOpen, modal: state.ui.tp1Modal, close: state.ui.tp1ModalClose }, + { open: state.ui.tp2ModalOpen, modal: state.ui.tp2Modal, close: state.ui.tp2ModalClose }, + ]; + + modalConfigs.forEach(({ open, modal, close }) => { + if (!open || !modal) return; + const openModal = () => modal.classList.remove("hidden"); + const closeModal = () => modal.classList.add("hidden"); + open.addEventListener("click", openModal); + close?.addEventListener("click", closeModal); + modal.addEventListener("click", (e) => { + if (e.target === modal) closeModal(); + }); + document.addEventListener("keydown", (e) => { + if (e.key === "Escape" && !modal.classList.contains("hidden")) { + closeModal(); + } + }); + }); +} + +function prepareData(runs) { + const quantSet = new Set(); + // Tests map: TestName -> { name: ..., models: Map(ModelName -> Row) } + const testsMap = new Map(); + + for (const run of runs) { + if (!run.test) continue; + const testKey = run.test; + + if (run.quant) quantSet.add(run.quant.toUpperCase()); + + if (!testsMap.has(testKey)) { + testsMap.set(testKey, { name: testKey, models: new Map() }); + } + const testEntry = testsMap.get(testKey); + + const modelName = run.model_clean || run.model; + + if (!testEntry.models.has(modelName)) { + testEntry.models.set(modelName, { + model: modelName, + quant: (run.quant || "Unknown").toUpperCase(), + sizeB: run.name_params_b ?? run.params_b ?? null, + backends: {}, + search_blob: [modelName, run.quant, run.env, run.test] + .filter(Boolean) + .map((s) => s.toString().toLowerCase()) + .join(" "), + }); + } + + const row = testEntry.models.get(modelName); + + // Update stats + if (row.sizeB != null) { + state.sizeStats.min = Math.min(state.sizeStats.min, row.sizeB); + state.sizeStats.max = Math.max(state.sizeStats.max, row.sizeB); + } + + // Add backend data + // run.env comes from python script as "TP1" or "TP2" + const env = run.env; + row.backends[env] = { + mean: typeof run.tps_mean === "number" ? run.tps_mean : null, + std: 0, // Not currently parsed + error: Boolean(run.error), + error_type: run.error_type || null, + }; + } + + state.tests = [...testsMap.values()].sort((a, b) => a.name.localeCompare(b.name)); + state.quantOptions = [...quantSet].sort(); +} + +function initializeControls() { + const { quant, backendList, search, resetBtn, sizeLo, sizeHi } = state.ui; + + quant.innerHTML = ""; + const anyOpt = document.createElement("option"); + anyOpt.value = ""; + anyOpt.textContent = "Any"; + quant.appendChild(anyOpt); + state.quantOptions.forEach((q) => { + const opt = document.createElement("option"); + opt.value = q; + opt.textContent = q; + quant.appendChild(opt); + }); + + renderBackendList(); + setupSizeSlider(); + + search.addEventListener("input", (e) => { + state.filters.search = (e.target.value || "").trim().toLowerCase(); + renderTables(); + }); + + quant.addEventListener("change", (e) => { + state.filters.quant = e.target.value; + renderTables(); + }); + + backendList.addEventListener("change", (e) => { + const checkbox = e.target.closest("input[data-env]"); + if (!checkbox) return; + const env = checkbox.dataset.env; + if (checkbox.checked) { + state.filters.backends.add(env); + } else { + state.filters.backends.delete(env); + } + renderTables(); + }); + + state.ui.backendAll.addEventListener("click", () => { + state.filters.backends = new Set(state.envs); + renderBackendList(); + renderTables(); + }); + + state.ui.backendNone.addEventListener("click", () => { + state.filters.backends = new Set(); + renderBackendList(); + renderTables(); + }); + + sizeLo.addEventListener("input", () => updateSizeUI(true)); + sizeHi.addEventListener("input", () => updateSizeUI(true)); + + resetBtn.addEventListener("click", () => { + state.filters.search = ""; + state.filters.quant = ""; + state.filters.backends = new Set(state.envs); + search.value = ""; + quant.value = ""; + renderBackendList(); + setupSizeSlider(); + renderTables(); + }); +} + +function renderBackendList() { + const container = state.ui.backendList; + container.innerHTML = ""; + state.backendOrder.forEach((env) => { + const label = document.createElement("label"); + label.className = "backend-item"; + const checkbox = document.createElement("input"); + checkbox.type = "checkbox"; + checkbox.dataset.env = env; + checkbox.checked = state.filters.backends.has(env); + label.appendChild(checkbox); + + const baseSpan = document.createElement("span"); + baseSpan.textContent = env; + label.appendChild(baseSpan); + + container.appendChild(label); + }); +} + +function setupSizeSlider() { + const { sizeLo, sizeHi } = state.ui; + const minRaw = state.sizeStats.min === Infinity ? 0 : Math.floor(state.sizeStats.min || 0); + const maxRaw = state.sizeStats.max === -Infinity ? 0 : Math.ceil(state.sizeStats.max || 0); + const minB = Math.max(0, minRaw); + const maxB = Math.max(minB, maxRaw); + + [sizeLo, sizeHi].forEach((inp) => { + inp.min = minB; + inp.max = maxB; + inp.step = 1; + }); + + sizeLo.value = minB; + sizeHi.value = maxB; + sizeLo.style.zIndex = 2; + sizeHi.style.zIndex = 1; + updateSizeUI(false); +} + +function updateSizeUI(triggerRender) { + const { sizeLo, sizeHi, sizeLoVal, sizeHiVal, sizeTrack } = state.ui; + if (+sizeLo.value > +sizeHi.value) { + if (document.activeElement === sizeLo) { + sizeHi.value = sizeLo.value; + } else { + sizeLo.value = sizeHi.value; + } + } + sizeLo.style.zIndex = +sizeLo.value >= +sizeHi.max - 1 ? 4 : 2; + sizeHi.style.zIndex = +sizeHi.value <= +sizeLo.min + 1 ? 3 : 1; + state.filters.sizeLo = +sizeLo.value; + state.filters.sizeHi = +sizeHi.value; + sizeLoVal.textContent = formatSizeLabel(state.filters.sizeLo); + sizeHiVal.textContent = formatSizeLabel(state.filters.sizeHi); + const range = (sizeHi.max - sizeLo.min) || 1; + const minB = +sizeLo.min; + const start = ((state.filters.sizeLo - minB) / range) * 100; + const end = ((state.filters.sizeHi - minB) / range) * 100; + sizeTrack.style.background = `linear-gradient(to right, #e3e7f1 ${start}%, var(--accent) ${start}%, var(--accent) ${end}%, #e3e7f1 ${end}%)`; + if (triggerRender) renderTables(); +} + +function renderTables() { + const backendList = state.backendOrder.filter((env) => state.filters.backends.has(env)); + const frag = document.createDocumentFragment(); + let totalRows = 0; + + for (const test of state.tests) { + const models = filterModels(test.models); + if (!models.length) continue; + totalRows += models.length; + + const block = document.createElement("div"); + block.className = "test-block"; + const heading = document.createElement("h2"); + heading.textContent = test.name; + block.appendChild(heading); + + const tableWrap = document.createElement("div"); + tableWrap.className = "table-wrap"; + const scroller = document.createElement("div"); + scroller.className = "table-scroll"; + + const table = buildSingleTable(models, backendList); + scroller.appendChild(table); + tableWrap.appendChild(scroller); + block.appendChild(tableWrap); + setupResizeOverlay(scroller, backendList, table); + frag.appendChild(block); + } + + state.ui.tables.innerHTML = ""; + if (frag.childNodes.length) { + state.ui.tables.appendChild(frag); + } else { + state.ui.tables.innerHTML = "

No models match the current filters.

"; + } + state.ui.stats.textContent = `Showing ${totalRows.toLocaleString()} model rows across ${backendList.length} configurations`; +} + +function buildSingleTable(models, backendList) { + const table = document.createElement("table"); + const colgroup = document.createElement("colgroup"); + const colModel = document.createElement("col"); + colModel.style.width = `${MODEL_COL_WIDTH}px`; + colgroup.appendChild(colModel); + // Winner colGroup removed + + backendList.forEach((env) => { + const col = document.createElement("col"); + col.style.width = `${state.columnWidths[env] || 200}px`; + col.dataset.env = env; + colgroup.appendChild(col); + }); + table.appendChild(colgroup); + + const thead = document.createElement("thead"); + const headRow = document.createElement("tr"); + headRow.appendChild(makeHeaderCell("Model", "model")); + // Winner header removed + + backendList.forEach((env) => { + const th = makeHeaderCell(env, ""); // REMOVED "backend-header" class + attachHeaderInteractions(th, env); + headRow.appendChild(th); + }); + thead.appendChild(headRow); + table.appendChild(thead); + + const tbody = document.createElement("tbody"); + models.forEach((model) => { + const tr = document.createElement("tr"); + const tdModel = document.createElement("td"); + tdModel.className = "model"; + const head = document.createElement("div"); + head.className = "model-head"; + const nameSpan = document.createElement("span"); + nameSpan.className = "model-name"; + nameSpan.textContent = model.model; + head.appendChild(nameSpan); + tdModel.appendChild(head); + + const meta = document.createElement("div"); + meta.className = "meta"; + meta.textContent = `${model.quant} · ${formatSize(model.sizeB)}`; + tdModel.appendChild(meta); + tr.appendChild(tdModel); + + // Winner cell removed + + backendList.forEach((env) => { + const td = document.createElement("td"); + td.className = "data-cell"; + td.dataset.env = env; + const cell = model.backends[env]; + if (!cell) { + td.innerHTML = `N/A`; + } else if (cell.error || cell.mean == null) { + td.innerHTML = `FAIL`; + } else { + td.innerHTML = `
${cell.mean.toFixed(2)}
`; + } + tr.appendChild(td); + }); + tbody.appendChild(tr); + }); + table.appendChild(tbody); + return table; +} + +function makeHeaderCell(label, extra = "") { + const th = document.createElement("th"); + th.textContent = label; + if (extra) th.className = extra; + return th; +} + +function attachHeaderInteractions(th, env) { + const width = state.columnWidths[env] || 200; + th.style.width = `${width}px`; + th.style.minWidth = `${width}px`; + th.draggable = true; + th.addEventListener("dragstart", (e) => { + state.draggingEnv = env; + th.classList.add("dragging"); + e.dataTransfer.effectAllowed = "move"; + }); + th.addEventListener("dragend", () => { + state.draggingEnv = null; + th.classList.remove("dragging"); + document.querySelectorAll("th.drop-target").forEach((el) => el.classList.remove("drop-target")); + }); + th.addEventListener("dragover", (e) => { + if (!state.draggingEnv || state.draggingEnv === env) return; + e.preventDefault(); + th.classList.add("drop-target"); + }); + th.addEventListener("dragleave", () => th.classList.remove("drop-target")); + th.addEventListener("drop", (e) => { + if (!state.draggingEnv || state.draggingEnv === env) return; + e.preventDefault(); + moveBackend(state.draggingEnv, env); + th.classList.remove("drop-target"); + }); + + const handle = document.createElement("span"); + handle.className = "resize-handle"; + handle.addEventListener("mousedown", (e) => startResize(e, env)); + th.appendChild(handle); +} + +function moveBackend(from, to) { + const order = state.backendOrder; + const fromIdx = order.indexOf(from); + const toIdx = order.indexOf(to); + if (fromIdx === -1 || toIdx === -1) return; + const [col] = order.splice(fromIdx, 1); + order.splice(toIdx, 0, col); + renderBackendList(); + renderTables(); +} + +function filterModels(modelsMap) { + const models = []; + for (const model of modelsMap.values()) { + if (state.filters.search && !model.search_blob.includes(state.filters.search)) continue; + if (state.filters.quant && model.quant !== state.filters.quant) continue; + if (model.sizeB != null) { + if (state.filters.sizeLo != null && model.sizeB < state.filters.sizeLo - 1e-6) continue; + if (state.filters.sizeHi != null && model.sizeB > state.filters.sizeHi + 1e-6) continue; + } + models.push(model); + } + models.sort((a, b) => a.model.localeCompare(b.model)); + return models; +} + +function formatSize(size) { + if (size == null) return "—"; + return `${Number(size).toFixed(1)}B`; +} + +function formatSizeLabel(size) { + if (size >= 1000) return `${(size / 1000).toFixed(1)}kB`; + return `${Math.round(size)}B`; +} + +function startResize(event, env) { + event.preventDefault(); + event.stopPropagation(); + const column = state.columnWidths[env] || 200; + const startX = event.clientX; + const shellRect = state.ui.tables.getBoundingClientRect(); + const guide = document.createElement("div"); + guide.className = "resize-line"; + guide.style.position = "fixed"; + guide.style.top = `${shellRect.top}px`; + guide.style.bottom = `${window.innerHeight - shellRect.bottom}px`; + guide.style.left = `${startX}px`; + guide.style.width = "2px"; + guide.style.background = "var(--accent)"; + guide.style.zIndex = "10"; + document.body.appendChild(guide); + let nextWidth = column; + + const onMove = (e) => { + const delta = e.clientX - startX; + nextWidth = Math.max(80, column + delta); + guide.style.left = `${e.clientX}px`; + }; + + const onUp = () => { + document.removeEventListener("mousemove", onMove); + document.removeEventListener("mouseup", onUp); + guide.remove(); + state.columnWidths[env] = nextWidth; + renderTables(); + }; + + document.addEventListener("mousemove", onMove); + document.addEventListener("mouseup", onUp); +} + +function setupResizeOverlay(tableWrap, backendList, table) { + let overlay = tableWrap.querySelector(".resize-overlay"); + if (!overlay) { + overlay = document.createElement("div"); + overlay.className = "resize-overlay"; + tableWrap.appendChild(overlay); + } else { + overlay.innerHTML = ""; + } + + overlay.style.width = `${tableWrap.clientWidth}px`; + overlay.style.height = `${table.offsetHeight}px`; + + const bars = []; + let offset = MODEL_COL_WIDTH; // Winner column width removed + backendList.forEach((env) => { + const width = state.columnWidths[env] || 200; + const bar = document.createElement("div"); + bar.className = "resize-bar"; + bar.dataset.env = env; + bar.addEventListener("mousedown", (e) => startResize(e, env)); + overlay.appendChild(bar); + bars.push({ bar, offset, width, env }); + offset += width; + }); + + const positionBars = () => { + bars.forEach(({ bar, offset, width }) => { + const left = offset + width - 3 - tableWrap.scrollLeft; + bar.style.left = `${left}px`; + }); + }; + positionBars(); + + if (tableWrap._overlayScroll) { + tableWrap.removeEventListener("scroll", tableWrap._overlayScroll); + } + const onScroll = () => positionBars(); + tableWrap.addEventListener("scroll", onScroll); + tableWrap._overlayScroll = onScroll; + + if (tableWrap._overlayResize) { + tableWrap._overlayResize.disconnect(); + } + const resizeObserver = new ResizeObserver(() => { + overlay.style.width = `${tableWrap.clientWidth}px`; + overlay.style.height = `${table.offsetHeight}px`; + positionBars(); + }); + resizeObserver.observe(tableWrap); + tableWrap._overlayResize = resizeObserver; +} diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..250da66 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,782 @@ + + + + + + + AMD Strix Halo (gfx1151) vLLM Benchmarks + + + + + +
+
+

AMD Strix Halo (gfx1151) vLLM Benchmarks

+

+ View on GitHub → +

+
+ +
+ + +
+ + + +
+
Loading benchmark results...
+
+ +
+
System Configuration
+
+
+ System + Framework Desktop · AMD Ryzen AI MAX 395+ · 128GB unified RAM +
+
+ OS/Kernel + Fedora 42 · Linux 6.18.0-0.rc6.243.vanilla.fc42.x86_64 +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/docs/parse_results.py b/docs/parse_results.py new file mode 100644 index 0000000..5a7dc81 --- /dev/null +++ b/docs/parse_results.py @@ -0,0 +1,181 @@ + +import os +import json +import re +from pathlib import Path + +# Config +BENCHMARK_DIR = Path("../benchmarks/benchmark_results") +OUTPUT_FILE = Path("results.json") + +# Regex to parse model name for quantization and parameters +# Examples: +# "meta-llama/Meta-Llama-3.1-8B-In +# struct" +# "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit" +# "RedHatAI/Llama-3.1-8B-Instruct-FP8-block" +PARAMS_REGEX = r"(\d+(?:\.\d+)?)B" +QUANT_REGEX = r"(FP8|AWQ|GPTQ|BF16|4bit|Int4)" + +def extract_meta(model_name): + # Params + params_match = re.search(PARAMS_REGEX, model_name, re.IGNORECASE) + params_b = float(params_match.group(1)) if params_match else None + + # Quant + quant_match = re.search(QUANT_REGEX, model_name, re.IGNORECASE) + quant = quant_match.group(1).upper() if quant_match else "BF16" # Default assumption if no tag? Or unknown. + # Refine quant if 4bit + if quant == "4BIT" or quant == "INT4": + if "GPTQ" in model_name: quant = "GPTQ-4bit" + elif "AWQ" in model_name: quant = "AWQ-4bit" + else: quant = "4-bit" + + return params_b, quant + +def parse_logs(): + runs = [] + + if not BENCHMARK_DIR.exists(): + print(f"Error: {BENCHMARK_DIR} does not exist!") + return [] + + print(f"Scanning {BENCHMARK_DIR}...") + + # Files are flat in the dir: {model_safe}_tp{tp}_{type}.json + # or latency: {model_safe}_tp{tp}_qps{q}_latency.json + + # We need to group by (model, tp) to form cohesive records if we want, + # BUT the webapp expects a list of "runs". + # Looking at the example JSON, each "run" is a single test point (e.g. "pp2048 @ d16384" OR "tg32 @ d16384") + # Actually, looking at the provided valid example: + # "test": "pp512", "tps_mean": 2708.86 ... + + # Our data: + # throughput.json -> tokens_per_second. This is usually "decoding" or a mix? + # vLLM bench throughput usually streams tokens. + # Let's look at what run_vllm_bench.py produces. + # Throughput: --input-len 1024 --output-len 512. + # This is effectively a mixed batch. + # We'll label it "Throughput (1024/512)" or just "Throughput" + + # Latency: qps-based. + + files = list(BENCHMARK_DIR.glob("*.json")) + + for f in files: + fname = f.name + try: + data = json.loads(f.read_text()) + except: + print(f"Skipping bad JSON: {fname}") + continue + + # Infer metadata from filename + # Format: {model_safe}_tp{tp}_{suffix} + # Suffix can be: "throughput.json" or "qps{q}_latency.json" + + # We need model name. The script replaces / with _ in filenames. + # But we verify against the known models list? Or just parse string. + # We can reconstruct roughly. + + # Split by "_tp" which is a strong delimiter + parts = fname.split("_tp") + if len(parts) < 2: continue + + model_part = parts[0] + rest = parts[1] # "1_throughput.json" or "2_qps1.0_latency.json" + + # TP + tp_match = re.match(r"^(\d+)", rest) + if not tp_match: continue + tp = int(tp_match.group(1)) + + # Env mapping + env = f"TP{tp}" + + # Model Name Restoration (best effort or matching) + # In the script: model.replace("/", "_") + # We can reverse this if we have the list, but for now let's just use the clean string? + # The webapp uses "model_clean" and "model". + # Let's assume standard "org_model" format -> "org/model" + if "_" in model_part: + # Heuristic: First _ is likely the slash + model_display = model_part.replace("_", "/", 1) + else: + model_display = model_part + + params_b, quant = extract_meta(model_display) + + base_run = { + "model": model_display, + "model_clean": model_display, + "env": env, + "gpu_config": "dual" if tp > 1 else "single", + "quant": quant, + "params_b": params_b, + "name_params_b": params_b, + # Defaults + "backend": "vLLM", + "error": False + } + + if "throughput" in fname: + # Throughput run + # data has "tokens_per_second" + tps = data.get("tokens_per_second", 0) + + run = base_run.copy() + run["test"] = "Throughput" + run["tps_mean"] = tps + # If tps is 0 or missing, it might be an error? + if tps == 0 and "error" in str(data).lower(): + run["error"] = True + + runs.append(run) + + elif "latency" in fname: + # Latency run + # raw_output has strings like "Mean TTFT: 12.3 ms", "Mean TPOT: 45.6 ms" + raw = data.get("raw_output", "") + qps_match = re.search(r"_qps([\d\.]+)_", fname) + qps = qps_match.group(1) if qps_match else "?" + + # Extract metrics + ttft = 0.0 + tpot = 0.0 + + ttft_m = re.search(r"(?:Mean TTFT|TTFT).*?([\d\.]+)", raw) + if ttft_m: ttft = float(ttft_m.group(1)) + + tpot_m = re.search(r"(?:Mean TPOT|TPOT).*?([\d\.]+)", raw) + if tpot_m: tpot = float(tpot_m.group(1)) + + # We create TWO entries? Or how does the webapp handle multiple metrics? + # Example webapp table columns are "Backends" showing ONE value. + # But grouping is by "Test". + # So we can have a test called "TTFT (QPS 1.0)" and "TPOT (QPS 1.0)" + + # Entry 1: TTFT + r1 = base_run.copy() + r1["test"] = f"TTFT @ QPS {qps}" + r1["tps_mean"] = ttft # Using tps_mean field for the numeric value + runs.append(r1) + + # Entry 2: TPOT + r2 = base_run.copy() + r2["test"] = f"TPOT @ QPS {qps}" + r2["tps_mean"] = tpot + runs.append(r2) + + return runs + +if __name__ == "__main__": + data = {"runs": parse_logs()} + + runs_count = len(data["runs"]) + print(f"Parsed {runs_count} runs.") + + with open(OUTPUT_FILE, "w") as f: + json.dump(data, f, indent=2) + print(f"Written to {OUTPUT_FILE}") diff --git a/docs/results.json b/docs/results.json new file mode 100644 index 0000000..8d17292 --- /dev/null +++ b/docs/results.json @@ -0,0 +1,95 @@ +{ + "runs": [ + { + "model": "Qwen/Qwen3-14B-AWQ", + "model_clean": "Qwen/Qwen3-14B-AWQ", + "env": "TP1", + "gpu_config": "single", + "quant": "AWQ", + "params_b": 14.0, + "name_params_b": 14.0, + "backend": "vLLM", + "error": false, + "test": "Throughput", + "tps_mean": 112.69232830266365 + }, + { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_clean": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 8.0, + "name_params_b": 8.0, + "backend": "vLLM", + "error": false, + "test": "Throughput", + "tps_mean": 278.99494393048457 + }, + { + "model": "google/gemma-3-12b-it", + "model_clean": "google/gemma-3-12b-it", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 12.0, + "name_params_b": 12.0, + "backend": "vLLM", + "error": false, + "test": "Throughput", + "tps_mean": 162.71078485804028 + }, + { + "model": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "model_clean": "dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16", + "env": "TP1", + "gpu_config": "single", + "quant": "GPTQ", + "params_b": 80.0, + "name_params_b": 80.0, + "backend": "vLLM", + "error": false, + "test": "Throughput", + "tps_mean": 112.62418795067208 + }, + { + "model": "openai/gpt-oss-20b", + "model_clean": "openai/gpt-oss-20b", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 20.0, + "name_params_b": 20.0, + "backend": "vLLM", + "error": false, + "test": "Throughput", + "tps_mean": 313.85817605876395 + }, + { + "model": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit", + "model_clean": "cpatonn/Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit", + "env": "TP1", + "gpu_config": "single", + "quant": "GPTQ", + "params_b": 30.0, + "name_params_b": 30.0, + "backend": "vLLM", + "error": false, + "test": "Throughput", + "tps_mean": 271.7264154071495 + }, + { + "model": "openai/gpt-oss-120b", + "model_clean": "openai/gpt-oss-120b", + "env": "TP1", + "gpu_config": "single", + "quant": "BF16", + "params_b": 120.0, + "name_params_b": 120.0, + "backend": "vLLM", + "error": false, + "test": "Throughput", + "tps_mean": 109.73523843987172 + } + ] +} \ No newline at end of file diff --git a/scripts/99-toolbox-banner.sh b/scripts/99-toolbox-banner.sh index 92bf392..95e954c 100644 --- a/scripts/99-toolbox-banner.sh +++ b/scripts/99-toolbox-banner.sh @@ -89,11 +89,11 @@ echo printf 'Machine: %s\n' "$MACHINE" printf 'GPU : %s\n\n' "$GPU" printf 'Repo : https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes\n' -printf 'Image : docker.io/kyuz0/vllm-therock-gfx1151-aotriton:latest\n\n' +printf 'Image : docker.io/kyuz0/vllm-therock-gfx1151:latest\n\n' printf 'Included:\n' -printf ' - %-16s → %s\n' "start-vllm (wizard)" "Beginner-friendly launcher that guides you through model & settings" -printf ' - %-16s → %s\n' "vLLM server" "vllm serve Qwen/Qwen2.5-7B-Instruct --download-dir ~/vllm-models" -printf ' - %-16s → %s\n' "API test" "curl localhost:8000/v1/chat/completions (see README)" +printf ' - %-16s → %s\n' "start-vllm (TUI)" "Interactive launcher: Model select, Multi-GPU & Cache handling" +printf ' - %-16s → %s\n' "vLLM server" "vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct" +printf ' - %-16s → %s\n' "API test" "curl localhost:8000/v1/chat/completions" echo printf 'SSH tip: ssh -L 8000:localhost:8000 user@host\n\n' diff --git a/scripts/start-vllm.sh b/scripts/start-vllm.sh deleted file mode 100644 index f714154..0000000 --- a/scripts/start-vllm.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Defaults (override via env: HOST, PORT, DOWNLOAD_DIR, EXTRA_FLAGS) -HOST="${HOST:-0.0.0.0}" -PORT="${PORT:-8000}" -DOWNLOAD_DIR="${DOWNLOAD_DIR:-$HOME/vllm-models}" -EXTRA_FLAGS="${EXTRA_FLAGS:-}" - -models=( - "Llama 2 7B Chat|meta-llama/Llama-2-7b-chat-hf|" - "Qwen2.5 7B Instruct|Qwen/Qwen2.5-7B-Instruct|" - "Qwen3 30B A3B Instruct|Qwen/Qwen3-30B-A3B-Instruct-2507|" - "Qwen3 14B AWQ|Qwen/Qwen3-14B-AWQ|--quantization awq --dtype float16 --enforce-eager" - "Gemma 3 27B instruct|google/gemma-3-27b-it|" - "Gemma 3 12B Instruct|google/gemma-3-12b-it|" - "Gemma 3 4B Instruct|google/gemma-3-4b-it|" -) - -echo "Select a model:" -for i in "${!models[@]}"; do - name="${models[$i]%%|*}" - printf " [%d] %s\n" "$((i+1))" "$name" -done - -read -rp "Enter number: " choice -[[ "$choice" =~ ^[1-9][0-9]*$ ]] || { echo "Invalid choice."; exit 1; } -idx=$((choice-1)) -(( idx >= 0 && idx < ${#models[@]} )) || { echo "Invalid choice."; exit 1; } - -IFS='|' read -r label repo flags <<< "${models[$idx]}" - -mkdir -p "$DOWNLOAD_DIR" - -CMD=(vllm serve "$repo" --host "$HOST" --port "$PORT" --download-dir "$DOWNLOAD_DIR") - -# Per-model flags -if [[ -n "${flags:-}" ]]; then - # shellcheck disable=SC2206 - CMD+=($flags) -fi - -# Optional global extras: e.g. EXTRA_FLAGS="--gpu-memory-utilization 0.8" -if [[ -n "${EXTRA_FLAGS:-}" ]]; then - # shellcheck disable=SC2206 - CMD+=($EXTRA_FLAGS) -fi - -echo -e "Running:\n\n ${CMD[@]}\n" -echo "API test → curl -s http://localhost:${PORT}/v1/models | jq -r '.data[0].id'" -echo "SSH tip → ssh -L ${PORT}:localhost:${PORT} user@host" -echo - -exec "${CMD[@]}" diff --git a/scripts/start_vllm.py b/scripts/start_vllm.py new file mode 100644 index 0000000..7e3deb4 --- /dev/null +++ b/scripts/start_vllm.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +import sys +import os +import json +import shutil +import tempfile +import subprocess +from pathlib import Path + +# Add benchmarks dir to path to import config +SCRIPT_DIR = Path(__file__).parent.resolve() +BENCH_DIR = SCRIPT_DIR.parent / "benchmarks" +OPT_DIR = Path("/opt") + +# Check /opt first (Container), then local fallback +if (OPT_DIR / "run_vllm_bench.py").exists(): + sys.path.append(str(OPT_DIR)) +else: + sys.path.append(str(BENCH_DIR)) + +try: + from run_vllm_bench import MODEL_TABLE, MODELS_TO_RUN +except ImportError: + print("Error: Could not import run_vllm_bench.py config.") + sys.exit(1) + +if (OPT_DIR / "max_context_results.json").exists(): + RESULTS_FILE = OPT_DIR / "max_context_results.json" +else: + RESULTS_FILE = BENCH_DIR / "max_context_results.json" +HOST = os.getenv("HOST", "0.0.0.0") +PORT = os.getenv("PORT", "8000") + +def check_dependencies(): + if not shutil.which("dialog"): + print("Error: 'dialog' is required. Please install it (apt-get install dialog).") + sys.exit(1) + +def detect_gpus(): + """Detects AMD GPUs via rocm-smi or /dev/dri.""" + try: + # Try rocm-smi first + res = subprocess.run(["rocm-smi", "--showid", "--csv"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if res.returncode == 0: + count = res.stdout.count("GPU") + if count > 0: return count + except: pass + + # Fallback to /dev/dri/render* + try: + return len(list(Path("/dev/dri").glob("renderD*"))) + except: + return 1 + +def get_verified_config(model_id, tp_size, max_seqs): + """ + Reads max_context_results.json to find the best verified configuration. + Returns dict: {'ctx': int, 'util': float} + """ + default_config = { + "ctx": int(MODEL_TABLE.get(model_id, {}).get("ctx", 8192)), + "util": 0.90 # Safe default + } + + if not RESULTS_FILE.exists(): + return default_config + + try: + with open(RESULTS_FILE, "r") as f: + data = json.load(f) + + # Filter for Model + TP + Sequences + matches = [r for r in data + if r["model"] == model_id + and r["tp"] == tp_size + and r["max_seqs"] == max_seqs + and r["status"] == "success"] + + if not matches: + # Fallback 1: Try finding match with SAME TP but ANY Sequences (e.g. 1) to get base context? + # Actually, safer to fallback to default or try finding nearest sequence? + # Let's try finding exact match first. If fail, return default. + return default_config + + # Sort by Util desc, then Context desc + # We prefer higher utilization if available (performance), as long as it is verified success + matches.sort(key=lambda x: (float(x["util"]), x["max_context_1_user"]), reverse=True) + + best = matches[0] + return { + "ctx": best["max_context_1_user"], + "util": float(best["util"]) + } + + except Exception as e: + return default_config + +def run_dialog(args): + """Runs dialog and returns stderr (selection).""" + with tempfile.NamedTemporaryFile(mode="w+") as tf: + cmd = ["dialog"] + args + try: + subprocess.run(cmd, stderr=tf, check=True) + tf.seek(0) + return tf.read().strip() + except subprocess.CalledProcessError: + return None # User cancelled + +def nuke_vllm_cache(): + """Removes vLLM cache directory to fix potential graph/incompatibility issues.""" + cache = Path.home() / ".cache" / "vllm" + if cache.exists(): + try: + print(f"Clearing vLLM cache at {cache}...", end="", flush=True) + subprocess.run(["rm", "-rf", str(cache)], check=True) + cache.mkdir(parents=True, exist_ok=True) + print(" Done.") + time.sleep(1) + except Exception as e: + print(f" Failed: {e}") + +def configure_and_launch(model_idx, gpu_count): + model_id = MODELS_TO_RUN[model_idx] + config = MODEL_TABLE[model_id] + + # Static Config + valid_tps = config.get("valid_tp", [1]) + max_tp = max(valid_tps) if valid_tps else 1 + + # Defaults + current_tp = min(gpu_count, max_tp) + current_seqs = 1 # Default to 1 concurrent user/request for stability + + # Initial Lookup + verified = get_verified_config(model_id, current_tp, current_seqs) + current_ctx = verified["ctx"] + current_util = verified["util"] + + clear_cache = False + use_eager = config.get("enforce_eager", False) # Default to model config, usually False + use_rocm_attn = False # Default to Triton + + name = model_id.split("/")[-1] + + while True: + cache_status = "YES" if clear_cache else "NO" + eager_status = "YES" if use_eager else "NO" + attn_backend = "ROCm" if use_rocm_attn else "Triton" + + menu_args = [ + "--clear", "--backtitle", f"AMD R9700 vLLM Launcher (GPUs: {gpu_count})", + "--title", f"Configuration: {name}", + "--menu", "Customize Launch Parameters:", "22", "65", "9", + "1", f"Tensor Parallelism: {current_tp}", + "2", f"Concurrent Requests: {current_seqs}", + "3", f"Context Length: {current_ctx} (Verified)", + "4", f"GPU Utilization: {current_util} (Verified)", + "5", f"Attention Backend: {attn_backend}", + "6", f"Erase vLLM Cache: {cache_status}", + "7", f"Force Eager Mode: {eager_status}", + "8", "LAUNCH SERVER" + ] + + choice = run_dialog(menu_args) + if not choice: return False # Back/Cancel + + if choice == "1": + # TP Selection + new_tp = run_dialog([ + "--title", "Tensor Parallelism", + "--rangebox", f"Set TP Size (1-{max_tp})", "10", "40", "1", str(max_tp), str(current_tp) + ]) + if new_tp: + new_tp_int = int(new_tp) + if new_tp_int != current_tp: + current_tp = new_tp_int + # RE-CALCULATE Config + verified = get_verified_config(model_id, current_tp, current_seqs) + current_ctx = verified["ctx"] + current_util = verified["util"] + + elif choice == "2": + # Max Seqs Selection + new_seqs = run_dialog([ + "--title", "Concurrent Requests", + "--menu", "Select Max Concurrent Requests:", "12", "40", "4", + "1", "1 (Latency Focus)", + "4", "4 (Balanced)", + "8", "8 (Throughput)", + "16", "16 (Max Load)" + ]) + if new_seqs: + current_seqs = int(new_seqs) + # RE-CALCULATE Config based on new concurrency + verified = get_verified_config(model_id, current_tp, current_seqs) + current_ctx = verified["ctx"] + current_util = verified["util"] + + elif choice == "3": + # Configured Length Override + new_ctx = run_dialog([ + "--title", "Context Length", + "--inputbox", f"Override verified limit ({current_ctx}):", "10", "40", str(current_ctx) + ]) + if new_ctx: current_ctx = int(new_ctx) + + elif choice == "4": + # Util Override + pass + + elif choice == "5": + # Toggle Attention Backend + use_rocm_attn = not use_rocm_attn + + elif choice == "6": + # Toggle Cache + if not clear_cache: + # Enabling it -> Show Warning + warn_msg = ( + "WARNING: Erasing the vLLM cache will remove the compiled compute graphs.\n\n" + "This is useful if you are experiencing crashes, 'invalid graph' errors,\n" + "or have switched vLLM versions recently.\n\n" + "However, the next startup will take longer as graphs are re-compiled.\n\n" + "Are you sure you want to enable this?" + ) + confirm = run_dialog([ + "--title", "Erase Cache Warning", + "--yesno", warn_msg, "12", "60" + ]) + + # If confirm is not None (exit 0), it is YES. + if confirm is not None: + clear_cache = True + else: + # Disabling it -> No warning needed + clear_cache = False + + elif choice == "7": + # Toggle Eager Mode + use_eager = not use_eager + + elif choice == "8": + # Launch + break + + # Build Command + subprocess.run(["clear"]) + + if clear_cache: + nuke_vllm_cache() + + cmd = [ + "vllm", "serve", model_id, + "--host", HOST, + "--port", PORT, + "--tensor-parallel-size", str(current_tp), + "--max-num-seqs", str(current_seqs), + "--max-model-len", str(current_ctx), + "--gpu-memory-utilization", str(current_util), + "--dtype", "auto" + ] + + if config.get("trust_remote"): cmd.append("--trust-remote-code") + if use_eager: cmd.append("--enforce-eager") + + # Env Vars + env = os.environ.copy() + env.update(config.get("env", {})) + + if use_rocm_attn: + env["VLLM_V1_USE_PREFILL_DECODE_ATTENTION"] = "1" + env["VLLM_USE_TRITON_FLASH_ATTN"] = "0" + # Optional: Explicitly mention these in print + + + print("\n" + "="*60) + print(f" Launching: {name}") + print(f" Config: TP={current_tp} | Seqs={current_seqs} | Ctx={current_ctx} | Util={current_util}") + print(f" Backend: {'ROCm' if use_rocm_attn else 'Triton'}") + if clear_cache: + print(f" Action: Clearing vLLM Cache (~/.cache/vllm)") + print(f" Command: {' '.join(cmd)}") + print("="*60 + "\n") + + os.execvpe("vllm", cmd, env) + +def main(): + check_dependencies() + gpu_count = detect_gpus() + + while True: + # Build Model Menu + menu_items = [] + for i, m_id in enumerate(MODELS_TO_RUN): + name = m_id.split("/")[-1] + # Pre-calc verified ctx for 'default' TP to show in menu? + # Or just show names. Just names is cleaner. + config = MODEL_TABLE[m_id] + menu_items.extend([str(i), name]) + + choice = run_dialog([ + "--clear", "--backtitle", f"AMD R9700 vLLM Launcher (GPUs: {gpu_count})", + "--title", "Select Model", + "--menu", "Choose a model to serve:", "20", "60", "10" + ] + menu_items) + + if not choice: + subprocess.run(["clear"]) + print("Selection cancelled.") + sys.exit(0) + + configure_and_launch(int(choice), gpu_count) + +if __name__ == "__main__": + main()