30 Commits

Autor SHA1 Mensaje Fecha
devbadxyz 48a20990d3 Improve compilation support 2026-03-15 13:04:09 +01:00
Donato Capitella 039363b819 feat: set LD_LIBRARY_PATH to include ROCm library directories. 2026-03-14 13:41:09 +00:00
Donato Capitella cf2fd6ec11 chore: remove fix_block_size.py script and its execution from the Dockerfile. 2026-03-14 13:18:56 +00:00
Donato Capitella b78e8a9d82 fix: Remove vLLM block size validation checks by adding and running a new patching script in the Dockerfile. 2026-03-13 16:29:01 +00:00
Donato Capitella 16405e8943 config: Add VLLM_DISABLE_COMPILE_CACHE=1 to environment variables across VLLM scripts. 2026-03-09 14:07:43 +00:00
Donato Capitella 8de950d9ca feat: Override _get_gcn_arch function to return "gfx1151" and rename the original implementation to _old_get_gcn_arch. 2026-03-09 12:13:27 +00:00
Donato Capitella fb0aef0864 Downgrade Python to 3.12 and remove the --no-deps flag from a pip install command in the Dockerfile. 2026-03-09 11:08:11 +00:00
Donato Capitella 9997faaa1e build: Add --no-deps flag to local wheel installation. 2026-03-08 16:31:16 +00:00
Donato Capitella 8a20ec27b2 fixing https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes/issues/21 2026-02-26 12:36:03 +00:00
Donato Capitella c27835d99f feat: Introduce v1 API structure, enhance quantization support, and expand model compatibility with various updates and new tests. 2026-02-25 11:50:23 +00:00
Donato Capitella b035bcb482 updated benchmarks including thunderbolt and configuratuion guides 2026-02-25 10:48:42 +00:00
Donato Capitella 6875f62ccf improve benchmarks 2026-02-25 09:29:46 +00:00
Donato Capitella a5a7b8fe04 fix: Ignore settings.json and default 'TP2 (Eth)' checkbox to unchecked in documentation. 2026-02-24 08:50:18 +00:00
Donato Capitella 1af159af81 removing llvm flags as they have no impact on performance 2026-02-24 08:27:57 +00:00
Donato Capitella e726d406fa updated benchmarks, fix start-vllm 2026-02-23 19:39:19 +00:00
Donato Capitella e0fadf426b force egaer mode to make gemma stable 2026-02-23 18:19:15 +00:00
Donato Capitella f968cb1f30 most of the time spent by devs is to ensure there is no standard way of passing flags - I have no idea why 2026-02-23 12:08:57 +00:00
Donato Capitella fedfa3c682 Trying fix for ROCm/llvm loop unrolling bug, to see if performance improves on custom complied kernels 2026-02-23 11:43:44 +00:00
Donato Capitella 13c5a929a3 feat: refactor vLLM Strix Halo patching into a dedicated script 2026-02-23 10:33:20 +00:00
Donato Capitella 5a7f0cc676 feat: Implement temporary patch for C10_CHECK macro import missing 2026-02-23 09:49:42 +00:00
Donato Capitella b3fcb0091f feat: Enhance find_max_context.py with Ray cluster support and fix C10_HIP_CHECK build error in Dockerfile. 2026-02-23 09:11:30 +00:00
Donato Capitella 91b6dbc270 feat: Display environment variables and allow to choose between RoCE/Ethernet and show RCCL debug information 2026-02-22 20:07:34 +00:00
Donato Capitella 4a5d6c7855 fix broken stuff 2026-02-19 20:29:28 +00:00
Donato Capitella 726cd5ae53 remove clang patch 2026-02-18 15:23:02 +00:00
Donato Capitella 49b85fc1fb add MiniMax 2026-02-18 15:22:12 +00:00
Donato Capitella 290beffb05 feat: Enhance quantization support for MoE layers with new FP8/INT8 configs and model-specific optimizations across various devices. 2026-02-12 11:10:28 +00:00
Donato Capitella 6754095398 feat: Introduce measure_bandwidth.sh script, install perfquery, and add the script to the Docker image for RDMA bandwidth monitoring. 2026-02-07 10:40:53 +00:00
Donato Capitella 9cf7eaeab2 fix: Correct 'buy me a coffee' URL in README. 2026-02-06 06:56:26 +00:00
Donato Capitella c3ecb9bbd5 feat: add project context and support sections to README. 2026-02-05 17:55:30 +00:00
Donato Capitella afe985afca added images to RDMA guide 2026-02-03 19:47:42 +00:00
Se han modificado 103 ficheros con 2727 adiciones y 607 borrados
+2 -2
Ver fichero
@@ -13,7 +13,7 @@ on:
default: ""
env:
IMAGE_REPO: kyuz0/vllm-therock-gfx1151
IMAGE_REPO: kyuz0/vllm-therock-gfx1150
DOCKER_BUILDKIT: "1"
jobs:
@@ -67,7 +67,7 @@ jobs:
uses: dawidd6/action-download-artifact@v6
with:
workflow: build-rccl.yml
name: librccl-gfx1151
name: librccl-gfx1150
run_id: ${{ github.event.inputs.rccl_run_id }}
path: custom_libs
if_no_artifact_found: warn
+6 -6
Ver fichero
@@ -5,7 +5,7 @@ on:
env:
ROCM_MAJOR_VER: 7
GFX: gfx1151
GFX: gfx1150
jobs:
build-rccl:
@@ -24,17 +24,17 @@ jobs:
shell: bash
run: |
source /etc/profile.d/rocm-sdk.sh
bash scripts/build_rccl_gfx1151.sh
bash scripts/build_rccl_gfx1150.sh
- name: Compress Artifact
run: |
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
ls -lh rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
gzip -c rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 > librccl.so.1.gz
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
ls -lh rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
gzip -c rocm-systems/projects/rccl/build_gfx1150/librccl.so.1 > librccl.so.1.gz
ls -lh librccl.so.1.gz
- name: Upload Artifact
uses: actions/upload-artifact@v4
with:
name: librccl-gfx1151
name: librccl-gfx1150
path: librccl.so.1.gz
+3
Ver fichero
@@ -1,2 +1,5 @@
*.pyc
__pycache__/
settings.json
custom_libs/
rocm-systems/
+20 -37
Ver fichero
@@ -1,4 +1,4 @@
FROM registry.fedoraproject.org/fedora:43
FROM debian:12-slim
# 1. System Base & Build Tools
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
@@ -8,7 +8,7 @@ RUN sh /tmp/install_deps.sh
# 2. Install "TheRock" ROCm SDK (Tarball Method)
WORKDIR /tmp
ARG ROCM_MAJOR_VER=7
ARG GFX=gfx1151
ARG GFX=gfx1150
# We pass ARGs to the script via ENV or rely on defaults.
# But let's be explicit and export them for the RUN command.
COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh
@@ -18,7 +18,7 @@ RUN chmod +x /tmp/install_rocm_sdk.sh && \
/tmp/install_rocm_sdk.sh
# 4. Python Venv Setup
RUN /usr/bin/python3.13 -m venv /opt/venv
RUN /usr/bin/python3.11 -m venv /opt/venv
ENV VIRTUAL_ENV=/opt/venv
ENV PATH=/opt/venv/bin:$PATH
ENV PIP_NO_CACHE_DIR=1
@@ -27,13 +27,14 @@ RUN python -m pip install --upgrade pip wheel packaging "setuptools<80.0.0"
# 5. Install PyTorch (TheRock Nightly)
RUN python -m pip install \
--index-url https://rocm.nightlies.amd.com/v2-staging/gfx1151/ \
--index-url https://rocm.nightlies.amd.com/v2-staging/gfx1150/ \
--pre torch torchaudio torchvision
WORKDIR /opt
# Flash-Attention
ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
ENV LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH"
RUN git clone https://github.com/ROCm/flash-attention.git &&\
cd flash-attention &&\
@@ -46,38 +47,18 @@ RUN git clone https://github.com/vllm-project/vllm.git /opt/vllm
WORKDIR /opt/vllm
# --- PATCHING ---
RUN echo "import sys, re" > patch_strix.py && \
echo "from pathlib import Path" >> patch_strix.py && \
# Patch 1: __init__.py
echo "p = Path('vllm/platforms/__init__.py')" >> patch_strix.py && \
echo "txt = p.read_text()" >> patch_strix.py && \
echo "txt = txt.replace('import amdsmi', '# import amdsmi')" >> patch_strix.py && \
echo "txt = re.sub(r'is_rocm = .*', 'is_rocm = True', txt)" >> patch_strix.py && \
echo "txt = re.sub(r'if len\(amdsmi\.amdsmi_get_processor_handles\(\)\) > 0:', 'if True:', txt)" >> patch_strix.py && \
echo "txt = txt.replace('amdsmi.amdsmi_init()', 'pass')" >> patch_strix.py && \
echo "txt = txt.replace('amdsmi.amdsmi_shut_down()', 'pass')" >> patch_strix.py && \
echo "p.write_text(txt)" >> patch_strix.py && \
# Patch 2: rocm.py
echo "p = Path('vllm/platforms/rocm.py')" >> patch_strix.py && \
echo "txt = p.read_text()" >> patch_strix.py && \
echo "header = 'import sys\nfrom unittest.mock import MagicMock\nsys.modules[\"amdsmi\"] = MagicMock()\n'" >> patch_strix.py && \
echo "txt = header + txt" >> patch_strix.py && \
echo "txt = re.sub(r'device_type = .*', 'device_type = \"rocm\"', txt)" >> patch_strix.py && \
echo "txt = re.sub(r'device_name = .*', 'device_name = \"gfx1151\"', txt)" >> patch_strix.py && \
echo "txt += '\n def get_device_name(self, device_id: int = 0) -> str:\n return \"AMD-gfx1151\"\n'" >> patch_strix.py && \
echo "p.write_text(txt)" >> patch_strix.py && \
echo "print('Successfully patched vLLM for Strix Halo')" >> patch_strix.py && \
python patch_strix.py && \
sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt
COPY scripts/patch_strix.py /opt/vllm/patch_strix.py
RUN python /opt/vllm/patch_strix.py && \
sed -i 's/gfx1200;gfx1201/gfx1150/' CMakeLists.txt
# 7. Build vLLM (Wheel Method) with CLANG Host Compiler
RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11
RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11 amd-quark>=0.11
ENV ROCM_HOME="/opt/rocm"
ENV HIP_PATH="/opt/rocm"
ENV VLLM_TARGET_DEVICE="rocm"
ENV PYTORCH_ROCM_ARCH="gfx1151"
ENV HIP_ARCHITECTURES="gfx1151"
ENV AMDGPU_TARGETS="gfx1151"
ENV PYTORCH_ROCM_ARCH="gfx1150"
ENV HIP_ARCHITECTURES="gfx1150"
ENV AMDGPU_TARGETS="gfx1150"
ENV MAX_JOBS="4"
# --- CRITICAL FIX FOR SEGFAULT ---
@@ -88,7 +69,7 @@ ENV CXX="/opt/rocm/llvm/bin/clang++"
RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1150 -DHIP_ARCHITECTURES=gfx1150" && \
python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \
python -m pip install /tmp/dist/*.whl
@@ -105,8 +86,8 @@ ENV CMAKE_PREFIX_PATH="/opt/rocm"
# Force CMake to use the System ROCm Compiler (/opt/rocm/llvm/bin/clang++)
RUN cmake -S . \
-DGPU_TARGETS="gfx1151" \
-DBNB_ROCM_ARCH="gfx1151" \
-DGPU_TARGETS="gfx1150" \
-DBNB_ROCM_ARCH="gfx1150" \
-DCOMPUTE_BACKEND=hip \
-DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
@@ -120,17 +101,19 @@ RUN chmod -R a+rwX /opt && \
find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \
find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} + && \
rm -rf /root/.cache/pip || true && \
dnf clean all && rm -rf /var/cache/dnf/*
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh
COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh
COPY scripts/zz-venv-last.sh /etc/profile.d/zz-venv-last.sh
COPY scripts/start_vllm.py /opt/start-vllm
COPY scripts/start_vllm_cluster.py /opt/start-vllm-cluster
COPY scripts/measure_bandwidth.sh /opt/measure_bandwidth.sh
COPY scripts/cluster_manager.py /opt/cluster_manager.py
COPY scripts/models.py /opt/models.py
COPY benchmarks/max_context_results.json /opt/max_context_results.json
COPY benchmarks/bench_utils.py /opt/bench_utils.py
COPY benchmarks/run_vllm_bench.py /opt/run_vllm_bench.py
COPY benchmarks/vllm_cluster_bench.py /opt/vllm_cluster_bench.py
COPY benchmarks/find_max_context.py /opt/find_max_context.py
@@ -145,7 +128,7 @@ RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py
RUN chmod 0644 /etc/profile.d/*.sh
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one
# 9. Install Custom RCCL (gfx1150) - Replaces standard library with manually built one
COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz
RUN echo "Installing Custom RCCL..." && \
gzip -d /tmp/librccl.so.1.gz && \
@@ -163,4 +146,4 @@ RUN python -m pip install transformers==5.0.0
RUN chmod -R a+rwX /opt
CMD ["/bin/bash"]
CMD ["/bin/bash"]
+304
Ver fichero
@@ -0,0 +1,304 @@
# Guida: Usare vLLM con Podman su Strix Halo
Questa guida ti spiega come buildare e usare il container vLLM con il modello `bullpoint/Qwen3-Coder-Next-AWQ-4bit` su Debian 13 con Podman.
## Prerequisiti
- Podman installato e funzionante
- AMD Ryzen AI Max "Strix Halo" (gfx1150) o GPU ROCm compatibile
- Accesso ai device `/dev/kfd` e `/dev/dri`
- Almeno 30GB di spazio disco per il modello e la cache
## 1. Buildare l'immagine
Dalla directory del progetto, esegui:
```bash
podman build -t vllm:rocm .
```
**Note:**
- Il build richiede 30-60 minuti a seconda della macchina
- L'immagine compila vLLM, bitsandbytes e flash-attention da sorgente
- Se il build fallisce, verifica di avere abbastanza spazio disco e memoria
### Opzioni di build avanzate
Puoi passare argomenti personalizzati:
```bash
podman build \
--build-arg ROCM_MAJOR_VER=7 \
--build-arg GFX=gfx1150 \
--network=host \
-t vllm:rocm .
```
- `--network=host` - Usare la rete dell'host per i download (utile se hai problemi di connessione)
- `--no-cache` - Ignorare la cache e ricompilare tutto
## 2. Preparare i filesystem locali
Crea le cartelle per modelli e cache:
```bash
mkdir -p ~/models
mkdir -p ~/.cache/huggingface
```
## 3. Lanciare il container con GPU
### Opzione A: Shell interattiva (Development)
Se vuoi esplorare il container e usare il TUI `start-vllm`:
```bash
podman run -it \
--device /dev/kfd \
--device /dev/dri \
--network host \
-v $HOME/models:/models \
-v $HOME/.cache/huggingface:/cache/huggingface \
-p 8000:8000 \
vllm:rocm \
/bin/bash
```
Dentro il container:
```bash
start-vllm
```
Oppure lancia direttamente:
```bash
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
--tensor-parallel-size 1 \
--trust-remote-code \
--enforce-eager \
--gpu-memory-utilization 0.90
```
### Opzione B: Lanciare direttamente il servizio (Production)
Esegui vLLM in un unico comando senza shell interattiva:
```bash
podman run -d \
--device /dev/kfd \
--device /dev/dri \
--network host \
-v $HOME/models:/models \
-v $HOME/.cache/huggingface:/cache/huggingface \
-p 8000:8000 \
--name vllm-server \
vllm:rocm \
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
--tensor-parallel-size 1 \
--trust-remote-code \
--enforce-eager \
--gpu-memory-utilization 0.90
```
**Opzioni spiegate:**
| Opzione | Significato |
|---------|------------|
| `-d` | Esegui in background |
| `--device /dev/kfd` | Accesso alla GPU ROCm (kernel compute queue) |
| `--device /dev/dri` | Accesso agli acceleratori DRI (render engine) |
| `--network host` | Usa la rete dell'host (migliore performance) |
| `-v $HOME/models:/models` | Monta la cartella modelli locale |
| `-v $HOME/.cache/huggingface:/cache/huggingface` | Monta la cache HuggingFace |
| `-p 8000:8000` | Espone la porta dell'API OpenAI-compatible |
| `--name vllm-server` | Nome del container |
| `--tensor-parallel-size 1` | Usa 1 GPU (no parallelismo) |
| `--trust-remote-code` | Permetti codice remoto da HuggingFace |
| `--enforce-eager` | Modalità eager (debug/stability) |
| `--gpu-memory-utilization 0.90` | Usa il 90% della memoria GPU |
## 4. Monitorare il container
Se lanciato in background (`-d`):
```bash
# Visualizza i log
podman logs -f vllm-server
# Visualizza i log ultimi 50 righe
podman logs -n 50 vllm-server
# Controlla lo stato
podman ps | grep vllm-server
# Entra nel container
podman exec -it vllm-server /bin/bash
```
## 5. Testare l'API
Una volta che il server è up, puoi testare con cURL:
### Chat Completion
```bash
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
"messages": [{"role": "user", "content": "Write a Python function to sort a list"}],
"max_tokens": 200,
"temperature": 0.7
}'
```
### Completamento testo
```bash
curl -X POST http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
"prompt": "def fibonacci(",
"max_tokens": 100
}'
```
### Listare modelli disponibili
```bash
curl http://localhost:8000/v1/models
```
## 6. Usare da un altro host (SSH Port Forwarding)
Se vLLM è su un server remoto:
```bash
ssh -L 0.0.0.0:8000:localhost:8000 user@remote-host
```
Poi da client locale:
```bash
curl http://localhost:8000/v1/models
```
## 7. Stoppare il container
```bash
# Se lanciato in background
podman stop vllm-server
# Rimuovere il container
podman rm vllm-server
# Se in shell interattiva, usa Ctrl+C e poi
podman stop <container-id>
```
## 8. Usare con systemd (Quadlet)
Se hai già usato il file `vllm-rocm.container` generato:
```bash
mkdir -p ~/.config/containers/systemd/
cp vllm-rocm.container ~/.config/containers/systemd/
systemctl --user daemon-reload
systemctl --user start vllm-rocm
systemctl --user status vllm-rocm
```
Visualizza i log:
```bash
systemctl --user logs -u vllm-rocm -n 50 -f
```
## Modello: bullpoint/Qwen3-Coder-Next-AWQ-4bit
### Caratteristiche
- **Quantizzazione:** AWQ (Activation-aware Weight Quantization) a 4-bit
- **Vantaggi:**
- Occupa ~15-20GB di memoria (vs 50-60GB full precision)
- Esecuzione molto veloce
- Qualità proche al modello full precision
- **Caso d'uso:** Sviluppo code, task di programmazione
### Parametri consigliati
```bash
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
--tensor-parallel-size 1 \
--trust-remote-code \
--enforce-eager \
--gpu-memory-utilization 0.90 \
--max-model-len 4096 \
--batch-size 16
```
## Troubleshooting
### Errore: "Unable to locate package python3.13"
Il container usa Python 3.13, disponibile in Debian 13. Verifica di usare `debian:bookworm` o `debian:13-slim` nella base image.
### Errore: "No GPU detected"
Verifica che i device siano accessibili:
```bash
ls -la /dev/kfd /dev/dri
```
Se non ci sono, potrebbe essere un problema di driver. Su Strix Halo:
```bash
rocm-smi
```
### Errore: "Out of memory"
Riduci `--gpu-memory-utilization` oppure `--max-model-len`:
```bash
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
--gpu-memory-utilization 0.80 \
--max-model-len 2048
```
### Il container si ferma subito
Controlla i log:
```bash
podman logs vllm-server
```
Se vedi errori di compilazione, il build potrebbe non essere completato correttamente. Riprova:
```bash
podman build --no-cache -t vllm:rocm .
```
## Link Utili
- [vLLM Documentation](https://docs.vllm.ai/)
- [HuggingFace Qwen3 Models](https://huggingface.co/collections/Qwen/qwen3-coder-67a2e625ef1d5c6ba5a9c14c)
- [ROCm Documentation](https://rocmdocs.amd.com/)
## Domande Frequenti
**D: Posso usare più GPU con Tensor Parallelism?**
R: Sì, imposta `--tensor-parallel-size 2` se hai 2 GPU. Su Strix Halo single-GPU, usa `--tensor-parallel-size 1`.
**D: Come cambio modello senza riavviare il container?**
R: Devi stoppare e riavviare il container con un modello diverso.
**D: Posso usare questo con una Web UI?**
R: Sì, usa HuggingFace Chat UI o altre app che supportano endpoint OpenAI-compatible.
**D: Il modello viene scaricato ogni volta?**
R: No, viene cachato in `~/.cache/huggingface`. La prima volta richiede il download, le volte successive usa la cache.
+17 -7
Ver fichero
@@ -1,18 +1,28 @@
# AMD Strix Halo (gfx1151) — vLLM Toolbox/Container
# AMD Strix Halo (gfx1150) — vLLM Toolbox/Container
An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1151)**. Built on the **TheRock nightly builds** for ROCm.
An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1150)**. Built on the **TheRock nightly builds** for ROCm.
---
## 🚀 High-Performance Clustering Support (New!)
**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1151)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.
**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1150)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.
👉 **[Read the Full RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)** for hardware requirements and configuration instructions.
---
### 📦 Project Context
This repository is part of the **[Strix Halo AI Toolboxes](https://strix-halo-toolboxes.com)** project. Check out the website for an overview of all toolboxes, tutorials, and host configuration guides.
### ❤️ Support
This is a hobby project maintained in my spare time. If you find these toolboxes and tutorials useful, you can **[buy me a coffee](https://buymeacoffee.com/dcapitella)** to support the work! ☕
---
## Table of Contents
* [Tested Models (Benchmarks)](#tested-models-benchmarks)
@@ -48,7 +58,7 @@ View full benchmarks at: [https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/
## 1) Toolbx vs Docker/Podman
The `kyuz0/vllm-therock-gfx1151:latest` image can be used both as: 
The `kyuz0/vllm-therock-gfx1150:latest` image can be used both as: 
* **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean.
* **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container.
@@ -71,7 +81,7 @@ To manually create a toolbox that exposes the GPU and relaxes seccomp:
```bash
toolbox create vllm \
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
-- --device /dev/dri --device /dev/kfd \
--group-add video --group-add render --security-opt seccomp=unconfined
```
@@ -102,7 +112,7 @@ Ubuntu’s toolbox package still breaks GPU access, so use Distrobox instead:
```bash
distrobox create -n vllm \
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
--additional-flags "--device /dev/kfd --device /dev/dri --group-add video --group-add render --security-opt seccomp=unconfined"
distrobox enter vllm
@@ -208,6 +218,6 @@ This toolbox supports high-performance clustering of multiple Strix Halo nodes u
**Detailed Documentation:** [RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)
**Key Features:**
* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1151`.
* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1150`.
* **Easy Setup:** `refresh_toolbox.sh` automatically detects and exposes RDMA devices.
* **Cluster Management:** Included `start-vllm-cluster` TUI for managing Ray and vLLM.
+14
Ver fichero
@@ -0,0 +1,14 @@
import subprocess
import tempfile
def run_dialog(args):
"""Runs dialog and returns stderr (selection line). Returns None if user cancelled."""
with tempfile.NamedTemporaryFile(mode="w+") as tf:
cmd = ["dialog"] + args
try:
# We don't trap stdout since dialog renders to TTY and writes choice to stderr
subprocess.run(cmd, stderr=tf, check=True)
tf.seek(0)
return tf.read().strip()
except subprocess.CalledProcessError:
return None # User cancelled/pressed ESC
@@ -0,0 +1,7 @@
{
"elapsed_time": 524.2037815230142,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.3815310134141399,
"tokens_per_second": 280.05330212131406
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 485.412814248004,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.4120204373051785,
"tokens_per_second": 302.43330149293365
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 421.75657659699937,
"elapsed_time": 424.04632396099623,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.4742071875054738,
"tokens_per_second": 348.0799308087054
"requests_per_second": 0.4716465836369236,
"tokens_per_second": 346.2003835540928
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 868.8101008250001,
"elapsed_time": 918.187000697013,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.2301999019234296,
"tokens_per_second": 168.9724830093454
"requests_per_second": 0.21782055272855774,
"tokens_per_second": 159.8857312165796
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 456.08530166203855,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.4385144604993234,
"tokens_per_second": 321.88057686801585
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 503.28860085096676,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.397386310084984,
"tokens_per_second": 291.6914862601304
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 457.7749735690013,
"elapsed_time": 458.737264430034,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.4368958801760569,
"tokens_per_second": 320.69249844623016
"requests_per_second": 0.4359794058773347,
"tokens_per_second": 320.0197833991106
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 644.1538858940003,
"elapsed_time": 686.8188757880125,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.3104848148551126,
"tokens_per_second": 227.90361622402403
"requests_per_second": 0.29119758796747197,
"tokens_per_second": 213.74630950782364
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 534.8865945799625,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.3739110346503573,
"tokens_per_second": 274.46004720922855
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 571.4193902639672,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.35000562355367393,
"tokens_per_second": 256.91287782898553
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 534.4193308840004,
"elapsed_time": 524.8208868440124,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.3742379596733028,
"tokens_per_second": 274.7000183491961
"requests_per_second": 0.38108239403864297,
"tokens_per_second": 279.7240042842149
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 733.5017090729998,
"elapsed_time": 789.1420173590304,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.2726646680247824,
"tokens_per_second": 200.1426829468909
"requests_per_second": 0.2534398062712803,
"tokens_per_second": 186.03115379827653
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 805.9022228560061,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.24816906360082697,
"tokens_per_second": 182.16229690959702
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 824.4905019259895,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.2425740497104635,
"tokens_per_second": 178.05541683872298
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 879.0596038709991,
"elapsed_time": 748.1414223780157,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.22751585799106944,
"tokens_per_second": 167.00232766189475
"requests_per_second": 0.2673291359329993,
"tokens_per_second": 196.2262690032198
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 1109.9732099440007,
"elapsed_time": 1168.3619703819859,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.18018452896722634,
"tokens_per_second": 132.2599488751683
"requests_per_second": 0.17117982703135376,
"tokens_per_second": 125.65027253668944
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 510.63144373201067,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.391671923958063,
"tokens_per_second": 291.5155379231269
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 572.292031740013,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.3494719285046033,
"tokens_per_second": 260.10671430704866
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 504.69023761399876,
"elapsed_time": 520.7929677469656,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.39628268013570256,
"tokens_per_second": 294.9472545848014
"requests_per_second": 0.3840297630462106,
"tokens_per_second": 285.8275921888489
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 876.911706677,
"elapsed_time": 930.6109793490032,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.22807313265081958,
"tokens_per_second": 169.75141153501525
"requests_per_second": 0.2149125729635249,
"tokens_per_second": 159.95620436815713
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 237.61095946098794,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.8417120172137385,
"tokens_per_second": 613.9321196754427
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 284.23000320699066,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.7036554823325597,
"tokens_per_second": 513.235753981134
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 244.51837097500174,
"elapsed_time": 247.22850671299966,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.8179344529513773,
"tokens_per_second": 596.5891209659404
"requests_per_second": 0.8089681997399035,
"tokens_per_second": 590.0492703672895
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 380.55349342600005,
"elapsed_time": 395.08209386101225,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.5255502930730307,
"tokens_per_second": 383.3285005130725
"requests_per_second": 0.5062239041143659,
"tokens_per_second": 369.23212230245684
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1361.426551499986,
"num_requests": 200,
"total_num_tokens": 146523,
"requests_per_second": 0.14690473002722398,
"tokens_per_second": 107.62460878889469
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1474.255295659008,
"num_requests": 200,
"total_num_tokens": 146523,
"requests_per_second": 0.13566171380825723,
"tokens_per_second": 99.38780646163637
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1482.2689266130328,
"num_requests": 200,
"total_num_tokens": 146523,
"requests_per_second": 0.13492828218223374,
"tokens_per_second": 98.85048345093716
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1724.1368565150187,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.11600007229371459,
"tokens_per_second": 85.2809331488931
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1338.8605944840237,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.1493807501871223,
"tokens_per_second": 109.82173992256857
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 1307.2402118169994,
"elapsed_time": 1199.1163451180328,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.15299406963775225,
"tokens_per_second": 112.4781801162827
"requests_per_second": 0.16678948695367285,
"tokens_per_second": 122.62029501860121
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 1886.751298176,
"elapsed_time": 1959.4152568069985,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.10600231211890418,
"tokens_per_second": 77.93077982357597
"requests_per_second": 0.10207126810164463,
"tokens_per_second": 75.0407548829671
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 243.98866786801955,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.819710201082723,
"tokens_per_second": 602.6345456319963
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 282.0738571010297,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.7090341588386423,
"tokens_per_second": 521.2677328949931
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 247.62527259899798,
"elapsed_time": 242.14750060701044,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.8076720033495051,
"tokens_per_second": 593.7843034224891
"requests_per_second": 0.825942863331829,
"tokens_per_second": 607.216674264294
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 341.2666312900001,
"elapsed_time": 357.72086531698005,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.5860520240258851,
"tokens_per_second": 430.8537270233502
"requests_per_second": 0.5590951476167821,
"tokens_per_second": 411.03557062490586
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 486.3392907420057,
"num_requests": 200,
"total_num_tokens": 146278,
"requests_per_second": 0.41123553824915293,
"tokens_per_second": 300.773560320048
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 455.7690629530116,
"num_requests": 200,
"total_num_tokens": 146278,
"requests_per_second": 0.4388187269758136,
"tokens_per_second": 320.9476287228403
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 422.7612150579989,
"elapsed_time": 398.827027003048,
"num_requests": 200,
"total_num_tokens": 146278,
"requests_per_second": 0.47308029420949094,
"tokens_per_second": 346.0061963818796
"requests_per_second": 0.5014705284716613,
"tokens_per_second": 366.77052981888835
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 594.5536415039987,
"elapsed_time": 610.5734472059994,
"num_requests": 200,
"total_num_tokens": 146278,
"requests_per_second": 0.33638680522429343,
"tokens_per_second": 246.02994547299596
"requests_per_second": 0.32756091984544267,
"tokens_per_second": 239.57478116575834
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 497.111974740983,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.40232384284085837,
"tokens_per_second": 295.31575874126105
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 471.133652363962,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.4245079904534079,
"tokens_per_second": 311.59947769256274
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 395.26841144900027,
"elapsed_time": 399.3928133630543,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.5059852854591319,
"tokens_per_second": 371.4058491591393
"requests_per_second": 0.5007601371589951,
"tokens_per_second": 367.5704596781314
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 769.1666062429999,
"elapsed_time": 813.6141017450136,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.260021688898978,
"tokens_per_second": 190.86242019407229
"requests_per_second": 0.24581678165489804,
"tokens_per_second": 180.43566315423652
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 456.45958357997006,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.4381548929949473,
"tokens_per_second": 321.6166453306162
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 490.5911466999678,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.40767144157681784,
"tokens_per_second": 299.24102990342374
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 464.71097393700256,
"elapsed_time": 440.66104900900973,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.43037503139986644,
"tokens_per_second": 315.906032423287
"requests_per_second": 0.4538635771184551,
"tokens_per_second": 333.147212194374
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 638.3282979609994,
"elapsed_time": 683.9224744850071,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.31331839844615444,
"tokens_per_second": 229.9835374194385
"requests_per_second": 0.29243080533447857,
"tokens_per_second": 214.65152188564062
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 517.5916094129789,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.38640502736670695,
"tokens_per_second": 283.6309502128471
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 548.4156070559984,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.3646869225214776,
"tokens_per_second": 267.6893183038276
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 502.6907218439992,
"elapsed_time": 497.59323585999664,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.3978589444944367,
"tokens_per_second": 292.0384117325289
"requests_per_second": 0.4019347241614679,
"tokens_per_second": 295.0301359026215
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 721.7994779089986,
"elapsed_time": 780.1687226030044,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.2770852655357769,
"tokens_per_second": 203.38751203489863
"requests_per_second": 0.2563548040386794,
"tokens_per_second": 188.17083503449163
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 802.5698999410379,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.24919947784572202,
"tokens_per_second": 182.9186467257061
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 839.8958681730437,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.23812475757863116,
"tokens_per_second": 174.78952518165474
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 886.8526372269989,
"elapsed_time": 757.2171181479935,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.2255166096425645,
"tokens_per_second": 165.5348293928834
"requests_per_second": 0.2641250378612165,
"tokens_per_second": 193.87438091607942
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 1084.3601952080007,
"elapsed_time": 1144.2253085140255,
"num_requests": 200,
"total_num_tokens": 146805,
"requests_per_second": 0.18444055848217136,
"tokens_per_second": 135.3839809398758
"requests_per_second": 0.1747907501362075,
"tokens_per_second": 128.30078036872973
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 373.92354663898004,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.5348686965496139,
"tokens_per_second": 398.09474781142933
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 434.26602390100015,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.46054719686197254,
"tokens_per_second": 342.7783704164132
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 369.2837602610016,
"elapsed_time": 374.03978066996206,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.5415889392445647,
"tokens_per_second": 403.09652364564084
"requests_per_second": 0.5347024844303181,
"tokens_per_second": 397.9710386242193
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 509.0738683320001,
"elapsed_time": 555.4390292470343,
"num_requests": 200,
"total_num_tokens": 148857,
"requests_per_second": 0.39287029337276264,
"tokens_per_second": 292.4074663029466
"requests_per_second": 0.36007552488906747,
"tokens_per_second": 267.99881204205957
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 213.75922767800512,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.9356321229849724,
"tokens_per_second": 682.4360360233941
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 264.80451649799943,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.7552741269105618,
"tokens_per_second": 550.8856190566602
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 224.76228898300178,
"elapsed_time": 224.3753512299736,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.8898289873490544,
"tokens_per_second": 649.02791593759
"requests_per_second": 0.8913635071929533,
"tokens_per_second": 650.1471716939323
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 322.171811016,
"elapsed_time": 336.45260514499387,
"num_requests": 200,
"total_num_tokens": 145877,
"requests_per_second": 0.620786776376495,
"tokens_per_second": 452.7925628873698
"requests_per_second": 0.5944373648520577,
"tokens_per_second": 433.5736973626181
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1484.8385301349917,
"num_requests": 200,
"total_num_tokens": 146523,
"requests_per_second": 0.1346947805710681,
"tokens_per_second": 98.67941666807306
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1493.5249834020506,
"num_requests": 200,
"total_num_tokens": 146523,
"requests_per_second": 0.13391138562973798,
"tokens_per_second": 98.10548978313048
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1707.9124416089617,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.11710202181769186,
"tokens_per_second": 86.0910643999307
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 1320.7432732739835,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.1514298834959962,
"tokens_per_second": 111.32822174858649
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 1315.035868578001,
"elapsed_time": 1242.463667072996,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.15208710635115047,
"tokens_per_second": 111.8113988472388
"requests_per_second": 0.16097050183460196,
"tokens_per_second": 118.34229353876268
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 1923.4690410719995,
"elapsed_time": 1966.935257990961,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.10397879858182421,
"tokens_per_second": 76.44313314138553
"requests_per_second": 0.10168102848706935,
"tokens_per_second": 74.75385852312364
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 299.5004001749912,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.6677787404729495,
"tokens_per_second": 490.93757442090305
}
@@ -0,0 +1,7 @@
{
"elapsed_time": 282.87595599895576,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.707023682142636,
"tokens_per_second": 519.7896706376232
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 246.0529060009976,
"elapsed_time": 244.54776988498634,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.8128333180474167,
"tokens_per_second": 597.5787987620997
"requests_per_second": 0.8178361229548825,
"tokens_per_second": 601.2567608739705
}
@@ -1,7 +1,7 @@
{
"elapsed_time": 333.59849170300004,
"elapsed_time": 362.9645123449736,
"num_requests": 200,
"total_num_tokens": 147036,
"requests_per_second": 0.5995230943012126,
"tokens_per_second": 440.75738846836555
"requests_per_second": 0.5510180560294371,
"tokens_per_second": 405.0974544317216
}
+88 -10
Ver fichero
@@ -15,18 +15,21 @@ except ImportError:
print("Error: 'transformers' not found. Please install it or run in vLLM environment.")
sys.exit(1)
# Import path handling for scripts/models.py
try:
import sys, os
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
import models
import cluster_manager # Import shared cluster logic
except ImportError:
print("Error: Could not import scripts/models.py.")
print("Error: Could not import scripts/models.py or cluster_manager.py.")
sys.exit(1)
# Import Utils from run_vllm_bench (keep utils shared)
try:
from run_vllm_bench import get_gpu_count, kill_vllm
from run_vllm_bench import kill_vllm
# We do NOT import get_gpu_count because we are overriding it for cluster awareness
except ImportError:
print("Error: Could not import run_vllm_bench.py.")
sys.exit(1)
@@ -65,7 +68,30 @@ CONCURRENCY_STEPS = [1, 4, 8, 16]
def log(msg): print(f"[MAX-CTX] {msg}", flush=True)
def get_gpu_count():
"""
Returns total GPUs.
If Ray Cluster is active, returns TOTAL cluster GPUs (e.g., 2).
Otherwise returns local AMD GPUs.
"""
if cluster_manager.check_ray_status():
# Ideally we'd query Ray for total resources, but for this specific 2-node setup:
# If cluster is up, we assume 2 nodes x 1 GPU = 2 GPUs.
# Constructing a Ray client just to count is slow/complex here.
log("Ray Cluster Detected: Assuming 2 GPUs available.")
return 2
# Local Fallback
try:
res = subprocess.run("rocm-smi --showid", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if res.returncode == 0:
return res.stdout.count("GPU")
except: pass
return 1
def get_hf_context_limit(model_name, trust_remote=False):
# ... (Keep existing implementation)
try:
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote)
@@ -95,6 +121,7 @@ def get_hf_context_limit(model_name, trust_remote=False):
def get_vllm_server_cmd(model, tp_size, util, max_len, max_seqs):
"""
Constructs the vLLM serve command.
Using Ray Backend if tp_size > 1 (Cluster Mode).
"""
config = MODEL_TABLE[model]
@@ -105,16 +132,47 @@ def get_vllm_server_cmd(model, tp_size, util, max_len, max_seqs):
"--tensor-parallel-size", str(tp_size),
"--max-num-seqs", str(max_seqs),
"--dtype", "auto",
# "--disable-log-stats" # Cleaner output, but user managed without it
# "--disable-log-stats"
]
# Env Setup
env = os.environ.copy()
env["VLLM_DISABLE_COMPILE_CACHE"] = "1"
env.update(config.get("env", {}))
# CLUSTER / RAY LOGIC
# Only if we need more than 1 GPU do we engage the cluster machinery
if tp_size > 1:
log(f"TP={tp_size} > 1: Using Ray Distributed Backend")
cmd.extend(["--distributed-executor-backend", "ray"])
# Inject Cluster Env Vars (similar to start_vllm_cluster.py)
# We need to know Head IP and RDMA Interface
rdma_iface = cluster_manager.get_net_iface()
head_ip = cluster_manager.get_local_ip(rdma_iface) # Assuming we run this ON HEAD
# IMPORTANT: vLLM needs to bind to the Head IP for Ray workers to reach it?
# Or at least we should be explicit.
cmd.extend(["--host", head_ip])
# Update our own process env so verify_context knows where to look?
# No, verify_context runs in THIS process. We need to export it or pass it.
# Simplest is to set it in os.environ for OUR process too, but that might be messy.
# Better: We rely on standard PORT.
env["RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES"] = "1"
env["VLLM_HOST_IP"] = head_ip
env["NCCL_SOCKET_IFNAME"] = rdma_iface
env["NCCL_IB_GID_INDEX"] = "1"
env["NCCL_IB_DISABLE"] = "0"
env["NCCL_NET_GDR_LEVEL"] = "0"
else:
# Default Localhost bind for single node safety
cmd.extend(["--host", "127.0.0.1"])
if config.get("trust_remote"): cmd.append("--trust-remote-code")
if config.get("enforce_eager"): cmd.append("--enforce-eager")
# Add model specific env vars
env = os.environ.copy()
env.update(config.get("env", {}))
return cmd, env
def is_port_free(port):
@@ -300,7 +358,14 @@ def verify_context(model, context_len):
"""
Sends a request to the server with length ~context_len to verify stability.
"""
url = f"http://{HOST}:{PORT}/v1/completions"
# Use dynamic host if set (by cluster logic), else localhost
# But wait, the env var is set for the SERVER process, not necessarily us?
# Actually, we (the client script) need to know where to send requests.
# If we are on Head, localhost is fine for Head-based server.
# But if we use Ray, vLLM head usually binds to HOST IP.
target_host = os.getenv("VLLM_HOST_IP", "127.0.0.1")
url = f"http://{target_host}:{PORT}/v1/completions"
# We use a simple "A " * N prompt.
# Llama 3 tokenizer: "A" is usually 1 token.
@@ -529,9 +594,22 @@ def main():
continue
config = MODEL_TABLE[model]
valid_tps = [t for t in config["valid_tp"] if t <= gpu_count]
for tp in valid_tps:
# KEY CHANGES:
# We only want to test the MINIMUM required TP.
# If model supports 1 and 2, we ONLY test 1 (local is faster/easier).
# We only test 2 if model VALID_TP *starts* with 2 (or higher).
valid_tps = config.get("valid_tp", [1])
min_tp = min(valid_tps)
if min_tp > gpu_count:
log(f"Skipping {model}: Requires TP={min_tp} but only {gpu_count} GPUs available.")
continue
tps_to_test = [min_tp]
for tp in tps_to_test:
# Track successful seqs for this TP to skip lower utils
# effectively: {seqs_count: max_working_util}
# Since we iterate high-util -> low-util, if we succeeded already for this 'seqs', we skip.
+140 -30
Ver fichero
@@ -2,6 +2,12 @@
import subprocess, time, json, sys, os, requests, argparse
from pathlib import Path
try:
import bench_utils
except ImportError:
sys.path.append(str(Path(__file__).parent))
import bench_utils
# =========================
# ⚙️ GLOBAL SETTINGS
@@ -89,38 +95,43 @@ def get_dataset():
def get_model_args(model, tp_size):
def get_model_args(model, tp_size, overrides=None):
config = MODEL_TABLE.get(model, {"max_num_seqs": "32"})
overrides = overrides or {}
# Allow per-model GPU utilization override
util = config.get("gpu_util", GPU_UTIL)
util = overrides.get("gpu_util", config.get("gpu_util", GPU_UTIL))
max_seq_override = overrides.get("max_num_seqs", config.get("max_num_seqs", "32"))
cmd = [
"--model", model,
"--gpu-memory-utilization", util,
"--gpu-memory-utilization", str(util),
"--dtype", "auto",
"--tensor-parallel-size", str(tp_size),
"--max-num-seqs", config["max_num_seqs"]
"--max-num-seqs", str(max_seq_override)
]
# Optional: if a model really needs a hard limit, we can still support "ctx" in config,
# but by default we rely on auto.
if "ctx" in config:
cmd.extend(["--max-model-len", config["ctx"]])
if "ctx" in overrides or "ctx" in config:
cmd.extend(["--max-model-len", str(overrides.get("ctx", config.get("ctx")))])
if config.get("trust_remote"): cmd.append("--trust-remote-code")
if config.get("enforce_eager"): cmd.append("--enforce-eager")
return cmd
def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DIR, extra_env=None):
def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DIR, extra_env=None, overrides=None):
if tp_size not in MODEL_TABLE[model]["valid_tp"]: return
overrides = overrides or {}
model_safe = model.replace("/", "_")
output_dir_path = Path(output_dir)
output_dir_path.mkdir(parents=True, exist_ok=True)
output_file = output_dir_path / f"{model_safe}_tp{tp_size}_throughput.json"
tag = overrides.get("tag", "").strip()
tag_suffix = f"_{tag}" if tag else ""
output_file = output_dir_path / f"{model_safe}_tp{tp_size}{tag_suffix}_throughput.json"
if output_file.exists():
log(f"SKIP {model} (TP={tp_size} | {backend_name})")
@@ -130,13 +141,13 @@ def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DI
dataset_args = ["--dataset-name", "sharegpt", "--dataset-path", dataset_path] if dataset_path else ["--input-len", "1024"]
# Retrieve Model-Specific Batch Tokens
batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)
batch_tokens = str(overrides.get("max_tokens", MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)))
log(f"START {model} (TP={tp_size} | {backend_name}) [Batch: {batch_tokens}]...")
kill_vllm()
nuke_vllm_cache()
cmd = ["vllm", "bench", "throughput"] + get_model_args(model, tp_size)
cmd = ["vllm", "bench", "throughput"] + get_model_args(model, tp_size, overrides)
cmd.extend([
"--num-prompts", str(OFF_NUM_PROMPTS),
"--max-num-batched-tokens", batch_tokens,
@@ -152,6 +163,7 @@ def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DI
# ENV Setup: Global + Model Specific
env = os.environ.copy()
env["VLLM_DISABLE_COMPILE_CACHE"] = "1"
# Inject model specific env vars (e.g. for AWQ)
model_env = MODEL_TABLE[model].get("env", {})
@@ -168,35 +180,64 @@ def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DI
def print_summary(tps):
print(f"\n{'MODEL':<40} | {'TP':<2} | {'Triton':<8} | {'ROCm':<8}")
print("-" * 75)
print(f"\n{'MODEL':<40} | {'TP':<2} | {'Tag':<15} | {'Triton':<8} | {'ROCm':<8}")
print("-" * 92)
for m in MODELS_TO_RUN:
msafe = m.replace("/", "_")
name_cell = m.split('/')[-1]
for tp in tps:
if tp not in MODEL_TABLE[m]["valid_tp"]: continue
# Default
try:
p1 = RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json"
d1 = json.loads(p1.read_text())
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
except: val1 = "N/A"
prefix = f"{msafe}_tp{tp}"
# ROCm
try:
p2 = Path("benchmark_results_rocm") / f"{msafe}_tp{tp}_throughput.json"
d2 = json.loads(p2.read_text())
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
except: val2 = "N/A"
tags = set()
for p in RESULTS_DIR.glob(f"{prefix}*_throughput.json"):
name_part = p.name[len(prefix):-len("_throughput.json")]
tag = name_part.lstrip("_")
tags.add(tag)
for p in Path("benchmark_results_rocm").glob(f"{prefix}*_throughput.json"):
name_part = p.name[len(prefix):-len("_throughput.json")]
tag = name_part.lstrip("_")
tags.add(tag)
if not tags:
tags.add("") # Default empty tag if no files found
for tag in sorted(list(tags)):
tag_suffix = f"_{tag}" if tag else ""
# Default
try:
p1 = RESULTS_DIR / f"{prefix}{tag_suffix}_throughput.json"
if p1.exists():
d1 = json.loads(p1.read_text())
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
else:
val1 = "N/A"
except: val1 = "N/A"
# ROCm
try:
p2 = Path("benchmark_results_rocm") / f"{prefix}{tag_suffix}_throughput.json"
if p2.exists():
d2 = json.loads(p2.read_text())
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
else:
val2 = "N/A"
except: val2 = "N/A"
name_cell = m.split('/')[-1]
print(f"{name_cell:<40} | {tp:<2} | {val1:<8} | {val2:<8}")
print("-" * 75)
display_tag = tag if tag else "(Default)"
print(f"{name_cell:<40} | {tp:<2} | {display_tag:<15} | {val1:<8} | {val2:<8}")
print("-" * 92)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--tp", type=int, nargs="+", default=[1])
parser.add_argument("--tui", action="store_true", help="Launch interactive configuration UI")
args = parser.parse_args()
gpu_count = get_gpu_count()
@@ -207,17 +248,86 @@ if __name__ == "__main__":
log(f"Requested TP={args.tp} but only {gpu_count} GPU(s) detected. Nothing to run.")
sys.exit(0)
selected_models = MODELS_TO_RUN
if args.tui:
# TUI Model Selection
checklist_args = [
"--clear", "--backtitle", "AMD vLLM Benchmark Launcher",
"--title", "Model Selection",
"--checklist", "Select models to benchmark:", "20", "65", "10"
]
for m in MODELS_TO_RUN:
m_name = m.split("/")[-1]
# All selected "on" by default
checklist_args.extend([m, m_name, "on"])
choice = bench_utils.run_dialog(checklist_args)
if choice is None:
subprocess.run(["clear"])
print("Cancelled by user.")
sys.exit(0)
# Parse space-separated quoted output from dialog checklist
import shlex
selected_models = [m for m in shlex.split(choice)]
if not selected_models:
subprocess.run(["clear"])
print("No models selected. Exiting.")
sys.exit(0)
kill_vllm()
for tp in valid_tp_args:
for m in MODELS_TO_RUN:
for m in selected_models:
overrides = {}
if args.tui:
config = MODEL_TABLE.get(m, {})
default_seqs = config.get("max_num_seqs", "32")
default_tokens = config.get("max_tokens", DEFAULT_BATCH_TOKENS)
default_util = config.get("gpu_util", GPU_UTIL)
default_ctx = config.get("ctx", "auto")
form_args = [
"--clear", "--backtitle", f"AMD vLLM Benchmark Configuration (TP: {tp})",
"--title", f"Tune Parameters: {m.split('/')[-1]}",
"--form", "Edit the options below. Leave tag empty for no suffix.",
"15", "70", "5",
"Max Concurrent Seqs:", "1", "1", str(default_seqs), "1", "25", "15", "0",
"Max Batched Tokens:", "2", "1", str(default_tokens), "2", "25", "15", "0",
"GPU Utilization (0-1):", "3", "1", str(default_util), "3", "25", "15", "0",
"Max Context Length:", "4", "1", str(default_ctx), "4", "25", "15", "0",
"Filename Tag (Optional):", "5", "1", "", "5", "25", "15", "0"
]
form_res = bench_utils.run_dialog(form_args)
if form_res is None:
subprocess.run(["clear"])
print(f"Skipping {m} (TP={tp}) due to user cancellation.")
continue
lines = form_res.splitlines()
if len(lines) >= 5:
overrides["max_num_seqs"] = lines[0].strip()
overrides["max_tokens"] = lines[1].strip()
overrides["gpu_util"] = lines[2].strip()
ctx_val = lines[3].strip()
if ctx_val and ctx_val.lower() != "auto":
overrides["ctx"] = ctx_val
overrides["tag"] = lines[4].strip()
# 1. Default (Triton)
run_throughput(m, tp, "Default", RESULTS_DIR)
run_throughput(m, tp, "Default", RESULTS_DIR, overrides=overrides)
# 2. ROCm Attention
# We force this via CLI argument --attention-backend ROCM_ATTN below
# No specific env vars needed if forcing backend.
rocm_env = {}
print(f"[DEBUG] Forcing ROCm Env: {rocm_env} + CLI: --attention-backend ROCM_ATTN")
run_throughput(m, tp, "ROCm-Attn", "benchmark_results_rocm", rocm_env)
run_throughput(m, tp, "ROCm-Attn", "benchmark_results_rocm", rocm_env, overrides=overrides)
print_summary(valid_tp_args)
+218 -46
Ver fichero
@@ -2,6 +2,12 @@
import subprocess, time, json, sys, os, requests, argparse, re
from pathlib import Path
try:
import bench_utils
except ImportError:
sys.path.append(str(Path(__file__).parent))
import bench_utils
# Import models immediately to access globals
try:
import models
@@ -23,6 +29,8 @@ except ImportError:
# User requested specifically to test with TP=2 on the cluster.
CLUSTER_TP = 2
GPU_UTIL = "0.90"
FORCE_ETH = False
FORCE_DEBUG_NCCL = False
# THROUGHPUT CONFIG (Imported from models.py)
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
@@ -66,6 +74,15 @@ def log(msg): print(f"\n[CLUSTER-BENCH] {msg}")
def restart_cluster():
log("Restarting Ray Cluster (Clean State)...")
# Push config to env so cluster_manager picks it up for daemon injection
os.environ["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
if FORCE_DEBUG_NCCL:
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
else:
os.environ.pop("NCCL_DEBUG", None)
os.environ.pop("NCCL_DEBUG_SUBSYS", None)
# 1. Stop Cluster (Best Effort)
cluster_manager.stop_cluster()
@@ -89,7 +106,8 @@ def restart_cluster():
log("Cluster Ready.")
def get_net_iface():
return cluster_manager.get_net_iface()
prefix = ".".join(HEAD_IP.split('.')[:3])
return cluster_manager.get_net_iface(prefix)
def get_local_ip(iface):
return cluster_manager.get_local_ip(iface)
@@ -122,6 +140,7 @@ def get_cluster_env():
host_ip = get_local_ip(rdma_iface)
env = os.environ.copy()
env["VLLM_DISABLE_COMPILE_CACHE"] = "1"
# Critical Cluster Envs (Match start_vllm_cluster.py)
env["RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES"] = "1"
@@ -130,31 +149,37 @@ def get_cluster_env():
env["GLOO_SOCKET_IFNAME"] = rdma_iface
# RCCL specific
env["NCCL_IB_GID_INDEX"] = "1"
env["NCCL_IB_DISABLE"] = "0"
env["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
env["NCCL_NET_GDR_LEVEL"] = "0"
# Stability for RDMA (Fix for high-throughput models like Gemma 3)
env["NCCL_IB_TIMEOUT"] = "23" # ~32 seconds (default is 18/~1s)
env["NCCL_IB_RETRY_CNT"] = "7" # Default is 3, increase for lossy networks
if FORCE_DEBUG_NCCL:
env["NCCL_DEBUG"] = "INFO"
env["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
return env
def get_model_args(model):
def get_model_args(model, overrides=None):
config = MODEL_TABLE.get(model, {"max_num_seqs": "32"})
util = config.get("gpu_util", GPU_UTIL)
overrides = overrides or {}
util = overrides.get("gpu_util", config.get("gpu_util", GPU_UTIL))
max_seq_override = overrides.get("max_num_seqs", config.get("max_num_seqs", "32"))
cmd = [
"--model", model,
"--gpu-memory-utilization", util,
"--gpu-memory-utilization", str(util),
"--dtype", "auto",
"--tensor-parallel-size", str(CLUSTER_TP),
"--max-num-seqs", config["max_num_seqs"],
"--max-num-seqs", str(max_seq_override),
"--distributed-executor-backend", "ray"
]
# Optional ctx
if "ctx" in config:
cmd.extend(["--max-model-len", config["ctx"]])
if "ctx" in overrides or "ctx" in config:
cmd.extend(["--max-model-len", str(overrides.get("ctx", config.get("ctx")))])
if config.get("trust_remote"): cmd.append("--trust-remote-code")
@@ -163,16 +188,20 @@ def get_model_args(model):
return cmd
def get_benchmark_output_file(model, output_dir):
def get_benchmark_output_file(model, output_dir, tag=""):
model_safe = model.replace("/", "_")
output_dir_path = Path(output_dir)
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json"
eth_suffix = "_eth" if FORCE_ETH else ""
tag_suffix = f"_{tag}" if tag else ""
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}{eth_suffix}{tag_suffix}_throughput.json"
def run_bench_set(model, backend_name, output_dir, extra_env=None):
def run_bench_set(model, backend_name, output_dir, extra_env=None, overrides=None):
output_dir_path = Path(output_dir)
output_dir_path.mkdir(parents=True, exist_ok=True)
overrides = overrides or {}
output_file = get_benchmark_output_file(model, output_dir)
tag = overrides.get("tag", "").strip()
output_file = get_benchmark_output_file(model, output_dir, tag)
if output_file.exists():
log(f"SKIP {model} [{backend_name}] (Result exists)")
@@ -181,13 +210,13 @@ def run_bench_set(model, backend_name, output_dir, extra_env=None):
dataset_path = get_dataset()
dataset_args = ["--dataset-name", "sharegpt", "--dataset-path", dataset_path] if dataset_path else ["--input-len", "1024"]
batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)
batch_tokens = str(overrides.get("max_tokens", MODEL_TABLE.get(model, {}).get("max_tokens", DEFAULT_BATCH_TOKENS)))
log(f"START {model} [TP={CLUSTER_TP} | {backend_name}]...")
nuke_vllm_cache()
cmd = ["vllm", "bench", "throughput"] + get_model_args(model)
cmd = ["vllm", "bench", "throughput"] + get_model_args(model, overrides)
cmd.extend([
"--num-prompts", str(OFF_NUM_PROMPTS),
"--max-num-batched-tokens", batch_tokens,
@@ -218,20 +247,24 @@ def run_bench_set(model, backend_name, output_dir, extra_env=None):
except Exception as e:
log(f"ERROR: System error: {e}")
def run_cluster_throughput(model):
def run_cluster_throughput(model, overrides=None):
overrides = overrides or {}
tag = overrides.get("tag", "").strip()
# 1. Default Run (Triton)
if get_benchmark_output_file(model, RESULTS_DIR).exists():
if get_benchmark_output_file(model, RESULTS_DIR, tag).exists():
log(f"SKIP {model} [Default] (Result exists)")
else:
restart_cluster()
run_bench_set(
model,
"Default",
RESULTS_DIR
RESULTS_DIR,
overrides=overrides
)
# 2. ROCm Attention Run
if get_benchmark_output_file(model, "benchmark_results_rocm").exists():
if get_benchmark_output_file(model, "benchmark_results_rocm", tag).exists():
log(f"SKIP {model} [ROCm-Attn] (Result exists)")
else:
restart_cluster()
@@ -239,47 +272,186 @@ def run_cluster_throughput(model):
model,
"ROCm-Attn",
"benchmark_results_rocm",
extra_env={}
extra_env={},
overrides=overrides
)
def print_summary():
print(f"\n{'MODEL (TP=2)':<50} | {'Triton':<8} | {'ROCm':<8}")
print("-" * 75)
eth_suffix = "_eth" if FORCE_ETH else ""
title_suffix = " (Ethernet ONLY)" if FORCE_ETH else ""
print(f"\n{f'MODEL (TP={CLUSTER_TP}){title_suffix}':<50} | {'Tag':<15} | {'Triton':<8} | {'ROCm':<8}")
print("-" * 92)
for m in MODELS_TO_RUN:
msafe = m.replace("/", "_")
# Default
try:
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
d1 = json.loads(p1.read_text())
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
except: val1 = "N/A"
# ROCm
try:
p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
d2 = json.loads(p2.read_text())
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
except: val2 = "N/A"
name_cell = m.split('/')[-1]
print(f"{name_cell:<50} | {val1:<8} | {val2:<8}")
print("-" * 75)
# Find all tags used for this model by looking at the files in RESULTS_DIR
prefix = f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}"
# Gather all unique tags from both directories
tags = set()
for p in RESULTS_DIR.glob(f"{prefix}*_throughput.json"):
# Extract tag: {prefix}_{tag}_throughput.json or {prefix}_throughput.json
name_part = p.name[len(prefix):-len("_throughput.json")]
tag = name_part.lstrip("_")
tags.add(tag)
for p in Path("benchmark_results_rocm").glob(f"{prefix}*_throughput.json"):
name_part = p.name[len(prefix):-len("_throughput.json")]
tag = name_part.lstrip("_")
tags.add(tag)
if not tags:
tags.add("") # Default empty tag if no files found
# Sort so empty tag (Default) comes first
for tag in sorted(list(tags)):
tag_suffix = f"_{tag}" if tag else ""
# Default (Triton)
try:
p1 = RESULTS_DIR / f"{prefix}{tag_suffix}_throughput.json"
if p1.exists():
d1 = json.loads(p1.read_text())
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
else:
val1 = "N/A"
except: val1 = "N/A"
# ROCm
try:
p2 = Path("benchmark_results_rocm") / f"{prefix}{tag_suffix}_throughput.json"
if p2.exists():
d2 = json.loads(p2.read_text())
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
else:
val2 = "N/A"
except: val2 = "N/A"
display_tag = tag if tag else "(Default)"
print(f"{name_cell:<50} | {display_tag:<15} | {val1:<8} | {val2:<8}")
print("-" * 92)
if __name__ == "__main__":
# if not check_ray_status():
# log("ERROR: Ray Cluster not ready. Please start it with 'start-vllm-cluster' first.")
# sys.exit(1)
# We now handle this by restarting the cluster ourselves.
pass
parser = argparse.ArgumentParser(description="VLLM Cluster Benchmark")
parser.add_argument("--eth-only", action="store_true", help="Run benchmark using only Ethernet (disable RDMA/RoCE)")
parser.add_argument("--debug-nccl", action="store_true", help="Enable NCCL Debug logging (INFO level for Transport tracking)")
parser.add_argument("--tui", action="store_true", help="Launch interactive configuration UI")
args = parser.parse_args()
FORCE_ETH = args.eth_only
FORCE_DEBUG_NCCL = args.debug_nccl
selected_models = MODELS_TO_RUN
if args.tui:
# 1. Cluster IPs Configuration
form_args = [
"--clear", "--backtitle", "AMD VLLM Cluster Configuration",
"--title", "Cluster Network Details",
"--form", "Verify Head and Worker IPs for this run:",
"10", "60", "2",
"Head Node IP:", "1", "1", HEAD_IP, "1", "20", "20", "0",
"Worker Node IP:", "2", "1", WORKER_IP, "2", "20", "20", "0"
]
res = bench_utils.run_dialog(form_args)
if res is None:
subprocess.run(["clear"])
print("Cancelled by user.")
sys.exit(0)
lines = res.splitlines()
if len(lines) >= 2:
HEAD_IP = lines[0].strip()
WORKER_IP = lines[1].strip()
os.environ["VLLM_HEAD_IP"] = HEAD_IP
os.environ["VLLM_WORKER_IP"] = WORKER_IP
# 2. Network Options (ETH / Debug)
eth_status = "on" if FORCE_ETH else "off"
debug_status = "on" if FORCE_DEBUG_NCCL else "off"
check_args = [
"--title", "Network Overrides",
"--checklist", "Select custom backend flags:", "10", "60", "2",
"ETH_ONLY", "Force Ethernet (Disable RDMA/RoCE)", eth_status,
"DEBUG_NCCL", "Enable NCCL debug logs", debug_status
]
flags_res = bench_utils.run_dialog(check_args)
if flags_res is not None:
FORCE_ETH = "ETH_ONLY" in flags_res
FORCE_DEBUG_NCCL = "DEBUG_NCCL" in flags_res
# 3. Model Selection
checklist_args = [
"--title", "Model Selection",
"--checklist", "Select models to benchmark:", "20", "65", "10"
]
for m in MODELS_TO_RUN:
m_name = m.split("/")[-1]
checklist_args.extend([m, m_name, "on"])
choice = bench_utils.run_dialog(checklist_args)
if choice is None:
subprocess.run(["clear"])
print("Cancelled by user.")
sys.exit(0)
import shlex
selected_models = [m for m in shlex.split(choice)]
if not selected_models:
subprocess.run(["clear"])
print("No models selected. Exiting.")
sys.exit(0)
log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...")
if FORCE_ETH:
log("Note: Ethernet ONLY mode enabled. RDMA/RoCE disabled.")
if FORCE_DEBUG_NCCL:
log("Note: NCCL Debug mode enabled (Transport Logging).")
log("Note: Eager Mode (--enforce-eager) is ENABLED for cluster stability.")
for m in MODELS_TO_RUN:
run_cluster_throughput(m)
for m in selected_models:
overrides = {}
if args.tui:
config = MODEL_TABLE.get(m, {})
default_seqs = config.get("max_num_seqs", "32")
default_tokens = config.get("max_tokens", DEFAULT_BATCH_TOKENS)
default_util = config.get("gpu_util", GPU_UTIL)
default_ctx = config.get("ctx", "auto")
form_args = [
"--clear", "--backtitle", f"AMD VLLM Cluster Benchmark Configuration (TP: {CLUSTER_TP})",
"--title", f"Tune Parameters: {m.split('/')[-1]}",
"--form", "Edit cluster model options. Leave tag empty for no suffix.",
"15", "70", "5",
"Max Concurrent Seqs:", "1", "1", str(default_seqs), "1", "25", "15", "0",
"Max Batched Tokens:", "2", "1", str(default_tokens), "2", "25", "15", "0",
"GPU Utilization (0-1):", "3", "1", str(default_util), "3", "25", "15", "0",
"Max Context Length:", "4", "1", str(default_ctx), "4", "25", "15", "0",
"Filename Tag (Optional):", "5", "1", "", "5", "25", "15", "0"
]
form_res = bench_utils.run_dialog(form_args)
if form_res is None:
subprocess.run(["clear"])
print(f"Skipping {m} due to user cancellation.")
continue
lines = form_res.splitlines()
if len(lines) >= 5:
overrides["max_num_seqs"] = lines[0].strip()
overrides["max_tokens"] = lines[1].strip()
overrides["gpu_util"] = lines[2].strip()
ctx_val = lines[3].strip()
if ctx_val and ctx_val.lower() != "auto":
overrides["ctx"] = ctx_val
overrides["tag"] = lines[4].strip()
run_cluster_throughput(m, overrides=overrides)
print_summary()
+41 -11
Ver fichero
@@ -4,7 +4,7 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AMD Strix Halo (gfx1151) vLLM Benchmarks</title>
<title>AMD Strix Halo (gfx1150) vLLM Benchmarks</title>
<style>
:root {
--bg-body: #f9fafb;
@@ -445,7 +445,7 @@
<div class="container">
<header>
<h1>AMD Strix Halo (gfx1151) vLLM Benchmarks</h1>
<h1>AMD Strix Halo (gfx1150) vLLM Benchmarks</h1>
<p style="margin: 4px 0 0 0; font-size: 0.9rem;">
<a href="https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes/" target="_blank"
style="color: var(--primary); text-decoration: none;">View on GitHub &rarr;</a>
@@ -469,6 +469,14 @@
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
<input type="checkbox" id="toggleTP2" checked> TP2
</label>
<label
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
<input type="checkbox" id="toggleTP2Eth"> TP2 (Eth)
</label>
<label
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
<input type="checkbox" id="toggleTP2Usb"> TP2 (Thunderbolt)
</label>
</div>
<!-- Attention Group -->
@@ -544,6 +552,8 @@
activeTab: "Throughput",
showTP1: true,
showTP2: true,
showTP2Eth: false,
showTP2Usb: false,
showTriton: true,
showRocm: false
};
@@ -615,6 +625,8 @@
// Toggles
$('toggleTP1').addEventListener('change', e => { state.showTP1 = e.target.checked; render(); });
$('toggleTP2').addEventListener('change', e => { state.showTP2 = e.target.checked; render(); });
$('toggleTP2Eth').addEventListener('change', e => { state.showTP2Eth = e.target.checked; render(); });
$('toggleTP2Usb').addEventListener('change', e => { state.showTP2Usb = e.target.checked; render(); });
$('toggleTriton').addEventListener('change', e => { state.showTriton = e.target.checked; render(); });
$('toggleRocm').addEventListener('change', e => { state.showRocm = e.target.checked; render(); });
}
@@ -636,13 +648,23 @@
params: run.params_b || run.name_params_b,
results: {
1: { triton: null, rocm: null },
2: { triton: null, rocm: null }
2: { triton: null, rocm: null },
"2_eth": { triton: null, rocm: null },
"2_usb": { triton: null, rocm: null }
}
};
}
const m = testGroups[testName].models[modelName];
const tp = run.tp || 1;
let tp = run.tp || 1;
if (tp === 2) {
if (run.network === "Ethernet") {
if (run.tag === "usb") tp = "2_usb";
else tp = "2_eth";
} else if (run.tag === "usb") {
tp = "2_usb";
}
}
if (!m.results[tp]) m.results[tp] = { triton: null, rocm: null };
@@ -749,8 +771,16 @@
if (state.showRocm) cols.push({ id: "tp1_rocm", label: "TP1 ROCm" });
}
if (state.showTP2) {
if (state.showTriton) cols.push({ id: "tp2_triton", label: "TP2 Triton" });
if (state.showRocm) cols.push({ id: "tp2_rocm", label: "TP2 ROCm" });
if (state.showTriton) cols.push({ id: "tp2_triton", label: "TP2 RoCE Triton" });
if (state.showRocm) cols.push({ id: "tp2_rocm", label: "TP2 RoCE ROCm" });
}
if (state.showTP2Eth) {
if (state.showTriton) cols.push({ id: "tp2_eth_triton", label: "TP2 Eth Triton" });
if (state.showRocm) cols.push({ id: "tp2_eth_rocm", label: "TP2 Eth ROCm" });
}
if (state.showTP2Usb) {
if (state.showTriton) cols.push({ id: "tp2_usb_triton", label: "TP2 TB Triton" });
if (state.showRocm) cols.push({ id: "tp2_usb_rocm", label: "TP2 TB ROCm" });
}
// Thead
@@ -790,11 +820,7 @@
// Data Cells
cols.forEach(c => {
let val = null;
if (c.id === "tp1_triton") val = m.results[1]?.triton;
if (c.id === "tp1_rocm") val = m.results[1]?.rocm;
if (c.id === "tp2_triton") val = m.results[2]?.triton;
if (c.id === "tp2_rocm") val = m.results[2]?.rocm;
let val = getVal(m, c.id);
const bg = c.id.startsWith("tp2") ? 'style="background:#fbfdff;"' : "";
rowHtml += `<td class="col-data" ${bg}>${formatVal(val, unit)}</td>`;
@@ -823,6 +849,10 @@
if (colId === "tp1_rocm") return m.results[1]?.rocm;
if (colId === "tp2_triton") return m.results[2]?.triton;
if (colId === "tp2_rocm") return m.results[2]?.rocm;
if (colId === "tp2_eth_triton") return m.results["2_eth"]?.triton;
if (colId === "tp2_eth_rocm") return m.results["2_eth"]?.rocm;
if (colId === "tp2_usb_triton") return m.results["2_usb"]?.triton;
if (colId === "tp2_usb_rocm") return m.results["2_usb"]?.rocm;
return null;
}
+26
Ver fichero
@@ -66,6 +66,30 @@ def parse_logs():
if not tp_match: continue
tp = int(tp_match.group(1))
# Network
network = "RoCE"
network_prefix = ""
if "_eth" in rest:
network = "Ethernet"
network_prefix = "_eth"
# Tag Extraction
tag = ""
test_type_str = ""
if "throughput" in fname:
test_type_str = "_throughput.json"
elif "latency" in fname:
qps_match = re.search(r"(_qps[\d\.]+)_latency\.json$", rest)
if qps_match:
test_type_str = qps_match.group(0)
else:
test_type_str = "_latency.json"
raw_prefix = f"{tp}{network_prefix}"
if rest.endswith(test_type_str):
tag_part = rest[len(raw_prefix):-len(test_type_str)]
tag = tag_part.lstrip("_")
# Model Name
if "_" in model_part:
model_display = model_part.replace("_", "/", 1)
@@ -87,6 +111,8 @@ def parse_logs():
"params_b": params_b,
"name_params_b": params_b,
"backend": backend_name, # "Triton" or "ROCm"
"network": network,
"tag": tag,
"error": False
}
+863 -171
Ver fichero
La diferencia del archivo ha sido suprimido porque es demasiado grande Cargar Diff
Archivo binario no mostrado.

Después

Anchura:  |  Altura:  |  Tamaño: 584 KiB

+134 -43
Ver fichero
@@ -3,62 +3,140 @@
# -------- dynamic config --------
HOST_ROCE="192.168.100.2"
HOST_ETH="192.168.1.127"
HOST_TB="192.168.2.2"
# Automatically detect local and remote RDMA device names
RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}')
RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'")
# Parse args
RUN_ETH=true
RUN_ROCE=true
RUN_TB=true
RUN_RDMA=true
# If any flags are provided, turn off defaults and only run requested
if [ "$#" -gt 0 ]; then
RUN_ETH=false
RUN_ROCE=false
RUN_TB=false
RUN_RDMA=false
fi
while getopts "ertih" opt; do
case ${opt} in
e ) RUN_ETH=true ;;
r ) RUN_ROCE=true ;;
t ) RUN_TB=true ;;
i ) RUN_RDMA=true ;;
h ) echo "Usage: $0 [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)]"
echo
echo "Options:"
echo " -e Run benchmarking for standard Ethernet (1G LAN)."
echo " -r Run benchmarking for RoCE NIC (via Ethernet/TCP)."
echo " -t Run benchmarking for Thunderbolt link."
echo " -i Run benchmarking for RDMA (RoCE v2)."
echo " -h Print this help message and exit."
echo
echo "If no arguments are provided, all benchmarks are executed."
exit 0
;;
\? ) echo "Usage: cmd [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)] [-h (Help)]"
exit 1
;;
esac
done
# Automatically detect local and remote RDMA device names if needed
if [ "$RUN_RDMA" = true ]; then
RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}')
RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'")
fi
WORKDIR="/tmp/rdma_bench"
mkdir -p "$WORKDIR"
# -------- helpers --------
parse_ping_avg() {
grep rtt "$1" | awk -F'/' '{print $5}'
if [ -f "$1" ]; then
grep rtt "$1" | awk -F'/' '{print $5}'
else
echo "0"
fi
}
parse_iperf_gbps() {
grep receiver "$1" | tail -n1 | awk '
{
val=$(NF-2);
unit=$(NF-1);
if (unit=="Mbits/sec") printf "%.2f", val/1000;
else if (unit=="Gbits/sec") printf "%.2f", val;
else print "N/A";
}'
if [ -f "$1" ]; then
grep receiver "$1" | tail -n1 | awk '
{
val=$(NF-2);
unit=$(NF-1);
if (unit=="Mbits/sec") printf "%.2f", val/1000;
else if (unit=="Gbits/sec") printf "%.2f", val;
else print "0.00";
}'
else
echo "0.00"
fi
}
parse_rdma_lat_us() {
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}')
echo "${val:-0}"
if [ -f "$1" ]; then
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}')
echo "${val:-0}"
else
echo "0"
fi
}
parse_rdma_bw_mib() {
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}')
echo "${val:-0}"
if [ -f "$1" ]; then
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}')
echo "${val:-0}"
else
echo "0"
fi
}
# -------- normal ethernet --------
ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt"
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt"
# Clear old results
rm -f "$WORKDIR"/*.txt
# -------- roce ethernet (tcp) --------
ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt"
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt"
if [ "$RUN_ETH" = true ]; then
# -------- normal ethernet --------
echo "[*] Benchmarking Ethernet (1G LAN)..."
ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt"
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt"
fi
# -------- rdma latency --------
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
sleep 2
ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1
if [ "$RUN_ROCE" = true ]; then
# -------- roce ethernet (tcp) --------
echo "[*] Benchmarking RoCE NIC (Ethernet/TCP)..."
ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt"
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt"
fi
# -------- rdma bandwidth (maximized) --------
# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
sleep 2
ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1
if [ "$RUN_TB" = true ]; then
# -------- thunderbolt ethernet (tcp) --------
echo "[*] Benchmarking Thunderbolt..."
ping -c 10 "$HOST_TB" > "$WORKDIR/ping_tb.txt"
ssh "$HOST_TB" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
sleep 1
iperf3 -c "$HOST_TB" -P 8 -t 10 > "$WORKDIR/iperf_tb.txt"
fi
if [ "$RUN_RDMA" = true ]; then
# -------- rdma latency --------
echo "[*] Benchmarking RDMA (RoCE v2)..."
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
sleep 2
ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1
# -------- rdma bandwidth (maximized) --------
# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
sleep 2
ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1
fi
# -------- parse --------
ETH_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_eth.txt")
@@ -67,13 +145,17 @@ ETH_BW=$(parse_iperf_gbps "$WORKDIR/iperf_eth.txt")
ROCE_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_roce.txt")
ROCE_BW=$(parse_iperf_gbps "$WORKDIR/iperf_roce.txt")
TB_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_tb.txt")
TB_BW=$(parse_iperf_gbps "$WORKDIR/iperf_tb.txt")
RDMA_LAT_US=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt")
RDMA_BW_MIB=$(parse_rdma_bw_mib "$WORKDIR/rdma_bw_cli.txt")
# Convert units for dual display
ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')")
ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')")
RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')")
ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
TB_LAT_US=$(python3 -c "print(f'{float(${TB_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')" 2>/dev/null || echo "0.00")
RDMA_BW_GBPS=$(python3 - <<EOF
import sys
@@ -88,9 +170,18 @@ EOF
echo
echo "=== Network Comparison ==="
echo
printf "%-20s %-15s %-15s %-12s\n" "Path" "Latency (ms)" "Latency (us)" "Bandwidth"
echo "----------------------------------------------------------------"
printf "%-20s %-15s %-15s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT_MS} ms" "${ETH_LAT_US} us" "${ETH_BW} Gbps"
printf "%-20s %-15s %-15s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT_MS} ms" "${ROCE_LAT_US} us" "${ROCE_BW} Gbps"
printf "%-20s %-15s %-15s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT_MS} ms" "${RDMA_LAT_US} us" "${RDMA_BW_GBPS} Gbps"
printf "%-25s %-15s %-15s %-12s\n" "Path" "Latency (ms)" "Latency (us)" "Bandwidth"
echo "-----------------------------------------------------------------------"
if [ "$RUN_ETH" = true ]; then
printf "%-25s %-15s %-15s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT_MS:-0.00} ms" "${ETH_LAT_US:-0.00} us" "${ETH_BW:-0.00} Gbps"
fi
if [ "$RUN_ROCE" = true ]; then
printf "%-25s %-15s %-15s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT_MS:-0.00} ms" "${ROCE_LAT_US:-0.00} us" "${ROCE_BW:-0.00} Gbps"
fi
if [ "$RUN_TB" = true ]; then
printf "%-25s %-15s %-15s %-12s\n" "Ethernet (Thunderbolt)" "${TB_LAT_MS:-0.00} ms" "${TB_LAT_US:-0.00} us" "${TB_BW:-0.00} Gbps"
fi
if [ "$RUN_RDMA" = true ]; then
printf "%-25s %-15s %-15s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT_MS:-0.00} ms" "${RDMA_LAT_US:-0.00} us" "${RDMA_BW_GBPS:-0.00} Gbps"
fi
echo
Archivo binario no mostrado.

Después

Anchura:  |  Altura:  |  Tamaño: 6.5 MiB

+68 -3
Ver fichero
@@ -45,6 +45,8 @@ This guide details how to configure a two-node **AMD Strix Halo** cluster linked
## 2. Concepts & Architecture
![concepts](concepts.png)
To fully utilize the Strix Halo cluster, it is helpful to understand the technologies involved:
* **vLLM**: A high-performance inference engine. To run models larger than a single GPU (or APU) can handle, it splits the model using **Tensor Parallelism (TP)**.
@@ -55,15 +57,20 @@ To fully utilize the Strix Halo cluster, it is helpful to understand the technol
* **With RDMA**: Latency is ~5µs.
* **Why it matters**: For interactive token generation, high latency kills performance. RoCE makes the two nodes feel like a single machine.
---
## 3. Hardware Prerequisites
![cluster](cluster.png)
* **Nodes**: 2x [Framework Desktop Mainboards](https://frame.work/gb/en/products/framework-desktop-mainboard-amd-ryzen-ai-max-300-series?v=FRAFMK0006) with AMD Ryzen AI MAX+ "Strix Halo", 128GB of Unified Memory.
* **Network Cards**: [Intel Ethernet Controller E810-CQDA1](https://www.intel.com/content/www/us/en/products/sku/192558/intel-ethernet-network-adapter-e810cqda1/specifications.html) (or similar 100GbE QSFP28).
* **Connection**: Direct Attach Copper (DAC) cable (e.g., [QSFPTEK 100G QSFP28 DAC](https://www.amazon.co.uk/dp/B09F32F7VK)). No switch required for 2 nodes.
* **PCIe Note**: The Framework motherboard PCIe slot is physically **x4**, so a riser is required to plug in a 16x card (e.g., [CY PCI-E Express 4x to 16x Extender](https://www.amazon.co.uk/dp/B0837FZFJ6)). **Test Setup Note:** One of the boards in this setup has a modified PCIe slot (cut by Framework using an ultrasonic knife) to accept x16 cards directly. **This is not recommended for users.** Risers are the cheaper, safer, and easier solution. Performance is identical (~50Gbps bandwidth, ~5µs latency).
---
## 4. Host Configuration (Fedora)
@@ -214,7 +221,7 @@ The cluster management and verification scripts rely on SSH to execute commands
### 5.2 Installation
The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1151` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1151-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.
The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1150` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1150-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.
To install the toolbox on **both nodes**, run:
@@ -223,7 +230,7 @@ To install the toolbox on **both nodes**, run:
```
**What this does:**
1. Pulls the latest `kyuz0/vllm-therock-gfx1151` image.
1. Pulls the latest `kyuz0/vllm-therock-gfx1150` image.
2. Detects if `/dev/infiniband` exists on your host.
3. Creates the toolbox with flags to expose:
* **iGPU Access**: `/dev/dri`, `/dev/kfd` (Required for ROCm)
@@ -325,4 +332,62 @@ If you see link issues, ensure your Intel E810 firmware is up to date using the
## 8. References & Acknowledgements
* **Reddit - Strix Halo Batching with Tensor Parallel**: [Thread by Hungry_Elk_3276](https://www.reddit.com/r/LocalLLaMA/comments/1p8nped/strix_halo_batching_with_tensor_parallel_and/)
* Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1151` support in upstream RCCL.
* Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1150` support in upstream RCCL.
---
## 9. Alternative: Thunderbolt Networking
If you do not have dedicated 100GbE RDMA network cards, you can directly connect the two nodes using a high-quality **Thunderbolt 4 / USB4 cable**. This will create a `thunderbolt0` network interface.
While it lacks the ultra-low microprocessor-level latency of RDMA, it provides significantly more bandwidth than standard 1GbE/5GbE Ethernet and is easier to configure.
>**Note**: `thunderbolt-net` relies on standard OS kernel TCP/IP stacks.
### 9.1 Thunderbolt Configuration
**1. Establish Connection:**
Connect the nodes directly using a certified Thunderbolt 4 or USB4 cable. Verify the link is active:
```bash
ip link show thunderbolt0
```
**2. Network Configuration (Head - Node 1):**
Configure a persistent connection using `nmcli` with a static IP and Jumbo Frames (reduces CPU overhead).
*Note: Jumbo Frames may be unsupported on some Thunderbolt host controllers.*
```bash
sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.1/24 mtu 9000
sudo nmcli connection up thunderbolt0
```
**3. Network Configuration (Worker - Node 2):**
```bash
sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.2/24 mtu 9000
sudo nmcli connection up thunderbolt0
```
**4. Firewall Rules:**
To ensure Ray and NCCL can communicate freely over this link:
```bash
# Assign the interface to the trusted zone permanently
sudo firewall-cmd --permanent --zone=trusted --add-interface=thunderbolt0
sudo firewall-cmd --reload
```
### 9.2 Running vLLM over Thunderbolt
Our cluster scripts dynamically detect the network interface based on the provided IPs. There is no need to manually export environment variables!
1. Open the Toolbox: `toolbox enter vllm`
2. Launch the cluster manager: `start-vllm-cluster`
3. Select **Option 1 (Configure IPs)**.
4. Set the **Head IP** explicitly to `192.168.2.1` and the **Worker IP** to `192.168.2.2`.
5. Start the cluster normally (Option 2). The script will automatically discover and utilize `thunderbolt0` as the backend network for Ray orchestration and GPU synchronization.
### 9.3 Validating the Link
I have added Thunderbolt support to the `compare_eth_vs_rdma.sh` script. Run it from inside the toolbox to see the latency and bandwidth of your Thunderbolt link compared to your other network interfaces.
You can use the `-t` flag to ONLY benchmark the Thunderbolt connection (or `-e`, `-r`, `-i` for the others):
```bash
/opt/compare_eth_vs_rdma.sh -t
```
+7 -7
Ver fichero
@@ -1,9 +1,9 @@
# Issue Report: vLLM Tensor Parallelism over RDMA on AMD Strix Halo
> **✅ RESOLVED (Feb 2, 2026)**
> This issue is **SOLVED**. The root cause was indeed missing `gfx1151` support in the upstream RCCL library.
> This issue is **SOLVED**. The root cause was indeed missing `gfx1150` support in the upstream RCCL library.
>
> I have patched and built a custom version of RCCL with native `gfx1151` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1151`).
> I have patched and built a custom version of RCCL with native `gfx1150` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1150`).
>
> See the [RDMA Cluster Setup Guide](setup_guide.md) for instructions on how to run the cluster using the fixed container.
@@ -12,8 +12,8 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R
- **Current Status:** RDMA communication is verified (low latency ~5us). Ray cluster is operational and can allocate tensors on both nodes.
- **Blocker:** vLLM fails with `HIP error: invalid kernel file` when initializing the distributed environment.
- **Suspected Cause:** Possible missing support for `gfx1151` in the RCCL library included with the ROCm nightly build.
- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1151` support is indeed missing/required in RCCL.
- **Suspected Cause:** Possible missing support for `gfx1150` in the RCCL library included with the ROCm nightly build.
- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1150` support is indeed missing/required in RCCL.
## Table of Contents
1. [Context & Goal](#1-context--goal)
@@ -24,7 +24,7 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R
4. [The Issue: Invalid Kernel File](#4-the-issue-invalid-kernel-file)
- [4.1 Command & Configuration](#41-command--configuration)
- [4.2 Error Logs](#42-error-logs)
- [4.3 Hypothesis: RCCL Support for gfx1151](#43-hypothesis-rccl-support-for-gfx1151)
- [4.3 Hypothesis: RCCL Support for gfx1150](#43-hypothesis-rccl-support-for-gfx1150)
5. [Request for Help](#5-request-for-help)
## 1. Context & Goal
@@ -70,7 +70,7 @@ The environment is created using `toolbox` (wrapping Podman) with specific flags
```bash
toolbox create vllm \
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
-- \
--device /dev/dri \
--device /dev/kfd \
@@ -751,7 +751,7 @@ This results in an `HIP error: invalid kernel file` immediately upon engine init
### 4.1 - Possible reasons
This invalid kernel file might be related to RCCL not supporting gfx1151. There was a PR that was never merged:
This invalid kernel file might be related to RCCL not supporting gfx1150. There was a PR that was never merged:
https://github.com/ROCm/rccl/pull/2075
+1 -1
Ver fichero
@@ -3,7 +3,7 @@
set -e
TOOLBOX_NAME="vllm"
IMAGE="docker.io/kyuz0/vllm-therock-gfx1151:latest"
IMAGE="docker.io/kyuz0/vllm-therock-gfx1150:latest"
# Base options
OPTIONS="--device /dev/dri --device /dev/kfd --group-add video --group-add render --security-opt seccomp=unconfined"
+2 -2
Ver fichero
@@ -83,13 +83,13 @@ cat <<'ASCII'
v L L M
ASCII
echo
printf 'AMD STRIX HALO — vLLM Toolbox (gfx1151, ROCm via TheRock)\n'
printf 'AMD STRIX HALO — vLLM Toolbox (gfx1150, ROCm via TheRock)\n'
[[ -n "$ROCM_VER" ]] && printf 'ROCm nightly: %s\n' "$ROCM_VER"
echo
printf 'Machine: %s\n' "$MACHINE"
printf 'GPU : %s\n\n' "$GPU"
printf 'Repo : https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes\n'
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1151:latest\n\n'
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1150:latest\n\n'
printf 'Included:\n'
printf ' - %-16s → %s\n' "start-vllm (TUI)" "Interactive launcher: Model select, Multi-GPU & Cache handling"
printf ' - %-16s → %s\n' "start-vllm-cluster" "Cluster launcher: Setup Ray Head/Worker & Launch vLLM RCCL"
+18 -9
Ver fichero
@@ -1,13 +1,13 @@
#!/bin/bash
set -e
# Configuration
REPO_URL="https://github.com/kyuz0/rocm-systems.git"
BRANCH="gfx1151-rccl"
BUILD_DIR="build_gfx1151"
REPO_URL="https://code.badstorm.xyz/AI/rocm-systems.git"
BRANCH="gfx1150-rccl"
BUILD_DIR="build_gfx1150"
ROCM_PATH=${ROCM_PATH:-/opt/rocm}
# Project sub-directory
PROJECT_DIR="projects/rccl"
echo "=== Building RCCL for gfx1151 ==="
echo "=== Building RCCL for gfx1150 ==="
echo "Repo: $REPO_URL"
echo "Branch: $BRANCH"
echo "ROCm Path: $ROCM_PATH"
@@ -28,14 +28,14 @@ echo "Entering project directory..."
cd $PROJECT_DIR
mkdir -p $BUILD_DIR
cd $BUILD_DIR
echo "Configuring CMake for gfx1151..."
# We explicitly set GPU_TARGETS to gfx1151 to override the default list.
echo "Configuring CMake for gfx1150..."
# We explicitly set GPU_TARGETS to gfx1150 to override the default list.
# We also set AMDGPU_TARGETS for standard rocm-cmake compliance.
CXX=$ROCM_PATH/bin/hipcc cmake .. \
-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc \
-DDEFAULT_GPUS="gfx1151" \
-DGPU_TARGETS="gfx1151" \
-DAMDGPU_TARGETS="gfx1151" \
-DDEFAULT_GPUS="gfx1150" \
-DGPU_TARGETS="gfx1150" \
-DAMDGPU_TARGETS="gfx1150" \
-DCMAKE_INSTALL_PREFIX=./install \
-DBUILD_TESTS=OFF \
-DGENERATE_SYM_KERNELS=OFF \
@@ -44,6 +44,15 @@ CXX=$ROCM_PATH/bin/hipcc cmake .. \
# 3. Build
echo "Building librccl.so..."
make -j$(nproc)
# Comprimi il file reale (non il symlink)
cd /home/badstorm/Source/ai/amd-strix-halo-vllm-toolboxes
gzip -k rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0
# Copia i file .gz in custom_libs/
mkdir -p custom_libs/
cp rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0.gz custom_libs/librccl.so.1.gz
echo "=== Build Complete ==="
echo "Libraries are located in:"
echo " $(pwd)/librccl.so"
+96 -15
Ver fichero
@@ -2,13 +2,17 @@ import subprocess
import time
import os
def get_net_iface(ip_prefix="192.168.100"):
def get_net_iface(ip_prefix=None):
"""
Auto-detects the interface that serves the cluster network.
Assumes standard 192.168.100.x setup from start_vllm_cluster.py
Assumes standard 192.168.100.x setup from start_vllm_cluster.py, but parameterizable.
"""
if ip_prefix is None:
head_ip = os.getenv("VLLM_HEAD_IP", "192.168.100.1")
ip_prefix = ".".join(head_ip.split('.')[:3])
try:
# ip -o addr show | grep 192.168.100
# ip -o addr show | grep <ip_prefix>
cmd = f"ip -o addr show | grep {ip_prefix}"
res = subprocess.check_output(cmd, shell=True, text=True).strip()
# Output format: 2: eth0 inet 192.168.100.1/24 ...
@@ -31,35 +35,77 @@ def get_subnet_from_ip(ip):
parts = ip.split('.')
return f"{parts[0]}.{parts[1]}.{parts[2]}.0/24"
def stop_cluster(nodes=None):
def stop_cluster(worker_ip=None):
"""
Stops Ray on the given nodes (list of IPs).
If nodes is None, does nothing (caller should identify nodes first if needed,
but typically for a clean start we might just rely on 'ray stop' on each setup).
Actually, to be safe, we can try to stop local ray.
Stops Ray locally and on the worker node if provided.
"""
print("Stopping Ray cluster locally...")
subprocess.run(["ray", "stop", "--force"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if worker_ip:
print(f"Stopping Ray cluster on worker ({worker_ip})...")
ssh_cmd = [
"ssh", "-o", "StrictHostKeyChecking=no", worker_ip,
"toolbox", "run", "-c", "vllm", "--", "ray", "stop", "--force"
]
try:
subprocess.run(ssh_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError as e:
print(f"Warning: Failed to stop worker node completely: {e}")
def setup_worker_node(worker_ip, head_ip):
subnet = get_subnet_from_ip(worker_ip)
# Script to run on worker
# Read overrides from current env
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
script = f"""
source /etc/profile
# Silece the kill command
# Silence the kill command
ray stop --force > /dev/null 2>&1 || true
# Calculate Interface dynamically
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
echo "\\n--- Ray Worker Environment ({worker_ip}) ---"
echo "export RAY_DISABLE_METRICS=1"
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
echo "export RAY_memory_monitor_refresh_ms=0"
echo "export VLLM_HOST_IP={worker_ip}"
echo "export RDMA_IFACE=$RDMA_IFACE"
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
echo "export NCCL_IB_TIMEOUT=23"
echo "export NCCL_IB_RETRY_CNT=7"
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
export RAY_DISABLE_METRICS=1
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
export RAY_memory_monitor_refresh_ms=0
export VLLM_HOST_IP={worker_ip}
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
export RDMA_IFACE=$RDMA_IFACE
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
# Stability for RDMA
export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7
echo "Starting Ray Worker on {worker_ip} connecting to {head_ip}..."
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false
export NCCL_IB_DISABLE={nccl_disable_val}
"""
if nccl_debug_val:
script += f"""
echo "export NCCL_DEBUG={nccl_debug_val}"
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
export NCCL_DEBUG={nccl_debug_val}
export NCCL_DEBUG_SUBSYS=INIT,NET
"""
script += f"""
echo "\\nStarting Ray Worker on {worker_ip} connecting to {head_ip}..."
if [ "{nccl_disable_val}" = "1" ]; then
echo "Note: Worker is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
fi
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats
"""
print(f"Setting up Worker Node ({worker_ip})...")
@@ -83,20 +129,55 @@ def setup_head_node(head_ip):
print(f"Setting up Head Node ({head_ip})...")
# Read overrides from current env
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
script = f"""
# Silence the kill command
ray stop --force > /dev/null 2>&1 || true
# Calculate Interface dynamically
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
echo "\\n--- Ray Head Environment ({head_ip}) ---"
echo "export RAY_DISABLE_METRICS=1"
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
echo "export RAY_memory_monitor_refresh_ms=0"
echo "export VLLM_HOST_IP={head_ip}"
echo "export RDMA_IFACE=$RDMA_IFACE"
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
echo "export NCCL_IB_TIMEOUT=23"
echo "export NCCL_IB_RETRY_CNT=7"
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
export RAY_DISABLE_METRICS=1
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
export RAY_memory_monitor_refresh_ms=0
export VLLM_HOST_IP={head_ip}
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
export RDMA_IFACE=$RDMA_IFACE
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
# Stability for RDMA
export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7
echo "Starting Ray Head on {head_ip}..."
export NCCL_IB_DISABLE={nccl_disable_val}
"""
if nccl_debug_val:
script += f"""
echo "export NCCL_DEBUG={nccl_debug_val}"
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
export NCCL_DEBUG={nccl_debug_val}
export NCCL_DEBUG_SUBSYS=INIT,NET
"""
script += f"""
echo "\\nStarting Ray Head on {head_ip}..."
if [ "{nccl_disable_val}" = "1" ]; then
echo "Note: Head is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
fi
ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false
"""
+9 -8
Ver fichero
@@ -2,11 +2,12 @@
set -e
# 1. System Base & Build Tools
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
dnf -y install --setopt=install_weak_deps=False --nodocs \
python3.13 python3.13-devel git rsync libatomic bash ca-certificates curl \
gcc gcc-c++ binutils make ffmpeg-free \
cmake ninja-build aria2c tar xz vim nano dialog \
libdrm-devel zlib-devel openssl-devel pgrep \
numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \
&& dnf clean all && rm -rf /var/cache/dnf/*
# Added 'libgoogle-perftools4' for tcmalloc (fixes double-free)
apt-get update
apt-get install -y --no-install-recommends \
python3.11 python3.11-dev python3.11-venv git rsync bash ca-certificates curl \
gcc g++ binutils make ffmpeg \
cmake ninja-build aria2 tar xz-utils vim nano dialog \
libdrm-dev zlib1g-dev libssl-dev procps \
libnuma-dev libgoogle-perftools4 iproute2 ibverbs-utils patch perftest iputils-ping iperf3 infiniband-diags \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
+3 -2
Ver fichero
@@ -3,7 +3,7 @@ set -euo pipefail
# Configuration with defaults matching Dockerfile ARGs
ROCM_MAJOR_VER="${ROCM_MAJOR_VER:-7}"
GFX="${GFX:-gfx1151}"
GFX="${GFX:-gfx1150}"
echo "=== Installing ROCm SDK ($GFX / $ROCM_MAJOR_VER) ==="
@@ -51,8 +51,9 @@ printf '%s\n' \
"export VLLM_TARGET_DEVICE=rocm" \
"export HIP_FORCE_DEV_KERNARG=1" \
"export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
"export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4" \
"export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/rocm/lib/librocm_smi64.so.1.0" \
> /etc/profile.d/rocm-sdk.sh
chmod 0644 /etc/profile.d/rocm-sdk.sh
echo "=== ROCm SDK Installation Complete ==="
+2 -2
Ver fichero
@@ -3,7 +3,7 @@ set -e
# Configuration
# Paths identified from your environment
ROCM_LIB_PATH="/opt/rocm/lib/librccl.so.1.0"
VENV_LIB_PATH="/opt/venv/lib/python3.13/site-packages/_rocm_sdk_libraries_gfx1151/lib/librccl.so.1"
VENV_LIB_PATH="/opt/venv/lib/python3.11/site-packages/_rocm_sdk_libraries_gfx1150/lib/librccl.so.1"
BACKUP_DIR="./rccl_backups_$(date +%Y%m%d_%H%M%S)"
# Files to replace
# We assume the new library is named 'librccl.so' or 'librccl.so.1' in the current directory or provided as arg
@@ -20,7 +20,7 @@ do_install() {
echo "Please provide the path to the newly built librccl.so.1"
exit 1
fi
echo "=== Installing Custom RCCL (gfx1151) ==="
echo "=== Installing Custom RCCL (gfx1150) ==="
echo "Creating backup directory: $BACKUP_DIR"
mkdir -p "$BACKUP_DIR"
# 1. Backup /opt/rocm location
+18
Ver fichero
@@ -0,0 +1,18 @@
#!/usr/bin/env bash
while true; do
A_IN=$(rdma statistic | awk '/ip4InOctets/ {print $2}')
A_OUT=$(rdma statistic | awk '/ip4OutOctets/ {print $2}')
sleep 1
B_IN=$(rdma statistic | awk '/ip4InOctets/ {print $2}')
B_OUT=$(rdma statistic | awk '/ip4OutOctets/ {print $2}')
RX=$(( (B_IN - A_IN) * 8 ))
TX=$(( (B_OUT - A_OUT) * 8 ))
printf "%s RDMA RX: %7sbit/s TX: %7sbit/s SUM: %7sbit/s\n" \
"$(date +%T)" \
"$(numfmt --to=iec $RX)" \
"$(numfmt --to=iec $TX)" \
"$(numfmt --to=iec $((RX+TX)))"
done
+12 -1
Ver fichero
@@ -10,6 +10,7 @@ MODEL_TABLE = {
"google/gemma-3-12b-it": {
"trust_remote": False,
"enforce_eager": True,
"valid_tp": [1, 2],
"max_num_seqs": "64",
"max_tokens": "32768"
@@ -68,7 +69,7 @@ MODEL_TABLE = {
# 5. Qwen 80B AWQ
# Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
# Config: 20k ctx fits in that cache. Eager mode required for stability.
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
"trust_remote": True,
"valid_tp": [1], # Too big for single GPU
"max_num_seqs": "64", # Large Model / Bandwidth Constrained
@@ -77,6 +78,15 @@ MODEL_TABLE = {
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
},
"mratsim/MiniMax-M2.5-BF16-INT4-AWQ": {
"trust_remote": True,
"valid_tp": [2],
"max_num_seqs": "64",
"max_tokens": "16384",
"enforce_eager": False,
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
},
}
MODELS_TO_RUN = [
@@ -89,6 +99,7 @@ MODELS_TO_RUN = [
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
"mratsim/MiniMax-M2.5-BF16-INT4-AWQ",
]
# Hardware / Global Defaults

Algunos archivos no se mostraron porque demasiados archivos han cambiado en esta diferencia Ver más