1 Commits

Auteur SHA1 Bericht Datum
devbadxyz 48a20990d3 Improve compilation support 2026-03-15 13:04:09 +01:00
16 gewijzigde bestanden met toevoegingen van 387 en 70 verwijderingen
@@ -13,7 +13,7 @@ on:
default: ""
env:
IMAGE_REPO: kyuz0/vllm-therock-gfx1151
IMAGE_REPO: kyuz0/vllm-therock-gfx1150
DOCKER_BUILDKIT: "1"
jobs:
@@ -67,7 +67,7 @@ jobs:
uses: dawidd6/action-download-artifact@v6
with:
workflow: build-rccl.yml
name: librccl-gfx1151
name: librccl-gfx1150
run_id: ${{ github.event.inputs.rccl_run_id }}
path: custom_libs
if_no_artifact_found: warn
+6 -6
Bestand weergeven
@@ -5,7 +5,7 @@ on:
env:
ROCM_MAJOR_VER: 7
GFX: gfx1151
GFX: gfx1150
jobs:
build-rccl:
@@ -24,17 +24,17 @@ jobs:
shell: bash
run: |
source /etc/profile.d/rocm-sdk.sh
bash scripts/build_rccl_gfx1151.sh
bash scripts/build_rccl_gfx1150.sh
- name: Compress Artifact
run: |
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
ls -lh rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
gzip -c rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 > librccl.so.1.gz
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
ls -lh rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
gzip -c rocm-systems/projects/rccl/build_gfx1150/librccl.so.1 > librccl.so.1.gz
ls -lh librccl.so.1.gz
- name: Upload Artifact
uses: actions/upload-artifact@v4
with:
name: librccl-gfx1151
name: librccl-gfx1150
path: librccl.so.1.gz
+3 -1
Bestand weergeven
@@ -1,3 +1,5 @@
*.pyc
__pycache__/
settings.json
settings.json
custom_libs/
rocm-systems/
+15 -15
Bestand weergeven
@@ -1,4 +1,4 @@
FROM registry.fedoraproject.org/fedora:43
FROM debian:12-slim
# 1. System Base & Build Tools
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
@@ -8,7 +8,7 @@ RUN sh /tmp/install_deps.sh
# 2. Install "TheRock" ROCm SDK (Tarball Method)
WORKDIR /tmp
ARG ROCM_MAJOR_VER=7
ARG GFX=gfx1151
ARG GFX=gfx1150
# We pass ARGs to the script via ENV or rely on defaults.
# But let's be explicit and export them for the RUN command.
COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh
@@ -18,7 +18,7 @@ RUN chmod +x /tmp/install_rocm_sdk.sh && \
/tmp/install_rocm_sdk.sh
# 4. Python Venv Setup
RUN /usr/bin/python3.12 -m venv /opt/venv
RUN /usr/bin/python3.11 -m venv /opt/venv
ENV VIRTUAL_ENV=/opt/venv
ENV PATH=/opt/venv/bin:$PATH
ENV PIP_NO_CACHE_DIR=1
@@ -27,7 +27,7 @@ RUN python -m pip install --upgrade pip wheel packaging "setuptools<80.0.0"
# 5. Install PyTorch (TheRock Nightly)
RUN python -m pip install \
--index-url https://rocm.nightlies.amd.com/v2-staging/gfx1151/ \
--index-url https://rocm.nightlies.amd.com/v2-staging/gfx1150/ \
--pre torch torchaudio torchvision
WORKDIR /opt
@@ -49,16 +49,16 @@ WORKDIR /opt/vllm
# --- PATCHING ---
COPY scripts/patch_strix.py /opt/vllm/patch_strix.py
RUN python /opt/vllm/patch_strix.py && \
sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt
sed -i 's/gfx1200;gfx1201/gfx1150/' CMakeLists.txt
# 7. Build vLLM (Wheel Method) with CLANG Host Compiler
RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11
RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11 amd-quark>=0.11
ENV ROCM_HOME="/opt/rocm"
ENV HIP_PATH="/opt/rocm"
ENV VLLM_TARGET_DEVICE="rocm"
ENV PYTORCH_ROCM_ARCH="gfx1151"
ENV HIP_ARCHITECTURES="gfx1151"
ENV AMDGPU_TARGETS="gfx1151"
ENV PYTORCH_ROCM_ARCH="gfx1150"
ENV HIP_ARCHITECTURES="gfx1150"
ENV AMDGPU_TARGETS="gfx1150"
ENV MAX_JOBS="4"
# --- CRITICAL FIX FOR SEGFAULT ---
@@ -69,7 +69,7 @@ ENV CXX="/opt/rocm/llvm/bin/clang++"
RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1150 -DHIP_ARCHITECTURES=gfx1150" && \
python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \
python -m pip install /tmp/dist/*.whl
@@ -86,8 +86,8 @@ ENV CMAKE_PREFIX_PATH="/opt/rocm"
# Force CMake to use the System ROCm Compiler (/opt/rocm/llvm/bin/clang++)
RUN cmake -S . \
-DGPU_TARGETS="gfx1151" \
-DBNB_ROCM_ARCH="gfx1151" \
-DGPU_TARGETS="gfx1150" \
-DBNB_ROCM_ARCH="gfx1150" \
-DCOMPUTE_BACKEND=hip \
-DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
@@ -101,7 +101,7 @@ RUN chmod -R a+rwX /opt && \
find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \
find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} + && \
rm -rf /root/.cache/pip || true && \
dnf clean all && rm -rf /var/cache/dnf/*
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh
COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh
@@ -128,7 +128,7 @@ RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py
RUN chmod 0644 /etc/profile.d/*.sh
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one
# 9. Install Custom RCCL (gfx1150) - Replaces standard library with manually built one
COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz
RUN echo "Installing Custom RCCL..." && \
gzip -d /tmp/librccl.so.1.gz && \
@@ -146,4 +146,4 @@ RUN python -m pip install transformers==5.0.0
RUN chmod -R a+rwX /opt
CMD ["/bin/bash"]
CMD ["/bin/bash"]
+304
Bestand weergeven
@@ -0,0 +1,304 @@
# Guida: Usare vLLM con Podman su Strix Halo
Questa guida ti spiega come buildare e usare il container vLLM con il modello `bullpoint/Qwen3-Coder-Next-AWQ-4bit` su Debian 13 con Podman.
## Prerequisiti
- Podman installato e funzionante
- AMD Ryzen AI Max "Strix Halo" (gfx1150) o GPU ROCm compatibile
- Accesso ai device `/dev/kfd` e `/dev/dri`
- Almeno 30GB di spazio disco per il modello e la cache
## 1. Buildare l'immagine
Dalla directory del progetto, esegui:
```bash
podman build -t vllm:rocm .
```
**Note:**
- Il build richiede 30-60 minuti a seconda della macchina
- L'immagine compila vLLM, bitsandbytes e flash-attention da sorgente
- Se il build fallisce, verifica di avere abbastanza spazio disco e memoria
### Opzioni di build avanzate
Puoi passare argomenti personalizzati:
```bash
podman build \
--build-arg ROCM_MAJOR_VER=7 \
--build-arg GFX=gfx1150 \
--network=host \
-t vllm:rocm .
```
- `--network=host` - Usare la rete dell'host per i download (utile se hai problemi di connessione)
- `--no-cache` - Ignorare la cache e ricompilare tutto
## 2. Preparare i filesystem locali
Crea le cartelle per modelli e cache:
```bash
mkdir -p ~/models
mkdir -p ~/.cache/huggingface
```
## 3. Lanciare il container con GPU
### Opzione A: Shell interattiva (Development)
Se vuoi esplorare il container e usare il TUI `start-vllm`:
```bash
podman run -it \
--device /dev/kfd \
--device /dev/dri \
--network host \
-v $HOME/models:/models \
-v $HOME/.cache/huggingface:/cache/huggingface \
-p 8000:8000 \
vllm:rocm \
/bin/bash
```
Dentro il container:
```bash
start-vllm
```
Oppure lancia direttamente:
```bash
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
--tensor-parallel-size 1 \
--trust-remote-code \
--enforce-eager \
--gpu-memory-utilization 0.90
```
### Opzione B: Lanciare direttamente il servizio (Production)
Esegui vLLM in un unico comando senza shell interattiva:
```bash
podman run -d \
--device /dev/kfd \
--device /dev/dri \
--network host \
-v $HOME/models:/models \
-v $HOME/.cache/huggingface:/cache/huggingface \
-p 8000:8000 \
--name vllm-server \
vllm:rocm \
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
--tensor-parallel-size 1 \
--trust-remote-code \
--enforce-eager \
--gpu-memory-utilization 0.90
```
**Opzioni spiegate:**
| Opzione | Significato |
|---------|------------|
| `-d` | Esegui in background |
| `--device /dev/kfd` | Accesso alla GPU ROCm (kernel compute queue) |
| `--device /dev/dri` | Accesso agli acceleratori DRI (render engine) |
| `--network host` | Usa la rete dell'host (migliore performance) |
| `-v $HOME/models:/models` | Monta la cartella modelli locale |
| `-v $HOME/.cache/huggingface:/cache/huggingface` | Monta la cache HuggingFace |
| `-p 8000:8000` | Espone la porta dell'API OpenAI-compatible |
| `--name vllm-server` | Nome del container |
| `--tensor-parallel-size 1` | Usa 1 GPU (no parallelismo) |
| `--trust-remote-code` | Permetti codice remoto da HuggingFace |
| `--enforce-eager` | Modalità eager (debug/stability) |
| `--gpu-memory-utilization 0.90` | Usa il 90% della memoria GPU |
## 4. Monitorare il container
Se lanciato in background (`-d`):
```bash
# Visualizza i log
podman logs -f vllm-server
# Visualizza i log ultimi 50 righe
podman logs -n 50 vllm-server
# Controlla lo stato
podman ps | grep vllm-server
# Entra nel container
podman exec -it vllm-server /bin/bash
```
## 5. Testare l'API
Una volta che il server è up, puoi testare con cURL:
### Chat Completion
```bash
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
"messages": [{"role": "user", "content": "Write a Python function to sort a list"}],
"max_tokens": 200,
"temperature": 0.7
}'
```
### Completamento testo
```bash
curl -X POST http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
"prompt": "def fibonacci(",
"max_tokens": 100
}'
```
### Listare modelli disponibili
```bash
curl http://localhost:8000/v1/models
```
## 6. Usare da un altro host (SSH Port Forwarding)
Se vLLM è su un server remoto:
```bash
ssh -L 0.0.0.0:8000:localhost:8000 user@remote-host
```
Poi da client locale:
```bash
curl http://localhost:8000/v1/models
```
## 7. Stoppare il container
```bash
# Se lanciato in background
podman stop vllm-server
# Rimuovere il container
podman rm vllm-server
# Se in shell interattiva, usa Ctrl+C e poi
podman stop <container-id>
```
## 8. Usare con systemd (Quadlet)
Se hai già usato il file `vllm-rocm.container` generato:
```bash
mkdir -p ~/.config/containers/systemd/
cp vllm-rocm.container ~/.config/containers/systemd/
systemctl --user daemon-reload
systemctl --user start vllm-rocm
systemctl --user status vllm-rocm
```
Visualizza i log:
```bash
systemctl --user logs -u vllm-rocm -n 50 -f
```
## Modello: bullpoint/Qwen3-Coder-Next-AWQ-4bit
### Caratteristiche
- **Quantizzazione:** AWQ (Activation-aware Weight Quantization) a 4-bit
- **Vantaggi:**
- Occupa ~15-20GB di memoria (vs 50-60GB full precision)
- Esecuzione molto veloce
- Qualità proche al modello full precision
- **Caso d'uso:** Sviluppo code, task di programmazione
### Parametri consigliati
```bash
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
--tensor-parallel-size 1 \
--trust-remote-code \
--enforce-eager \
--gpu-memory-utilization 0.90 \
--max-model-len 4096 \
--batch-size 16
```
## Troubleshooting
### Errore: "Unable to locate package python3.13"
Il container usa Python 3.13, disponibile in Debian 13. Verifica di usare `debian:bookworm` o `debian:13-slim` nella base image.
### Errore: "No GPU detected"
Verifica che i device siano accessibili:
```bash
ls -la /dev/kfd /dev/dri
```
Se non ci sono, potrebbe essere un problema di driver. Su Strix Halo:
```bash
rocm-smi
```
### Errore: "Out of memory"
Riduci `--gpu-memory-utilization` oppure `--max-model-len`:
```bash
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
--gpu-memory-utilization 0.80 \
--max-model-len 2048
```
### Il container si ferma subito
Controlla i log:
```bash
podman logs vllm-server
```
Se vedi errori di compilazione, il build potrebbe non essere completato correttamente. Riprova:
```bash
podman build --no-cache -t vllm:rocm .
```
## Link Utili
- [vLLM Documentation](https://docs.vllm.ai/)
- [HuggingFace Qwen3 Models](https://huggingface.co/collections/Qwen/qwen3-coder-67a2e625ef1d5c6ba5a9c14c)
- [ROCm Documentation](https://rocmdocs.amd.com/)
## Domande Frequenti
**D: Posso usare più GPU con Tensor Parallelism?**
R: Sì, imposta `--tensor-parallel-size 2` se hai 2 GPU. Su Strix Halo single-GPU, usa `--tensor-parallel-size 1`.
**D: Come cambio modello senza riavviare il container?**
R: Devi stoppare e riavviare il container con un modello diverso.
**D: Posso usare questo con una Web UI?**
R: Sì, usa HuggingFace Chat UI o altre app che supportano endpoint OpenAI-compatible.
**D: Il modello viene scaricato ogni volta?**
R: No, viene cachato in `~/.cache/huggingface`. La prima volta richiede il download, le volte successive usa la cache.
+7 -7
Bestand weergeven
@@ -1,13 +1,13 @@
# AMD Strix Halo (gfx1151) — vLLM Toolbox/Container
# AMD Strix Halo (gfx1150) — vLLM Toolbox/Container
An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1151)**. Built on the **TheRock nightly builds** for ROCm.
An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1150)**. Built on the **TheRock nightly builds** for ROCm.
---
## 🚀 High-Performance Clustering Support (New!)
**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1151)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.
**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1150)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.
👉 **[Read the Full RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)** for hardware requirements and configuration instructions.
@@ -58,7 +58,7 @@ View full benchmarks at: [https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/
## 1) Toolbx vs Docker/Podman
The `kyuz0/vllm-therock-gfx1151:latest` image can be used both as: 
The `kyuz0/vllm-therock-gfx1150:latest` image can be used both as: 
* **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean.
* **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container.
@@ -81,7 +81,7 @@ To manually create a toolbox that exposes the GPU and relaxes seccomp:
```bash
toolbox create vllm \
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
-- --device /dev/dri --device /dev/kfd \
--group-add video --group-add render --security-opt seccomp=unconfined
```
@@ -112,7 +112,7 @@ Ubuntus toolbox package still breaks GPU access, so use Distrobox instead:
```bash
distrobox create -n vllm \
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
--additional-flags "--device /dev/kfd --device /dev/dri --group-add video --group-add render --security-opt seccomp=unconfined"
distrobox enter vllm
@@ -218,6 +218,6 @@ This toolbox supports high-performance clustering of multiple Strix Halo nodes u
**Detailed Documentation:** [RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)
**Key Features:**
* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1151`.
* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1150`.
* **Easy Setup:** `refresh_toolbox.sh` automatically detects and exposes RDMA devices.
* **Cluster Management:** Included `start-vllm-cluster` TUI for managing Ray and vLLM.
+2 -2
Bestand weergeven
@@ -4,7 +4,7 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AMD Strix Halo (gfx1151) vLLM Benchmarks</title>
<title>AMD Strix Halo (gfx1150) vLLM Benchmarks</title>
<style>
:root {
--bg-body: #f9fafb;
@@ -445,7 +445,7 @@
<div class="container">
<header>
<h1>AMD Strix Halo (gfx1151) vLLM Benchmarks</h1>
<h1>AMD Strix Halo (gfx1150) vLLM Benchmarks</h1>
<p style="margin: 4px 0 0 0; font-size: 0.9rem;">
<a href="https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes/" target="_blank"
style="color: var(--primary); text-decoration: none;">View on GitHub &rarr;</a>
+3 -3
Bestand weergeven
@@ -221,7 +221,7 @@ The cluster management and verification scripts rely on SSH to execute commands
### 5.2 Installation
The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1151` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1151-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.
The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1150` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1150-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.
To install the toolbox on **both nodes**, run:
@@ -230,7 +230,7 @@ To install the toolbox on **both nodes**, run:
```
**What this does:**
1. Pulls the latest `kyuz0/vllm-therock-gfx1151` image.
1. Pulls the latest `kyuz0/vllm-therock-gfx1150` image.
2. Detects if `/dev/infiniband` exists on your host.
3. Creates the toolbox with flags to expose:
* **iGPU Access**: `/dev/dri`, `/dev/kfd` (Required for ROCm)
@@ -332,7 +332,7 @@ If you see link issues, ensure your Intel E810 firmware is up to date using the
## 8. References & Acknowledgements
* **Reddit - Strix Halo Batching with Tensor Parallel**: [Thread by Hungry_Elk_3276](https://www.reddit.com/r/LocalLLaMA/comments/1p8nped/strix_halo_batching_with_tensor_parallel_and/)
* Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1151` support in upstream RCCL.
* Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1150` support in upstream RCCL.
---
+7 -7
Bestand weergeven
@@ -1,9 +1,9 @@
# Issue Report: vLLM Tensor Parallelism over RDMA on AMD Strix Halo
> **✅ RESOLVED (Feb 2, 2026)**
> This issue is **SOLVED**. The root cause was indeed missing `gfx1151` support in the upstream RCCL library.
> This issue is **SOLVED**. The root cause was indeed missing `gfx1150` support in the upstream RCCL library.
>
> I have patched and built a custom version of RCCL with native `gfx1151` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1151`).
> I have patched and built a custom version of RCCL with native `gfx1150` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1150`).
>
> See the [RDMA Cluster Setup Guide](setup_guide.md) for instructions on how to run the cluster using the fixed container.
@@ -12,8 +12,8 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R
- **Current Status:** RDMA communication is verified (low latency ~5us). Ray cluster is operational and can allocate tensors on both nodes.
- **Blocker:** vLLM fails with `HIP error: invalid kernel file` when initializing the distributed environment.
- **Suspected Cause:** Possible missing support for `gfx1151` in the RCCL library included with the ROCm nightly build.
- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1151` support is indeed missing/required in RCCL.
- **Suspected Cause:** Possible missing support for `gfx1150` in the RCCL library included with the ROCm nightly build.
- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1150` support is indeed missing/required in RCCL.
## Table of Contents
1. [Context & Goal](#1-context--goal)
@@ -24,7 +24,7 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R
4. [The Issue: Invalid Kernel File](#4-the-issue-invalid-kernel-file)
- [4.1 Command & Configuration](#41-command--configuration)
- [4.2 Error Logs](#42-error-logs)
- [4.3 Hypothesis: RCCL Support for gfx1151](#43-hypothesis-rccl-support-for-gfx1151)
- [4.3 Hypothesis: RCCL Support for gfx1150](#43-hypothesis-rccl-support-for-gfx1150)
5. [Request for Help](#5-request-for-help)
## 1. Context & Goal
@@ -70,7 +70,7 @@ The environment is created using `toolbox` (wrapping Podman) with specific flags
```bash
toolbox create vllm \
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
-- \
--device /dev/dri \
--device /dev/kfd \
@@ -751,7 +751,7 @@ This results in an `HIP error: invalid kernel file` immediately upon engine init
### 4.1 - Possible reasons
This invalid kernel file might be related to RCCL not supporting gfx1151. There was a PR that was never merged:
This invalid kernel file might be related to RCCL not supporting gfx1150. There was a PR that was never merged:
https://github.com/ROCm/rccl/pull/2075
+1 -1
Bestand weergeven
@@ -3,7 +3,7 @@
set -e
TOOLBOX_NAME="vllm"
IMAGE="docker.io/kyuz0/vllm-therock-gfx1151:latest"
IMAGE="docker.io/kyuz0/vllm-therock-gfx1150:latest"
# Base options
OPTIONS="--device /dev/dri --device /dev/kfd --group-add video --group-add render --security-opt seccomp=unconfined"
+2 -2
Bestand weergeven
@@ -83,13 +83,13 @@ cat <<'ASCII'
v L L M
ASCII
echo
printf 'AMD STRIX HALO — vLLM Toolbox (gfx1151, ROCm via TheRock)\n'
printf 'AMD STRIX HALO — vLLM Toolbox (gfx1150, ROCm via TheRock)\n'
[[ -n "$ROCM_VER" ]] && printf 'ROCm nightly: %s\n' "$ROCM_VER"
echo
printf 'Machine: %s\n' "$MACHINE"
printf 'GPU : %s\n\n' "$GPU"
printf 'Repo : https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes\n'
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1151:latest\n\n'
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1150:latest\n\n'
printf 'Included:\n'
printf ' - %-16s → %s\n' "start-vllm (TUI)" "Interactive launcher: Model select, Multi-GPU & Cache handling"
printf ' - %-16s → %s\n' "start-vllm-cluster" "Cluster launcher: Setup Ray Head/Worker & Launch vLLM RCCL"
+18 -9
Bestand weergeven
@@ -1,13 +1,13 @@
#!/bin/bash
set -e
# Configuration
REPO_URL="https://github.com/kyuz0/rocm-systems.git"
BRANCH="gfx1151-rccl"
BUILD_DIR="build_gfx1151"
REPO_URL="https://code.badstorm.xyz/AI/rocm-systems.git"
BRANCH="gfx1150-rccl"
BUILD_DIR="build_gfx1150"
ROCM_PATH=${ROCM_PATH:-/opt/rocm}
# Project sub-directory
PROJECT_DIR="projects/rccl"
echo "=== Building RCCL for gfx1151 ==="
echo "=== Building RCCL for gfx1150 ==="
echo "Repo: $REPO_URL"
echo "Branch: $BRANCH"
echo "ROCm Path: $ROCM_PATH"
@@ -28,14 +28,14 @@ echo "Entering project directory..."
cd $PROJECT_DIR
mkdir -p $BUILD_DIR
cd $BUILD_DIR
echo "Configuring CMake for gfx1151..."
# We explicitly set GPU_TARGETS to gfx1151 to override the default list.
echo "Configuring CMake for gfx1150..."
# We explicitly set GPU_TARGETS to gfx1150 to override the default list.
# We also set AMDGPU_TARGETS for standard rocm-cmake compliance.
CXX=$ROCM_PATH/bin/hipcc cmake .. \
-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc \
-DDEFAULT_GPUS="gfx1151" \
-DGPU_TARGETS="gfx1151" \
-DAMDGPU_TARGETS="gfx1151" \
-DDEFAULT_GPUS="gfx1150" \
-DGPU_TARGETS="gfx1150" \
-DAMDGPU_TARGETS="gfx1150" \
-DCMAKE_INSTALL_PREFIX=./install \
-DBUILD_TESTS=OFF \
-DGENERATE_SYM_KERNELS=OFF \
@@ -44,6 +44,15 @@ CXX=$ROCM_PATH/bin/hipcc cmake .. \
# 3. Build
echo "Building librccl.so..."
make -j$(nproc)
# Comprimi il file reale (non il symlink)
cd /home/badstorm/Source/ai/amd-strix-halo-vllm-toolboxes
gzip -k rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0
# Copia i file .gz in custom_libs/
mkdir -p custom_libs/
cp rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0.gz custom_libs/librccl.so.1.gz
echo "=== Build Complete ==="
echo "Libraries are located in:"
echo " $(pwd)/librccl.so"
+9 -8
Bestand weergeven
@@ -2,11 +2,12 @@
set -e
# 1. System Base & Build Tools
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
dnf -y install --setopt=install_weak_deps=False --nodocs \
python3.12 python3.12-devel git rsync libatomic bash ca-certificates curl \
gcc gcc-c++ binutils make ffmpeg-free \
cmake ninja-build aria2c tar xz vim nano dialog \
libdrm-devel zlib-devel openssl-devel pgrep \
numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 perfquery \
&& dnf clean all && rm -rf /var/cache/dnf/*
# Added 'libgoogle-perftools4' for tcmalloc (fixes double-free)
apt-get update
apt-get install -y --no-install-recommends \
python3.11 python3.11-dev python3.11-venv git rsync bash ca-certificates curl \
gcc g++ binutils make ffmpeg \
cmake ninja-build aria2 tar xz-utils vim nano dialog \
libdrm-dev zlib1g-dev libssl-dev procps \
libnuma-dev libgoogle-perftools4 iproute2 ibverbs-utils patch perftest iputils-ping iperf3 infiniband-diags \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
+3 -2
Bestand weergeven
@@ -3,7 +3,7 @@ set -euo pipefail
# Configuration with defaults matching Dockerfile ARGs
ROCM_MAJOR_VER="${ROCM_MAJOR_VER:-7}"
GFX="${GFX:-gfx1151}"
GFX="${GFX:-gfx1150}"
echo "=== Installing ROCm SDK ($GFX / $ROCM_MAJOR_VER) ==="
@@ -51,8 +51,9 @@ printf '%s\n' \
"export VLLM_TARGET_DEVICE=rocm" \
"export HIP_FORCE_DEV_KERNARG=1" \
"export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
"export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4:/opt/rocm/lib/librocm_smi64.so.1.0" \
"export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/rocm/lib/librocm_smi64.so.1.0" \
> /etc/profile.d/rocm-sdk.sh
chmod 0644 /etc/profile.d/rocm-sdk.sh
echo "=== ROCm SDK Installation Complete ==="
+2 -2
Bestand weergeven
@@ -3,7 +3,7 @@ set -e
# Configuration
# Paths identified from your environment
ROCM_LIB_PATH="/opt/rocm/lib/librccl.so.1.0"
VENV_LIB_PATH="/opt/venv/lib/python3.13/site-packages/_rocm_sdk_libraries_gfx1151/lib/librccl.so.1"
VENV_LIB_PATH="/opt/venv/lib/python3.11/site-packages/_rocm_sdk_libraries_gfx1150/lib/librccl.so.1"
BACKUP_DIR="./rccl_backups_$(date +%Y%m%d_%H%M%S)"
# Files to replace
# We assume the new library is named 'librccl.so' or 'librccl.so.1' in the current directory or provided as arg
@@ -20,7 +20,7 @@ do_install() {
echo "Please provide the path to the newly built librccl.so.1"
exit 1
fi
echo "=== Installing Custom RCCL (gfx1151) ==="
echo "=== Installing Custom RCCL (gfx1150) ==="
echo "Creating backup directory: $BACKUP_DIR"
mkdir -p "$BACKUP_DIR"
# 1. Backup /opt/rocm location
+3 -3
Bestand weergeven
@@ -25,10 +25,10 @@ def patch_vllm():
txt = p_rocm.read_text()
header = 'import sys\nfrom unittest.mock import MagicMock\nsys.modules["amdsmi"] = MagicMock()\n'
txt = header + txt
txt = txt.replace('def _get_gcn_arch() -> str:', 'def _get_gcn_arch() -> str:\n return "gfx1151"\n\ndef _old_get_gcn_arch() -> str:')
txt = txt.replace('def _get_gcn_arch() -> str:', 'def _get_gcn_arch() -> str:\n return "gfx1150"\n\ndef _old_get_gcn_arch() -> str:')
txt = re.sub(r'device_type = .*', 'device_type = "rocm"', txt)
txt = re.sub(r'device_name = .*', 'device_name = "gfx1151"', txt)
txt += '\n def get_device_name(self, device_id: int = 0) -> str:\n return "AMD-gfx1151"\n'
txt = re.sub(r'device_name = .*', 'device_name = "gfx1150"', txt)
txt += '\n def get_device_name(self, device_id: int = 0) -> str:\n return "AMD-gfx1150"\n'
p_rocm.write_text(txt)
print(" -> Patched vllm/platforms/rocm.py")