Improve compilation support
This commit is contained in:
@@ -13,7 +13,7 @@ on:
|
||||
default: ""
|
||||
|
||||
env:
|
||||
IMAGE_REPO: kyuz0/vllm-therock-gfx1151
|
||||
IMAGE_REPO: kyuz0/vllm-therock-gfx1150
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
jobs:
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
uses: dawidd6/action-download-artifact@v6
|
||||
with:
|
||||
workflow: build-rccl.yml
|
||||
name: librccl-gfx1151
|
||||
name: librccl-gfx1150
|
||||
run_id: ${{ github.event.inputs.rccl_run_id }}
|
||||
path: custom_libs
|
||||
if_no_artifact_found: warn
|
||||
|
||||
@@ -5,7 +5,7 @@ on:
|
||||
|
||||
env:
|
||||
ROCM_MAJOR_VER: 7
|
||||
GFX: gfx1151
|
||||
GFX: gfx1150
|
||||
|
||||
jobs:
|
||||
build-rccl:
|
||||
@@ -24,17 +24,17 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
source /etc/profile.d/rocm-sdk.sh
|
||||
bash scripts/build_rccl_gfx1151.sh
|
||||
bash scripts/build_rccl_gfx1150.sh
|
||||
|
||||
- name: Compress Artifact
|
||||
run: |
|
||||
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
|
||||
ls -lh rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
|
||||
gzip -c rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 > librccl.so.1.gz
|
||||
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
|
||||
ls -lh rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
|
||||
gzip -c rocm-systems/projects/rccl/build_gfx1150/librccl.so.1 > librccl.so.1.gz
|
||||
ls -lh librccl.so.1.gz
|
||||
|
||||
- name: Upload Artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: librccl-gfx1151
|
||||
name: librccl-gfx1150
|
||||
path: librccl.so.1.gz
|
||||
|
||||
+3
-1
@@ -1,3 +1,5 @@
|
||||
*.pyc
|
||||
__pycache__/
|
||||
settings.json
|
||||
settings.json
|
||||
custom_libs/
|
||||
rocm-systems/
|
||||
+15
-15
@@ -1,4 +1,4 @@
|
||||
FROM registry.fedoraproject.org/fedora:43
|
||||
FROM debian:12-slim
|
||||
|
||||
# 1. System Base & Build Tools
|
||||
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
|
||||
@@ -8,7 +8,7 @@ RUN sh /tmp/install_deps.sh
|
||||
# 2. Install "TheRock" ROCm SDK (Tarball Method)
|
||||
WORKDIR /tmp
|
||||
ARG ROCM_MAJOR_VER=7
|
||||
ARG GFX=gfx1151
|
||||
ARG GFX=gfx1150
|
||||
# We pass ARGs to the script via ENV or rely on defaults.
|
||||
# But let's be explicit and export them for the RUN command.
|
||||
COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh
|
||||
@@ -18,7 +18,7 @@ RUN chmod +x /tmp/install_rocm_sdk.sh && \
|
||||
/tmp/install_rocm_sdk.sh
|
||||
|
||||
# 4. Python Venv Setup
|
||||
RUN /usr/bin/python3.12 -m venv /opt/venv
|
||||
RUN /usr/bin/python3.11 -m venv /opt/venv
|
||||
ENV VIRTUAL_ENV=/opt/venv
|
||||
ENV PATH=/opt/venv/bin:$PATH
|
||||
ENV PIP_NO_CACHE_DIR=1
|
||||
@@ -27,7 +27,7 @@ RUN python -m pip install --upgrade pip wheel packaging "setuptools<80.0.0"
|
||||
|
||||
# 5. Install PyTorch (TheRock Nightly)
|
||||
RUN python -m pip install \
|
||||
--index-url https://rocm.nightlies.amd.com/v2-staging/gfx1151/ \
|
||||
--index-url https://rocm.nightlies.amd.com/v2-staging/gfx1150/ \
|
||||
--pre torch torchaudio torchvision
|
||||
|
||||
WORKDIR /opt
|
||||
@@ -49,16 +49,16 @@ WORKDIR /opt/vllm
|
||||
# --- PATCHING ---
|
||||
COPY scripts/patch_strix.py /opt/vllm/patch_strix.py
|
||||
RUN python /opt/vllm/patch_strix.py && \
|
||||
sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt
|
||||
sed -i 's/gfx1200;gfx1201/gfx1150/' CMakeLists.txt
|
||||
|
||||
# 7. Build vLLM (Wheel Method) with CLANG Host Compiler
|
||||
RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11
|
||||
RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11 amd-quark>=0.11
|
||||
ENV ROCM_HOME="/opt/rocm"
|
||||
ENV HIP_PATH="/opt/rocm"
|
||||
ENV VLLM_TARGET_DEVICE="rocm"
|
||||
ENV PYTORCH_ROCM_ARCH="gfx1151"
|
||||
ENV HIP_ARCHITECTURES="gfx1151"
|
||||
ENV AMDGPU_TARGETS="gfx1151"
|
||||
ENV PYTORCH_ROCM_ARCH="gfx1150"
|
||||
ENV HIP_ARCHITECTURES="gfx1150"
|
||||
ENV AMDGPU_TARGETS="gfx1150"
|
||||
ENV MAX_JOBS="4"
|
||||
|
||||
# --- CRITICAL FIX FOR SEGFAULT ---
|
||||
@@ -69,7 +69,7 @@ ENV CXX="/opt/rocm/llvm/bin/clang++"
|
||||
|
||||
RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
|
||||
echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \
|
||||
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \
|
||||
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1150 -DHIP_ARCHITECTURES=gfx1150" && \
|
||||
python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \
|
||||
python -m pip install /tmp/dist/*.whl
|
||||
|
||||
@@ -86,8 +86,8 @@ ENV CMAKE_PREFIX_PATH="/opt/rocm"
|
||||
|
||||
# Force CMake to use the System ROCm Compiler (/opt/rocm/llvm/bin/clang++)
|
||||
RUN cmake -S . \
|
||||
-DGPU_TARGETS="gfx1151" \
|
||||
-DBNB_ROCM_ARCH="gfx1151" \
|
||||
-DGPU_TARGETS="gfx1150" \
|
||||
-DBNB_ROCM_ARCH="gfx1150" \
|
||||
-DCOMPUTE_BACKEND=hip \
|
||||
-DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
||||
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
||||
@@ -101,7 +101,7 @@ RUN chmod -R a+rwX /opt && \
|
||||
find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \
|
||||
find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} + && \
|
||||
rm -rf /root/.cache/pip || true && \
|
||||
dnf clean all && rm -rf /var/cache/dnf/*
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh
|
||||
COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh
|
||||
@@ -128,7 +128,7 @@ RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py
|
||||
RUN chmod 0644 /etc/profile.d/*.sh
|
||||
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
|
||||
|
||||
# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one
|
||||
# 9. Install Custom RCCL (gfx1150) - Replaces standard library with manually built one
|
||||
COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz
|
||||
RUN echo "Installing Custom RCCL..." && \
|
||||
gzip -d /tmp/librccl.so.1.gz && \
|
||||
@@ -146,4 +146,4 @@ RUN python -m pip install transformers==5.0.0
|
||||
|
||||
RUN chmod -R a+rwX /opt
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
CMD ["/bin/bash"]
|
||||
+304
@@ -0,0 +1,304 @@
|
||||
# Guida: Usare vLLM con Podman su Strix Halo
|
||||
|
||||
Questa guida ti spiega come buildare e usare il container vLLM con il modello `bullpoint/Qwen3-Coder-Next-AWQ-4bit` su Debian 13 con Podman.
|
||||
|
||||
## Prerequisiti
|
||||
|
||||
- Podman installato e funzionante
|
||||
- AMD Ryzen AI Max "Strix Halo" (gfx1150) o GPU ROCm compatibile
|
||||
- Accesso ai device `/dev/kfd` e `/dev/dri`
|
||||
- Almeno 30GB di spazio disco per il modello e la cache
|
||||
|
||||
## 1. Buildare l'immagine
|
||||
|
||||
Dalla directory del progetto, esegui:
|
||||
|
||||
```bash
|
||||
podman build -t vllm:rocm .
|
||||
```
|
||||
|
||||
**Note:**
|
||||
- Il build richiede 30-60 minuti a seconda della macchina
|
||||
- L'immagine compila vLLM, bitsandbytes e flash-attention da sorgente
|
||||
- Se il build fallisce, verifica di avere abbastanza spazio disco e memoria
|
||||
|
||||
### Opzioni di build avanzate
|
||||
|
||||
Puoi passare argomenti personalizzati:
|
||||
|
||||
```bash
|
||||
podman build \
|
||||
--build-arg ROCM_MAJOR_VER=7 \
|
||||
--build-arg GFX=gfx1150 \
|
||||
--network=host \
|
||||
-t vllm:rocm .
|
||||
```
|
||||
|
||||
- `--network=host` - Usare la rete dell'host per i download (utile se hai problemi di connessione)
|
||||
- `--no-cache` - Ignorare la cache e ricompilare tutto
|
||||
|
||||
## 2. Preparare i filesystem locali
|
||||
|
||||
Crea le cartelle per modelli e cache:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/models
|
||||
mkdir -p ~/.cache/huggingface
|
||||
```
|
||||
|
||||
## 3. Lanciare il container con GPU
|
||||
|
||||
### Opzione A: Shell interattiva (Development)
|
||||
|
||||
Se vuoi esplorare il container e usare il TUI `start-vllm`:
|
||||
|
||||
```bash
|
||||
podman run -it \
|
||||
--device /dev/kfd \
|
||||
--device /dev/dri \
|
||||
--network host \
|
||||
-v $HOME/models:/models \
|
||||
-v $HOME/.cache/huggingface:/cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
vllm:rocm \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
Dentro il container:
|
||||
|
||||
```bash
|
||||
start-vllm
|
||||
```
|
||||
|
||||
Oppure lancia direttamente:
|
||||
|
||||
```bash
|
||||
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
|
||||
--tensor-parallel-size 1 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.90
|
||||
```
|
||||
|
||||
### Opzione B: Lanciare direttamente il servizio (Production)
|
||||
|
||||
Esegui vLLM in un unico comando senza shell interattiva:
|
||||
|
||||
```bash
|
||||
podman run -d \
|
||||
--device /dev/kfd \
|
||||
--device /dev/dri \
|
||||
--network host \
|
||||
-v $HOME/models:/models \
|
||||
-v $HOME/.cache/huggingface:/cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
--name vllm-server \
|
||||
vllm:rocm \
|
||||
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
|
||||
--tensor-parallel-size 1 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.90
|
||||
```
|
||||
|
||||
**Opzioni spiegate:**
|
||||
|
||||
| Opzione | Significato |
|
||||
|---------|------------|
|
||||
| `-d` | Esegui in background |
|
||||
| `--device /dev/kfd` | Accesso alla GPU ROCm (kernel compute queue) |
|
||||
| `--device /dev/dri` | Accesso agli acceleratori DRI (render engine) |
|
||||
| `--network host` | Usa la rete dell'host (migliore performance) |
|
||||
| `-v $HOME/models:/models` | Monta la cartella modelli locale |
|
||||
| `-v $HOME/.cache/huggingface:/cache/huggingface` | Monta la cache HuggingFace |
|
||||
| `-p 8000:8000` | Espone la porta dell'API OpenAI-compatible |
|
||||
| `--name vllm-server` | Nome del container |
|
||||
| `--tensor-parallel-size 1` | Usa 1 GPU (no parallelismo) |
|
||||
| `--trust-remote-code` | Permetti codice remoto da HuggingFace |
|
||||
| `--enforce-eager` | Modalità eager (debug/stability) |
|
||||
| `--gpu-memory-utilization 0.90` | Usa il 90% della memoria GPU |
|
||||
|
||||
## 4. Monitorare il container
|
||||
|
||||
Se lanciato in background (`-d`):
|
||||
|
||||
```bash
|
||||
# Visualizza i log
|
||||
podman logs -f vllm-server
|
||||
|
||||
# Visualizza i log ultimi 50 righe
|
||||
podman logs -n 50 vllm-server
|
||||
|
||||
# Controlla lo stato
|
||||
podman ps | grep vllm-server
|
||||
|
||||
# Entra nel container
|
||||
podman exec -it vllm-server /bin/bash
|
||||
```
|
||||
|
||||
## 5. Testare l'API
|
||||
|
||||
Una volta che il server è up, puoi testare con cURL:
|
||||
|
||||
### Chat Completion
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
|
||||
"messages": [{"role": "user", "content": "Write a Python function to sort a list"}],
|
||||
"max_tokens": 200,
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
### Completamento testo
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
|
||||
"prompt": "def fibonacci(",
|
||||
"max_tokens": 100
|
||||
}'
|
||||
```
|
||||
|
||||
### Listare modelli disponibili
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/v1/models
|
||||
```
|
||||
|
||||
## 6. Usare da un altro host (SSH Port Forwarding)
|
||||
|
||||
Se vLLM è su un server remoto:
|
||||
|
||||
```bash
|
||||
ssh -L 0.0.0.0:8000:localhost:8000 user@remote-host
|
||||
```
|
||||
|
||||
Poi da client locale:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/v1/models
|
||||
```
|
||||
|
||||
## 7. Stoppare il container
|
||||
|
||||
```bash
|
||||
# Se lanciato in background
|
||||
podman stop vllm-server
|
||||
|
||||
# Rimuovere il container
|
||||
podman rm vllm-server
|
||||
|
||||
# Se in shell interattiva, usa Ctrl+C e poi
|
||||
podman stop <container-id>
|
||||
```
|
||||
|
||||
## 8. Usare con systemd (Quadlet)
|
||||
|
||||
Se hai già usato il file `vllm-rocm.container` generato:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/.config/containers/systemd/
|
||||
cp vllm-rocm.container ~/.config/containers/systemd/
|
||||
systemctl --user daemon-reload
|
||||
systemctl --user start vllm-rocm
|
||||
systemctl --user status vllm-rocm
|
||||
```
|
||||
|
||||
Visualizza i log:
|
||||
|
||||
```bash
|
||||
systemctl --user logs -u vllm-rocm -n 50 -f
|
||||
```
|
||||
|
||||
## Modello: bullpoint/Qwen3-Coder-Next-AWQ-4bit
|
||||
|
||||
### Caratteristiche
|
||||
|
||||
- **Quantizzazione:** AWQ (Activation-aware Weight Quantization) a 4-bit
|
||||
- **Vantaggi:**
|
||||
- Occupa ~15-20GB di memoria (vs 50-60GB full precision)
|
||||
- Esecuzione molto veloce
|
||||
- Qualità proche al modello full precision
|
||||
- **Caso d'uso:** Sviluppo code, task di programmazione
|
||||
|
||||
### Parametri consigliati
|
||||
|
||||
```bash
|
||||
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
|
||||
--tensor-parallel-size 1 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.90 \
|
||||
--max-model-len 4096 \
|
||||
--batch-size 16
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Errore: "Unable to locate package python3.13"
|
||||
|
||||
Il container usa Python 3.13, disponibile in Debian 13. Verifica di usare `debian:bookworm` o `debian:13-slim` nella base image.
|
||||
|
||||
### Errore: "No GPU detected"
|
||||
|
||||
Verifica che i device siano accessibili:
|
||||
|
||||
```bash
|
||||
ls -la /dev/kfd /dev/dri
|
||||
```
|
||||
|
||||
Se non ci sono, potrebbe essere un problema di driver. Su Strix Halo:
|
||||
|
||||
```bash
|
||||
rocm-smi
|
||||
```
|
||||
|
||||
### Errore: "Out of memory"
|
||||
|
||||
Riduci `--gpu-memory-utilization` oppure `--max-model-len`:
|
||||
|
||||
```bash
|
||||
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
|
||||
--gpu-memory-utilization 0.80 \
|
||||
--max-model-len 2048
|
||||
```
|
||||
|
||||
### Il container si ferma subito
|
||||
|
||||
Controlla i log:
|
||||
|
||||
```bash
|
||||
podman logs vllm-server
|
||||
```
|
||||
|
||||
Se vedi errori di compilazione, il build potrebbe non essere completato correttamente. Riprova:
|
||||
|
||||
```bash
|
||||
podman build --no-cache -t vllm:rocm .
|
||||
```
|
||||
|
||||
## Link Utili
|
||||
|
||||
- [vLLM Documentation](https://docs.vllm.ai/)
|
||||
- [HuggingFace Qwen3 Models](https://huggingface.co/collections/Qwen/qwen3-coder-67a2e625ef1d5c6ba5a9c14c)
|
||||
- [ROCm Documentation](https://rocmdocs.amd.com/)
|
||||
|
||||
## Domande Frequenti
|
||||
|
||||
**D: Posso usare più GPU con Tensor Parallelism?**
|
||||
R: Sì, imposta `--tensor-parallel-size 2` se hai 2 GPU. Su Strix Halo single-GPU, usa `--tensor-parallel-size 1`.
|
||||
|
||||
**D: Come cambio modello senza riavviare il container?**
|
||||
R: Devi stoppare e riavviare il container con un modello diverso.
|
||||
|
||||
**D: Posso usare questo con una Web UI?**
|
||||
R: Sì, usa HuggingFace Chat UI o altre app che supportano endpoint OpenAI-compatible.
|
||||
|
||||
**D: Il modello viene scaricato ogni volta?**
|
||||
R: No, viene cachato in `~/.cache/huggingface`. La prima volta richiede il download, le volte successive usa la cache.
|
||||
+7
-7
@@ -1,13 +1,13 @@
|
||||
# AMD Strix Halo (gfx1151) — vLLM Toolbox/Container
|
||||
# AMD Strix Halo (gfx1150) — vLLM Toolbox/Container
|
||||
|
||||
An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1151)**. Built on the **TheRock nightly builds** for ROCm.
|
||||
An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1150)**. Built on the **TheRock nightly builds** for ROCm.
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 🚀 High-Performance Clustering Support (New!)
|
||||
|
||||
**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1151)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.
|
||||
**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1150)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.
|
||||
|
||||
👉 **[Read the Full RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)** for hardware requirements and configuration instructions.
|
||||
|
||||
@@ -58,7 +58,7 @@ View full benchmarks at: [https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/
|
||||
|
||||
## 1) Toolbx vs Docker/Podman
|
||||
|
||||
The `kyuz0/vllm-therock-gfx1151:latest` image can be used both as:
|
||||
The `kyuz0/vllm-therock-gfx1150:latest` image can be used both as:
|
||||
|
||||
* **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean.
|
||||
* **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container.
|
||||
@@ -81,7 +81,7 @@ To manually create a toolbox that exposes the GPU and relaxes seccomp:
|
||||
|
||||
```bash
|
||||
toolbox create vllm \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
|
||||
-- --device /dev/dri --device /dev/kfd \
|
||||
--group-add video --group-add render --security-opt seccomp=unconfined
|
||||
```
|
||||
@@ -112,7 +112,7 @@ Ubuntu’s toolbox package still breaks GPU access, so use Distrobox instead:
|
||||
|
||||
```bash
|
||||
distrobox create -n vllm \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
|
||||
--additional-flags "--device /dev/kfd --device /dev/dri --group-add video --group-add render --security-opt seccomp=unconfined"
|
||||
|
||||
distrobox enter vllm
|
||||
@@ -218,6 +218,6 @@ This toolbox supports high-performance clustering of multiple Strix Halo nodes u
|
||||
**Detailed Documentation:** [RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)
|
||||
|
||||
**Key Features:**
|
||||
* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1151`.
|
||||
* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1150`.
|
||||
* **Easy Setup:** `refresh_toolbox.sh` automatically detects and exposes RDMA devices.
|
||||
* **Cluster Management:** Included `start-vllm-cluster` TUI for managing Ray and vLLM.
|
||||
@@ -4,7 +4,7 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>AMD Strix Halo (gfx1151) vLLM Benchmarks</title>
|
||||
<title>AMD Strix Halo (gfx1150) vLLM Benchmarks</title>
|
||||
<style>
|
||||
:root {
|
||||
--bg-body: #f9fafb;
|
||||
@@ -445,7 +445,7 @@
|
||||
|
||||
<div class="container">
|
||||
<header>
|
||||
<h1>AMD Strix Halo (gfx1151) vLLM Benchmarks</h1>
|
||||
<h1>AMD Strix Halo (gfx1150) vLLM Benchmarks</h1>
|
||||
<p style="margin: 4px 0 0 0; font-size: 0.9rem;">
|
||||
<a href="https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes/" target="_blank"
|
||||
style="color: var(--primary); text-decoration: none;">View on GitHub →</a>
|
||||
|
||||
@@ -221,7 +221,7 @@ The cluster management and verification scripts rely on SSH to execute commands
|
||||
|
||||
### 5.2 Installation
|
||||
|
||||
The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1151` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1151-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.
|
||||
The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1150` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1150-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.
|
||||
|
||||
To install the toolbox on **both nodes**, run:
|
||||
|
||||
@@ -230,7 +230,7 @@ To install the toolbox on **both nodes**, run:
|
||||
```
|
||||
|
||||
**What this does:**
|
||||
1. Pulls the latest `kyuz0/vllm-therock-gfx1151` image.
|
||||
1. Pulls the latest `kyuz0/vllm-therock-gfx1150` image.
|
||||
2. Detects if `/dev/infiniband` exists on your host.
|
||||
3. Creates the toolbox with flags to expose:
|
||||
* **iGPU Access**: `/dev/dri`, `/dev/kfd` (Required for ROCm)
|
||||
@@ -332,7 +332,7 @@ If you see link issues, ensure your Intel E810 firmware is up to date using the
|
||||
## 8. References & Acknowledgements
|
||||
|
||||
* **Reddit - Strix Halo Batching with Tensor Parallel**: [Thread by Hungry_Elk_3276](https://www.reddit.com/r/LocalLLaMA/comments/1p8nped/strix_halo_batching_with_tensor_parallel_and/)
|
||||
* Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1151` support in upstream RCCL.
|
||||
* Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1150` support in upstream RCCL.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# Issue Report: vLLM Tensor Parallelism over RDMA on AMD Strix Halo
|
||||
|
||||
> **✅ RESOLVED (Feb 2, 2026)**
|
||||
> This issue is **SOLVED**. The root cause was indeed missing `gfx1151` support in the upstream RCCL library.
|
||||
> This issue is **SOLVED**. The root cause was indeed missing `gfx1150` support in the upstream RCCL library.
|
||||
>
|
||||
> I have patched and built a custom version of RCCL with native `gfx1151` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1151`).
|
||||
> I have patched and built a custom version of RCCL with native `gfx1150` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1150`).
|
||||
>
|
||||
> See the [RDMA Cluster Setup Guide](setup_guide.md) for instructions on how to run the cluster using the fixed container.
|
||||
|
||||
@@ -12,8 +12,8 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R
|
||||
|
||||
- **Current Status:** RDMA communication is verified (low latency ~5us). Ray cluster is operational and can allocate tensors on both nodes.
|
||||
- **Blocker:** vLLM fails with `HIP error: invalid kernel file` when initializing the distributed environment.
|
||||
- **Suspected Cause:** Possible missing support for `gfx1151` in the RCCL library included with the ROCm nightly build.
|
||||
- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1151` support is indeed missing/required in RCCL.
|
||||
- **Suspected Cause:** Possible missing support for `gfx1150` in the RCCL library included with the ROCm nightly build.
|
||||
- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1150` support is indeed missing/required in RCCL.
|
||||
|
||||
## Table of Contents
|
||||
1. [Context & Goal](#1-context--goal)
|
||||
@@ -24,7 +24,7 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R
|
||||
4. [The Issue: Invalid Kernel File](#4-the-issue-invalid-kernel-file)
|
||||
- [4.1 Command & Configuration](#41-command--configuration)
|
||||
- [4.2 Error Logs](#42-error-logs)
|
||||
- [4.3 Hypothesis: RCCL Support for gfx1151](#43-hypothesis-rccl-support-for-gfx1151)
|
||||
- [4.3 Hypothesis: RCCL Support for gfx1150](#43-hypothesis-rccl-support-for-gfx1150)
|
||||
5. [Request for Help](#5-request-for-help)
|
||||
|
||||
## 1. Context & Goal
|
||||
@@ -70,7 +70,7 @@ The environment is created using `toolbox` (wrapping Podman) with specific flags
|
||||
|
||||
```bash
|
||||
toolbox create vllm \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
|
||||
-- \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
@@ -751,7 +751,7 @@ This results in an `HIP error: invalid kernel file` immediately upon engine init
|
||||
|
||||
### 4.1 - Possible reasons
|
||||
|
||||
This invalid kernel file might be related to RCCL not supporting gfx1151. There was a PR that was never merged:
|
||||
This invalid kernel file might be related to RCCL not supporting gfx1150. There was a PR that was never merged:
|
||||
|
||||
https://github.com/ROCm/rccl/pull/2075
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
set -e
|
||||
|
||||
TOOLBOX_NAME="vllm"
|
||||
IMAGE="docker.io/kyuz0/vllm-therock-gfx1151:latest"
|
||||
IMAGE="docker.io/kyuz0/vllm-therock-gfx1150:latest"
|
||||
|
||||
# Base options
|
||||
OPTIONS="--device /dev/dri --device /dev/kfd --group-add video --group-add render --security-opt seccomp=unconfined"
|
||||
|
||||
@@ -83,13 +83,13 @@ cat <<'ASCII'
|
||||
v L L M
|
||||
ASCII
|
||||
echo
|
||||
printf 'AMD STRIX HALO — vLLM Toolbox (gfx1151, ROCm via TheRock)\n'
|
||||
printf 'AMD STRIX HALO — vLLM Toolbox (gfx1150, ROCm via TheRock)\n'
|
||||
[[ -n "$ROCM_VER" ]] && printf 'ROCm nightly: %s\n' "$ROCM_VER"
|
||||
echo
|
||||
printf 'Machine: %s\n' "$MACHINE"
|
||||
printf 'GPU : %s\n\n' "$GPU"
|
||||
printf 'Repo : https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes\n'
|
||||
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1151:latest\n\n'
|
||||
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1150:latest\n\n'
|
||||
printf 'Included:\n'
|
||||
printf ' - %-16s → %s\n' "start-vllm (TUI)" "Interactive launcher: Model select, Multi-GPU & Cache handling"
|
||||
printf ' - %-16s → %s\n' "start-vllm-cluster" "Cluster launcher: Setup Ray Head/Worker & Launch vLLM RCCL"
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
# Configuration
|
||||
REPO_URL="https://github.com/kyuz0/rocm-systems.git"
|
||||
BRANCH="gfx1151-rccl"
|
||||
BUILD_DIR="build_gfx1151"
|
||||
REPO_URL="https://code.badstorm.xyz/AI/rocm-systems.git"
|
||||
BRANCH="gfx1150-rccl"
|
||||
BUILD_DIR="build_gfx1150"
|
||||
ROCM_PATH=${ROCM_PATH:-/opt/rocm}
|
||||
# Project sub-directory
|
||||
PROJECT_DIR="projects/rccl"
|
||||
echo "=== Building RCCL for gfx1151 ==="
|
||||
echo "=== Building RCCL for gfx1150 ==="
|
||||
echo "Repo: $REPO_URL"
|
||||
echo "Branch: $BRANCH"
|
||||
echo "ROCm Path: $ROCM_PATH"
|
||||
@@ -28,14 +28,14 @@ echo "Entering project directory..."
|
||||
cd $PROJECT_DIR
|
||||
mkdir -p $BUILD_DIR
|
||||
cd $BUILD_DIR
|
||||
echo "Configuring CMake for gfx1151..."
|
||||
# We explicitly set GPU_TARGETS to gfx1151 to override the default list.
|
||||
echo "Configuring CMake for gfx1150..."
|
||||
# We explicitly set GPU_TARGETS to gfx1150 to override the default list.
|
||||
# We also set AMDGPU_TARGETS for standard rocm-cmake compliance.
|
||||
CXX=$ROCM_PATH/bin/hipcc cmake .. \
|
||||
-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc \
|
||||
-DDEFAULT_GPUS="gfx1151" \
|
||||
-DGPU_TARGETS="gfx1151" \
|
||||
-DAMDGPU_TARGETS="gfx1151" \
|
||||
-DDEFAULT_GPUS="gfx1150" \
|
||||
-DGPU_TARGETS="gfx1150" \
|
||||
-DAMDGPU_TARGETS="gfx1150" \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DBUILD_TESTS=OFF \
|
||||
-DGENERATE_SYM_KERNELS=OFF \
|
||||
@@ -44,6 +44,15 @@ CXX=$ROCM_PATH/bin/hipcc cmake .. \
|
||||
# 3. Build
|
||||
echo "Building librccl.so..."
|
||||
make -j$(nproc)
|
||||
|
||||
# Comprimi il file reale (non il symlink)
|
||||
cd /home/badstorm/Source/ai/amd-strix-halo-vllm-toolboxes
|
||||
gzip -k rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0
|
||||
|
||||
# Copia i file .gz in custom_libs/
|
||||
mkdir -p custom_libs/
|
||||
cp rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0.gz custom_libs/librccl.so.1.gz
|
||||
|
||||
echo "=== Build Complete ==="
|
||||
echo "Libraries are located in:"
|
||||
echo " $(pwd)/librccl.so"
|
||||
|
||||
@@ -2,11 +2,12 @@
|
||||
set -e
|
||||
|
||||
# 1. System Base & Build Tools
|
||||
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
|
||||
dnf -y install --setopt=install_weak_deps=False --nodocs \
|
||||
python3.12 python3.12-devel git rsync libatomic bash ca-certificates curl \
|
||||
gcc gcc-c++ binutils make ffmpeg-free \
|
||||
cmake ninja-build aria2c tar xz vim nano dialog \
|
||||
libdrm-devel zlib-devel openssl-devel pgrep \
|
||||
numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 perfquery \
|
||||
&& dnf clean all && rm -rf /var/cache/dnf/*
|
||||
# Added 'libgoogle-perftools4' for tcmalloc (fixes double-free)
|
||||
apt-get update
|
||||
apt-get install -y --no-install-recommends \
|
||||
python3.11 python3.11-dev python3.11-venv git rsync bash ca-certificates curl \
|
||||
gcc g++ binutils make ffmpeg \
|
||||
cmake ninja-build aria2 tar xz-utils vim nano dialog \
|
||||
libdrm-dev zlib1g-dev libssl-dev procps \
|
||||
libnuma-dev libgoogle-perftools4 iproute2 ibverbs-utils patch perftest iputils-ping iperf3 infiniband-diags \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
@@ -3,7 +3,7 @@ set -euo pipefail
|
||||
|
||||
# Configuration with defaults matching Dockerfile ARGs
|
||||
ROCM_MAJOR_VER="${ROCM_MAJOR_VER:-7}"
|
||||
GFX="${GFX:-gfx1151}"
|
||||
GFX="${GFX:-gfx1150}"
|
||||
|
||||
echo "=== Installing ROCm SDK ($GFX / $ROCM_MAJOR_VER) ==="
|
||||
|
||||
@@ -51,8 +51,9 @@ printf '%s\n' \
|
||||
"export VLLM_TARGET_DEVICE=rocm" \
|
||||
"export HIP_FORCE_DEV_KERNARG=1" \
|
||||
"export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
|
||||
"export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4:/opt/rocm/lib/librocm_smi64.so.1.0" \
|
||||
"export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/rocm/lib/librocm_smi64.so.1.0" \
|
||||
> /etc/profile.d/rocm-sdk.sh
|
||||
|
||||
chmod 0644 /etc/profile.d/rocm-sdk.sh
|
||||
echo "=== ROCm SDK Installation Complete ==="
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ set -e
|
||||
# Configuration
|
||||
# Paths identified from your environment
|
||||
ROCM_LIB_PATH="/opt/rocm/lib/librccl.so.1.0"
|
||||
VENV_LIB_PATH="/opt/venv/lib/python3.13/site-packages/_rocm_sdk_libraries_gfx1151/lib/librccl.so.1"
|
||||
VENV_LIB_PATH="/opt/venv/lib/python3.11/site-packages/_rocm_sdk_libraries_gfx1150/lib/librccl.so.1"
|
||||
BACKUP_DIR="./rccl_backups_$(date +%Y%m%d_%H%M%S)"
|
||||
# Files to replace
|
||||
# We assume the new library is named 'librccl.so' or 'librccl.so.1' in the current directory or provided as arg
|
||||
@@ -20,7 +20,7 @@ do_install() {
|
||||
echo "Please provide the path to the newly built librccl.so.1"
|
||||
exit 1
|
||||
fi
|
||||
echo "=== Installing Custom RCCL (gfx1151) ==="
|
||||
echo "=== Installing Custom RCCL (gfx1150) ==="
|
||||
echo "Creating backup directory: $BACKUP_DIR"
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
# 1. Backup /opt/rocm location
|
||||
|
||||
@@ -25,10 +25,10 @@ def patch_vllm():
|
||||
txt = p_rocm.read_text()
|
||||
header = 'import sys\nfrom unittest.mock import MagicMock\nsys.modules["amdsmi"] = MagicMock()\n'
|
||||
txt = header + txt
|
||||
txt = txt.replace('def _get_gcn_arch() -> str:', 'def _get_gcn_arch() -> str:\n return "gfx1151"\n\ndef _old_get_gcn_arch() -> str:')
|
||||
txt = txt.replace('def _get_gcn_arch() -> str:', 'def _get_gcn_arch() -> str:\n return "gfx1150"\n\ndef _old_get_gcn_arch() -> str:')
|
||||
txt = re.sub(r'device_type = .*', 'device_type = "rocm"', txt)
|
||||
txt = re.sub(r'device_name = .*', 'device_name = "gfx1151"', txt)
|
||||
txt += '\n def get_device_name(self, device_id: int = 0) -> str:\n return "AMD-gfx1151"\n'
|
||||
txt = re.sub(r'device_name = .*', 'device_name = "gfx1150"', txt)
|
||||
txt += '\n def get_device_name(self, device_id: int = 0) -> str:\n return "AMD-gfx1150"\n'
|
||||
p_rocm.write_text(txt)
|
||||
print(" -> Patched vllm/platforms/rocm.py")
|
||||
|
||||
|
||||
Viittaa uudesa ongelmassa
Block a user