From 48a20990d399cad8dfebe716b7069e61bacc95b7 Mon Sep 17 00:00:00 2001 From: BadStorm Developer Date: Sun, 15 Mar 2026 13:04:09 +0100 Subject: [PATCH] Improve compilation support --- .github/workflows/build-and-publish.yml | 4 +- .github/workflows/build-rccl.yml | 12 +- .gitignore | 4 +- Dockerfile | 30 +-- GUIDE.md | 304 ++++++++++++++++++++++++ README.md | 14 +- docs/index.html | 4 +- rdma_cluster/setup_guide.md | 6 +- rdma_cluster/troubleshooting_rccl.md | 14 +- refresh_toolbox.sh | 2 +- scripts/99-toolbox-banner.sh | 4 +- scripts/build_rccl_gfx1151.sh | 27 ++- scripts/install_deps.sh | 17 +- scripts/install_rocm_sdk.sh | 5 +- scripts/manage_rccl_install.sh | 4 +- scripts/patch_strix.py | 6 +- 16 files changed, 387 insertions(+), 70 deletions(-) create mode 100644 GUIDE.md diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml index b08eb24..b724814 100644 --- a/.github/workflows/build-and-publish.yml +++ b/.github/workflows/build-and-publish.yml @@ -13,7 +13,7 @@ on: default: "" env: - IMAGE_REPO: kyuz0/vllm-therock-gfx1151 + IMAGE_REPO: kyuz0/vllm-therock-gfx1150 DOCKER_BUILDKIT: "1" jobs: @@ -67,7 +67,7 @@ jobs: uses: dawidd6/action-download-artifact@v6 with: workflow: build-rccl.yml - name: librccl-gfx1151 + name: librccl-gfx1150 run_id: ${{ github.event.inputs.rccl_run_id }} path: custom_libs if_no_artifact_found: warn diff --git a/.github/workflows/build-rccl.yml b/.github/workflows/build-rccl.yml index 0b1c679..de7af73 100644 --- a/.github/workflows/build-rccl.yml +++ b/.github/workflows/build-rccl.yml @@ -5,7 +5,7 @@ on: env: ROCM_MAJOR_VER: 7 - GFX: gfx1151 + GFX: gfx1150 jobs: build-rccl: @@ -24,17 +24,17 @@ jobs: shell: bash run: | source /etc/profile.d/rocm-sdk.sh - bash scripts/build_rccl_gfx1151.sh + bash scripts/build_rccl_gfx1150.sh - name: Compress Artifact run: | - # Path determined from script logic: rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 - ls -lh rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 - gzip -c rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 > librccl.so.1.gz + # Path determined from script logic: rocm-systems/projects/rccl/build_gfx1150/librccl.so.1 + ls -lh rocm-systems/projects/rccl/build_gfx1150/librccl.so.1 + gzip -c rocm-systems/projects/rccl/build_gfx1150/librccl.so.1 > librccl.so.1.gz ls -lh librccl.so.1.gz - name: Upload Artifact uses: actions/upload-artifact@v4 with: - name: librccl-gfx1151 + name: librccl-gfx1150 path: librccl.so.1.gz diff --git a/.gitignore b/.gitignore index 52dc923..a6e9cd4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.pyc __pycache__/ -settings.json \ No newline at end of file +settings.json +custom_libs/ +rocm-systems/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 9b7120d..a31840f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM registry.fedoraproject.org/fedora:43 +FROM debian:12-slim # 1. System Base & Build Tools # Added 'gperftools-libs' for tcmalloc (fixes double-free) @@ -8,7 +8,7 @@ RUN sh /tmp/install_deps.sh # 2. Install "TheRock" ROCm SDK (Tarball Method) WORKDIR /tmp ARG ROCM_MAJOR_VER=7 -ARG GFX=gfx1151 +ARG GFX=gfx1150 # We pass ARGs to the script via ENV or rely on defaults. # But let's be explicit and export them for the RUN command. COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh @@ -18,7 +18,7 @@ RUN chmod +x /tmp/install_rocm_sdk.sh && \ /tmp/install_rocm_sdk.sh # 4. Python Venv Setup -RUN /usr/bin/python3.12 -m venv /opt/venv +RUN /usr/bin/python3.11 -m venv /opt/venv ENV VIRTUAL_ENV=/opt/venv ENV PATH=/opt/venv/bin:$PATH ENV PIP_NO_CACHE_DIR=1 @@ -27,7 +27,7 @@ RUN python -m pip install --upgrade pip wheel packaging "setuptools<80.0.0" # 5. Install PyTorch (TheRock Nightly) RUN python -m pip install \ - --index-url https://rocm.nightlies.amd.com/v2-staging/gfx1151/ \ + --index-url https://rocm.nightlies.amd.com/v2-staging/gfx1150/ \ --pre torch torchaudio torchvision WORKDIR /opt @@ -49,16 +49,16 @@ WORKDIR /opt/vllm # --- PATCHING --- COPY scripts/patch_strix.py /opt/vllm/patch_strix.py RUN python /opt/vllm/patch_strix.py && \ - sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt + sed -i 's/gfx1200;gfx1201/gfx1150/' CMakeLists.txt # 7. Build vLLM (Wheel Method) with CLANG Host Compiler -RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11 +RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11 amd-quark>=0.11 ENV ROCM_HOME="/opt/rocm" ENV HIP_PATH="/opt/rocm" ENV VLLM_TARGET_DEVICE="rocm" -ENV PYTORCH_ROCM_ARCH="gfx1151" -ENV HIP_ARCHITECTURES="gfx1151" -ENV AMDGPU_TARGETS="gfx1151" +ENV PYTORCH_ROCM_ARCH="gfx1150" +ENV HIP_ARCHITECTURES="gfx1150" +ENV AMDGPU_TARGETS="gfx1150" ENV MAX_JOBS="4" # --- CRITICAL FIX FOR SEGFAULT --- @@ -69,7 +69,7 @@ ENV CXX="/opt/rocm/llvm/bin/clang++" RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \ echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \ - export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \ + export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1150 -DHIP_ARCHITECTURES=gfx1150" && \ python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \ python -m pip install /tmp/dist/*.whl @@ -86,8 +86,8 @@ ENV CMAKE_PREFIX_PATH="/opt/rocm" # Force CMake to use the System ROCm Compiler (/opt/rocm/llvm/bin/clang++) RUN cmake -S . \ - -DGPU_TARGETS="gfx1151" \ - -DBNB_ROCM_ARCH="gfx1151" \ + -DGPU_TARGETS="gfx1150" \ + -DBNB_ROCM_ARCH="gfx1150" \ -DCOMPUTE_BACKEND=hip \ -DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ @@ -101,7 +101,7 @@ RUN chmod -R a+rwX /opt && \ find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \ find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} + && \ rm -rf /root/.cache/pip || true && \ - dnf clean all && rm -rf /var/cache/dnf/* + apt-get clean && rm -rf /var/lib/apt/lists/* COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh @@ -128,7 +128,7 @@ RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py RUN chmod 0644 /etc/profile.d/*.sh RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh -# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one +# 9. Install Custom RCCL (gfx1150) - Replaces standard library with manually built one COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz RUN echo "Installing Custom RCCL..." && \ gzip -d /tmp/librccl.so.1.gz && \ @@ -146,4 +146,4 @@ RUN python -m pip install transformers==5.0.0 RUN chmod -R a+rwX /opt -CMD ["/bin/bash"] +CMD ["/bin/bash"] \ No newline at end of file diff --git a/GUIDE.md b/GUIDE.md new file mode 100644 index 0000000..55c6564 --- /dev/null +++ b/GUIDE.md @@ -0,0 +1,304 @@ +# Guida: Usare vLLM con Podman su Strix Halo + +Questa guida ti spiega come buildare e usare il container vLLM con il modello `bullpoint/Qwen3-Coder-Next-AWQ-4bit` su Debian 13 con Podman. + +## Prerequisiti + +- Podman installato e funzionante +- AMD Ryzen AI Max "Strix Halo" (gfx1150) o GPU ROCm compatibile +- Accesso ai device `/dev/kfd` e `/dev/dri` +- Almeno 30GB di spazio disco per il modello e la cache + +## 1. Buildare l'immagine + +Dalla directory del progetto, esegui: + +```bash +podman build -t vllm:rocm . +``` + +**Note:** +- Il build richiede 30-60 minuti a seconda della macchina +- L'immagine compila vLLM, bitsandbytes e flash-attention da sorgente +- Se il build fallisce, verifica di avere abbastanza spazio disco e memoria + +### Opzioni di build avanzate + +Puoi passare argomenti personalizzati: + +```bash +podman build \ + --build-arg ROCM_MAJOR_VER=7 \ + --build-arg GFX=gfx1150 \ + --network=host \ + -t vllm:rocm . +``` + +- `--network=host` - Usare la rete dell'host per i download (utile se hai problemi di connessione) +- `--no-cache` - Ignorare la cache e ricompilare tutto + +## 2. Preparare i filesystem locali + +Crea le cartelle per modelli e cache: + +```bash +mkdir -p ~/models +mkdir -p ~/.cache/huggingface +``` + +## 3. Lanciare il container con GPU + +### Opzione A: Shell interattiva (Development) + +Se vuoi esplorare il container e usare il TUI `start-vllm`: + +```bash +podman run -it \ + --device /dev/kfd \ + --device /dev/dri \ + --network host \ + -v $HOME/models:/models \ + -v $HOME/.cache/huggingface:/cache/huggingface \ + -p 8000:8000 \ + vllm:rocm \ + /bin/bash +``` + +Dentro il container: + +```bash +start-vllm +``` + +Oppure lancia direttamente: + +```bash +vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \ + --tensor-parallel-size 1 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.90 +``` + +### Opzione B: Lanciare direttamente il servizio (Production) + +Esegui vLLM in un unico comando senza shell interattiva: + +```bash +podman run -d \ + --device /dev/kfd \ + --device /dev/dri \ + --network host \ + -v $HOME/models:/models \ + -v $HOME/.cache/huggingface:/cache/huggingface \ + -p 8000:8000 \ + --name vllm-server \ + vllm:rocm \ + vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \ + --tensor-parallel-size 1 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.90 +``` + +**Opzioni spiegate:** + +| Opzione | Significato | +|---------|------------| +| `-d` | Esegui in background | +| `--device /dev/kfd` | Accesso alla GPU ROCm (kernel compute queue) | +| `--device /dev/dri` | Accesso agli acceleratori DRI (render engine) | +| `--network host` | Usa la rete dell'host (migliore performance) | +| `-v $HOME/models:/models` | Monta la cartella modelli locale | +| `-v $HOME/.cache/huggingface:/cache/huggingface` | Monta la cache HuggingFace | +| `-p 8000:8000` | Espone la porta dell'API OpenAI-compatible | +| `--name vllm-server` | Nome del container | +| `--tensor-parallel-size 1` | Usa 1 GPU (no parallelismo) | +| `--trust-remote-code` | Permetti codice remoto da HuggingFace | +| `--enforce-eager` | Modalità eager (debug/stability) | +| `--gpu-memory-utilization 0.90` | Usa il 90% della memoria GPU | + +## 4. Monitorare il container + +Se lanciato in background (`-d`): + +```bash +# Visualizza i log +podman logs -f vllm-server + +# Visualizza i log ultimi 50 righe +podman logs -n 50 vllm-server + +# Controlla lo stato +podman ps | grep vllm-server + +# Entra nel container +podman exec -it vllm-server /bin/bash +``` + +## 5. Testare l'API + +Una volta che il server è up, puoi testare con cURL: + +### Chat Completion + +```bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit", + "messages": [{"role": "user", "content": "Write a Python function to sort a list"}], + "max_tokens": 200, + "temperature": 0.7 + }' +``` + +### Completamento testo + +```bash +curl -X POST http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit", + "prompt": "def fibonacci(", + "max_tokens": 100 + }' +``` + +### Listare modelli disponibili + +```bash +curl http://localhost:8000/v1/models +``` + +## 6. Usare da un altro host (SSH Port Forwarding) + +Se vLLM è su un server remoto: + +```bash +ssh -L 0.0.0.0:8000:localhost:8000 user@remote-host +``` + +Poi da client locale: + +```bash +curl http://localhost:8000/v1/models +``` + +## 7. Stoppare il container + +```bash +# Se lanciato in background +podman stop vllm-server + +# Rimuovere il container +podman rm vllm-server + +# Se in shell interattiva, usa Ctrl+C e poi +podman stop +``` + +## 8. Usare con systemd (Quadlet) + +Se hai già usato il file `vllm-rocm.container` generato: + +```bash +mkdir -p ~/.config/containers/systemd/ +cp vllm-rocm.container ~/.config/containers/systemd/ +systemctl --user daemon-reload +systemctl --user start vllm-rocm +systemctl --user status vllm-rocm +``` + +Visualizza i log: + +```bash +systemctl --user logs -u vllm-rocm -n 50 -f +``` + +## Modello: bullpoint/Qwen3-Coder-Next-AWQ-4bit + +### Caratteristiche + +- **Quantizzazione:** AWQ (Activation-aware Weight Quantization) a 4-bit +- **Vantaggi:** + - Occupa ~15-20GB di memoria (vs 50-60GB full precision) + - Esecuzione molto veloce + - Qualità proche al modello full precision +- **Caso d'uso:** Sviluppo code, task di programmazione + +### Parametri consigliati + +```bash +vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \ + --tensor-parallel-size 1 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.90 \ + --max-model-len 4096 \ + --batch-size 16 +``` + +## Troubleshooting + +### Errore: "Unable to locate package python3.13" + +Il container usa Python 3.13, disponibile in Debian 13. Verifica di usare `debian:bookworm` o `debian:13-slim` nella base image. + +### Errore: "No GPU detected" + +Verifica che i device siano accessibili: + +```bash +ls -la /dev/kfd /dev/dri +``` + +Se non ci sono, potrebbe essere un problema di driver. Su Strix Halo: + +```bash +rocm-smi +``` + +### Errore: "Out of memory" + +Riduci `--gpu-memory-utilization` oppure `--max-model-len`: + +```bash +vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \ + --gpu-memory-utilization 0.80 \ + --max-model-len 2048 +``` + +### Il container si ferma subito + +Controlla i log: + +```bash +podman logs vllm-server +``` + +Se vedi errori di compilazione, il build potrebbe non essere completato correttamente. Riprova: + +```bash +podman build --no-cache -t vllm:rocm . +``` + +## Link Utili + +- [vLLM Documentation](https://docs.vllm.ai/) +- [HuggingFace Qwen3 Models](https://huggingface.co/collections/Qwen/qwen3-coder-67a2e625ef1d5c6ba5a9c14c) +- [ROCm Documentation](https://rocmdocs.amd.com/) + +## Domande Frequenti + +**D: Posso usare più GPU con Tensor Parallelism?** +R: Sì, imposta `--tensor-parallel-size 2` se hai 2 GPU. Su Strix Halo single-GPU, usa `--tensor-parallel-size 1`. + +**D: Come cambio modello senza riavviare il container?** +R: Devi stoppare e riavviare il container con un modello diverso. + +**D: Posso usare questo con una Web UI?** +R: Sì, usa HuggingFace Chat UI o altre app che supportano endpoint OpenAI-compatible. + +**D: Il modello viene scaricato ogni volta?** +R: No, viene cachato in `~/.cache/huggingface`. La prima volta richiede il download, le volte successive usa la cache. diff --git a/README.md b/README.md index a2b914a..b679e02 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ -# AMD Strix Halo (gfx1151) — vLLM Toolbox/Container +# AMD Strix Halo (gfx1150) — vLLM Toolbox/Container -An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1151)**. Built on the **TheRock nightly builds** for ROCm. +An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1150)**. Built on the **TheRock nightly builds** for ROCm. --- ## 🚀 High-Performance Clustering Support (New!) -**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1151)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU. +**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1150)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU. 👉 **[Read the Full RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)** for hardware requirements and configuration instructions. @@ -58,7 +58,7 @@ View full benchmarks at: [https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/ ## 1) Toolbx vs Docker/Podman -The `kyuz0/vllm-therock-gfx1151:latest` image can be used both as:  +The `kyuz0/vllm-therock-gfx1150:latest` image can be used both as:  * **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean. * **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container. @@ -81,7 +81,7 @@ To manually create a toolbox that exposes the GPU and relaxes seccomp: ```bash toolbox create vllm \ - --image docker.io/kyuz0/vllm-therock-gfx1151:latest \ + --image docker.io/kyuz0/vllm-therock-gfx1150:latest \ -- --device /dev/dri --device /dev/kfd \ --group-add video --group-add render --security-opt seccomp=unconfined ``` @@ -112,7 +112,7 @@ Ubuntu’s toolbox package still breaks GPU access, so use Distrobox instead: ```bash distrobox create -n vllm \ - --image docker.io/kyuz0/vllm-therock-gfx1151:latest \ + --image docker.io/kyuz0/vllm-therock-gfx1150:latest \ --additional-flags "--device /dev/kfd --device /dev/dri --group-add video --group-add render --security-opt seccomp=unconfined" distrobox enter vllm @@ -218,6 +218,6 @@ This toolbox supports high-performance clustering of multiple Strix Halo nodes u **Detailed Documentation:** [RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md) **Key Features:** -* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1151`. +* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1150`. * **Easy Setup:** `refresh_toolbox.sh` automatically detects and exposes RDMA devices. * **Cluster Management:** Included `start-vllm-cluster` TUI for managing Ray and vLLM. \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index 0a2e252..3ab5bdd 100644 --- a/docs/index.html +++ b/docs/index.html @@ -4,7 +4,7 @@ - AMD Strix Halo (gfx1151) vLLM Benchmarks + AMD Strix Halo (gfx1150) vLLM Benchmarks