Improve compilation support

2026-03-15 13:04:09 +01:00
commit 48a20990d3
@@ -13,7 +13,7 @@ on:
        default: ""

 env:
-  IMAGE_REPO: kyuz0/vllm-therock-gfx1151
+  IMAGE_REPO: kyuz0/vllm-therock-gfx1150
  DOCKER_BUILDKIT: "1"

 jobs:
@@ -67,7 +67,7 @@ jobs:
        uses: dawidd6/action-download-artifact@v6
        with:
          workflow: build-rccl.yml
-          name: librccl-gfx1151
+          name: librccl-gfx1150
          run_id: ${{ github.event.inputs.rccl_run_id }}
          path: custom_libs
          if_no_artifact_found: warn
@@ -5,7 +5,7 @@ on:

 env:
  ROCM_MAJOR_VER: 7
-  GFX: gfx1151
+  GFX: gfx1150

 jobs:
  build-rccl:
@@ -24,17 +24,17 @@ jobs:
        shell: bash
        run: |
          source /etc/profile.d/rocm-sdk.sh
-          bash scripts/build_rccl_gfx1151.sh
+          bash scripts/build_rccl_gfx1150.sh
          
      - name: Compress Artifact
        run: |
-          # Path determined from script logic: rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
-          ls -lh rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
-          gzip -c rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 > librccl.so.1.gz
+          # Path determined from script logic: rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
+          ls -lh rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
+          gzip -c rocm-systems/projects/rccl/build_gfx1150/librccl.so.1 > librccl.so.1.gz
          ls -lh librccl.so.1.gz
          
      - name: Upload Artifact
        uses: actions/upload-artifact@v4
        with:
-          name: librccl-gfx1151
+          name: librccl-gfx1150
          path: librccl.so.1.gz
@@ -1,3 +1,5 @@
 *.pyc
 __pycache__/
-settings.json
+settings.json
+custom_libs/
+rocm-systems/
@@ -1,4 +1,4 @@
-FROM registry.fedoraproject.org/fedora:43
+FROM debian:12-slim

 # 1. System Base & Build Tools
 # Added 'gperftools-libs' for tcmalloc (fixes double-free)
@@ -8,7 +8,7 @@ RUN sh /tmp/install_deps.sh
 # 2. Install "TheRock" ROCm SDK (Tarball Method)
 WORKDIR /tmp
 ARG ROCM_MAJOR_VER=7
-ARG GFX=gfx1151
+ARG GFX=gfx1150
 # We pass ARGs to the script via ENV or rely on defaults. 
 # But let's be explicit and export them for the RUN command.
 COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh
@@ -18,7 +18,7 @@ RUN chmod +x /tmp/install_rocm_sdk.sh && \
  /tmp/install_rocm_sdk.sh

 # 4. Python Venv Setup
-RUN /usr/bin/python3.12 -m venv /opt/venv
+RUN /usr/bin/python3.11 -m venv /opt/venv
 ENV VIRTUAL_ENV=/opt/venv
 ENV PATH=/opt/venv/bin:$PATH
 ENV PIP_NO_CACHE_DIR=1
@@ -27,7 +27,7 @@ RUN python -m pip install --upgrade pip wheel packaging "setuptools<80.0.0"

 # 5. Install PyTorch (TheRock Nightly)
 RUN python -m pip install \
-  --index-url https://rocm.nightlies.amd.com/v2-staging/gfx1151/ \
+  --index-url https://rocm.nightlies.amd.com/v2-staging/gfx1150/ \
  --pre torch torchaudio torchvision

 WORKDIR /opt
@@ -49,16 +49,16 @@ WORKDIR /opt/vllm
 # --- PATCHING ---
 COPY scripts/patch_strix.py /opt/vllm/patch_strix.py
 RUN python /opt/vllm/patch_strix.py && \
-  sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt  
+  sed -i 's/gfx1200;gfx1201/gfx1150/' CMakeLists.txt  

 # 7. Build vLLM (Wheel Method) with CLANG Host Compiler
-RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11
+RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11 amd-quark>=0.11
 ENV ROCM_HOME="/opt/rocm"
 ENV HIP_PATH="/opt/rocm"
 ENV VLLM_TARGET_DEVICE="rocm"
-ENV PYTORCH_ROCM_ARCH="gfx1151"
-ENV HIP_ARCHITECTURES="gfx1151"          
-ENV AMDGPU_TARGETS="gfx1151"              
+ENV PYTORCH_ROCM_ARCH="gfx1150"
+ENV HIP_ARCHITECTURES="gfx1150"          
+ENV AMDGPU_TARGETS="gfx1150"              
 ENV MAX_JOBS="4"

 # --- CRITICAL FIX FOR SEGFAULT ---
@@ -69,7 +69,7 @@ ENV CXX="/opt/rocm/llvm/bin/clang++"

 RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
  echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \
-  export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \   
+  export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1150 -DHIP_ARCHITECTURES=gfx1150" && \   
  python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \
  python -m pip install /tmp/dist/*.whl

@@ -86,8 +86,8 @@ ENV CMAKE_PREFIX_PATH="/opt/rocm"

 # Force CMake to use the System ROCm Compiler (/opt/rocm/llvm/bin/clang++)
 RUN cmake -S . \
-  -DGPU_TARGETS="gfx1151" \
-  -DBNB_ROCM_ARCH="gfx1151" \
+  -DGPU_TARGETS="gfx1150" \
+  -DBNB_ROCM_ARCH="gfx1150" \
  -DCOMPUTE_BACKEND=hip \
  -DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
  -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
@@ -101,7 +101,7 @@ RUN chmod -R a+rwX /opt && \
  find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \
  find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} + && \
  rm -rf /root/.cache/pip || true && \
-  dnf clean all && rm -rf /var/cache/dnf/*
+  apt-get clean && rm -rf /var/lib/apt/lists/*

 COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh
 COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh
@@ -128,7 +128,7 @@ RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py
 RUN chmod 0644 /etc/profile.d/*.sh
 RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh

-# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one
+# 9. Install Custom RCCL (gfx1150) - Replaces standard library with manually built one
 COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz
 RUN echo "Installing Custom RCCL..." && \
  gzip -d /tmp/librccl.so.1.gz && \
@@ -146,4 +146,4 @@ RUN python -m pip install transformers==5.0.0

 RUN chmod -R a+rwX /opt

-CMD ["/bin/bash"]
+CMD ["/bin/bash"]
@@ -0,0 +1,304 @@
+# Guida: Usare vLLM con Podman su Strix Halo
+
+Questa guida ti spiega come buildare e usare il container vLLM con il modello `bullpoint/Qwen3-Coder-Next-AWQ-4bit` su Debian 13 con Podman.
+
+## Prerequisiti
+
+- Podman installato e funzionante
+- AMD Ryzen AI Max "Strix Halo" (gfx1150) o GPU ROCm compatibile
+- Accesso ai device `/dev/kfd` e `/dev/dri`
+- Almeno 30GB di spazio disco per il modello e la cache
+
+## 1. Buildare l'immagine
+
+Dalla directory del progetto, esegui:
+
+```bash
+podman build -t vllm:rocm .
+```
+
+**Note:**
+- Il build richiede 30-60 minuti a seconda della macchina
+- L'immagine compila vLLM, bitsandbytes e flash-attention da sorgente
+- Se il build fallisce, verifica di avere abbastanza spazio disco e memoria
+
+### Opzioni di build avanzate
+
+Puoi passare argomenti personalizzati:
+
+```bash
+podman build \
+  --build-arg ROCM_MAJOR_VER=7 \
+  --build-arg GFX=gfx1150 \
+  --network=host \
+  -t vllm:rocm .
+```
+
+- `--network=host` - Usare la rete dell'host per i download (utile se hai problemi di connessione)
+- `--no-cache` - Ignorare la cache e ricompilare tutto
+
+## 2. Preparare i filesystem locali
+
+Crea le cartelle per modelli e cache:
+
+```bash
+mkdir -p ~/models
+mkdir -p ~/.cache/huggingface
+```
+
+## 3. Lanciare il container con GPU
+
+### Opzione A: Shell interattiva (Development)
+
+Se vuoi esplorare il container e usare il TUI `start-vllm`:
+
+```bash
+podman run -it \
+  --device /dev/kfd \
+  --device /dev/dri \
+  --network host \
+  -v $HOME/models:/models \
+  -v $HOME/.cache/huggingface:/cache/huggingface \
+  -p 8000:8000 \
+  vllm:rocm \
+  /bin/bash
+```
+
+Dentro il container:
+
+```bash
+start-vllm
+```
+
+Oppure lancia direttamente:
+
+```bash
+vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
+  --tensor-parallel-size 1 \
+  --trust-remote-code \
+  --enforce-eager \
+  --gpu-memory-utilization 0.90
+```
+
+### Opzione B: Lanciare direttamente il servizio (Production)
+
+Esegui vLLM in un unico comando senza shell interattiva:
+
+```bash
+podman run -d \
+  --device /dev/kfd \
+  --device /dev/dri \
+  --network host \
+  -v $HOME/models:/models \
+  -v $HOME/.cache/huggingface:/cache/huggingface \
+  -p 8000:8000 \
+  --name vllm-server \
+  vllm:rocm \
+  vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
+    --tensor-parallel-size 1 \
+    --trust-remote-code \
+    --enforce-eager \
+    --gpu-memory-utilization 0.90
+```
+
+**Opzioni spiegate:**
+
+| Opzione | Significato |
+|---------|------------|
+| `-d` | Esegui in background |
+| `--device /dev/kfd` | Accesso alla GPU ROCm (kernel compute queue) |
+| `--device /dev/dri` | Accesso agli acceleratori DRI (render engine) |
+| `--network host` | Usa la rete dell'host (migliore performance) |
+| `-v $HOME/models:/models` | Monta la cartella modelli locale |
+| `-v $HOME/.cache/huggingface:/cache/huggingface` | Monta la cache HuggingFace |
+| `-p 8000:8000` | Espone la porta dell'API OpenAI-compatible |
+| `--name vllm-server` | Nome del container |
+| `--tensor-parallel-size 1` | Usa 1 GPU (no parallelismo) |
+| `--trust-remote-code` | Permetti codice remoto da HuggingFace |
+| `--enforce-eager` | Modalità eager (debug/stability) |
+| `--gpu-memory-utilization 0.90` | Usa il 90% della memoria GPU |
+
+## 4. Monitorare il container
+
+Se lanciato in background (`-d`):
+
+```bash
+# Visualizza i log
+podman logs -f vllm-server
+
+# Visualizza i log ultimi 50 righe
+podman logs -n 50 vllm-server
+
+# Controlla lo stato
+podman ps | grep vllm-server
+
+# Entra nel container
+podman exec -it vllm-server /bin/bash
+```
+
+## 5. Testare l'API
+
+Una volta che il server è up, puoi testare con cURL:
+
+### Chat Completion
+
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
+    "messages": [{"role": "user", "content": "Write a Python function to sort a list"}],
+    "max_tokens": 200,
+    "temperature": 0.7
+  }'
+```
+
+### Completamento testo
+
+```bash
+curl -X POST http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
+    "prompt": "def fibonacci(",
+    "max_tokens": 100
+  }'
+```
+
+### Listare modelli disponibili
+
+```bash
+curl http://localhost:8000/v1/models
+```
+
+## 6. Usare da un altro host (SSH Port Forwarding)
+
+Se vLLM è su un server remoto:
+
+```bash
+ssh -L 0.0.0.0:8000:localhost:8000 user@remote-host
+```
+
+Poi da client locale:
+
+```bash
+curl http://localhost:8000/v1/models
+```
+
+## 7. Stoppare il container
+
+```bash
+# Se lanciato in background
+podman stop vllm-server
+
+# Rimuovere il container
+podman rm vllm-server
+
+# Se in shell interattiva, usa Ctrl+C e poi
+podman stop <container-id>
+```
+
+## 8. Usare con systemd (Quadlet)
+
+Se hai già usato il file `vllm-rocm.container` generato:
+
+```bash
+mkdir -p ~/.config/containers/systemd/
+cp vllm-rocm.container ~/.config/containers/systemd/
+systemctl --user daemon-reload
+systemctl --user start vllm-rocm
+systemctl --user status vllm-rocm
+```
+
+Visualizza i log:
+
+```bash
+systemctl --user logs -u vllm-rocm -n 50 -f
+```
+
+## Modello: bullpoint/Qwen3-Coder-Next-AWQ-4bit
+
+### Caratteristiche
+
+- **Quantizzazione:** AWQ (Activation-aware Weight Quantization) a 4-bit
+- **Vantaggi:**
+  - Occupa ~15-20GB di memoria (vs 50-60GB full precision)
+  - Esecuzione molto veloce
+  - Qualità proche al modello full precision
+- **Caso d'uso:** Sviluppo code, task di programmazione
+
+### Parametri consigliati
+
+```bash
+vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
+  --tensor-parallel-size 1 \
+  --trust-remote-code \
+  --enforce-eager \
+  --gpu-memory-utilization 0.90 \
+  --max-model-len 4096 \
+  --batch-size 16
+```
+
+## Troubleshooting
+
+### Errore: "Unable to locate package python3.13"
+
+Il container usa Python 3.13, disponibile in Debian 13. Verifica di usare `debian:bookworm` o `debian:13-slim` nella base image.
+
+### Errore: "No GPU detected"
+
+Verifica che i device siano accessibili:
+
+```bash
+ls -la /dev/kfd /dev/dri
+```
+
+Se non ci sono, potrebbe essere un problema di driver. Su Strix Halo:
+
+```bash
+rocm-smi
+```
+
+### Errore: "Out of memory"
+
+Riduci `--gpu-memory-utilization` oppure `--max-model-len`:
+
+```bash
+vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
+  --gpu-memory-utilization 0.80 \
+  --max-model-len 2048
+```
+
+### Il container si ferma subito
+
+Controlla i log:
+
+```bash
+podman logs vllm-server
+```
+
+Se vedi errori di compilazione, il build potrebbe non essere completato correttamente. Riprova:
+
+```bash
+podman build --no-cache -t vllm:rocm .
+```
+
+## Link Utili
+
+- [vLLM Documentation](https://docs.vllm.ai/)
+- [HuggingFace Qwen3 Models](https://huggingface.co/collections/Qwen/qwen3-coder-67a2e625ef1d5c6ba5a9c14c)
+- [ROCm Documentation](https://rocmdocs.amd.com/)
+
+## Domande Frequenti
+
+**D: Posso usare più GPU con Tensor Parallelism?**  
+R: Sì, imposta `--tensor-parallel-size 2` se hai 2 GPU. Su Strix Halo single-GPU, usa `--tensor-parallel-size 1`.
+
+**D: Come cambio modello senza riavviare il container?**  
+R: Devi stoppare e riavviare il container con un modello diverso.
+
+**D: Posso usare questo con una Web UI?**  
+R: Sì, usa HuggingFace Chat UI o altre app che supportano endpoint OpenAI-compatible.
+
+**D: Il modello viene scaricato ogni volta?**  
+R: No, viene cachato in `~/.cache/huggingface`. La prima volta richiede il download, le volte successive usa la cache.
@@ -1,13 +1,13 @@
-# AMD Strix Halo (gfx1151) — vLLM Toolbox/Container
+# AMD Strix Halo (gfx1150) — vLLM Toolbox/Container

-An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1151)**. Built on the **TheRock nightly builds** for ROCm.
+An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1150)**. Built on the **TheRock nightly builds** for ROCm.


 ---

 ## 🚀 High-Performance Clustering Support (New!)

-**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1151)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.
+**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1150)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.

 👉 **[Read the Full RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)** for hardware requirements and configuration instructions.

@@ -58,7 +58,7 @@ View full benchmarks at: [https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/

 ## 1) Toolbx vs Docker/Podman

-The `kyuz0/vllm-therock-gfx1151:latest` image can be used both as: 
+The `kyuz0/vllm-therock-gfx1150:latest` image can be used both as: 

 * **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean. 
 * **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container.
@@ -81,7 +81,7 @@ To manually create a toolbox that exposes the GPU and relaxes seccomp:

 ```bash
 toolbox create vllm \
-  --image docker.io/kyuz0/vllm-therock-gfx1151:latest \
+  --image docker.io/kyuz0/vllm-therock-gfx1150:latest \
  -- --device /dev/dri --device /dev/kfd \
  --group-add video --group-add render --security-opt seccomp=unconfined
 ```
@@ -112,7 +112,7 @@ Ubuntu’s toolbox package still breaks GPU access, so use Distrobox instead:

 ```bash
 distrobox create -n vllm \
-  --image docker.io/kyuz0/vllm-therock-gfx1151:latest \
+  --image docker.io/kyuz0/vllm-therock-gfx1150:latest \
  --additional-flags "--device /dev/kfd --device /dev/dri --group-add video --group-add render --security-opt seccomp=unconfined"

 distrobox enter vllm
@@ -218,6 +218,6 @@ This toolbox supports high-performance clustering of multiple Strix Halo nodes u
 **Detailed Documentation:** [RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)

 **Key Features:**
-*   **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1151`.
+*   **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1150`.
 *   **Easy Setup:** `refresh_toolbox.sh` automatically detects and exposes RDMA devices.
 *   **Cluster Management:** Included `start-vllm-cluster` TUI for managing Ray and vLLM.
@@ -4,7 +4,7 @@
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>AMD Strix Halo (gfx1151) vLLM Benchmarks</title>
+    <title>AMD Strix Halo (gfx1150) vLLM Benchmarks</title>
    <style>
        :root {
            --bg-body: #f9fafb;
@@ -445,7 +445,7 @@

    <div class="container">
        <header>
-            <h1>AMD Strix Halo (gfx1151) vLLM Benchmarks</h1>
+            <h1>AMD Strix Halo (gfx1150) vLLM Benchmarks</h1>
            <p style="margin: 4px 0 0 0; font-size: 0.9rem;">
                <a href="https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes/" target="_blank"
                    style="color: var(--primary); text-decoration: none;">View on GitHub &rarr;</a>
@@ -221,7 +221,7 @@ The cluster management and verification scripts rely on SSH to execute commands

 ### 5.2 Installation

-The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1151` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1151-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.
+The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1150` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1150-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.

 To install the toolbox on **both nodes**, run:

@@ -230,7 +230,7 @@ To install the toolbox on **both nodes**, run:
 ```

 **What this does:**
-1.  Pulls the latest `kyuz0/vllm-therock-gfx1151` image.
+1.  Pulls the latest `kyuz0/vllm-therock-gfx1150` image.
 2.  Detects if `/dev/infiniband` exists on your host.
 3.  Creates the toolbox with flags to expose:
    *   **iGPU Access**: `/dev/dri`, `/dev/kfd` (Required for ROCm)
@@ -332,7 +332,7 @@ If you see link issues, ensure your Intel E810 firmware is up to date using the
 ## 8. References & Acknowledgements

 *   **Reddit - Strix Halo Batching with Tensor Parallel**: [Thread by Hungry_Elk_3276](https://www.reddit.com/r/LocalLLaMA/comments/1p8nped/strix_halo_batching_with_tensor_parallel_and/)
-    *   Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1151` support in upstream RCCL.
+    *   Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1150` support in upstream RCCL.

 ---

@@ -1,9 +1,9 @@
 # Issue Report: vLLM Tensor Parallelism over RDMA on AMD Strix Halo

 > **✅ RESOLVED (Feb 2, 2026)**
-> This issue is **SOLVED**. The root cause was indeed missing `gfx1151` support in the upstream RCCL library.
+> This issue is **SOLVED**. The root cause was indeed missing `gfx1150` support in the upstream RCCL library.
 >
-> I have patched and built a custom version of RCCL with native `gfx1151` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1151`).
+> I have patched and built a custom version of RCCL with native `gfx1150` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1150`).
 >
 > See the [RDMA Cluster Setup Guide](setup_guide.md) for instructions on how to run the cluster using the fixed container.

@@ -12,8 +12,8 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R

 - **Current Status:** RDMA communication is verified (low latency ~5us). Ray cluster is operational and can allocate tensors on both nodes.
 - **Blocker:** vLLM fails with `HIP error: invalid kernel file` when initializing the distributed environment.
- **Suspected Cause:** Possible missing support for `gfx1151` in the RCCL library included with the ROCm nightly build.
- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1151` support is indeed missing/required in RCCL.
+- **Suspected Cause:** Possible missing support for `gfx1150` in the RCCL library included with the ROCm nightly build.
+- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1150` support is indeed missing/required in RCCL.

 ## Table of Contents
 1. [Context & Goal](#1-context--goal)
@@ -24,7 +24,7 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R
 4. [The Issue: Invalid Kernel File](#4-the-issue-invalid-kernel-file)
    - [4.1 Command & Configuration](#41-command--configuration)
    - [4.2 Error Logs](#42-error-logs)
-    - [4.3 Hypothesis: RCCL Support for gfx1151](#43-hypothesis-rccl-support-for-gfx1151)
+    - [4.3 Hypothesis: RCCL Support for gfx1150](#43-hypothesis-rccl-support-for-gfx1150)
 5. [Request for Help](#5-request-for-help)

 ## 1. Context & Goal
@@ -70,7 +70,7 @@ The environment is created using `toolbox` (wrapping Podman) with specific flags

 ```bash
 toolbox create vllm \
-  --image docker.io/kyuz0/vllm-therock-gfx1151:latest \
+  --image docker.io/kyuz0/vllm-therock-gfx1150:latest \
  -- \
  --device /dev/dri \
  --device /dev/kfd \
@@ -751,7 +751,7 @@ This results in an `HIP error: invalid kernel file` immediately upon engine init

 ### 4.1 - Possible reasons

-This invalid kernel file might be related to RCCL not supporting gfx1151. There was a PR that was never merged:
+This invalid kernel file might be related to RCCL not supporting gfx1150. There was a PR that was never merged:

 https://github.com/ROCm/rccl/pull/2075

@@ -3,7 +3,7 @@
 set -e

 TOOLBOX_NAME="vllm"
-IMAGE="docker.io/kyuz0/vllm-therock-gfx1151:latest"
+IMAGE="docker.io/kyuz0/vllm-therock-gfx1150:latest"

 # Base options
 OPTIONS="--device /dev/dri --device /dev/kfd --group-add video --group-add render --security-opt seccomp=unconfined"
@@ -83,13 +83,13 @@ cat <<'ASCII'
                               v L L M                                      
 ASCII
 echo
-printf 'AMD STRIX HALO — vLLM Toolbox (gfx1151, ROCm via TheRock)\n'
+printf 'AMD STRIX HALO — vLLM Toolbox (gfx1150, ROCm via TheRock)\n'
 [[ -n "$ROCM_VER" ]] && printf 'ROCm nightly: %s\n' "$ROCM_VER"
 echo
 printf 'Machine: %s\n' "$MACHINE"
 printf 'GPU    : %s\n\n' "$GPU"
 printf 'Repo   : https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes\n'
-printf 'Image  : docker.io/kyuz0/vllm-therock-gfx1151:latest\n\n'
+printf 'Image  : docker.io/kyuz0/vllm-therock-gfx1150:latest\n\n'
 printf 'Included:\n'
 printf '  - %-16s → %s\n' "start-vllm (TUI)" "Interactive launcher: Model select, Multi-GPU & Cache handling"
 printf '  - %-16s → %s\n' "start-vllm-cluster" "Cluster launcher: Setup Ray Head/Worker & Launch vLLM RCCL"
@@ -1,13 +1,13 @@
 #!/bin/bash
 set -e
 # Configuration
-REPO_URL="https://github.com/kyuz0/rocm-systems.git"
-BRANCH="gfx1151-rccl"
-BUILD_DIR="build_gfx1151"
+REPO_URL="https://code.badstorm.xyz/AI/rocm-systems.git"
+BRANCH="gfx1150-rccl"
+BUILD_DIR="build_gfx1150"
 ROCM_PATH=${ROCM_PATH:-/opt/rocm}
 # Project sub-directory
 PROJECT_DIR="projects/rccl"
-echo "=== Building RCCL for gfx1151 ==="
+echo "=== Building RCCL for gfx1150 ==="
 echo "Repo: $REPO_URL"
 echo "Branch: $BRANCH"
 echo "ROCm Path: $ROCM_PATH"
@@ -28,14 +28,14 @@ echo "Entering project directory..."
 cd $PROJECT_DIR
 mkdir -p $BUILD_DIR
 cd $BUILD_DIR
-echo "Configuring CMake for gfx1151..."
-# We explicitly set GPU_TARGETS to gfx1151 to override the default list.
+echo "Configuring CMake for gfx1150..."
+# We explicitly set GPU_TARGETS to gfx1150 to override the default list.
 # We also set AMDGPU_TARGETS for standard rocm-cmake compliance.
 CXX=$ROCM_PATH/bin/hipcc cmake .. \
    -DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc \
-    -DDEFAULT_GPUS="gfx1151" \
-    -DGPU_TARGETS="gfx1151" \
-    -DAMDGPU_TARGETS="gfx1151" \
+    -DDEFAULT_GPUS="gfx1150" \
+    -DGPU_TARGETS="gfx1150" \
+    -DAMDGPU_TARGETS="gfx1150" \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DBUILD_TESTS=OFF \
    -DGENERATE_SYM_KERNELS=OFF \
@@ -44,6 +44,15 @@ CXX=$ROCM_PATH/bin/hipcc cmake .. \
 # 3. Build
 echo "Building librccl.so..."
 make -j$(nproc)
+
+# Comprimi il file reale (non il symlink)
+cd /home/badstorm/Source/ai/amd-strix-halo-vllm-toolboxes
+gzip -k rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0
+
+# Copia i file .gz in custom_libs/
+mkdir -p custom_libs/
+cp rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0.gz custom_libs/librccl.so.1.gz
+
 echo "=== Build Complete ==="
 echo "Libraries are located in:"
 echo "  $(pwd)/librccl.so"
@@ -2,11 +2,12 @@
 set -e

 # 1. System Base & Build Tools
-# Added 'gperftools-libs' for tcmalloc (fixes double-free)
-dnf -y install --setopt=install_weak_deps=False --nodocs \
-  python3.12 python3.12-devel git rsync libatomic bash ca-certificates curl \
-  gcc gcc-c++ binutils make ffmpeg-free \
-  cmake ninja-build aria2c tar xz vim nano dialog \
-  libdrm-devel zlib-devel openssl-devel pgrep \
-  numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 perfquery \
-  && dnf clean all && rm -rf /var/cache/dnf/*
+# Added 'libgoogle-perftools4' for tcmalloc (fixes double-free)
+apt-get update
+apt-get install -y --no-install-recommends \
+  python3.11 python3.11-dev python3.11-venv git rsync bash ca-certificates curl \
+  gcc g++ binutils make ffmpeg \
+  cmake ninja-build aria2 tar xz-utils vim nano dialog \
+  libdrm-dev zlib1g-dev libssl-dev procps \
+  libnuma-dev libgoogle-perftools4 iproute2 ibverbs-utils patch perftest iputils-ping iperf3 infiniband-diags \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
@@ -3,7 +3,7 @@ set -euo pipefail

 # Configuration with defaults matching Dockerfile ARGs
 ROCM_MAJOR_VER="${ROCM_MAJOR_VER:-7}"
-GFX="${GFX:-gfx1151}"
+GFX="${GFX:-gfx1150}"

 echo "=== Installing ROCm SDK ($GFX / $ROCM_MAJOR_VER) ==="

@@ -51,8 +51,9 @@ printf '%s\n' \
  "export VLLM_TARGET_DEVICE=rocm" \
  "export HIP_FORCE_DEV_KERNARG=1" \
  "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
-  "export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4:/opt/rocm/lib/librocm_smi64.so.1.0" \
+  "export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/rocm/lib/librocm_smi64.so.1.0" \
  > /etc/profile.d/rocm-sdk.sh

 chmod 0644 /etc/profile.d/rocm-sdk.sh
 echo "=== ROCm SDK Installation Complete ==="
+
@@ -3,7 +3,7 @@ set -e
 # Configuration
 # Paths identified from your environment
 ROCM_LIB_PATH="/opt/rocm/lib/librccl.so.1.0"
-VENV_LIB_PATH="/opt/venv/lib/python3.13/site-packages/_rocm_sdk_libraries_gfx1151/lib/librccl.so.1"
+VENV_LIB_PATH="/opt/venv/lib/python3.11/site-packages/_rocm_sdk_libraries_gfx1150/lib/librccl.so.1"
 BACKUP_DIR="./rccl_backups_$(date +%Y%m%d_%H%M%S)"
 # Files to replace
 # We assume the new library is named 'librccl.so' or 'librccl.so.1' in the current directory or provided as arg
@@ -20,7 +20,7 @@ do_install() {
        echo "Please provide the path to the newly built librccl.so.1"
        exit 1
    fi
-    echo "=== Installing Custom RCCL (gfx1151) ==="
+    echo "=== Installing Custom RCCL (gfx1150) ==="
    echo "Creating backup directory: $BACKUP_DIR"
    mkdir -p "$BACKUP_DIR"
    # 1. Backup /opt/rocm location
@@ -25,10 +25,10 @@ def patch_vllm():
        txt = p_rocm.read_text()
        header = 'import sys\nfrom unittest.mock import MagicMock\nsys.modules["amdsmi"] = MagicMock()\n'
        txt = header + txt
-        txt = txt.replace('def _get_gcn_arch() -> str:', 'def _get_gcn_arch() -> str:\n    return "gfx1151"\n\ndef _old_get_gcn_arch() -> str:')
+        txt = txt.replace('def _get_gcn_arch() -> str:', 'def _get_gcn_arch() -> str:\n    return "gfx1150"\n\ndef _old_get_gcn_arch() -> str:')
        txt = re.sub(r'device_type = .*', 'device_type = "rocm"', txt)
-        txt = re.sub(r'device_name = .*', 'device_name = "gfx1151"', txt)
-        txt += '\n    def get_device_name(self, device_id: int = 0) -> str:\n        return "AMD-gfx1151"\n'
+        txt = re.sub(r'device_name = .*', 'device_name = "gfx1150"', txt)
+        txt += '\n    def get_device_name(self, device_id: int = 0) -> str:\n        return "AMD-gfx1150"\n'
        p_rocm.write_text(txt)
        print(" -> Patched vllm/platforms/rocm.py")