Porovnat revize
30 Commity
| Autor | SHA1 | Datum | |
|---|---|---|---|
| 48a20990d3 | |||
| 039363b819 | |||
| cf2fd6ec11 | |||
| b78e8a9d82 | |||
| 16405e8943 | |||
| 8de950d9ca | |||
| fb0aef0864 | |||
| 9997faaa1e | |||
| 8a20ec27b2 | |||
| c27835d99f | |||
| b035bcb482 | |||
| 6875f62ccf | |||
| a5a7b8fe04 | |||
| 1af159af81 | |||
| e726d406fa | |||
| e0fadf426b | |||
| f968cb1f30 | |||
| fedfa3c682 | |||
| 13c5a929a3 | |||
| 5a7f0cc676 | |||
| b3fcb0091f | |||
| 91b6dbc270 | |||
| 4a5d6c7855 | |||
| 726cd5ae53 | |||
| 49b85fc1fb | |||
| 290beffb05 | |||
| 6754095398 | |||
| 9cf7eaeab2 | |||
| c3ecb9bbd5 | |||
| afe985afca |
@@ -13,7 +13,7 @@ on:
|
||||
default: ""
|
||||
|
||||
env:
|
||||
IMAGE_REPO: kyuz0/vllm-therock-gfx1151
|
||||
IMAGE_REPO: kyuz0/vllm-therock-gfx1150
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
jobs:
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
uses: dawidd6/action-download-artifact@v6
|
||||
with:
|
||||
workflow: build-rccl.yml
|
||||
name: librccl-gfx1151
|
||||
name: librccl-gfx1150
|
||||
run_id: ${{ github.event.inputs.rccl_run_id }}
|
||||
path: custom_libs
|
||||
if_no_artifact_found: warn
|
||||
|
||||
@@ -5,7 +5,7 @@ on:
|
||||
|
||||
env:
|
||||
ROCM_MAJOR_VER: 7
|
||||
GFX: gfx1151
|
||||
GFX: gfx1150
|
||||
|
||||
jobs:
|
||||
build-rccl:
|
||||
@@ -24,17 +24,17 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
source /etc/profile.d/rocm-sdk.sh
|
||||
bash scripts/build_rccl_gfx1151.sh
|
||||
bash scripts/build_rccl_gfx1150.sh
|
||||
|
||||
- name: Compress Artifact
|
||||
run: |
|
||||
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
|
||||
ls -lh rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
|
||||
gzip -c rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 > librccl.so.1.gz
|
||||
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
|
||||
ls -lh rocm-systems/projects/rccl/build_gfx1150/librccl.so.1
|
||||
gzip -c rocm-systems/projects/rccl/build_gfx1150/librccl.so.1 > librccl.so.1.gz
|
||||
ls -lh librccl.so.1.gz
|
||||
|
||||
- name: Upload Artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: librccl-gfx1151
|
||||
name: librccl-gfx1150
|
||||
path: librccl.so.1.gz
|
||||
|
||||
@@ -1,2 +1,5 @@
|
||||
*.pyc
|
||||
__pycache__/
|
||||
settings.json
|
||||
custom_libs/
|
||||
rocm-systems/
|
||||
+20
-37
@@ -1,4 +1,4 @@
|
||||
FROM registry.fedoraproject.org/fedora:43
|
||||
FROM debian:12-slim
|
||||
|
||||
# 1. System Base & Build Tools
|
||||
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
|
||||
@@ -8,7 +8,7 @@ RUN sh /tmp/install_deps.sh
|
||||
# 2. Install "TheRock" ROCm SDK (Tarball Method)
|
||||
WORKDIR /tmp
|
||||
ARG ROCM_MAJOR_VER=7
|
||||
ARG GFX=gfx1151
|
||||
ARG GFX=gfx1150
|
||||
# We pass ARGs to the script via ENV or rely on defaults.
|
||||
# But let's be explicit and export them for the RUN command.
|
||||
COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh
|
||||
@@ -18,7 +18,7 @@ RUN chmod +x /tmp/install_rocm_sdk.sh && \
|
||||
/tmp/install_rocm_sdk.sh
|
||||
|
||||
# 4. Python Venv Setup
|
||||
RUN /usr/bin/python3.13 -m venv /opt/venv
|
||||
RUN /usr/bin/python3.11 -m venv /opt/venv
|
||||
ENV VIRTUAL_ENV=/opt/venv
|
||||
ENV PATH=/opt/venv/bin:$PATH
|
||||
ENV PIP_NO_CACHE_DIR=1
|
||||
@@ -27,13 +27,14 @@ RUN python -m pip install --upgrade pip wheel packaging "setuptools<80.0.0"
|
||||
|
||||
# 5. Install PyTorch (TheRock Nightly)
|
||||
RUN python -m pip install \
|
||||
--index-url https://rocm.nightlies.amd.com/v2-staging/gfx1151/ \
|
||||
--index-url https://rocm.nightlies.amd.com/v2-staging/gfx1150/ \
|
||||
--pre torch torchaudio torchvision
|
||||
|
||||
WORKDIR /opt
|
||||
|
||||
# Flash-Attention
|
||||
ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
|
||||
ENV LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH"
|
||||
|
||||
RUN git clone https://github.com/ROCm/flash-attention.git &&\
|
||||
cd flash-attention &&\
|
||||
@@ -46,38 +47,18 @@ RUN git clone https://github.com/vllm-project/vllm.git /opt/vllm
|
||||
WORKDIR /opt/vllm
|
||||
|
||||
# --- PATCHING ---
|
||||
RUN echo "import sys, re" > patch_strix.py && \
|
||||
echo "from pathlib import Path" >> patch_strix.py && \
|
||||
# Patch 1: __init__.py
|
||||
echo "p = Path('vllm/platforms/__init__.py')" >> patch_strix.py && \
|
||||
echo "txt = p.read_text()" >> patch_strix.py && \
|
||||
echo "txt = txt.replace('import amdsmi', '# import amdsmi')" >> patch_strix.py && \
|
||||
echo "txt = re.sub(r'is_rocm = .*', 'is_rocm = True', txt)" >> patch_strix.py && \
|
||||
echo "txt = re.sub(r'if len\(amdsmi\.amdsmi_get_processor_handles\(\)\) > 0:', 'if True:', txt)" >> patch_strix.py && \
|
||||
echo "txt = txt.replace('amdsmi.amdsmi_init()', 'pass')" >> patch_strix.py && \
|
||||
echo "txt = txt.replace('amdsmi.amdsmi_shut_down()', 'pass')" >> patch_strix.py && \
|
||||
echo "p.write_text(txt)" >> patch_strix.py && \
|
||||
# Patch 2: rocm.py
|
||||
echo "p = Path('vllm/platforms/rocm.py')" >> patch_strix.py && \
|
||||
echo "txt = p.read_text()" >> patch_strix.py && \
|
||||
echo "header = 'import sys\nfrom unittest.mock import MagicMock\nsys.modules[\"amdsmi\"] = MagicMock()\n'" >> patch_strix.py && \
|
||||
echo "txt = header + txt" >> patch_strix.py && \
|
||||
echo "txt = re.sub(r'device_type = .*', 'device_type = \"rocm\"', txt)" >> patch_strix.py && \
|
||||
echo "txt = re.sub(r'device_name = .*', 'device_name = \"gfx1151\"', txt)" >> patch_strix.py && \
|
||||
echo "txt += '\n def get_device_name(self, device_id: int = 0) -> str:\n return \"AMD-gfx1151\"\n'" >> patch_strix.py && \
|
||||
echo "p.write_text(txt)" >> patch_strix.py && \
|
||||
echo "print('Successfully patched vLLM for Strix Halo')" >> patch_strix.py && \
|
||||
python patch_strix.py && \
|
||||
sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt
|
||||
COPY scripts/patch_strix.py /opt/vllm/patch_strix.py
|
||||
RUN python /opt/vllm/patch_strix.py && \
|
||||
sed -i 's/gfx1200;gfx1201/gfx1150/' CMakeLists.txt
|
||||
|
||||
# 7. Build vLLM (Wheel Method) with CLANG Host Compiler
|
||||
RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11
|
||||
RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11 amd-quark>=0.11
|
||||
ENV ROCM_HOME="/opt/rocm"
|
||||
ENV HIP_PATH="/opt/rocm"
|
||||
ENV VLLM_TARGET_DEVICE="rocm"
|
||||
ENV PYTORCH_ROCM_ARCH="gfx1151"
|
||||
ENV HIP_ARCHITECTURES="gfx1151"
|
||||
ENV AMDGPU_TARGETS="gfx1151"
|
||||
ENV PYTORCH_ROCM_ARCH="gfx1150"
|
||||
ENV HIP_ARCHITECTURES="gfx1150"
|
||||
ENV AMDGPU_TARGETS="gfx1150"
|
||||
ENV MAX_JOBS="4"
|
||||
|
||||
# --- CRITICAL FIX FOR SEGFAULT ---
|
||||
@@ -88,7 +69,7 @@ ENV CXX="/opt/rocm/llvm/bin/clang++"
|
||||
|
||||
RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
|
||||
echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \
|
||||
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \
|
||||
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1150 -DHIP_ARCHITECTURES=gfx1150" && \
|
||||
python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \
|
||||
python -m pip install /tmp/dist/*.whl
|
||||
|
||||
@@ -105,8 +86,8 @@ ENV CMAKE_PREFIX_PATH="/opt/rocm"
|
||||
|
||||
# Force CMake to use the System ROCm Compiler (/opt/rocm/llvm/bin/clang++)
|
||||
RUN cmake -S . \
|
||||
-DGPU_TARGETS="gfx1151" \
|
||||
-DBNB_ROCM_ARCH="gfx1151" \
|
||||
-DGPU_TARGETS="gfx1150" \
|
||||
-DBNB_ROCM_ARCH="gfx1150" \
|
||||
-DCOMPUTE_BACKEND=hip \
|
||||
-DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
||||
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
||||
@@ -120,17 +101,19 @@ RUN chmod -R a+rwX /opt && \
|
||||
find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \
|
||||
find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} + && \
|
||||
rm -rf /root/.cache/pip || true && \
|
||||
dnf clean all && rm -rf /var/cache/dnf/*
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh
|
||||
COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh
|
||||
COPY scripts/zz-venv-last.sh /etc/profile.d/zz-venv-last.sh
|
||||
COPY scripts/start_vllm.py /opt/start-vllm
|
||||
COPY scripts/start_vllm_cluster.py /opt/start-vllm-cluster
|
||||
COPY scripts/measure_bandwidth.sh /opt/measure_bandwidth.sh
|
||||
COPY scripts/cluster_manager.py /opt/cluster_manager.py
|
||||
COPY scripts/models.py /opt/models.py
|
||||
|
||||
COPY benchmarks/max_context_results.json /opt/max_context_results.json
|
||||
COPY benchmarks/bench_utils.py /opt/bench_utils.py
|
||||
COPY benchmarks/run_vllm_bench.py /opt/run_vllm_bench.py
|
||||
COPY benchmarks/vllm_cluster_bench.py /opt/vllm_cluster_bench.py
|
||||
COPY benchmarks/find_max_context.py /opt/find_max_context.py
|
||||
@@ -145,7 +128,7 @@ RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py
|
||||
RUN chmod 0644 /etc/profile.d/*.sh
|
||||
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
|
||||
|
||||
# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one
|
||||
# 9. Install Custom RCCL (gfx1150) - Replaces standard library with manually built one
|
||||
COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz
|
||||
RUN echo "Installing Custom RCCL..." && \
|
||||
gzip -d /tmp/librccl.so.1.gz && \
|
||||
@@ -163,4 +146,4 @@ RUN python -m pip install transformers==5.0.0
|
||||
|
||||
RUN chmod -R a+rwX /opt
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
CMD ["/bin/bash"]
|
||||
+304
@@ -0,0 +1,304 @@
|
||||
# Guida: Usare vLLM con Podman su Strix Halo
|
||||
|
||||
Questa guida ti spiega come buildare e usare il container vLLM con il modello `bullpoint/Qwen3-Coder-Next-AWQ-4bit` su Debian 13 con Podman.
|
||||
|
||||
## Prerequisiti
|
||||
|
||||
- Podman installato e funzionante
|
||||
- AMD Ryzen AI Max "Strix Halo" (gfx1150) o GPU ROCm compatibile
|
||||
- Accesso ai device `/dev/kfd` e `/dev/dri`
|
||||
- Almeno 30GB di spazio disco per il modello e la cache
|
||||
|
||||
## 1. Buildare l'immagine
|
||||
|
||||
Dalla directory del progetto, esegui:
|
||||
|
||||
```bash
|
||||
podman build -t vllm:rocm .
|
||||
```
|
||||
|
||||
**Note:**
|
||||
- Il build richiede 30-60 minuti a seconda della macchina
|
||||
- L'immagine compila vLLM, bitsandbytes e flash-attention da sorgente
|
||||
- Se il build fallisce, verifica di avere abbastanza spazio disco e memoria
|
||||
|
||||
### Opzioni di build avanzate
|
||||
|
||||
Puoi passare argomenti personalizzati:
|
||||
|
||||
```bash
|
||||
podman build \
|
||||
--build-arg ROCM_MAJOR_VER=7 \
|
||||
--build-arg GFX=gfx1150 \
|
||||
--network=host \
|
||||
-t vllm:rocm .
|
||||
```
|
||||
|
||||
- `--network=host` - Usare la rete dell'host per i download (utile se hai problemi di connessione)
|
||||
- `--no-cache` - Ignorare la cache e ricompilare tutto
|
||||
|
||||
## 2. Preparare i filesystem locali
|
||||
|
||||
Crea le cartelle per modelli e cache:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/models
|
||||
mkdir -p ~/.cache/huggingface
|
||||
```
|
||||
|
||||
## 3. Lanciare il container con GPU
|
||||
|
||||
### Opzione A: Shell interattiva (Development)
|
||||
|
||||
Se vuoi esplorare il container e usare il TUI `start-vllm`:
|
||||
|
||||
```bash
|
||||
podman run -it \
|
||||
--device /dev/kfd \
|
||||
--device /dev/dri \
|
||||
--network host \
|
||||
-v $HOME/models:/models \
|
||||
-v $HOME/.cache/huggingface:/cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
vllm:rocm \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
Dentro il container:
|
||||
|
||||
```bash
|
||||
start-vllm
|
||||
```
|
||||
|
||||
Oppure lancia direttamente:
|
||||
|
||||
```bash
|
||||
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
|
||||
--tensor-parallel-size 1 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.90
|
||||
```
|
||||
|
||||
### Opzione B: Lanciare direttamente il servizio (Production)
|
||||
|
||||
Esegui vLLM in un unico comando senza shell interattiva:
|
||||
|
||||
```bash
|
||||
podman run -d \
|
||||
--device /dev/kfd \
|
||||
--device /dev/dri \
|
||||
--network host \
|
||||
-v $HOME/models:/models \
|
||||
-v $HOME/.cache/huggingface:/cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
--name vllm-server \
|
||||
vllm:rocm \
|
||||
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
|
||||
--tensor-parallel-size 1 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.90
|
||||
```
|
||||
|
||||
**Opzioni spiegate:**
|
||||
|
||||
| Opzione | Significato |
|
||||
|---------|------------|
|
||||
| `-d` | Esegui in background |
|
||||
| `--device /dev/kfd` | Accesso alla GPU ROCm (kernel compute queue) |
|
||||
| `--device /dev/dri` | Accesso agli acceleratori DRI (render engine) |
|
||||
| `--network host` | Usa la rete dell'host (migliore performance) |
|
||||
| `-v $HOME/models:/models` | Monta la cartella modelli locale |
|
||||
| `-v $HOME/.cache/huggingface:/cache/huggingface` | Monta la cache HuggingFace |
|
||||
| `-p 8000:8000` | Espone la porta dell'API OpenAI-compatible |
|
||||
| `--name vllm-server` | Nome del container |
|
||||
| `--tensor-parallel-size 1` | Usa 1 GPU (no parallelismo) |
|
||||
| `--trust-remote-code` | Permetti codice remoto da HuggingFace |
|
||||
| `--enforce-eager` | Modalità eager (debug/stability) |
|
||||
| `--gpu-memory-utilization 0.90` | Usa il 90% della memoria GPU |
|
||||
|
||||
## 4. Monitorare il container
|
||||
|
||||
Se lanciato in background (`-d`):
|
||||
|
||||
```bash
|
||||
# Visualizza i log
|
||||
podman logs -f vllm-server
|
||||
|
||||
# Visualizza i log ultimi 50 righe
|
||||
podman logs -n 50 vllm-server
|
||||
|
||||
# Controlla lo stato
|
||||
podman ps | grep vllm-server
|
||||
|
||||
# Entra nel container
|
||||
podman exec -it vllm-server /bin/bash
|
||||
```
|
||||
|
||||
## 5. Testare l'API
|
||||
|
||||
Una volta che il server è up, puoi testare con cURL:
|
||||
|
||||
### Chat Completion
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
|
||||
"messages": [{"role": "user", "content": "Write a Python function to sort a list"}],
|
||||
"max_tokens": 200,
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
### Completamento testo
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit",
|
||||
"prompt": "def fibonacci(",
|
||||
"max_tokens": 100
|
||||
}'
|
||||
```
|
||||
|
||||
### Listare modelli disponibili
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/v1/models
|
||||
```
|
||||
|
||||
## 6. Usare da un altro host (SSH Port Forwarding)
|
||||
|
||||
Se vLLM è su un server remoto:
|
||||
|
||||
```bash
|
||||
ssh -L 0.0.0.0:8000:localhost:8000 user@remote-host
|
||||
```
|
||||
|
||||
Poi da client locale:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/v1/models
|
||||
```
|
||||
|
||||
## 7. Stoppare il container
|
||||
|
||||
```bash
|
||||
# Se lanciato in background
|
||||
podman stop vllm-server
|
||||
|
||||
# Rimuovere il container
|
||||
podman rm vllm-server
|
||||
|
||||
# Se in shell interattiva, usa Ctrl+C e poi
|
||||
podman stop <container-id>
|
||||
```
|
||||
|
||||
## 8. Usare con systemd (Quadlet)
|
||||
|
||||
Se hai già usato il file `vllm-rocm.container` generato:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/.config/containers/systemd/
|
||||
cp vllm-rocm.container ~/.config/containers/systemd/
|
||||
systemctl --user daemon-reload
|
||||
systemctl --user start vllm-rocm
|
||||
systemctl --user status vllm-rocm
|
||||
```
|
||||
|
||||
Visualizza i log:
|
||||
|
||||
```bash
|
||||
systemctl --user logs -u vllm-rocm -n 50 -f
|
||||
```
|
||||
|
||||
## Modello: bullpoint/Qwen3-Coder-Next-AWQ-4bit
|
||||
|
||||
### Caratteristiche
|
||||
|
||||
- **Quantizzazione:** AWQ (Activation-aware Weight Quantization) a 4-bit
|
||||
- **Vantaggi:**
|
||||
- Occupa ~15-20GB di memoria (vs 50-60GB full precision)
|
||||
- Esecuzione molto veloce
|
||||
- Qualità proche al modello full precision
|
||||
- **Caso d'uso:** Sviluppo code, task di programmazione
|
||||
|
||||
### Parametri consigliati
|
||||
|
||||
```bash
|
||||
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
|
||||
--tensor-parallel-size 1 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.90 \
|
||||
--max-model-len 4096 \
|
||||
--batch-size 16
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Errore: "Unable to locate package python3.13"
|
||||
|
||||
Il container usa Python 3.13, disponibile in Debian 13. Verifica di usare `debian:bookworm` o `debian:13-slim` nella base image.
|
||||
|
||||
### Errore: "No GPU detected"
|
||||
|
||||
Verifica che i device siano accessibili:
|
||||
|
||||
```bash
|
||||
ls -la /dev/kfd /dev/dri
|
||||
```
|
||||
|
||||
Se non ci sono, potrebbe essere un problema di driver. Su Strix Halo:
|
||||
|
||||
```bash
|
||||
rocm-smi
|
||||
```
|
||||
|
||||
### Errore: "Out of memory"
|
||||
|
||||
Riduci `--gpu-memory-utilization` oppure `--max-model-len`:
|
||||
|
||||
```bash
|
||||
vllm serve bullpoint/Qwen3-Coder-Next-AWQ-4bit \
|
||||
--gpu-memory-utilization 0.80 \
|
||||
--max-model-len 2048
|
||||
```
|
||||
|
||||
### Il container si ferma subito
|
||||
|
||||
Controlla i log:
|
||||
|
||||
```bash
|
||||
podman logs vllm-server
|
||||
```
|
||||
|
||||
Se vedi errori di compilazione, il build potrebbe non essere completato correttamente. Riprova:
|
||||
|
||||
```bash
|
||||
podman build --no-cache -t vllm:rocm .
|
||||
```
|
||||
|
||||
## Link Utili
|
||||
|
||||
- [vLLM Documentation](https://docs.vllm.ai/)
|
||||
- [HuggingFace Qwen3 Models](https://huggingface.co/collections/Qwen/qwen3-coder-67a2e625ef1d5c6ba5a9c14c)
|
||||
- [ROCm Documentation](https://rocmdocs.amd.com/)
|
||||
|
||||
## Domande Frequenti
|
||||
|
||||
**D: Posso usare più GPU con Tensor Parallelism?**
|
||||
R: Sì, imposta `--tensor-parallel-size 2` se hai 2 GPU. Su Strix Halo single-GPU, usa `--tensor-parallel-size 1`.
|
||||
|
||||
**D: Come cambio modello senza riavviare il container?**
|
||||
R: Devi stoppare e riavviare il container con un modello diverso.
|
||||
|
||||
**D: Posso usare questo con una Web UI?**
|
||||
R: Sì, usa HuggingFace Chat UI o altre app che supportano endpoint OpenAI-compatible.
|
||||
|
||||
**D: Il modello viene scaricato ogni volta?**
|
||||
R: No, viene cachato in `~/.cache/huggingface`. La prima volta richiede il download, le volte successive usa la cache.
|
||||
+17
-7
@@ -1,18 +1,28 @@
|
||||
# AMD Strix Halo (gfx1151) — vLLM Toolbox/Container
|
||||
# AMD Strix Halo (gfx1150) — vLLM Toolbox/Container
|
||||
|
||||
An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1151)**. Built on the **TheRock nightly builds** for ROCm.
|
||||
An **Fedora 43** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1150)**. Built on the **TheRock nightly builds** for ROCm.
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 🚀 High-Performance Clustering Support (New!)
|
||||
|
||||
**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1151)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.
|
||||
**Update:** This toolbox now ships with a **custom build of ROCm/RCCL** that enables **native RDMA/RoCE v2 support for Strix Halo (gfx1150)**. This allows you to connect two nodes via a low-latency interconnect (e.g., Intel E810) and run vLLM with Tensor Parallelism (TP=2) effectively acting as a single 256GB Unified Memory GPU.
|
||||
|
||||
👉 **[Read the Full RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)** for hardware requirements and configuration instructions.
|
||||
|
||||
---
|
||||
|
||||
### 📦 Project Context
|
||||
|
||||
This repository is part of the **[Strix Halo AI Toolboxes](https://strix-halo-toolboxes.com)** project. Check out the website for an overview of all toolboxes, tutorials, and host configuration guides.
|
||||
|
||||
### ❤️ Support
|
||||
|
||||
This is a hobby project maintained in my spare time. If you find these toolboxes and tutorials useful, you can **[buy me a coffee](https://buymeacoffee.com/dcapitella)** to support the work! ☕
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
|
||||
* [Tested Models (Benchmarks)](#tested-models-benchmarks)
|
||||
@@ -48,7 +58,7 @@ View full benchmarks at: [https://kyuz0.github.io/amd-strix-halo-vllm-toolboxes/
|
||||
|
||||
## 1) Toolbx vs Docker/Podman
|
||||
|
||||
The `kyuz0/vllm-therock-gfx1151:latest` image can be used both as:
|
||||
The `kyuz0/vllm-therock-gfx1150:latest` image can be used both as:
|
||||
|
||||
* **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean.
|
||||
* **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container.
|
||||
@@ -71,7 +81,7 @@ To manually create a toolbox that exposes the GPU and relaxes seccomp:
|
||||
|
||||
```bash
|
||||
toolbox create vllm \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
|
||||
-- --device /dev/dri --device /dev/kfd \
|
||||
--group-add video --group-add render --security-opt seccomp=unconfined
|
||||
```
|
||||
@@ -102,7 +112,7 @@ Ubuntu’s toolbox package still breaks GPU access, so use Distrobox instead:
|
||||
|
||||
```bash
|
||||
distrobox create -n vllm \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
|
||||
--additional-flags "--device /dev/kfd --device /dev/dri --group-add video --group-add render --security-opt seccomp=unconfined"
|
||||
|
||||
distrobox enter vllm
|
||||
@@ -208,6 +218,6 @@ This toolbox supports high-performance clustering of multiple Strix Halo nodes u
|
||||
**Detailed Documentation:** [RDMA Cluster Setup Guide](rdma_cluster/setup_guide.md)
|
||||
|
||||
**Key Features:**
|
||||
* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1151`.
|
||||
* **Custom RCCL Patch:** Use of a custom-built `librccl.so` to support RDMA on `gfx1150`.
|
||||
* **Easy Setup:** `refresh_toolbox.sh` automatically detects and exposes RDMA devices.
|
||||
* **Cluster Management:** Included `start-vllm-cluster` TUI for managing Ray and vLLM.
|
||||
@@ -0,0 +1,14 @@
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
def run_dialog(args):
|
||||
"""Runs dialog and returns stderr (selection line). Returns None if user cancelled."""
|
||||
with tempfile.NamedTemporaryFile(mode="w+") as tf:
|
||||
cmd = ["dialog"] + args
|
||||
try:
|
||||
# We don't trap stdout since dialog renders to TTY and writes choice to stderr
|
||||
subprocess.run(cmd, stderr=tf, check=True)
|
||||
tf.seek(0)
|
||||
return tf.read().strip()
|
||||
except subprocess.CalledProcessError:
|
||||
return None # User cancelled/pressed ESC
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 524.2037815230142,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.3815310134141399,
|
||||
"tokens_per_second": 280.05330212131406
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 485.412814248004,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.4120204373051785,
|
||||
"tokens_per_second": 302.43330149293365
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 421.75657659699937,
|
||||
"elapsed_time": 424.04632396099623,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.4742071875054738,
|
||||
"tokens_per_second": 348.0799308087054
|
||||
"requests_per_second": 0.4716465836369236,
|
||||
"tokens_per_second": 346.2003835540928
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 868.8101008250001,
|
||||
"elapsed_time": 918.187000697013,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.2301999019234296,
|
||||
"tokens_per_second": 168.9724830093454
|
||||
"requests_per_second": 0.21782055272855774,
|
||||
"tokens_per_second": 159.8857312165796
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 456.08530166203855,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.4385144604993234,
|
||||
"tokens_per_second": 321.88057686801585
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 503.28860085096676,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.397386310084984,
|
||||
"tokens_per_second": 291.6914862601304
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 457.7749735690013,
|
||||
"elapsed_time": 458.737264430034,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.4368958801760569,
|
||||
"tokens_per_second": 320.69249844623016
|
||||
"requests_per_second": 0.4359794058773347,
|
||||
"tokens_per_second": 320.0197833991106
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 644.1538858940003,
|
||||
"elapsed_time": 686.8188757880125,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.3104848148551126,
|
||||
"tokens_per_second": 227.90361622402403
|
||||
"requests_per_second": 0.29119758796747197,
|
||||
"tokens_per_second": 213.74630950782364
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 534.8865945799625,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.3739110346503573,
|
||||
"tokens_per_second": 274.46004720922855
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 571.4193902639672,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.35000562355367393,
|
||||
"tokens_per_second": 256.91287782898553
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 534.4193308840004,
|
||||
"elapsed_time": 524.8208868440124,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.3742379596733028,
|
||||
"tokens_per_second": 274.7000183491961
|
||||
"requests_per_second": 0.38108239403864297,
|
||||
"tokens_per_second": 279.7240042842149
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 733.5017090729998,
|
||||
"elapsed_time": 789.1420173590304,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.2726646680247824,
|
||||
"tokens_per_second": 200.1426829468909
|
||||
"requests_per_second": 0.2534398062712803,
|
||||
"tokens_per_second": 186.03115379827653
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 805.9022228560061,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.24816906360082697,
|
||||
"tokens_per_second": 182.16229690959702
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 824.4905019259895,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.2425740497104635,
|
||||
"tokens_per_second": 178.05541683872298
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 879.0596038709991,
|
||||
"elapsed_time": 748.1414223780157,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.22751585799106944,
|
||||
"tokens_per_second": 167.00232766189475
|
||||
"requests_per_second": 0.2673291359329993,
|
||||
"tokens_per_second": 196.2262690032198
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1109.9732099440007,
|
||||
"elapsed_time": 1168.3619703819859,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.18018452896722634,
|
||||
"tokens_per_second": 132.2599488751683
|
||||
"requests_per_second": 0.17117982703135376,
|
||||
"tokens_per_second": 125.65027253668944
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 510.63144373201067,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 148857,
|
||||
"requests_per_second": 0.391671923958063,
|
||||
"tokens_per_second": 291.5155379231269
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 572.292031740013,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 148857,
|
||||
"requests_per_second": 0.3494719285046033,
|
||||
"tokens_per_second": 260.10671430704866
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 504.69023761399876,
|
||||
"elapsed_time": 520.7929677469656,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 148857,
|
||||
"requests_per_second": 0.39628268013570256,
|
||||
"tokens_per_second": 294.9472545848014
|
||||
"requests_per_second": 0.3840297630462106,
|
||||
"tokens_per_second": 285.8275921888489
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 876.911706677,
|
||||
"elapsed_time": 930.6109793490032,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 148857,
|
||||
"requests_per_second": 0.22807313265081958,
|
||||
"tokens_per_second": 169.75141153501525
|
||||
"requests_per_second": 0.2149125729635249,
|
||||
"tokens_per_second": 159.95620436815713
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 237.61095946098794,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 145877,
|
||||
"requests_per_second": 0.8417120172137385,
|
||||
"tokens_per_second": 613.9321196754427
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 284.23000320699066,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 145877,
|
||||
"requests_per_second": 0.7036554823325597,
|
||||
"tokens_per_second": 513.235753981134
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 244.51837097500174,
|
||||
"elapsed_time": 247.22850671299966,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 145877,
|
||||
"requests_per_second": 0.8179344529513773,
|
||||
"tokens_per_second": 596.5891209659404
|
||||
"requests_per_second": 0.8089681997399035,
|
||||
"tokens_per_second": 590.0492703672895
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 380.55349342600005,
|
||||
"elapsed_time": 395.08209386101225,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 145877,
|
||||
"requests_per_second": 0.5255502930730307,
|
||||
"tokens_per_second": 383.3285005130725
|
||||
"requests_per_second": 0.5062239041143659,
|
||||
"tokens_per_second": 369.23212230245684
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1361.426551499986,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146523,
|
||||
"requests_per_second": 0.14690473002722398,
|
||||
"tokens_per_second": 107.62460878889469
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1474.255295659008,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146523,
|
||||
"requests_per_second": 0.13566171380825723,
|
||||
"tokens_per_second": 99.38780646163637
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1482.2689266130328,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146523,
|
||||
"requests_per_second": 0.13492828218223374,
|
||||
"tokens_per_second": 98.85048345093716
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1724.1368565150187,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.11600007229371459,
|
||||
"tokens_per_second": 85.2809331488931
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1338.8605944840237,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.1493807501871223,
|
||||
"tokens_per_second": 109.82173992256857
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1307.2402118169994,
|
||||
"elapsed_time": 1199.1163451180328,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.15299406963775225,
|
||||
"tokens_per_second": 112.4781801162827
|
||||
"requests_per_second": 0.16678948695367285,
|
||||
"tokens_per_second": 122.62029501860121
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1886.751298176,
|
||||
"elapsed_time": 1959.4152568069985,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.10600231211890418,
|
||||
"tokens_per_second": 77.93077982357597
|
||||
"requests_per_second": 0.10207126810164463,
|
||||
"tokens_per_second": 75.0407548829671
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 243.98866786801955,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.819710201082723,
|
||||
"tokens_per_second": 602.6345456319963
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 282.0738571010297,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.7090341588386423,
|
||||
"tokens_per_second": 521.2677328949931
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 247.62527259899798,
|
||||
"elapsed_time": 242.14750060701044,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.8076720033495051,
|
||||
"tokens_per_second": 593.7843034224891
|
||||
"requests_per_second": 0.825942863331829,
|
||||
"tokens_per_second": 607.216674264294
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 341.2666312900001,
|
||||
"elapsed_time": 357.72086531698005,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.5860520240258851,
|
||||
"tokens_per_second": 430.8537270233502
|
||||
"requests_per_second": 0.5590951476167821,
|
||||
"tokens_per_second": 411.03557062490586
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 486.3392907420057,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146278,
|
||||
"requests_per_second": 0.41123553824915293,
|
||||
"tokens_per_second": 300.773560320048
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 455.7690629530116,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146278,
|
||||
"requests_per_second": 0.4388187269758136,
|
||||
"tokens_per_second": 320.9476287228403
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 422.7612150579989,
|
||||
"elapsed_time": 398.827027003048,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146278,
|
||||
"requests_per_second": 0.47308029420949094,
|
||||
"tokens_per_second": 346.0061963818796
|
||||
"requests_per_second": 0.5014705284716613,
|
||||
"tokens_per_second": 366.77052981888835
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 594.5536415039987,
|
||||
"elapsed_time": 610.5734472059994,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146278,
|
||||
"requests_per_second": 0.33638680522429343,
|
||||
"tokens_per_second": 246.02994547299596
|
||||
"requests_per_second": 0.32756091984544267,
|
||||
"tokens_per_second": 239.57478116575834
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 497.111974740983,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.40232384284085837,
|
||||
"tokens_per_second": 295.31575874126105
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 471.133652363962,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.4245079904534079,
|
||||
"tokens_per_second": 311.59947769256274
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 395.26841144900027,
|
||||
"elapsed_time": 399.3928133630543,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.5059852854591319,
|
||||
"tokens_per_second": 371.4058491591393
|
||||
"requests_per_second": 0.5007601371589951,
|
||||
"tokens_per_second": 367.5704596781314
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 769.1666062429999,
|
||||
"elapsed_time": 813.6141017450136,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.260021688898978,
|
||||
"tokens_per_second": 190.86242019407229
|
||||
"requests_per_second": 0.24581678165489804,
|
||||
"tokens_per_second": 180.43566315423652
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 456.45958357997006,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.4381548929949473,
|
||||
"tokens_per_second": 321.6166453306162
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 490.5911466999678,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.40767144157681784,
|
||||
"tokens_per_second": 299.24102990342374
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 464.71097393700256,
|
||||
"elapsed_time": 440.66104900900973,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.43037503139986644,
|
||||
"tokens_per_second": 315.906032423287
|
||||
"requests_per_second": 0.4538635771184551,
|
||||
"tokens_per_second": 333.147212194374
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 638.3282979609994,
|
||||
"elapsed_time": 683.9224744850071,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.31331839844615444,
|
||||
"tokens_per_second": 229.9835374194385
|
||||
"requests_per_second": 0.29243080533447857,
|
||||
"tokens_per_second": 214.65152188564062
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 517.5916094129789,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.38640502736670695,
|
||||
"tokens_per_second": 283.6309502128471
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 548.4156070559984,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.3646869225214776,
|
||||
"tokens_per_second": 267.6893183038276
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 502.6907218439992,
|
||||
"elapsed_time": 497.59323585999664,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.3978589444944367,
|
||||
"tokens_per_second": 292.0384117325289
|
||||
"requests_per_second": 0.4019347241614679,
|
||||
"tokens_per_second": 295.0301359026215
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 721.7994779089986,
|
||||
"elapsed_time": 780.1687226030044,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.2770852655357769,
|
||||
"tokens_per_second": 203.38751203489863
|
||||
"requests_per_second": 0.2563548040386794,
|
||||
"tokens_per_second": 188.17083503449163
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 802.5698999410379,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.24919947784572202,
|
||||
"tokens_per_second": 182.9186467257061
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 839.8958681730437,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.23812475757863116,
|
||||
"tokens_per_second": 174.78952518165474
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 886.8526372269989,
|
||||
"elapsed_time": 757.2171181479935,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.2255166096425645,
|
||||
"tokens_per_second": 165.5348293928834
|
||||
"requests_per_second": 0.2641250378612165,
|
||||
"tokens_per_second": 193.87438091607942
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1084.3601952080007,
|
||||
"elapsed_time": 1144.2253085140255,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146805,
|
||||
"requests_per_second": 0.18444055848217136,
|
||||
"tokens_per_second": 135.3839809398758
|
||||
"requests_per_second": 0.1747907501362075,
|
||||
"tokens_per_second": 128.30078036872973
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 373.92354663898004,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 148857,
|
||||
"requests_per_second": 0.5348686965496139,
|
||||
"tokens_per_second": 398.09474781142933
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 434.26602390100015,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 148857,
|
||||
"requests_per_second": 0.46054719686197254,
|
||||
"tokens_per_second": 342.7783704164132
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 369.2837602610016,
|
||||
"elapsed_time": 374.03978066996206,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 148857,
|
||||
"requests_per_second": 0.5415889392445647,
|
||||
"tokens_per_second": 403.09652364564084
|
||||
"requests_per_second": 0.5347024844303181,
|
||||
"tokens_per_second": 397.9710386242193
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 509.0738683320001,
|
||||
"elapsed_time": 555.4390292470343,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 148857,
|
||||
"requests_per_second": 0.39287029337276264,
|
||||
"tokens_per_second": 292.4074663029466
|
||||
"requests_per_second": 0.36007552488906747,
|
||||
"tokens_per_second": 267.99881204205957
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 213.75922767800512,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 145877,
|
||||
"requests_per_second": 0.9356321229849724,
|
||||
"tokens_per_second": 682.4360360233941
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 264.80451649799943,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 145877,
|
||||
"requests_per_second": 0.7552741269105618,
|
||||
"tokens_per_second": 550.8856190566602
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 224.76228898300178,
|
||||
"elapsed_time": 224.3753512299736,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 145877,
|
||||
"requests_per_second": 0.8898289873490544,
|
||||
"tokens_per_second": 649.02791593759
|
||||
"requests_per_second": 0.8913635071929533,
|
||||
"tokens_per_second": 650.1471716939323
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 322.171811016,
|
||||
"elapsed_time": 336.45260514499387,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 145877,
|
||||
"requests_per_second": 0.620786776376495,
|
||||
"tokens_per_second": 452.7925628873698
|
||||
"requests_per_second": 0.5944373648520577,
|
||||
"tokens_per_second": 433.5736973626181
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1484.8385301349917,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146523,
|
||||
"requests_per_second": 0.1346947805710681,
|
||||
"tokens_per_second": 98.67941666807306
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1493.5249834020506,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 146523,
|
||||
"requests_per_second": 0.13391138562973798,
|
||||
"tokens_per_second": 98.10548978313048
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1707.9124416089617,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.11710202181769186,
|
||||
"tokens_per_second": 86.0910643999307
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1320.7432732739835,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.1514298834959962,
|
||||
"tokens_per_second": 111.32822174858649
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1315.035868578001,
|
||||
"elapsed_time": 1242.463667072996,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.15208710635115047,
|
||||
"tokens_per_second": 111.8113988472388
|
||||
"requests_per_second": 0.16097050183460196,
|
||||
"tokens_per_second": 118.34229353876268
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 1923.4690410719995,
|
||||
"elapsed_time": 1966.935257990961,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.10397879858182421,
|
||||
"tokens_per_second": 76.44313314138553
|
||||
"requests_per_second": 0.10168102848706935,
|
||||
"tokens_per_second": 74.75385852312364
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 299.5004001749912,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.6677787404729495,
|
||||
"tokens_per_second": 490.93757442090305
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 282.87595599895576,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.707023682142636,
|
||||
"tokens_per_second": 519.7896706376232
|
||||
}
|
||||
+3
-3
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 246.0529060009976,
|
||||
"elapsed_time": 244.54776988498634,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.8128333180474167,
|
||||
"tokens_per_second": 597.5787987620997
|
||||
"requests_per_second": 0.8178361229548825,
|
||||
"tokens_per_second": 601.2567608739705
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"elapsed_time": 333.59849170300004,
|
||||
"elapsed_time": 362.9645123449736,
|
||||
"num_requests": 200,
|
||||
"total_num_tokens": 147036,
|
||||
"requests_per_second": 0.5995230943012126,
|
||||
"tokens_per_second": 440.75738846836555
|
||||
"requests_per_second": 0.5510180560294371,
|
||||
"tokens_per_second": 405.0974544317216
|
||||
}
|
||||
@@ -15,18 +15,21 @@ except ImportError:
|
||||
print("Error: 'transformers' not found. Please install it or run in vLLM environment.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Import path handling for scripts/models.py
|
||||
try:
|
||||
import sys, os
|
||||
sys.path.append(str(Path(__file__).parent.parent / "scripts"))
|
||||
import models
|
||||
import cluster_manager # Import shared cluster logic
|
||||
except ImportError:
|
||||
print("Error: Could not import scripts/models.py.")
|
||||
print("Error: Could not import scripts/models.py or cluster_manager.py.")
|
||||
sys.exit(1)
|
||||
|
||||
# Import Utils from run_vllm_bench (keep utils shared)
|
||||
try:
|
||||
from run_vllm_bench import get_gpu_count, kill_vllm
|
||||
from run_vllm_bench import kill_vllm
|
||||
# We do NOT import get_gpu_count because we are overriding it for cluster awareness
|
||||
except ImportError:
|
||||
print("Error: Could not import run_vllm_bench.py.")
|
||||
sys.exit(1)
|
||||
@@ -65,7 +68,30 @@ CONCURRENCY_STEPS = [1, 4, 8, 16]
|
||||
|
||||
def log(msg): print(f"[MAX-CTX] {msg}", flush=True)
|
||||
|
||||
def get_gpu_count():
|
||||
"""
|
||||
Returns total GPUs.
|
||||
If Ray Cluster is active, returns TOTAL cluster GPUs (e.g., 2).
|
||||
Otherwise returns local AMD GPUs.
|
||||
"""
|
||||
if cluster_manager.check_ray_status():
|
||||
# Ideally we'd query Ray for total resources, but for this specific 2-node setup:
|
||||
# If cluster is up, we assume 2 nodes x 1 GPU = 2 GPUs.
|
||||
# Constructing a Ray client just to count is slow/complex here.
|
||||
log("Ray Cluster Detected: Assuming 2 GPUs available.")
|
||||
return 2
|
||||
|
||||
# Local Fallback
|
||||
try:
|
||||
res = subprocess.run("rocm-smi --showid", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if res.returncode == 0:
|
||||
return res.stdout.count("GPU")
|
||||
except: pass
|
||||
return 1
|
||||
|
||||
|
||||
def get_hf_context_limit(model_name, trust_remote=False):
|
||||
# ... (Keep existing implementation)
|
||||
try:
|
||||
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote)
|
||||
|
||||
@@ -95,6 +121,7 @@ def get_hf_context_limit(model_name, trust_remote=False):
|
||||
def get_vllm_server_cmd(model, tp_size, util, max_len, max_seqs):
|
||||
"""
|
||||
Constructs the vLLM serve command.
|
||||
Using Ray Backend if tp_size > 1 (Cluster Mode).
|
||||
"""
|
||||
config = MODEL_TABLE[model]
|
||||
|
||||
@@ -105,16 +132,47 @@ def get_vllm_server_cmd(model, tp_size, util, max_len, max_seqs):
|
||||
"--tensor-parallel-size", str(tp_size),
|
||||
"--max-num-seqs", str(max_seqs),
|
||||
"--dtype", "auto",
|
||||
# "--disable-log-stats" # Cleaner output, but user managed without it
|
||||
# "--disable-log-stats"
|
||||
]
|
||||
|
||||
# Env Setup
|
||||
env = os.environ.copy()
|
||||
env["VLLM_DISABLE_COMPILE_CACHE"] = "1"
|
||||
env.update(config.get("env", {}))
|
||||
|
||||
# CLUSTER / RAY LOGIC
|
||||
# Only if we need more than 1 GPU do we engage the cluster machinery
|
||||
if tp_size > 1:
|
||||
log(f"TP={tp_size} > 1: Using Ray Distributed Backend")
|
||||
cmd.extend(["--distributed-executor-backend", "ray"])
|
||||
|
||||
# Inject Cluster Env Vars (similar to start_vllm_cluster.py)
|
||||
# We need to know Head IP and RDMA Interface
|
||||
rdma_iface = cluster_manager.get_net_iface()
|
||||
head_ip = cluster_manager.get_local_ip(rdma_iface) # Assuming we run this ON HEAD
|
||||
|
||||
# IMPORTANT: vLLM needs to bind to the Head IP for Ray workers to reach it?
|
||||
# Or at least we should be explicit.
|
||||
cmd.extend(["--host", head_ip])
|
||||
|
||||
# Update our own process env so verify_context knows where to look?
|
||||
# No, verify_context runs in THIS process. We need to export it or pass it.
|
||||
# Simplest is to set it in os.environ for OUR process too, but that might be messy.
|
||||
# Better: We rely on standard PORT.
|
||||
|
||||
env["RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES"] = "1"
|
||||
env["VLLM_HOST_IP"] = head_ip
|
||||
env["NCCL_SOCKET_IFNAME"] = rdma_iface
|
||||
env["NCCL_IB_GID_INDEX"] = "1"
|
||||
env["NCCL_IB_DISABLE"] = "0"
|
||||
env["NCCL_NET_GDR_LEVEL"] = "0"
|
||||
else:
|
||||
# Default Localhost bind for single node safety
|
||||
cmd.extend(["--host", "127.0.0.1"])
|
||||
|
||||
if config.get("trust_remote"): cmd.append("--trust-remote-code")
|
||||
if config.get("enforce_eager"): cmd.append("--enforce-eager")
|
||||
|
||||
# Add model specific env vars
|
||||
env = os.environ.copy()
|
||||
env.update(config.get("env", {}))
|
||||
|
||||
return cmd, env
|
||||
|
||||
def is_port_free(port):
|
||||
@@ -300,7 +358,14 @@ def verify_context(model, context_len):
|
||||
"""
|
||||
Sends a request to the server with length ~context_len to verify stability.
|
||||
"""
|
||||
url = f"http://{HOST}:{PORT}/v1/completions"
|
||||
# Use dynamic host if set (by cluster logic), else localhost
|
||||
# But wait, the env var is set for the SERVER process, not necessarily us?
|
||||
# Actually, we (the client script) need to know where to send requests.
|
||||
# If we are on Head, localhost is fine for Head-based server.
|
||||
# But if we use Ray, vLLM head usually binds to HOST IP.
|
||||
|
||||
target_host = os.getenv("VLLM_HOST_IP", "127.0.0.1")
|
||||
url = f"http://{target_host}:{PORT}/v1/completions"
|
||||
|
||||
# We use a simple "A " * N prompt.
|
||||
# Llama 3 tokenizer: "A" is usually 1 token.
|
||||
@@ -529,9 +594,22 @@ def main():
|
||||
continue
|
||||
|
||||
config = MODEL_TABLE[model]
|
||||
valid_tps = [t for t in config["valid_tp"] if t <= gpu_count]
|
||||
|
||||
for tp in valid_tps:
|
||||
# KEY CHANGES:
|
||||
# We only want to test the MINIMUM required TP.
|
||||
# If model supports 1 and 2, we ONLY test 1 (local is faster/easier).
|
||||
# We only test 2 if model VALID_TP *starts* with 2 (or higher).
|
||||
|
||||
valid_tps = config.get("valid_tp", [1])
|
||||
min_tp = min(valid_tps)
|
||||
|
||||
if min_tp > gpu_count:
|
||||
log(f"Skipping {model}: Requires TP={min_tp} but only {gpu_count} GPUs available.")
|
||||
continue
|
||||
|
||||
tps_to_test = [min_tp]
|
||||
|
||||
for tp in tps_to_test:
|
||||
# Track successful seqs for this TP to skip lower utils
|
||||
# effectively: {seqs_count: max_working_util}
|
||||
# Since we iterate high-util -> low-util, if we succeeded already for this 'seqs', we skip.
|
||||
|
||||
@@ -2,6 +2,12 @@
|
||||
import subprocess, time, json, sys, os, requests, argparse
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import bench_utils
|
||||
except ImportError:
|
||||
sys.path.append(str(Path(__file__).parent))
|
||||
import bench_utils
|
||||
|
||||
|
||||
# =========================
|
||||
# ⚙️ GLOBAL SETTINGS
|
||||
@@ -89,38 +95,43 @@ def get_dataset():
|
||||
|
||||
|
||||
|
||||
def get_model_args(model, tp_size):
|
||||
def get_model_args(model, tp_size, overrides=None):
|
||||
config = MODEL_TABLE.get(model, {"max_num_seqs": "32"})
|
||||
overrides = overrides or {}
|
||||
|
||||
# Allow per-model GPU utilization override
|
||||
util = config.get("gpu_util", GPU_UTIL)
|
||||
util = overrides.get("gpu_util", config.get("gpu_util", GPU_UTIL))
|
||||
max_seq_override = overrides.get("max_num_seqs", config.get("max_num_seqs", "32"))
|
||||
|
||||
cmd = [
|
||||
"--model", model,
|
||||
"--gpu-memory-utilization", util,
|
||||
"--gpu-memory-utilization", str(util),
|
||||
"--dtype", "auto",
|
||||
"--tensor-parallel-size", str(tp_size),
|
||||
"--max-num-seqs", config["max_num_seqs"]
|
||||
"--max-num-seqs", str(max_seq_override)
|
||||
]
|
||||
|
||||
# Optional: if a model really needs a hard limit, we can still support "ctx" in config,
|
||||
# but by default we rely on auto.
|
||||
if "ctx" in config:
|
||||
cmd.extend(["--max-model-len", config["ctx"]])
|
||||
if "ctx" in overrides or "ctx" in config:
|
||||
cmd.extend(["--max-model-len", str(overrides.get("ctx", config.get("ctx")))])
|
||||
|
||||
if config.get("trust_remote"): cmd.append("--trust-remote-code")
|
||||
if config.get("enforce_eager"): cmd.append("--enforce-eager")
|
||||
|
||||
return cmd
|
||||
|
||||
def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DIR, extra_env=None):
|
||||
def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DIR, extra_env=None, overrides=None):
|
||||
if tp_size not in MODEL_TABLE[model]["valid_tp"]: return
|
||||
overrides = overrides or {}
|
||||
|
||||
model_safe = model.replace("/", "_")
|
||||
output_dir_path = Path(output_dir)
|
||||
output_dir_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_file = output_dir_path / f"{model_safe}_tp{tp_size}_throughput.json"
|
||||
tag = overrides.get("tag", "").strip()
|
||||
tag_suffix = f"_{tag}" if tag else ""
|
||||
output_file = output_dir_path / f"{model_safe}_tp{tp_size}{tag_suffix}_throughput.json"
|
||||
|
||||
if output_file.exists():
|
||||
log(f"SKIP {model} (TP={tp_size} | {backend_name})")
|
||||
@@ -130,13 +141,13 @@ def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DI
|
||||
dataset_args = ["--dataset-name", "sharegpt", "--dataset-path", dataset_path] if dataset_path else ["--input-len", "1024"]
|
||||
|
||||
# Retrieve Model-Specific Batch Tokens
|
||||
batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)
|
||||
batch_tokens = str(overrides.get("max_tokens", MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)))
|
||||
|
||||
log(f"START {model} (TP={tp_size} | {backend_name}) [Batch: {batch_tokens}]...")
|
||||
kill_vllm()
|
||||
nuke_vllm_cache()
|
||||
|
||||
cmd = ["vllm", "bench", "throughput"] + get_model_args(model, tp_size)
|
||||
cmd = ["vllm", "bench", "throughput"] + get_model_args(model, tp_size, overrides)
|
||||
cmd.extend([
|
||||
"--num-prompts", str(OFF_NUM_PROMPTS),
|
||||
"--max-num-batched-tokens", batch_tokens,
|
||||
@@ -152,6 +163,7 @@ def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DI
|
||||
|
||||
# ENV Setup: Global + Model Specific
|
||||
env = os.environ.copy()
|
||||
env["VLLM_DISABLE_COMPILE_CACHE"] = "1"
|
||||
|
||||
# Inject model specific env vars (e.g. for AWQ)
|
||||
model_env = MODEL_TABLE[model].get("env", {})
|
||||
@@ -168,35 +180,64 @@ def run_throughput(model, tp_size, backend_name="Default", output_dir=RESULTS_DI
|
||||
|
||||
|
||||
def print_summary(tps):
|
||||
print(f"\n{'MODEL':<40} | {'TP':<2} | {'Triton':<8} | {'ROCm':<8}")
|
||||
print("-" * 75)
|
||||
print(f"\n{'MODEL':<40} | {'TP':<2} | {'Tag':<15} | {'Triton':<8} | {'ROCm':<8}")
|
||||
print("-" * 92)
|
||||
|
||||
for m in MODELS_TO_RUN:
|
||||
msafe = m.replace("/", "_")
|
||||
name_cell = m.split('/')[-1]
|
||||
|
||||
for tp in tps:
|
||||
if tp not in MODEL_TABLE[m]["valid_tp"]: continue
|
||||
|
||||
# Default
|
||||
try:
|
||||
p1 = RESULTS_DIR / f"{msafe}_tp{tp}_throughput.json"
|
||||
d1 = json.loads(p1.read_text())
|
||||
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
|
||||
except: val1 = "N/A"
|
||||
prefix = f"{msafe}_tp{tp}"
|
||||
|
||||
# ROCm
|
||||
try:
|
||||
p2 = Path("benchmark_results_rocm") / f"{msafe}_tp{tp}_throughput.json"
|
||||
d2 = json.loads(p2.read_text())
|
||||
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
|
||||
except: val2 = "N/A"
|
||||
tags = set()
|
||||
for p in RESULTS_DIR.glob(f"{prefix}*_throughput.json"):
|
||||
name_part = p.name[len(prefix):-len("_throughput.json")]
|
||||
tag = name_part.lstrip("_")
|
||||
tags.add(tag)
|
||||
|
||||
for p in Path("benchmark_results_rocm").glob(f"{prefix}*_throughput.json"):
|
||||
name_part = p.name[len(prefix):-len("_throughput.json")]
|
||||
tag = name_part.lstrip("_")
|
||||
tags.add(tag)
|
||||
|
||||
if not tags:
|
||||
tags.add("") # Default empty tag if no files found
|
||||
|
||||
for tag in sorted(list(tags)):
|
||||
tag_suffix = f"_{tag}" if tag else ""
|
||||
|
||||
# Default
|
||||
try:
|
||||
p1 = RESULTS_DIR / f"{prefix}{tag_suffix}_throughput.json"
|
||||
if p1.exists():
|
||||
d1 = json.loads(p1.read_text())
|
||||
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
|
||||
else:
|
||||
val1 = "N/A"
|
||||
except: val1 = "N/A"
|
||||
|
||||
# ROCm
|
||||
try:
|
||||
p2 = Path("benchmark_results_rocm") / f"{prefix}{tag_suffix}_throughput.json"
|
||||
if p2.exists():
|
||||
d2 = json.loads(p2.read_text())
|
||||
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
|
||||
else:
|
||||
val2 = "N/A"
|
||||
except: val2 = "N/A"
|
||||
|
||||
name_cell = m.split('/')[-1]
|
||||
print(f"{name_cell:<40} | {tp:<2} | {val1:<8} | {val2:<8}")
|
||||
print("-" * 75)
|
||||
display_tag = tag if tag else "(Default)"
|
||||
print(f"{name_cell:<40} | {tp:<2} | {display_tag:<15} | {val1:<8} | {val2:<8}")
|
||||
|
||||
print("-" * 92)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--tp", type=int, nargs="+", default=[1])
|
||||
parser.add_argument("--tui", action="store_true", help="Launch interactive configuration UI")
|
||||
args = parser.parse_args()
|
||||
|
||||
gpu_count = get_gpu_count()
|
||||
@@ -207,17 +248,86 @@ if __name__ == "__main__":
|
||||
log(f"Requested TP={args.tp} but only {gpu_count} GPU(s) detected. Nothing to run.")
|
||||
sys.exit(0)
|
||||
|
||||
selected_models = MODELS_TO_RUN
|
||||
|
||||
if args.tui:
|
||||
# TUI Model Selection
|
||||
checklist_args = [
|
||||
"--clear", "--backtitle", "AMD vLLM Benchmark Launcher",
|
||||
"--title", "Model Selection",
|
||||
"--checklist", "Select models to benchmark:", "20", "65", "10"
|
||||
]
|
||||
|
||||
for m in MODELS_TO_RUN:
|
||||
m_name = m.split("/")[-1]
|
||||
# All selected "on" by default
|
||||
checklist_args.extend([m, m_name, "on"])
|
||||
|
||||
choice = bench_utils.run_dialog(checklist_args)
|
||||
|
||||
if choice is None:
|
||||
subprocess.run(["clear"])
|
||||
print("Cancelled by user.")
|
||||
sys.exit(0)
|
||||
|
||||
# Parse space-separated quoted output from dialog checklist
|
||||
import shlex
|
||||
selected_models = [m for m in shlex.split(choice)]
|
||||
|
||||
if not selected_models:
|
||||
subprocess.run(["clear"])
|
||||
print("No models selected. Exiting.")
|
||||
sys.exit(0)
|
||||
|
||||
kill_vllm()
|
||||
for tp in valid_tp_args:
|
||||
for m in MODELS_TO_RUN:
|
||||
for m in selected_models:
|
||||
overrides = {}
|
||||
if args.tui:
|
||||
config = MODEL_TABLE.get(m, {})
|
||||
default_seqs = config.get("max_num_seqs", "32")
|
||||
default_tokens = config.get("max_tokens", DEFAULT_BATCH_TOKENS)
|
||||
default_util = config.get("gpu_util", GPU_UTIL)
|
||||
default_ctx = config.get("ctx", "auto")
|
||||
|
||||
form_args = [
|
||||
"--clear", "--backtitle", f"AMD vLLM Benchmark Configuration (TP: {tp})",
|
||||
"--title", f"Tune Parameters: {m.split('/')[-1]}",
|
||||
"--form", "Edit the options below. Leave tag empty for no suffix.",
|
||||
"15", "70", "5",
|
||||
"Max Concurrent Seqs:", "1", "1", str(default_seqs), "1", "25", "15", "0",
|
||||
"Max Batched Tokens:", "2", "1", str(default_tokens), "2", "25", "15", "0",
|
||||
"GPU Utilization (0-1):", "3", "1", str(default_util), "3", "25", "15", "0",
|
||||
"Max Context Length:", "4", "1", str(default_ctx), "4", "25", "15", "0",
|
||||
"Filename Tag (Optional):", "5", "1", "", "5", "25", "15", "0"
|
||||
]
|
||||
|
||||
form_res = bench_utils.run_dialog(form_args)
|
||||
if form_res is None:
|
||||
subprocess.run(["clear"])
|
||||
print(f"Skipping {m} (TP={tp}) due to user cancellation.")
|
||||
continue
|
||||
|
||||
lines = form_res.splitlines()
|
||||
if len(lines) >= 5:
|
||||
overrides["max_num_seqs"] = lines[0].strip()
|
||||
overrides["max_tokens"] = lines[1].strip()
|
||||
overrides["gpu_util"] = lines[2].strip()
|
||||
|
||||
ctx_val = lines[3].strip()
|
||||
if ctx_val and ctx_val.lower() != "auto":
|
||||
overrides["ctx"] = ctx_val
|
||||
|
||||
overrides["tag"] = lines[4].strip()
|
||||
|
||||
# 1. Default (Triton)
|
||||
run_throughput(m, tp, "Default", RESULTS_DIR)
|
||||
run_throughput(m, tp, "Default", RESULTS_DIR, overrides=overrides)
|
||||
|
||||
# 2. ROCm Attention
|
||||
# We force this via CLI argument --attention-backend ROCM_ATTN below
|
||||
# No specific env vars needed if forcing backend.
|
||||
rocm_env = {}
|
||||
print(f"[DEBUG] Forcing ROCm Env: {rocm_env} + CLI: --attention-backend ROCM_ATTN")
|
||||
run_throughput(m, tp, "ROCm-Attn", "benchmark_results_rocm", rocm_env)
|
||||
run_throughput(m, tp, "ROCm-Attn", "benchmark_results_rocm", rocm_env, overrides=overrides)
|
||||
|
||||
print_summary(valid_tp_args)
|
||||
|
||||
@@ -2,6 +2,12 @@
|
||||
import subprocess, time, json, sys, os, requests, argparse, re
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import bench_utils
|
||||
except ImportError:
|
||||
sys.path.append(str(Path(__file__).parent))
|
||||
import bench_utils
|
||||
|
||||
# Import models immediately to access globals
|
||||
try:
|
||||
import models
|
||||
@@ -23,6 +29,8 @@ except ImportError:
|
||||
# User requested specifically to test with TP=2 on the cluster.
|
||||
CLUSTER_TP = 2
|
||||
GPU_UTIL = "0.90"
|
||||
FORCE_ETH = False
|
||||
FORCE_DEBUG_NCCL = False
|
||||
|
||||
# THROUGHPUT CONFIG (Imported from models.py)
|
||||
OFF_NUM_PROMPTS = models.OFF_NUM_PROMPTS
|
||||
@@ -66,6 +74,15 @@ def log(msg): print(f"\n[CLUSTER-BENCH] {msg}")
|
||||
def restart_cluster():
|
||||
log("Restarting Ray Cluster (Clean State)...")
|
||||
|
||||
# Push config to env so cluster_manager picks it up for daemon injection
|
||||
os.environ["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
|
||||
if FORCE_DEBUG_NCCL:
|
||||
os.environ["NCCL_DEBUG"] = "INFO"
|
||||
os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
|
||||
else:
|
||||
os.environ.pop("NCCL_DEBUG", None)
|
||||
os.environ.pop("NCCL_DEBUG_SUBSYS", None)
|
||||
|
||||
# 1. Stop Cluster (Best Effort)
|
||||
cluster_manager.stop_cluster()
|
||||
|
||||
@@ -89,7 +106,8 @@ def restart_cluster():
|
||||
log("Cluster Ready.")
|
||||
|
||||
def get_net_iface():
|
||||
return cluster_manager.get_net_iface()
|
||||
prefix = ".".join(HEAD_IP.split('.')[:3])
|
||||
return cluster_manager.get_net_iface(prefix)
|
||||
|
||||
def get_local_ip(iface):
|
||||
return cluster_manager.get_local_ip(iface)
|
||||
@@ -122,6 +140,7 @@ def get_cluster_env():
|
||||
host_ip = get_local_ip(rdma_iface)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["VLLM_DISABLE_COMPILE_CACHE"] = "1"
|
||||
|
||||
# Critical Cluster Envs (Match start_vllm_cluster.py)
|
||||
env["RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES"] = "1"
|
||||
@@ -130,31 +149,37 @@ def get_cluster_env():
|
||||
env["GLOO_SOCKET_IFNAME"] = rdma_iface
|
||||
# RCCL specific
|
||||
env["NCCL_IB_GID_INDEX"] = "1"
|
||||
env["NCCL_IB_DISABLE"] = "0"
|
||||
env["NCCL_IB_DISABLE"] = "1" if FORCE_ETH else "0"
|
||||
env["NCCL_NET_GDR_LEVEL"] = "0"
|
||||
|
||||
# Stability for RDMA (Fix for high-throughput models like Gemma 3)
|
||||
env["NCCL_IB_TIMEOUT"] = "23" # ~32 seconds (default is 18/~1s)
|
||||
env["NCCL_IB_RETRY_CNT"] = "7" # Default is 3, increase for lossy networks
|
||||
|
||||
if FORCE_DEBUG_NCCL:
|
||||
env["NCCL_DEBUG"] = "INFO"
|
||||
env["NCCL_DEBUG_SUBSYS"] = "INIT,NET"
|
||||
|
||||
return env
|
||||
|
||||
def get_model_args(model):
|
||||
def get_model_args(model, overrides=None):
|
||||
config = MODEL_TABLE.get(model, {"max_num_seqs": "32"})
|
||||
util = config.get("gpu_util", GPU_UTIL)
|
||||
overrides = overrides or {}
|
||||
util = overrides.get("gpu_util", config.get("gpu_util", GPU_UTIL))
|
||||
max_seq_override = overrides.get("max_num_seqs", config.get("max_num_seqs", "32"))
|
||||
|
||||
cmd = [
|
||||
"--model", model,
|
||||
"--gpu-memory-utilization", util,
|
||||
"--gpu-memory-utilization", str(util),
|
||||
"--dtype", "auto",
|
||||
"--tensor-parallel-size", str(CLUSTER_TP),
|
||||
"--max-num-seqs", config["max_num_seqs"],
|
||||
"--max-num-seqs", str(max_seq_override),
|
||||
"--distributed-executor-backend", "ray"
|
||||
]
|
||||
|
||||
# Optional ctx
|
||||
if "ctx" in config:
|
||||
cmd.extend(["--max-model-len", config["ctx"]])
|
||||
if "ctx" in overrides or "ctx" in config:
|
||||
cmd.extend(["--max-model-len", str(overrides.get("ctx", config.get("ctx")))])
|
||||
|
||||
if config.get("trust_remote"): cmd.append("--trust-remote-code")
|
||||
|
||||
@@ -163,16 +188,20 @@ def get_model_args(model):
|
||||
|
||||
return cmd
|
||||
|
||||
def get_benchmark_output_file(model, output_dir):
|
||||
def get_benchmark_output_file(model, output_dir, tag=""):
|
||||
model_safe = model.replace("/", "_")
|
||||
output_dir_path = Path(output_dir)
|
||||
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
eth_suffix = "_eth" if FORCE_ETH else ""
|
||||
tag_suffix = f"_{tag}" if tag else ""
|
||||
return output_dir_path / f"{model_safe}_cluster_tp{CLUSTER_TP}{eth_suffix}{tag_suffix}_throughput.json"
|
||||
|
||||
def run_bench_set(model, backend_name, output_dir, extra_env=None):
|
||||
def run_bench_set(model, backend_name, output_dir, extra_env=None, overrides=None):
|
||||
output_dir_path = Path(output_dir)
|
||||
output_dir_path.mkdir(parents=True, exist_ok=True)
|
||||
overrides = overrides or {}
|
||||
|
||||
output_file = get_benchmark_output_file(model, output_dir)
|
||||
tag = overrides.get("tag", "").strip()
|
||||
output_file = get_benchmark_output_file(model, output_dir, tag)
|
||||
|
||||
if output_file.exists():
|
||||
log(f"SKIP {model} [{backend_name}] (Result exists)")
|
||||
@@ -181,13 +210,13 @@ def run_bench_set(model, backend_name, output_dir, extra_env=None):
|
||||
dataset_path = get_dataset()
|
||||
dataset_args = ["--dataset-name", "sharegpt", "--dataset-path", dataset_path] if dataset_path else ["--input-len", "1024"]
|
||||
|
||||
batch_tokens = MODEL_TABLE[model].get("max_tokens", DEFAULT_BATCH_TOKENS)
|
||||
batch_tokens = str(overrides.get("max_tokens", MODEL_TABLE.get(model, {}).get("max_tokens", DEFAULT_BATCH_TOKENS)))
|
||||
|
||||
log(f"START {model} [TP={CLUSTER_TP} | {backend_name}]...")
|
||||
|
||||
nuke_vllm_cache()
|
||||
|
||||
cmd = ["vllm", "bench", "throughput"] + get_model_args(model)
|
||||
cmd = ["vllm", "bench", "throughput"] + get_model_args(model, overrides)
|
||||
cmd.extend([
|
||||
"--num-prompts", str(OFF_NUM_PROMPTS),
|
||||
"--max-num-batched-tokens", batch_tokens,
|
||||
@@ -218,20 +247,24 @@ def run_bench_set(model, backend_name, output_dir, extra_env=None):
|
||||
except Exception as e:
|
||||
log(f"ERROR: System error: {e}")
|
||||
|
||||
def run_cluster_throughput(model):
|
||||
def run_cluster_throughput(model, overrides=None):
|
||||
overrides = overrides or {}
|
||||
tag = overrides.get("tag", "").strip()
|
||||
|
||||
# 1. Default Run (Triton)
|
||||
if get_benchmark_output_file(model, RESULTS_DIR).exists():
|
||||
if get_benchmark_output_file(model, RESULTS_DIR, tag).exists():
|
||||
log(f"SKIP {model} [Default] (Result exists)")
|
||||
else:
|
||||
restart_cluster()
|
||||
run_bench_set(
|
||||
model,
|
||||
"Default",
|
||||
RESULTS_DIR
|
||||
RESULTS_DIR,
|
||||
overrides=overrides
|
||||
)
|
||||
|
||||
# 2. ROCm Attention Run
|
||||
if get_benchmark_output_file(model, "benchmark_results_rocm").exists():
|
||||
if get_benchmark_output_file(model, "benchmark_results_rocm", tag).exists():
|
||||
log(f"SKIP {model} [ROCm-Attn] (Result exists)")
|
||||
else:
|
||||
restart_cluster()
|
||||
@@ -239,47 +272,186 @@ def run_cluster_throughput(model):
|
||||
model,
|
||||
"ROCm-Attn",
|
||||
"benchmark_results_rocm",
|
||||
extra_env={}
|
||||
extra_env={},
|
||||
overrides=overrides
|
||||
)
|
||||
|
||||
|
||||
def print_summary():
|
||||
print(f"\n{'MODEL (TP=2)':<50} | {'Triton':<8} | {'ROCm':<8}")
|
||||
print("-" * 75)
|
||||
eth_suffix = "_eth" if FORCE_ETH else ""
|
||||
title_suffix = " (Ethernet ONLY)" if FORCE_ETH else ""
|
||||
print(f"\n{f'MODEL (TP={CLUSTER_TP}){title_suffix}':<50} | {'Tag':<15} | {'Triton':<8} | {'ROCm':<8}")
|
||||
print("-" * 92)
|
||||
|
||||
for m in MODELS_TO_RUN:
|
||||
msafe = m.replace("/", "_")
|
||||
|
||||
# Default
|
||||
try:
|
||||
p1 = RESULTS_DIR / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
d1 = json.loads(p1.read_text())
|
||||
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
|
||||
except: val1 = "N/A"
|
||||
|
||||
# ROCm
|
||||
try:
|
||||
p2 = Path("benchmark_results_rocm") / f"{msafe}_cluster_tp{CLUSTER_TP}_throughput.json"
|
||||
d2 = json.loads(p2.read_text())
|
||||
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
|
||||
except: val2 = "N/A"
|
||||
|
||||
name_cell = m.split('/')[-1]
|
||||
print(f"{name_cell:<50} | {val1:<8} | {val2:<8}")
|
||||
print("-" * 75)
|
||||
|
||||
# Find all tags used for this model by looking at the files in RESULTS_DIR
|
||||
prefix = f"{msafe}_cluster_tp{CLUSTER_TP}{eth_suffix}"
|
||||
|
||||
# Gather all unique tags from both directories
|
||||
tags = set()
|
||||
for p in RESULTS_DIR.glob(f"{prefix}*_throughput.json"):
|
||||
# Extract tag: {prefix}_{tag}_throughput.json or {prefix}_throughput.json
|
||||
name_part = p.name[len(prefix):-len("_throughput.json")]
|
||||
tag = name_part.lstrip("_")
|
||||
tags.add(tag)
|
||||
|
||||
for p in Path("benchmark_results_rocm").glob(f"{prefix}*_throughput.json"):
|
||||
name_part = p.name[len(prefix):-len("_throughput.json")]
|
||||
tag = name_part.lstrip("_")
|
||||
tags.add(tag)
|
||||
|
||||
if not tags:
|
||||
tags.add("") # Default empty tag if no files found
|
||||
|
||||
# Sort so empty tag (Default) comes first
|
||||
for tag in sorted(list(tags)):
|
||||
tag_suffix = f"_{tag}" if tag else ""
|
||||
|
||||
# Default (Triton)
|
||||
try:
|
||||
p1 = RESULTS_DIR / f"{prefix}{tag_suffix}_throughput.json"
|
||||
if p1.exists():
|
||||
d1 = json.loads(p1.read_text())
|
||||
val1 = f"{d1.get('tokens_per_second', 0):.1f}"
|
||||
else:
|
||||
val1 = "N/A"
|
||||
except: val1 = "N/A"
|
||||
|
||||
# ROCm
|
||||
try:
|
||||
p2 = Path("benchmark_results_rocm") / f"{prefix}{tag_suffix}_throughput.json"
|
||||
if p2.exists():
|
||||
d2 = json.loads(p2.read_text())
|
||||
val2 = f"{d2.get('tokens_per_second', 0):.1f}"
|
||||
else:
|
||||
val2 = "N/A"
|
||||
except: val2 = "N/A"
|
||||
|
||||
display_tag = tag if tag else "(Default)"
|
||||
print(f"{name_cell:<50} | {display_tag:<15} | {val1:<8} | {val2:<8}")
|
||||
|
||||
print("-" * 92)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# if not check_ray_status():
|
||||
# log("ERROR: Ray Cluster not ready. Please start it with 'start-vllm-cluster' first.")
|
||||
# sys.exit(1)
|
||||
# We now handle this by restarting the cluster ourselves.
|
||||
pass
|
||||
parser = argparse.ArgumentParser(description="VLLM Cluster Benchmark")
|
||||
parser.add_argument("--eth-only", action="store_true", help="Run benchmark using only Ethernet (disable RDMA/RoCE)")
|
||||
parser.add_argument("--debug-nccl", action="store_true", help="Enable NCCL Debug logging (INFO level for Transport tracking)")
|
||||
parser.add_argument("--tui", action="store_true", help="Launch interactive configuration UI")
|
||||
args = parser.parse_args()
|
||||
|
||||
FORCE_ETH = args.eth_only
|
||||
FORCE_DEBUG_NCCL = args.debug_nccl
|
||||
|
||||
selected_models = MODELS_TO_RUN
|
||||
|
||||
if args.tui:
|
||||
# 1. Cluster IPs Configuration
|
||||
form_args = [
|
||||
"--clear", "--backtitle", "AMD VLLM Cluster Configuration",
|
||||
"--title", "Cluster Network Details",
|
||||
"--form", "Verify Head and Worker IPs for this run:",
|
||||
"10", "60", "2",
|
||||
"Head Node IP:", "1", "1", HEAD_IP, "1", "20", "20", "0",
|
||||
"Worker Node IP:", "2", "1", WORKER_IP, "2", "20", "20", "0"
|
||||
]
|
||||
res = bench_utils.run_dialog(form_args)
|
||||
if res is None:
|
||||
subprocess.run(["clear"])
|
||||
print("Cancelled by user.")
|
||||
sys.exit(0)
|
||||
|
||||
lines = res.splitlines()
|
||||
if len(lines) >= 2:
|
||||
HEAD_IP = lines[0].strip()
|
||||
WORKER_IP = lines[1].strip()
|
||||
os.environ["VLLM_HEAD_IP"] = HEAD_IP
|
||||
os.environ["VLLM_WORKER_IP"] = WORKER_IP
|
||||
|
||||
# 2. Network Options (ETH / Debug)
|
||||
eth_status = "on" if FORCE_ETH else "off"
|
||||
debug_status = "on" if FORCE_DEBUG_NCCL else "off"
|
||||
check_args = [
|
||||
"--title", "Network Overrides",
|
||||
"--checklist", "Select custom backend flags:", "10", "60", "2",
|
||||
"ETH_ONLY", "Force Ethernet (Disable RDMA/RoCE)", eth_status,
|
||||
"DEBUG_NCCL", "Enable NCCL debug logs", debug_status
|
||||
]
|
||||
flags_res = bench_utils.run_dialog(check_args)
|
||||
if flags_res is not None:
|
||||
FORCE_ETH = "ETH_ONLY" in flags_res
|
||||
FORCE_DEBUG_NCCL = "DEBUG_NCCL" in flags_res
|
||||
|
||||
# 3. Model Selection
|
||||
checklist_args = [
|
||||
"--title", "Model Selection",
|
||||
"--checklist", "Select models to benchmark:", "20", "65", "10"
|
||||
]
|
||||
for m in MODELS_TO_RUN:
|
||||
m_name = m.split("/")[-1]
|
||||
checklist_args.extend([m, m_name, "on"])
|
||||
|
||||
choice = bench_utils.run_dialog(checklist_args)
|
||||
if choice is None:
|
||||
subprocess.run(["clear"])
|
||||
print("Cancelled by user.")
|
||||
sys.exit(0)
|
||||
|
||||
import shlex
|
||||
selected_models = [m for m in shlex.split(choice)]
|
||||
if not selected_models:
|
||||
subprocess.run(["clear"])
|
||||
print("No models selected. Exiting.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
log("Ray Cluster Detected. Starting Benchmarks (Dual Backend)...")
|
||||
if FORCE_ETH:
|
||||
log("Note: Ethernet ONLY mode enabled. RDMA/RoCE disabled.")
|
||||
if FORCE_DEBUG_NCCL:
|
||||
log("Note: NCCL Debug mode enabled (Transport Logging).")
|
||||
log("Note: Eager Mode (--enforce-eager) is ENABLED for cluster stability.")
|
||||
|
||||
for m in MODELS_TO_RUN:
|
||||
run_cluster_throughput(m)
|
||||
for m in selected_models:
|
||||
overrides = {}
|
||||
if args.tui:
|
||||
config = MODEL_TABLE.get(m, {})
|
||||
default_seqs = config.get("max_num_seqs", "32")
|
||||
default_tokens = config.get("max_tokens", DEFAULT_BATCH_TOKENS)
|
||||
default_util = config.get("gpu_util", GPU_UTIL)
|
||||
default_ctx = config.get("ctx", "auto")
|
||||
|
||||
form_args = [
|
||||
"--clear", "--backtitle", f"AMD VLLM Cluster Benchmark Configuration (TP: {CLUSTER_TP})",
|
||||
"--title", f"Tune Parameters: {m.split('/')[-1]}",
|
||||
"--form", "Edit cluster model options. Leave tag empty for no suffix.",
|
||||
"15", "70", "5",
|
||||
"Max Concurrent Seqs:", "1", "1", str(default_seqs), "1", "25", "15", "0",
|
||||
"Max Batched Tokens:", "2", "1", str(default_tokens), "2", "25", "15", "0",
|
||||
"GPU Utilization (0-1):", "3", "1", str(default_util), "3", "25", "15", "0",
|
||||
"Max Context Length:", "4", "1", str(default_ctx), "4", "25", "15", "0",
|
||||
"Filename Tag (Optional):", "5", "1", "", "5", "25", "15", "0"
|
||||
]
|
||||
|
||||
form_res = bench_utils.run_dialog(form_args)
|
||||
if form_res is None:
|
||||
subprocess.run(["clear"])
|
||||
print(f"Skipping {m} due to user cancellation.")
|
||||
continue
|
||||
|
||||
lines = form_res.splitlines()
|
||||
if len(lines) >= 5:
|
||||
overrides["max_num_seqs"] = lines[0].strip()
|
||||
overrides["max_tokens"] = lines[1].strip()
|
||||
overrides["gpu_util"] = lines[2].strip()
|
||||
|
||||
ctx_val = lines[3].strip()
|
||||
if ctx_val and ctx_val.lower() != "auto":
|
||||
overrides["ctx"] = ctx_val
|
||||
|
||||
overrides["tag"] = lines[4].strip()
|
||||
|
||||
run_cluster_throughput(m, overrides=overrides)
|
||||
|
||||
print_summary()
|
||||
|
||||
+41
-11
@@ -4,7 +4,7 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>AMD Strix Halo (gfx1151) vLLM Benchmarks</title>
|
||||
<title>AMD Strix Halo (gfx1150) vLLM Benchmarks</title>
|
||||
<style>
|
||||
:root {
|
||||
--bg-body: #f9fafb;
|
||||
@@ -445,7 +445,7 @@
|
||||
|
||||
<div class="container">
|
||||
<header>
|
||||
<h1>AMD Strix Halo (gfx1151) vLLM Benchmarks</h1>
|
||||
<h1>AMD Strix Halo (gfx1150) vLLM Benchmarks</h1>
|
||||
<p style="margin: 4px 0 0 0; font-size: 0.9rem;">
|
||||
<a href="https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes/" target="_blank"
|
||||
style="color: var(--primary); text-decoration: none;">View on GitHub →</a>
|
||||
@@ -469,6 +469,14 @@
|
||||
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
|
||||
<input type="checkbox" id="toggleTP2" checked> TP2
|
||||
</label>
|
||||
<label
|
||||
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
|
||||
<input type="checkbox" id="toggleTP2Eth"> TP2 (Eth)
|
||||
</label>
|
||||
<label
|
||||
style="font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 4px; cursor: pointer;">
|
||||
<input type="checkbox" id="toggleTP2Usb"> TP2 (Thunderbolt)
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<!-- Attention Group -->
|
||||
@@ -544,6 +552,8 @@
|
||||
activeTab: "Throughput",
|
||||
showTP1: true,
|
||||
showTP2: true,
|
||||
showTP2Eth: false,
|
||||
showTP2Usb: false,
|
||||
showTriton: true,
|
||||
showRocm: false
|
||||
};
|
||||
@@ -615,6 +625,8 @@
|
||||
// Toggles
|
||||
$('toggleTP1').addEventListener('change', e => { state.showTP1 = e.target.checked; render(); });
|
||||
$('toggleTP2').addEventListener('change', e => { state.showTP2 = e.target.checked; render(); });
|
||||
$('toggleTP2Eth').addEventListener('change', e => { state.showTP2Eth = e.target.checked; render(); });
|
||||
$('toggleTP2Usb').addEventListener('change', e => { state.showTP2Usb = e.target.checked; render(); });
|
||||
$('toggleTriton').addEventListener('change', e => { state.showTriton = e.target.checked; render(); });
|
||||
$('toggleRocm').addEventListener('change', e => { state.showRocm = e.target.checked; render(); });
|
||||
}
|
||||
@@ -636,13 +648,23 @@
|
||||
params: run.params_b || run.name_params_b,
|
||||
results: {
|
||||
1: { triton: null, rocm: null },
|
||||
2: { triton: null, rocm: null }
|
||||
2: { triton: null, rocm: null },
|
||||
"2_eth": { triton: null, rocm: null },
|
||||
"2_usb": { triton: null, rocm: null }
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
const m = testGroups[testName].models[modelName];
|
||||
const tp = run.tp || 1;
|
||||
let tp = run.tp || 1;
|
||||
if (tp === 2) {
|
||||
if (run.network === "Ethernet") {
|
||||
if (run.tag === "usb") tp = "2_usb";
|
||||
else tp = "2_eth";
|
||||
} else if (run.tag === "usb") {
|
||||
tp = "2_usb";
|
||||
}
|
||||
}
|
||||
|
||||
if (!m.results[tp]) m.results[tp] = { triton: null, rocm: null };
|
||||
|
||||
@@ -749,8 +771,16 @@
|
||||
if (state.showRocm) cols.push({ id: "tp1_rocm", label: "TP1 ROCm" });
|
||||
}
|
||||
if (state.showTP2) {
|
||||
if (state.showTriton) cols.push({ id: "tp2_triton", label: "TP2 Triton" });
|
||||
if (state.showRocm) cols.push({ id: "tp2_rocm", label: "TP2 ROCm" });
|
||||
if (state.showTriton) cols.push({ id: "tp2_triton", label: "TP2 RoCE Triton" });
|
||||
if (state.showRocm) cols.push({ id: "tp2_rocm", label: "TP2 RoCE ROCm" });
|
||||
}
|
||||
if (state.showTP2Eth) {
|
||||
if (state.showTriton) cols.push({ id: "tp2_eth_triton", label: "TP2 Eth Triton" });
|
||||
if (state.showRocm) cols.push({ id: "tp2_eth_rocm", label: "TP2 Eth ROCm" });
|
||||
}
|
||||
if (state.showTP2Usb) {
|
||||
if (state.showTriton) cols.push({ id: "tp2_usb_triton", label: "TP2 TB Triton" });
|
||||
if (state.showRocm) cols.push({ id: "tp2_usb_rocm", label: "TP2 TB ROCm" });
|
||||
}
|
||||
|
||||
// Thead
|
||||
@@ -790,11 +820,7 @@
|
||||
|
||||
// Data Cells
|
||||
cols.forEach(c => {
|
||||
let val = null;
|
||||
if (c.id === "tp1_triton") val = m.results[1]?.triton;
|
||||
if (c.id === "tp1_rocm") val = m.results[1]?.rocm;
|
||||
if (c.id === "tp2_triton") val = m.results[2]?.triton;
|
||||
if (c.id === "tp2_rocm") val = m.results[2]?.rocm;
|
||||
let val = getVal(m, c.id);
|
||||
|
||||
const bg = c.id.startsWith("tp2") ? 'style="background:#fbfdff;"' : "";
|
||||
rowHtml += `<td class="col-data" ${bg}>${formatVal(val, unit)}</td>`;
|
||||
@@ -823,6 +849,10 @@
|
||||
if (colId === "tp1_rocm") return m.results[1]?.rocm;
|
||||
if (colId === "tp2_triton") return m.results[2]?.triton;
|
||||
if (colId === "tp2_rocm") return m.results[2]?.rocm;
|
||||
if (colId === "tp2_eth_triton") return m.results["2_eth"]?.triton;
|
||||
if (colId === "tp2_eth_rocm") return m.results["2_eth"]?.rocm;
|
||||
if (colId === "tp2_usb_triton") return m.results["2_usb"]?.triton;
|
||||
if (colId === "tp2_usb_rocm") return m.results["2_usb"]?.rocm;
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@@ -66,6 +66,30 @@ def parse_logs():
|
||||
if not tp_match: continue
|
||||
tp = int(tp_match.group(1))
|
||||
|
||||
# Network
|
||||
network = "RoCE"
|
||||
network_prefix = ""
|
||||
if "_eth" in rest:
|
||||
network = "Ethernet"
|
||||
network_prefix = "_eth"
|
||||
|
||||
# Tag Extraction
|
||||
tag = ""
|
||||
test_type_str = ""
|
||||
if "throughput" in fname:
|
||||
test_type_str = "_throughput.json"
|
||||
elif "latency" in fname:
|
||||
qps_match = re.search(r"(_qps[\d\.]+)_latency\.json$", rest)
|
||||
if qps_match:
|
||||
test_type_str = qps_match.group(0)
|
||||
else:
|
||||
test_type_str = "_latency.json"
|
||||
|
||||
raw_prefix = f"{tp}{network_prefix}"
|
||||
if rest.endswith(test_type_str):
|
||||
tag_part = rest[len(raw_prefix):-len(test_type_str)]
|
||||
tag = tag_part.lstrip("_")
|
||||
|
||||
# Model Name
|
||||
if "_" in model_part:
|
||||
model_display = model_part.replace("_", "/", 1)
|
||||
@@ -87,6 +111,8 @@ def parse_logs():
|
||||
"params_b": params_b,
|
||||
"name_params_b": params_b,
|
||||
"backend": backend_name, # "Triton" or "ROCm"
|
||||
"network": network,
|
||||
"tag": tag,
|
||||
"error": False
|
||||
}
|
||||
|
||||
|
||||
+863
-171
Rozdílový obsah nebyl zobrazen, protože je příliš veliký
Načíst rozdílové porovnání
Binární soubor nebyl zobrazen.
|
Za Šířka: | Výška: | Velikost: 584 KiB |
@@ -3,62 +3,140 @@
|
||||
# -------- dynamic config --------
|
||||
HOST_ROCE="192.168.100.2"
|
||||
HOST_ETH="192.168.1.127"
|
||||
HOST_TB="192.168.2.2"
|
||||
|
||||
# Automatically detect local and remote RDMA device names
|
||||
RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}')
|
||||
RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'")
|
||||
# Parse args
|
||||
RUN_ETH=true
|
||||
RUN_ROCE=true
|
||||
RUN_TB=true
|
||||
RUN_RDMA=true
|
||||
|
||||
# If any flags are provided, turn off defaults and only run requested
|
||||
if [ "$#" -gt 0 ]; then
|
||||
RUN_ETH=false
|
||||
RUN_ROCE=false
|
||||
RUN_TB=false
|
||||
RUN_RDMA=false
|
||||
fi
|
||||
|
||||
while getopts "ertih" opt; do
|
||||
case ${opt} in
|
||||
e ) RUN_ETH=true ;;
|
||||
r ) RUN_ROCE=true ;;
|
||||
t ) RUN_TB=true ;;
|
||||
i ) RUN_RDMA=true ;;
|
||||
h ) echo "Usage: $0 [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)]"
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -e Run benchmarking for standard Ethernet (1G LAN)."
|
||||
echo " -r Run benchmarking for RoCE NIC (via Ethernet/TCP)."
|
||||
echo " -t Run benchmarking for Thunderbolt link."
|
||||
echo " -i Run benchmarking for RDMA (RoCE v2)."
|
||||
echo " -h Print this help message and exit."
|
||||
echo
|
||||
echo "If no arguments are provided, all benchmarks are executed."
|
||||
exit 0
|
||||
;;
|
||||
\? ) echo "Usage: cmd [-e (Ethernet LAN)] [-r (RoCE Ethernet/TCP)] [-t (Thunderbolt)] [-i (RDMA/Infiniband)] [-h (Help)]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Automatically detect local and remote RDMA device names if needed
|
||||
if [ "$RUN_RDMA" = true ]; then
|
||||
RDMA_DEV_LOCAL=$(ibv_devices | awk 'NR==3 {print $1}')
|
||||
RDMA_DEV_REMOTE=$(ssh "$HOST_ROCE" "toolbox run -c vllm -- ibv_devices | awk 'NR==3 {print \$1}'")
|
||||
fi
|
||||
|
||||
WORKDIR="/tmp/rdma_bench"
|
||||
mkdir -p "$WORKDIR"
|
||||
|
||||
# -------- helpers --------
|
||||
parse_ping_avg() {
|
||||
grep rtt "$1" | awk -F'/' '{print $5}'
|
||||
if [ -f "$1" ]; then
|
||||
grep rtt "$1" | awk -F'/' '{print $5}'
|
||||
else
|
||||
echo "0"
|
||||
fi
|
||||
}
|
||||
|
||||
parse_iperf_gbps() {
|
||||
grep receiver "$1" | tail -n1 | awk '
|
||||
{
|
||||
val=$(NF-2);
|
||||
unit=$(NF-1);
|
||||
if (unit=="Mbits/sec") printf "%.2f", val/1000;
|
||||
else if (unit=="Gbits/sec") printf "%.2f", val;
|
||||
else print "N/A";
|
||||
}'
|
||||
if [ -f "$1" ]; then
|
||||
grep receiver "$1" | tail -n1 | awk '
|
||||
{
|
||||
val=$(NF-2);
|
||||
unit=$(NF-1);
|
||||
if (unit=="Mbits/sec") printf "%.2f", val/1000;
|
||||
else if (unit=="Gbits/sec") printf "%.2f", val;
|
||||
else print "0.00";
|
||||
}'
|
||||
else
|
||||
echo "0.00"
|
||||
fi
|
||||
}
|
||||
|
||||
parse_rdma_lat_us() {
|
||||
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}')
|
||||
echo "${val:-0}"
|
||||
if [ -f "$1" ]; then
|
||||
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $6}')
|
||||
echo "${val:-0}"
|
||||
else
|
||||
echo "0"
|
||||
fi
|
||||
}
|
||||
|
||||
parse_rdma_bw_mib() {
|
||||
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}')
|
||||
echo "${val:-0}"
|
||||
if [ -f "$1" ]; then
|
||||
val=$(grep -E '^[[:space:]]*[0-9]+' "$1" | tail -n1 | awk '{print $4}')
|
||||
echo "${val:-0}"
|
||||
else
|
||||
echo "0"
|
||||
fi
|
||||
}
|
||||
|
||||
# -------- normal ethernet --------
|
||||
ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt"
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
|
||||
sleep 1
|
||||
iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt"
|
||||
# Clear old results
|
||||
rm -f "$WORKDIR"/*.txt
|
||||
|
||||
# -------- roce ethernet (tcp) --------
|
||||
ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt"
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
|
||||
sleep 1
|
||||
iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt"
|
||||
if [ "$RUN_ETH" = true ]; then
|
||||
# -------- normal ethernet --------
|
||||
echo "[*] Benchmarking Ethernet (1G LAN)..."
|
||||
ping -c 10 "$HOST_ETH" > "$WORKDIR/ping_eth.txt"
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
|
||||
sleep 1
|
||||
iperf3 -c "$HOST_ETH" -P 8 -t 10 > "$WORKDIR/iperf_eth.txt"
|
||||
fi
|
||||
|
||||
# -------- rdma latency --------
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
|
||||
sleep 2
|
||||
ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1
|
||||
if [ "$RUN_ROCE" = true ]; then
|
||||
# -------- roce ethernet (tcp) --------
|
||||
echo "[*] Benchmarking RoCE NIC (Ethernet/TCP)..."
|
||||
ping -c 10 "$HOST_ROCE" > "$WORKDIR/ping_roce.txt"
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
|
||||
sleep 1
|
||||
iperf3 -c "$HOST_ROCE" -P 8 -t 10 > "$WORKDIR/iperf_roce.txt"
|
||||
fi
|
||||
|
||||
# -------- rdma bandwidth (maximized) --------
|
||||
# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
|
||||
sleep 2
|
||||
ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1
|
||||
if [ "$RUN_TB" = true ]; then
|
||||
# -------- thunderbolt ethernet (tcp) --------
|
||||
echo "[*] Benchmarking Thunderbolt..."
|
||||
ping -c 10 "$HOST_TB" > "$WORKDIR/ping_tb.txt"
|
||||
ssh "$HOST_TB" "toolbox run -c vllm -- iperf3 -s -1" >/dev/null 2>&1 &
|
||||
sleep 1
|
||||
iperf3 -c "$HOST_TB" -P 8 -t 10 > "$WORKDIR/iperf_tb.txt"
|
||||
fi
|
||||
|
||||
if [ "$RUN_RDMA" = true ]; then
|
||||
# -------- rdma latency --------
|
||||
echo "[*] Benchmarking RDMA (RoCE v2)..."
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_send_lat --rdma_cm -d $RDMA_DEV_REMOTE" > "$WORKDIR/rdma_lat_srv.txt" 2>&1 &
|
||||
sleep 2
|
||||
ib_send_lat --rdma_cm -d "$RDMA_DEV_LOCAL" "$HOST_ROCE" > "$WORKDIR/rdma_lat_cli.txt" 2>&1
|
||||
|
||||
# -------- rdma bandwidth (maximized) --------
|
||||
# We use -x 1 because show_gids confirmed RoCE v2 is at Index 1
|
||||
ssh "$HOST_ROCE" "toolbox run -c vllm -- ib_write_bw -a -x 1 -q 8 -m 4096" > "$WORKDIR/rdma_bw_srv.txt" 2>&1 &
|
||||
sleep 2
|
||||
ib_write_bw -a -x 1 -q 8 -m 4096 "$HOST_ROCE" > "$WORKDIR/rdma_bw_cli.txt" 2>&1
|
||||
fi
|
||||
|
||||
# -------- parse --------
|
||||
ETH_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_eth.txt")
|
||||
@@ -67,13 +145,17 @@ ETH_BW=$(parse_iperf_gbps "$WORKDIR/iperf_eth.txt")
|
||||
ROCE_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_roce.txt")
|
||||
ROCE_BW=$(parse_iperf_gbps "$WORKDIR/iperf_roce.txt")
|
||||
|
||||
TB_LAT_MS=$(parse_ping_avg "$WORKDIR/ping_tb.txt")
|
||||
TB_BW=$(parse_iperf_gbps "$WORKDIR/iperf_tb.txt")
|
||||
|
||||
RDMA_LAT_US=$(parse_rdma_lat_us "$WORKDIR/rdma_lat_cli.txt")
|
||||
RDMA_BW_MIB=$(parse_rdma_bw_mib "$WORKDIR/rdma_bw_cli.txt")
|
||||
|
||||
# Convert units for dual display
|
||||
ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')")
|
||||
ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')")
|
||||
RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')")
|
||||
ETH_LAT_US=$(python3 -c "print(f'{float(${ETH_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
|
||||
ROCE_LAT_US=$(python3 -c "print(f'{float(${ROCE_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
|
||||
TB_LAT_US=$(python3 -c "print(f'{float(${TB_LAT_MS:-0}) * 1000:.2f}')" 2>/dev/null || echo "0.00")
|
||||
RDMA_LAT_MS=$(python3 -c "print(f'{float(${RDMA_LAT_US:-0}) / 1000:.3f}')" 2>/dev/null || echo "0.00")
|
||||
|
||||
RDMA_BW_GBPS=$(python3 - <<EOF
|
||||
import sys
|
||||
@@ -88,9 +170,18 @@ EOF
|
||||
echo
|
||||
echo "=== Network Comparison ==="
|
||||
echo
|
||||
printf "%-20s %-15s %-15s %-12s\n" "Path" "Latency (ms)" "Latency (us)" "Bandwidth"
|
||||
echo "----------------------------------------------------------------"
|
||||
printf "%-20s %-15s %-15s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT_MS} ms" "${ETH_LAT_US} us" "${ETH_BW} Gbps"
|
||||
printf "%-20s %-15s %-15s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT_MS} ms" "${ROCE_LAT_US} us" "${ROCE_BW} Gbps"
|
||||
printf "%-20s %-15s %-15s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT_MS} ms" "${RDMA_LAT_US} us" "${RDMA_BW_GBPS} Gbps"
|
||||
printf "%-25s %-15s %-15s %-12s\n" "Path" "Latency (ms)" "Latency (us)" "Bandwidth"
|
||||
echo "-----------------------------------------------------------------------"
|
||||
if [ "$RUN_ETH" = true ]; then
|
||||
printf "%-25s %-15s %-15s %-12s\n" "Ethernet (1G LAN)" "${ETH_LAT_MS:-0.00} ms" "${ETH_LAT_US:-0.00} us" "${ETH_BW:-0.00} Gbps"
|
||||
fi
|
||||
if [ "$RUN_ROCE" = true ]; then
|
||||
printf "%-25s %-15s %-15s %-12s\n" "Ethernet (RoCE NIC)" "${ROCE_LAT_MS:-0.00} ms" "${ROCE_LAT_US:-0.00} us" "${ROCE_BW:-0.00} Gbps"
|
||||
fi
|
||||
if [ "$RUN_TB" = true ]; then
|
||||
printf "%-25s %-15s %-15s %-12s\n" "Ethernet (Thunderbolt)" "${TB_LAT_MS:-0.00} ms" "${TB_LAT_US:-0.00} us" "${TB_BW:-0.00} Gbps"
|
||||
fi
|
||||
if [ "$RUN_RDMA" = true ]; then
|
||||
printf "%-25s %-15s %-15s %-12s\n" "RDMA (RoCE)" "${RDMA_LAT_MS:-0.00} ms" "${RDMA_LAT_US:-0.00} us" "${RDMA_BW_GBPS:-0.00} Gbps"
|
||||
fi
|
||||
echo
|
||||
|
||||
Binární soubor nebyl zobrazen.
|
Za Šířka: | Výška: | Velikost: 6.5 MiB |
@@ -45,6 +45,8 @@ This guide details how to configure a two-node **AMD Strix Halo** cluster linked
|
||||
|
||||
## 2. Concepts & Architecture
|
||||
|
||||

|
||||
|
||||
To fully utilize the Strix Halo cluster, it is helpful to understand the technologies involved:
|
||||
|
||||
* **vLLM**: A high-performance inference engine. To run models larger than a single GPU (or APU) can handle, it splits the model using **Tensor Parallelism (TP)**.
|
||||
@@ -55,15 +57,20 @@ To fully utilize the Strix Halo cluster, it is helpful to understand the technol
|
||||
* **With RDMA**: Latency is ~5µs.
|
||||
* **Why it matters**: For interactive token generation, high latency kills performance. RoCE makes the two nodes feel like a single machine.
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 3. Hardware Prerequisites
|
||||
|
||||

|
||||
|
||||
|
||||
* **Nodes**: 2x [Framework Desktop Mainboards](https://frame.work/gb/en/products/framework-desktop-mainboard-amd-ryzen-ai-max-300-series?v=FRAFMK0006) with AMD Ryzen AI MAX+ "Strix Halo", 128GB of Unified Memory.
|
||||
* **Network Cards**: [Intel Ethernet Controller E810-CQDA1](https://www.intel.com/content/www/us/en/products/sku/192558/intel-ethernet-network-adapter-e810cqda1/specifications.html) (or similar 100GbE QSFP28).
|
||||
* **Connection**: Direct Attach Copper (DAC) cable (e.g., [QSFPTEK 100G QSFP28 DAC](https://www.amazon.co.uk/dp/B09F32F7VK)). No switch required for 2 nodes.
|
||||
* **PCIe Note**: The Framework motherboard PCIe slot is physically **x4**, so a riser is required to plug in a 16x card (e.g., [CY PCI-E Express 4x to 16x Extender](https://www.amazon.co.uk/dp/B0837FZFJ6)). **Test Setup Note:** One of the boards in this setup has a modified PCIe slot (cut by Framework using an ultrasonic knife) to accept x16 cards directly. **This is not recommended for users.** Risers are the cheaper, safer, and easier solution. Performance is identical (~50Gbps bandwidth, ~5µs latency).
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 4. Host Configuration (Fedora)
|
||||
@@ -214,7 +221,7 @@ The cluster management and verification scripts rely on SSH to execute commands
|
||||
|
||||
### 5.2 Installation
|
||||
|
||||
The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1151` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1151-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.
|
||||
The toolbox container provided in this repo includes a **critical patch**: a custom-built `librccl.so` that enables `gfx1150` (Strix Halo) support for RDMA (https://github.com/kyuz0/rocm-systems/tree/gfx1150-rccl), which is currently missing in upstream ROCm packages. This library is automatically compiled using the [`build-rccl`](../.github/workflows/build-rccl.yml) GitHub Action in this repository, which generates the artifact that is then bundled into the Docker container.
|
||||
|
||||
To install the toolbox on **both nodes**, run:
|
||||
|
||||
@@ -223,7 +230,7 @@ To install the toolbox on **both nodes**, run:
|
||||
```
|
||||
|
||||
**What this does:**
|
||||
1. Pulls the latest `kyuz0/vllm-therock-gfx1151` image.
|
||||
1. Pulls the latest `kyuz0/vllm-therock-gfx1150` image.
|
||||
2. Detects if `/dev/infiniband` exists on your host.
|
||||
3. Creates the toolbox with flags to expose:
|
||||
* **iGPU Access**: `/dev/dri`, `/dev/kfd` (Required for ROCm)
|
||||
@@ -325,4 +332,62 @@ If you see link issues, ensure your Intel E810 firmware is up to date using the
|
||||
## 8. References & Acknowledgements
|
||||
|
||||
* **Reddit - Strix Halo Batching with Tensor Parallel**: [Thread by Hungry_Elk_3276](https://www.reddit.com/r/LocalLLaMA/comments/1p8nped/strix_halo_batching_with_tensor_parallel_and/)
|
||||
* Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1151` support in upstream RCCL.
|
||||
* Special thanks to user **Hungry_Elk_3276** for their initial experiments with vLLM RDMA, which highlighted the missing `gfx1150` support in upstream RCCL.
|
||||
|
||||
---
|
||||
|
||||
## 9. Alternative: Thunderbolt Networking
|
||||
|
||||
If you do not have dedicated 100GbE RDMA network cards, you can directly connect the two nodes using a high-quality **Thunderbolt 4 / USB4 cable**. This will create a `thunderbolt0` network interface.
|
||||
|
||||
While it lacks the ultra-low microprocessor-level latency of RDMA, it provides significantly more bandwidth than standard 1GbE/5GbE Ethernet and is easier to configure.
|
||||
|
||||
>**Note**: `thunderbolt-net` relies on standard OS kernel TCP/IP stacks.
|
||||
|
||||
### 9.1 Thunderbolt Configuration
|
||||
|
||||
**1. Establish Connection:**
|
||||
Connect the nodes directly using a certified Thunderbolt 4 or USB4 cable. Verify the link is active:
|
||||
```bash
|
||||
ip link show thunderbolt0
|
||||
```
|
||||
|
||||
**2. Network Configuration (Head - Node 1):**
|
||||
Configure a persistent connection using `nmcli` with a static IP and Jumbo Frames (reduces CPU overhead).
|
||||
*Note: Jumbo Frames may be unsupported on some Thunderbolt host controllers.*
|
||||
```bash
|
||||
sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.1/24 mtu 9000
|
||||
sudo nmcli connection up thunderbolt0
|
||||
```
|
||||
|
||||
**3. Network Configuration (Worker - Node 2):**
|
||||
```bash
|
||||
sudo nmcli connection add type ethernet ifname thunderbolt0 con-name thunderbolt0 ipv4.method manual ipv4.addresses 192.168.2.2/24 mtu 9000
|
||||
sudo nmcli connection up thunderbolt0
|
||||
```
|
||||
|
||||
**4. Firewall Rules:**
|
||||
To ensure Ray and NCCL can communicate freely over this link:
|
||||
```bash
|
||||
# Assign the interface to the trusted zone permanently
|
||||
sudo firewall-cmd --permanent --zone=trusted --add-interface=thunderbolt0
|
||||
sudo firewall-cmd --reload
|
||||
```
|
||||
|
||||
### 9.2 Running vLLM over Thunderbolt
|
||||
|
||||
Our cluster scripts dynamically detect the network interface based on the provided IPs. There is no need to manually export environment variables!
|
||||
|
||||
1. Open the Toolbox: `toolbox enter vllm`
|
||||
2. Launch the cluster manager: `start-vllm-cluster`
|
||||
3. Select **Option 1 (Configure IPs)**.
|
||||
4. Set the **Head IP** explicitly to `192.168.2.1` and the **Worker IP** to `192.168.2.2`.
|
||||
5. Start the cluster normally (Option 2). The script will automatically discover and utilize `thunderbolt0` as the backend network for Ray orchestration and GPU synchronization.
|
||||
|
||||
### 9.3 Validating the Link
|
||||
I have added Thunderbolt support to the `compare_eth_vs_rdma.sh` script. Run it from inside the toolbox to see the latency and bandwidth of your Thunderbolt link compared to your other network interfaces.
|
||||
|
||||
You can use the `-t` flag to ONLY benchmark the Thunderbolt connection (or `-e`, `-r`, `-i` for the others):
|
||||
```bash
|
||||
/opt/compare_eth_vs_rdma.sh -t
|
||||
```
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# Issue Report: vLLM Tensor Parallelism over RDMA on AMD Strix Halo
|
||||
|
||||
> **✅ RESOLVED (Feb 2, 2026)**
|
||||
> This issue is **SOLVED**. The root cause was indeed missing `gfx1151` support in the upstream RCCL library.
|
||||
> This issue is **SOLVED**. The root cause was indeed missing `gfx1150` support in the upstream RCCL library.
|
||||
>
|
||||
> I have patched and built a custom version of RCCL with native `gfx1151` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1151`).
|
||||
> I have patched and built a custom version of RCCL with native `gfx1150` support. This patched library is **now included** in the toolbox container provided by this repository (`kyuz0/vllm-therock-gfx1150`).
|
||||
>
|
||||
> See the [RDMA Cluster Setup Guide](setup_guide.md) for instructions on how to run the cluster using the fixed container.
|
||||
|
||||
@@ -12,8 +12,8 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R
|
||||
|
||||
- **Current Status:** RDMA communication is verified (low latency ~5us). Ray cluster is operational and can allocate tensors on both nodes.
|
||||
- **Blocker:** vLLM fails with `HIP error: invalid kernel file` when initializing the distributed environment.
|
||||
- **Suspected Cause:** Possible missing support for `gfx1151` in the RCCL library included with the ROCm nightly build.
|
||||
- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1151` support is indeed missing/required in RCCL.
|
||||
- **Suspected Cause:** Possible missing support for `gfx1150` in the RCCL library included with the ROCm nightly build.
|
||||
- **Goal:** Solicit troubleshooting advice or confirmation if `gfx1150` support is indeed missing/required in RCCL.
|
||||
|
||||
## Table of Contents
|
||||
1. [Context & Goal](#1-context--goal)
|
||||
@@ -24,7 +24,7 @@ I am attempting to run vLLM with Tensor Parallelism across two AMD Strix Halo (R
|
||||
4. [The Issue: Invalid Kernel File](#4-the-issue-invalid-kernel-file)
|
||||
- [4.1 Command & Configuration](#41-command--configuration)
|
||||
- [4.2 Error Logs](#42-error-logs)
|
||||
- [4.3 Hypothesis: RCCL Support for gfx1151](#43-hypothesis-rccl-support-for-gfx1151)
|
||||
- [4.3 Hypothesis: RCCL Support for gfx1150](#43-hypothesis-rccl-support-for-gfx1150)
|
||||
5. [Request for Help](#5-request-for-help)
|
||||
|
||||
## 1. Context & Goal
|
||||
@@ -70,7 +70,7 @@ The environment is created using `toolbox` (wrapping Podman) with specific flags
|
||||
|
||||
```bash
|
||||
toolbox create vllm \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1151:latest \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1150:latest \
|
||||
-- \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
@@ -751,7 +751,7 @@ This results in an `HIP error: invalid kernel file` immediately upon engine init
|
||||
|
||||
### 4.1 - Possible reasons
|
||||
|
||||
This invalid kernel file might be related to RCCL not supporting gfx1151. There was a PR that was never merged:
|
||||
This invalid kernel file might be related to RCCL not supporting gfx1150. There was a PR that was never merged:
|
||||
|
||||
https://github.com/ROCm/rccl/pull/2075
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
set -e
|
||||
|
||||
TOOLBOX_NAME="vllm"
|
||||
IMAGE="docker.io/kyuz0/vllm-therock-gfx1151:latest"
|
||||
IMAGE="docker.io/kyuz0/vllm-therock-gfx1150:latest"
|
||||
|
||||
# Base options
|
||||
OPTIONS="--device /dev/dri --device /dev/kfd --group-add video --group-add render --security-opt seccomp=unconfined"
|
||||
|
||||
@@ -83,13 +83,13 @@ cat <<'ASCII'
|
||||
v L L M
|
||||
ASCII
|
||||
echo
|
||||
printf 'AMD STRIX HALO — vLLM Toolbox (gfx1151, ROCm via TheRock)\n'
|
||||
printf 'AMD STRIX HALO — vLLM Toolbox (gfx1150, ROCm via TheRock)\n'
|
||||
[[ -n "$ROCM_VER" ]] && printf 'ROCm nightly: %s\n' "$ROCM_VER"
|
||||
echo
|
||||
printf 'Machine: %s\n' "$MACHINE"
|
||||
printf 'GPU : %s\n\n' "$GPU"
|
||||
printf 'Repo : https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes\n'
|
||||
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1151:latest\n\n'
|
||||
printf 'Image : docker.io/kyuz0/vllm-therock-gfx1150:latest\n\n'
|
||||
printf 'Included:\n'
|
||||
printf ' - %-16s → %s\n' "start-vllm (TUI)" "Interactive launcher: Model select, Multi-GPU & Cache handling"
|
||||
printf ' - %-16s → %s\n' "start-vllm-cluster" "Cluster launcher: Setup Ray Head/Worker & Launch vLLM RCCL"
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
# Configuration
|
||||
REPO_URL="https://github.com/kyuz0/rocm-systems.git"
|
||||
BRANCH="gfx1151-rccl"
|
||||
BUILD_DIR="build_gfx1151"
|
||||
REPO_URL="https://code.badstorm.xyz/AI/rocm-systems.git"
|
||||
BRANCH="gfx1150-rccl"
|
||||
BUILD_DIR="build_gfx1150"
|
||||
ROCM_PATH=${ROCM_PATH:-/opt/rocm}
|
||||
# Project sub-directory
|
||||
PROJECT_DIR="projects/rccl"
|
||||
echo "=== Building RCCL for gfx1151 ==="
|
||||
echo "=== Building RCCL for gfx1150 ==="
|
||||
echo "Repo: $REPO_URL"
|
||||
echo "Branch: $BRANCH"
|
||||
echo "ROCm Path: $ROCM_PATH"
|
||||
@@ -28,14 +28,14 @@ echo "Entering project directory..."
|
||||
cd $PROJECT_DIR
|
||||
mkdir -p $BUILD_DIR
|
||||
cd $BUILD_DIR
|
||||
echo "Configuring CMake for gfx1151..."
|
||||
# We explicitly set GPU_TARGETS to gfx1151 to override the default list.
|
||||
echo "Configuring CMake for gfx1150..."
|
||||
# We explicitly set GPU_TARGETS to gfx1150 to override the default list.
|
||||
# We also set AMDGPU_TARGETS for standard rocm-cmake compliance.
|
||||
CXX=$ROCM_PATH/bin/hipcc cmake .. \
|
||||
-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc \
|
||||
-DDEFAULT_GPUS="gfx1151" \
|
||||
-DGPU_TARGETS="gfx1151" \
|
||||
-DAMDGPU_TARGETS="gfx1151" \
|
||||
-DDEFAULT_GPUS="gfx1150" \
|
||||
-DGPU_TARGETS="gfx1150" \
|
||||
-DAMDGPU_TARGETS="gfx1150" \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DBUILD_TESTS=OFF \
|
||||
-DGENERATE_SYM_KERNELS=OFF \
|
||||
@@ -44,6 +44,15 @@ CXX=$ROCM_PATH/bin/hipcc cmake .. \
|
||||
# 3. Build
|
||||
echo "Building librccl.so..."
|
||||
make -j$(nproc)
|
||||
|
||||
# Comprimi il file reale (non il symlink)
|
||||
cd /home/badstorm/Source/ai/amd-strix-halo-vllm-toolboxes
|
||||
gzip -k rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0
|
||||
|
||||
# Copia i file .gz in custom_libs/
|
||||
mkdir -p custom_libs/
|
||||
cp rocm-systems/projects/rccl/build_gfx1150/librccl.so.1.0.gz custom_libs/librccl.so.1.gz
|
||||
|
||||
echo "=== Build Complete ==="
|
||||
echo "Libraries are located in:"
|
||||
echo " $(pwd)/librccl.so"
|
||||
|
||||
@@ -2,13 +2,17 @@ import subprocess
|
||||
import time
|
||||
import os
|
||||
|
||||
def get_net_iface(ip_prefix="192.168.100"):
|
||||
def get_net_iface(ip_prefix=None):
|
||||
"""
|
||||
Auto-detects the interface that serves the cluster network.
|
||||
Assumes standard 192.168.100.x setup from start_vllm_cluster.py
|
||||
Assumes standard 192.168.100.x setup from start_vllm_cluster.py, but parameterizable.
|
||||
"""
|
||||
if ip_prefix is None:
|
||||
head_ip = os.getenv("VLLM_HEAD_IP", "192.168.100.1")
|
||||
ip_prefix = ".".join(head_ip.split('.')[:3])
|
||||
|
||||
try:
|
||||
# ip -o addr show | grep 192.168.100
|
||||
# ip -o addr show | grep <ip_prefix>
|
||||
cmd = f"ip -o addr show | grep {ip_prefix}"
|
||||
res = subprocess.check_output(cmd, shell=True, text=True).strip()
|
||||
# Output format: 2: eth0 inet 192.168.100.1/24 ...
|
||||
@@ -31,35 +35,77 @@ def get_subnet_from_ip(ip):
|
||||
parts = ip.split('.')
|
||||
return f"{parts[0]}.{parts[1]}.{parts[2]}.0/24"
|
||||
|
||||
def stop_cluster(nodes=None):
|
||||
def stop_cluster(worker_ip=None):
|
||||
"""
|
||||
Stops Ray on the given nodes (list of IPs).
|
||||
If nodes is None, does nothing (caller should identify nodes first if needed,
|
||||
but typically for a clean start we might just rely on 'ray stop' on each setup).
|
||||
Actually, to be safe, we can try to stop local ray.
|
||||
Stops Ray locally and on the worker node if provided.
|
||||
"""
|
||||
print("Stopping Ray cluster locally...")
|
||||
subprocess.run(["ray", "stop", "--force"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
if worker_ip:
|
||||
print(f"Stopping Ray cluster on worker ({worker_ip})...")
|
||||
ssh_cmd = [
|
||||
"ssh", "-o", "StrictHostKeyChecking=no", worker_ip,
|
||||
"toolbox", "run", "-c", "vllm", "--", "ray", "stop", "--force"
|
||||
]
|
||||
try:
|
||||
subprocess.run(ssh_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Warning: Failed to stop worker node completely: {e}")
|
||||
|
||||
def setup_worker_node(worker_ip, head_ip):
|
||||
subnet = get_subnet_from_ip(worker_ip)
|
||||
|
||||
# Script to run on worker
|
||||
# Read overrides from current env
|
||||
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
|
||||
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
|
||||
|
||||
script = f"""
|
||||
source /etc/profile
|
||||
# Silece the kill command
|
||||
# Silence the kill command
|
||||
ray stop --force > /dev/null 2>&1 || true
|
||||
|
||||
# Calculate Interface dynamically
|
||||
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
|
||||
|
||||
echo "\\n--- Ray Worker Environment ({worker_ip}) ---"
|
||||
echo "export RAY_DISABLE_METRICS=1"
|
||||
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
|
||||
echo "export RAY_memory_monitor_refresh_ms=0"
|
||||
echo "export VLLM_HOST_IP={worker_ip}"
|
||||
echo "export RDMA_IFACE=$RDMA_IFACE"
|
||||
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
|
||||
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
|
||||
echo "export NCCL_IB_TIMEOUT=23"
|
||||
echo "export NCCL_IB_RETRY_CNT=7"
|
||||
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
|
||||
|
||||
export RAY_DISABLE_METRICS=1
|
||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
export RAY_memory_monitor_refresh_ms=0
|
||||
export VLLM_HOST_IP={worker_ip}
|
||||
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
|
||||
export RDMA_IFACE=$RDMA_IFACE
|
||||
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
||||
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
|
||||
# Stability for RDMA
|
||||
export NCCL_IB_TIMEOUT=23
|
||||
export NCCL_IB_RETRY_CNT=7
|
||||
echo "Starting Ray Worker on {worker_ip} connecting to {head_ip}..."
|
||||
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false
|
||||
export NCCL_IB_DISABLE={nccl_disable_val}
|
||||
"""
|
||||
if nccl_debug_val:
|
||||
script += f"""
|
||||
echo "export NCCL_DEBUG={nccl_debug_val}"
|
||||
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
|
||||
export NCCL_DEBUG={nccl_debug_val}
|
||||
export NCCL_DEBUG_SUBSYS=INIT,NET
|
||||
"""
|
||||
|
||||
script += f"""
|
||||
echo "\\nStarting Ray Worker on {worker_ip} connecting to {head_ip}..."
|
||||
if [ "{nccl_disable_val}" = "1" ]; then
|
||||
echo "Note: Worker is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
|
||||
fi
|
||||
ray start --address='{head_ip}:6379' --num-gpus=1 --num-cpus=8 --disable-usage-stats
|
||||
"""
|
||||
|
||||
print(f"Setting up Worker Node ({worker_ip})...")
|
||||
@@ -83,20 +129,55 @@ def setup_head_node(head_ip):
|
||||
|
||||
print(f"Setting up Head Node ({head_ip})...")
|
||||
|
||||
# Read overrides from current env
|
||||
nccl_disable_val = os.getenv("NCCL_IB_DISABLE", "0")
|
||||
nccl_debug_val = os.getenv("NCCL_DEBUG", "")
|
||||
|
||||
script = f"""
|
||||
# Silence the kill command
|
||||
ray stop --force > /dev/null 2>&1 || true
|
||||
|
||||
# Calculate Interface dynamically
|
||||
RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
|
||||
|
||||
echo "\\n--- Ray Head Environment ({head_ip}) ---"
|
||||
echo "export RAY_DISABLE_METRICS=1"
|
||||
echo "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1"
|
||||
echo "export RAY_memory_monitor_refresh_ms=0"
|
||||
echo "export VLLM_HOST_IP={head_ip}"
|
||||
echo "export RDMA_IFACE=$RDMA_IFACE"
|
||||
echo "export NCCL_SOCKET_IFNAME=$RDMA_IFACE"
|
||||
echo "export GLOO_SOCKET_IFNAME=$RDMA_IFACE"
|
||||
echo "export NCCL_IB_TIMEOUT=23"
|
||||
echo "export NCCL_IB_RETRY_CNT=7"
|
||||
echo "export NCCL_IB_DISABLE={nccl_disable_val}"
|
||||
|
||||
export RAY_DISABLE_METRICS=1
|
||||
export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
export RAY_memory_monitor_refresh_ms=0
|
||||
export VLLM_HOST_IP={head_ip}
|
||||
export RDMA_IFACE=$(ip -o addr show to {subnet} | awk '{{print $2}}' | head -n1)
|
||||
export RDMA_IFACE=$RDMA_IFACE
|
||||
export NCCL_SOCKET_IFNAME=$RDMA_IFACE
|
||||
export GLOO_SOCKET_IFNAME=$RDMA_IFACE
|
||||
# Stability for RDMA
|
||||
export NCCL_IB_TIMEOUT=23
|
||||
export NCCL_IB_RETRY_CNT=7
|
||||
echo "Starting Ray Head on {head_ip}..."
|
||||
export NCCL_IB_DISABLE={nccl_disable_val}
|
||||
"""
|
||||
|
||||
if nccl_debug_val:
|
||||
script += f"""
|
||||
echo "export NCCL_DEBUG={nccl_debug_val}"
|
||||
echo "export NCCL_DEBUG_SUBSYS=INIT,NET"
|
||||
export NCCL_DEBUG={nccl_debug_val}
|
||||
export NCCL_DEBUG_SUBSYS=INIT,NET
|
||||
"""
|
||||
|
||||
script += f"""
|
||||
echo "\\nStarting Ray Head on {head_ip}..."
|
||||
if [ "{nccl_disable_val}" = "1" ]; then
|
||||
echo "Note: Head is configured with NCCL_IB_DISABLE=1 (Ethernet Forced)"
|
||||
fi
|
||||
ray start --head --port=6379 --node-ip-address={head_ip} --num-gpus=1 --num-cpus=8 --disable-usage-stats --include-dashboard=false
|
||||
"""
|
||||
|
||||
|
||||
@@ -2,11 +2,12 @@
|
||||
set -e
|
||||
|
||||
# 1. System Base & Build Tools
|
||||
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
|
||||
dnf -y install --setopt=install_weak_deps=False --nodocs \
|
||||
python3.13 python3.13-devel git rsync libatomic bash ca-certificates curl \
|
||||
gcc gcc-c++ binutils make ffmpeg-free \
|
||||
cmake ninja-build aria2c tar xz vim nano dialog \
|
||||
libdrm-devel zlib-devel openssl-devel pgrep \
|
||||
numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \
|
||||
&& dnf clean all && rm -rf /var/cache/dnf/*
|
||||
# Added 'libgoogle-perftools4' for tcmalloc (fixes double-free)
|
||||
apt-get update
|
||||
apt-get install -y --no-install-recommends \
|
||||
python3.11 python3.11-dev python3.11-venv git rsync bash ca-certificates curl \
|
||||
gcc g++ binutils make ffmpeg \
|
||||
cmake ninja-build aria2 tar xz-utils vim nano dialog \
|
||||
libdrm-dev zlib1g-dev libssl-dev procps \
|
||||
libnuma-dev libgoogle-perftools4 iproute2 ibverbs-utils patch perftest iputils-ping iperf3 infiniband-diags \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
@@ -3,7 +3,7 @@ set -euo pipefail
|
||||
|
||||
# Configuration with defaults matching Dockerfile ARGs
|
||||
ROCM_MAJOR_VER="${ROCM_MAJOR_VER:-7}"
|
||||
GFX="${GFX:-gfx1151}"
|
||||
GFX="${GFX:-gfx1150}"
|
||||
|
||||
echo "=== Installing ROCm SDK ($GFX / $ROCM_MAJOR_VER) ==="
|
||||
|
||||
@@ -51,8 +51,9 @@ printf '%s\n' \
|
||||
"export VLLM_TARGET_DEVICE=rocm" \
|
||||
"export HIP_FORCE_DEV_KERNARG=1" \
|
||||
"export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
|
||||
"export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4" \
|
||||
"export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/rocm/lib/librocm_smi64.so.1.0" \
|
||||
> /etc/profile.d/rocm-sdk.sh
|
||||
|
||||
chmod 0644 /etc/profile.d/rocm-sdk.sh
|
||||
echo "=== ROCm SDK Installation Complete ==="
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ set -e
|
||||
# Configuration
|
||||
# Paths identified from your environment
|
||||
ROCM_LIB_PATH="/opt/rocm/lib/librccl.so.1.0"
|
||||
VENV_LIB_PATH="/opt/venv/lib/python3.13/site-packages/_rocm_sdk_libraries_gfx1151/lib/librccl.so.1"
|
||||
VENV_LIB_PATH="/opt/venv/lib/python3.11/site-packages/_rocm_sdk_libraries_gfx1150/lib/librccl.so.1"
|
||||
BACKUP_DIR="./rccl_backups_$(date +%Y%m%d_%H%M%S)"
|
||||
# Files to replace
|
||||
# We assume the new library is named 'librccl.so' or 'librccl.so.1' in the current directory or provided as arg
|
||||
@@ -20,7 +20,7 @@ do_install() {
|
||||
echo "Please provide the path to the newly built librccl.so.1"
|
||||
exit 1
|
||||
fi
|
||||
echo "=== Installing Custom RCCL (gfx1151) ==="
|
||||
echo "=== Installing Custom RCCL (gfx1150) ==="
|
||||
echo "Creating backup directory: $BACKUP_DIR"
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
# 1. Backup /opt/rocm location
|
||||
|
||||
Spustitelný soubor
+18
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
while true; do
|
||||
A_IN=$(rdma statistic | awk '/ip4InOctets/ {print $2}')
|
||||
A_OUT=$(rdma statistic | awk '/ip4OutOctets/ {print $2}')
|
||||
sleep 1
|
||||
B_IN=$(rdma statistic | awk '/ip4InOctets/ {print $2}')
|
||||
B_OUT=$(rdma statistic | awk '/ip4OutOctets/ {print $2}')
|
||||
|
||||
RX=$(( (B_IN - A_IN) * 8 ))
|
||||
TX=$(( (B_OUT - A_OUT) * 8 ))
|
||||
|
||||
printf "%s RDMA RX: %7sbit/s TX: %7sbit/s SUM: %7sbit/s\n" \
|
||||
"$(date +%T)" \
|
||||
"$(numfmt --to=iec $RX)" \
|
||||
"$(numfmt --to=iec $TX)" \
|
||||
"$(numfmt --to=iec $((RX+TX)))"
|
||||
done
|
||||
+12
-1
@@ -10,6 +10,7 @@ MODEL_TABLE = {
|
||||
|
||||
"google/gemma-3-12b-it": {
|
||||
"trust_remote": False,
|
||||
"enforce_eager": True,
|
||||
"valid_tp": [1, 2],
|
||||
"max_num_seqs": "64",
|
||||
"max_tokens": "32768"
|
||||
@@ -68,7 +69,7 @@ MODEL_TABLE = {
|
||||
# 5. Qwen 80B AWQ
|
||||
# Size: ~48GB. Fits on 2x32GB (64GB). Leftover for Cache: ~16GB.
|
||||
# Config: 20k ctx fits in that cache. Eager mode required for stability.
|
||||
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
|
||||
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16": {
|
||||
"trust_remote": True,
|
||||
"valid_tp": [1], # Too big for single GPU
|
||||
"max_num_seqs": "64", # Large Model / Bandwidth Constrained
|
||||
@@ -77,6 +78,15 @@ MODEL_TABLE = {
|
||||
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
|
||||
},
|
||||
|
||||
"mratsim/MiniMax-M2.5-BF16-INT4-AWQ": {
|
||||
"trust_remote": True,
|
||||
"valid_tp": [2],
|
||||
"max_num_seqs": "64",
|
||||
"max_tokens": "16384",
|
||||
"enforce_eager": False,
|
||||
"env": {"VLLM_USE_TRITON_AWQ": "1"} # Fixes "Unsupported Hardware" error
|
||||
},
|
||||
|
||||
}
|
||||
|
||||
MODELS_TO_RUN = [
|
||||
@@ -89,6 +99,7 @@ MODELS_TO_RUN = [
|
||||
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit",
|
||||
"btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit",
|
||||
"dazipe/Qwen3-Next-80B-A3B-Instruct-GPTQ-Int4A16",
|
||||
"mratsim/MiniMax-M2.5-BF16-INT4-AWQ",
|
||||
]
|
||||
|
||||
# Hardware / Global Defaults
|
||||
|
||||
Některé soubory nejsou zobrazny, neboť je v této revizi změněno mnoho souborů Zobrazit více
Odkázat v novém úkolu
Zablokovat Uživatele