first commit
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
FROM kyuz0/pytorch-therock-gfx1151-aotriton-builder:latest AS vllm-builder
|
||||
|
||||
# Clone vLLM repository (shallow clone)
|
||||
RUN git clone --depth 1 https://github.com/vllm-project/vllm.git
|
||||
|
||||
# Install vLLM build dependencies and build vLLM
|
||||
RUN source .venv/bin/activate && \
|
||||
cd vllm && \
|
||||
uv pip install ninja cmake wheel pybind11 && \
|
||||
uv pip install --upgrade numba scipy huggingface-hub[cli] "numpy<2" && \
|
||||
python use_existing_torch.py && \
|
||||
sed -i '/amdsmi==/d' requirements/rocm-build.txt && \
|
||||
sed -i '/pytorch-triton-rocm/d' requirements/rocm-build.txt && \
|
||||
sed -i '/triton==/d' requirements/rocm-build.txt && \
|
||||
uv pip install -r requirements/rocm-build.txt
|
||||
|
||||
# Apply gfx1151 fixes
|
||||
RUN cd vllm && \
|
||||
sed -i 's/gfx1200;gfx1201/gfx1151;gfx1200;gfx1201/' CMakeLists.txt && \
|
||||
sed -i '/torch == 2.8.0,/d' pyproject.toml && \
|
||||
sed -i 's/import torch/try:\n import torch\n from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME\n TORCH_AVAILABLE = True\nexcept ImportError:\n torch = None\n CUDA_HOME = None\n ROCM_HOME = None\n TORCH_AVAILABLE = False/' setup.py && \
|
||||
sed -i 's/from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME/# Moved to try block above/' setup.py && \
|
||||
sed -i 's/torch.version.cuda is None/TORCH_AVAILABLE and torch.version.cuda is None/' setup.py && \
|
||||
sed -i 's/has_cuda = torch.version.cuda is not None/has_cuda = TORCH_AVAILABLE and torch.version.cuda is not None/' setup.py && \
|
||||
sed -i 's/torch.version.hip is not None/TORCH_AVAILABLE and torch.version.hip is not None/' setup.py && \
|
||||
sed -i 's/rocm_version = get_rocm_version() or torch.version.hip/rocm_version = get_rocm_version() or (torch.version.hip if TORCH_AVAILABLE else None)/' setup.py && \
|
||||
sed -i 's/cuda_major, cuda_minor = torch.version.cuda.split(".")/cuda_major, cuda_minor = torch.version.cuda.split(".") if TORCH_AVAILABLE else ("0", "0")/' setup.py
|
||||
|
||||
# Fix ROCm platform detection
|
||||
RUN cd vllm && \
|
||||
git checkout HEAD -- vllm/platforms/__init__.py && \
|
||||
sed -i '/def rocm_platform_plugin/,/return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None/s/is_rocm = False/is_rocm = False/' vllm/platforms/__init__.py && \
|
||||
sed -i '/def rocm_platform_plugin/,/return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None/s/logger.debug("Checking if ROCm platform is available.")/logger.debug("Checking if ROCm platform is available.")\n \n # Skip amdsmi check due to segfault issues - default to ROCm for AMD systems/' vllm/platforms/__init__.py && \
|
||||
sed -i '/def rocm_platform_plugin/,/return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None/s/try:\n import amdsmi/try:\n import torch/' vllm/platforms/__init__.py && \
|
||||
sed -i '/def rocm_platform_plugin/,/return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None/s/amdsmi.amdsmi_init()/# amdsmi disabled - using torch detection/' vllm/platforms/__init__.py && \
|
||||
sed -i '/def rocm_platform_plugin/,/return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None/s/try:\n if len(amdsmi.amdsmi_get_processor_handles()) > 0:/if hasattr(torch, '\''version'\'') and hasattr(torch.version, '\''hip'\'') and torch.version.hip is not None:/' vllm/platforms/__init__.py && \
|
||||
sed -i '/def rocm_platform_plugin/,/return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None/s/is_rocm = True\n logger.debug("Confirmed ROCm platform is available.")/is_rocm = True\n logger.debug("ROCm platform detected via torch.version.hip")/' vllm/platforms/__init__.py && \
|
||||
sed -i '/def rocm_platform_plugin/,/return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None/s/else:\n logger.debug("ROCm platform is not available because"\n " no GPU is found.")/else:\n # Fallback: assume ROCm if we'\''re not CUDA and not other platforms\n logger.debug("Defaulting to ROCm platform (amdsmi disabled due to segfault)")\n is_rocm = True/' vllm/platforms/__init__.py && \
|
||||
sed -i '/def rocm_platform_plugin/,/return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None/s/finally:\n amdsmi.amdsmi_shut_down()/finally:\n # amdsmi disabled\n pass/' vllm/platforms/__init__.py && \
|
||||
sed -i '/def rocm_platform_plugin/,/return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None/s/logger.debug("ROCm platform is not available because: %s", str(e))/logger.debug("ROCm platform check failed: %s", str(e))\n # Still default to ROCm as fallback\n is_rocm = True/' vllm/platforms/__init__.py
|
||||
|
||||
# Build vLLM
|
||||
RUN source .venv/bin/activate && \
|
||||
cd vllm && \
|
||||
uv pip uninstall amdsmi || echo "amdsmi not installed" && \
|
||||
printf '#!/bin/bash\necho "gfx1151"\n' > /usr/local/bin/amdgpu-arch && \
|
||||
chmod +x /usr/local/bin/amdgpu-arch && \
|
||||
printf '#!/bin/bash\necho "gfx1151"\n' > /usr/bin/amdgpu-arch && \
|
||||
chmod +x /usr/bin/amdgpu-arch && \
|
||||
printf '#!/bin/bash\necho "gfx1151"\n' > /bin/amdgpu-arch && \
|
||||
chmod +x /bin/amdgpu-arch && \
|
||||
export PYTORCH_ROCM_ARCH="gfx1151" && \
|
||||
/torch-therock/.venv/bin/python -c "import torch; print('torch==' + torch.__version__)" > /tmp/constraints.txt && \
|
||||
/torch-therock/.venv/bin/python -c "import triton; print('pytorch-triton-rocm==' + getattr(triton, '__version__', 'unknown'))" >> /tmp/constraints.txt || echo "# triton version not found" >> /tmp/constraints.txt && \
|
||||
TORCH_CMAKE_PATH=$(/torch-therock/.venv/bin/python -c "import torch; print(torch.utils.cmake_prefix_path)") && \
|
||||
VLLM_TARGET_DEVICE=rocm CMAKE_PREFIX_PATH="$TORCH_CMAKE_PATH" Torch_DIR="$TORCH_CMAKE_PATH/Torch" CMAKE_ARGS="-DGPU_TARGETS=gfx1151 -DHIP_TARGETS=gfx1151 -DAMDGPU_TARGETS=gfx1151" /torch-therock/.venv/bin/pip install . --no-build-isolation --constraint /tmp/constraints.txt
|
||||
|
||||
# Runtime stage
|
||||
FROM archlinux:latest
|
||||
|
||||
# Install runtime dependencies + compilation tools
|
||||
RUN pacman -Syu --noconfirm && \
|
||||
pacman -S --noconfirm ca-certificates gcc make cmake ninja git && \
|
||||
pacman -Scc --noconfirm && \
|
||||
git clone --depth 1 https://github.com/pyenv/pyenv.git /opt/pyenv && \
|
||||
export PYENV_ROOT=/opt/pyenv && \
|
||||
export PATH=$PYENV_ROOT/bin:$PATH && \
|
||||
eval "$(pyenv init -)" && \
|
||||
pyenv install 3.12.9 && \
|
||||
pyenv global 3.12.9
|
||||
|
||||
# Copy complete environment from builder
|
||||
COPY --from=vllm-builder /opt/pyenv /opt/pyenv
|
||||
COPY --from=vllm-builder /torch-therock/.venv /torch-therock/.venv
|
||||
COPY --from=vllm-builder /torch-therock/*.sh /torch-therock/
|
||||
|
||||
# Set environment
|
||||
ENV PYENV_ROOT=/opt/pyenv
|
||||
ENV PYENV_VERSION=3.12.9
|
||||
ENV PATH="/opt/pyenv/versions/3.12.9/bin:/torch-therock/.venv/bin:$PATH"
|
||||
ENV PYTORCH_ROCM_ARCH=gfx1151
|
||||
|
||||
WORKDIR /torch-therock
|
||||
|
||||
# Test installation
|
||||
RUN /torch-therock/.venv/bin/python -c "import torch; print('PyTorch version:', torch.__version__)" && \
|
||||
/torch-therock/.venv/bin/python -c "import vllm; print('vLLM version:', vllm.__version__)"
|
||||
|
||||
# Toolbx compatibility - fix permissions and add environment setup
|
||||
RUN chmod -R a+rwX /torch-therock
|
||||
|
||||
# Copy toolbx scripts
|
||||
COPY scripts/vllm-env.sh /etc/profile.d/vllm-env.sh
|
||||
COPY scripts/vllm-banner.sh /etc/profile.d/vllm-banner.sh
|
||||
RUN chmod 644 /etc/profile.d/vllm-env.sh /etc/profile.d/vllm-banner.sh
|
||||
|
||||
CMD ["bash", "-c", "source .venv/bin/activate && bash"]
|
||||
@@ -0,0 +1,164 @@
|
||||
# AMD Strix Halo — vLLM Toolbox/Container (gfx1151, PyTorch + AOTriton)
|
||||
|
||||
An **Arch-based** Docker/Podman container that is **Toolbx-compatible** (usable as a Fedora toolbox) for serving LLMs with **vLLM** on **AMD Ryzen AI Max “Strix Halo” (gfx1151)**. Built on the PyTorch + AOTriton base to make ROCm on Strix Halo practical for day‑to‑day use.
|
||||
|
||||
> **Built on:** [https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton](https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton)
|
||||
> **Credits:** **lhl** (build tools/scripts), **ssweens** (Arch‑based Dockerfiles), and the **AMD Strix Halo Home Lab Discord** for testing/support.
|
||||
|
||||
---
|
||||
|
||||
## 1) Toolbx vs Docker/Podman
|
||||
|
||||
The `kyuz0/pytorch-therock-gfx1151-aotriton-builder` image can be used both as:
|
||||
|
||||
##  
|
||||
|
||||
* **Fedora Toolbx (recommended for development):** Toolbx shares your **HOME** and user, so models/configs live on the host. Great for iterating quickly while keeping the host clean.
|
||||
* **Docker/Podman (recommended for deployment/perf):** Use for running vLLM as a service (host networking, IPC tuning, etc.). Always **mount a host directory** for model weights so they stay outside the container.
|
||||
|
||||
---
|
||||
|
||||
## 2) Quickstart — Fedora Toolbx (development)
|
||||
|
||||
Create a toolbox that exposes the GPU and relaxes seccomp to avoid ROCm syscall issues:
|
||||
|
||||
```bash
|
||||
toolbox create vllm \
|
||||
--image docker.io/kyuz0/vllm-therock-gfx1151-aotriton:latest \
|
||||
-- --device /dev/dri --device /dev/kfd \
|
||||
--group-add video --group-add render --security-opt seccomp=unconfined
|
||||
```
|
||||
|
||||
Enter it:
|
||||
|
||||
```bash
|
||||
toolbox enter vllm
|
||||
```
|
||||
|
||||
**Model storage (Toolbx):** keep weights **outside** the toolbox under your HOME so they persist. Recommended path:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/vllm-models
|
||||
```
|
||||
|
||||
Serve a model with vLLM (downloads to `~/vllm-models`; if the model isn't present, it will be fetched from Hugging Face automatically):
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-7B-Instruct \
|
||||
--host 0.0.0.0 --port 8000 \
|
||||
--download-dir ~/vllm-models
|
||||
```
|
||||
|
||||
> Toolbx shares HOME by design, so `~/vllm-models` stays on the host and survives toolbox updates.
|
||||
>
|
||||
> **Cache note (Toolbx):** vLLM will also write compiled kernels to `~/.cache/vllm/torch_compile_cache/` in your HOME. For example:
|
||||
>
|
||||
> ```bash
|
||||
> du -sh ~/.cache/vllm/torch_compile_cache/
|
||||
> # e.g., 138M /home/kyuz0/.cache/vllm/torch_compile_cache/
|
||||
> ```
|
||||
|
||||
---
|
||||
|
||||
## 3) Testing the API
|
||||
|
||||
Once the server is up (from section 2), hit the OpenAI‑compatible endpoint:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model":"Qwen/Qwen2.5-7B-Instruct","messages":[{"role":"user","content":"Hello! Test the performance."}]}'
|
||||
```
|
||||
|
||||
You should receive a JSON response with a `choices[0].message.content` reply.
|
||||
|
||||
---
|
||||
|
||||
## 4) Quickstart — Podman/Docker
|
||||
|
||||
Prefer this for persistent services. **Always mount a host directory for weights** so they live outside the container. If the model isn't present, vLLM will fetch it from **Hugging Face** into the mapped directory.
|
||||
|
||||
```bash
|
||||
podman run \
|
||||
-d \
|
||||
--name vllm \
|
||||
--network host \
|
||||
--device /dev/kfd \
|
||||
--device /dev/dri \
|
||||
--group-add video \
|
||||
--group-add render \
|
||||
-v ~/vllm-models:/models \
|
||||
-v ~/.cache/vllm:/root/.cache/vllm \
|
||||
docker.io/kyuz0/vllm-therock-gfx1151-aotriton:latest \
|
||||
bash -lc 'source /torch-therock/.venv/bin/activate; \
|
||||
TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 \
|
||||
vllm serve Qwen/Qwen2.5-7B-Instruct --dtype float16 \
|
||||
--host 0.0.0.0 --port 8000 --download-dir /models'
|
||||
```
|
||||
|
||||
> Not using `--network host`? Map a port instead: `-p 8000:8000`.
|
||||
|
||||
---
|
||||
|
||||
## 5) Models, dtypes & storage
|
||||
|
||||
* Start with **Qwen/Qwen2.5-7B-Instruct**; larger models may work but are less forgiving on unified memory.
|
||||
* Use `--dtype float16` unless you have a reason to change.
|
||||
* **Storage discipline:**
|
||||
|
||||
* **Toolbx:** `--download-dir ~/vllm-models` (lives in your HOME on the host).
|
||||
* **Podman/Docker:** `-v ~/vllm-models:/models` and `--download-dir /models`.
|
||||
|
||||
---
|
||||
|
||||
## 6) Performance notes (short)
|
||||
|
||||
* The image is built on the PyTorch + **AOTriton** base; enabling `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` can improve startup/throughput on some models.
|
||||
* vLLM flags you might tune later: `--gpu-memory-utilization`, `--max-num-seqs`, `--max-model-len`. Start simple; add knobs only if needed.
|
||||
|
||||
---
|
||||
|
||||
## 7) Requirements (host)
|
||||
|
||||
**Hardware & drivers**
|
||||
|
||||
* AMD Strix Halo APU (gfx1151).
|
||||
* Working amdgpu stack with `/dev/kfd` (ROCm compute) and `/dev/dri` (graphics).
|
||||
* Your user in the **video** and **render** groups.
|
||||
|
||||
**Unified memory setup (HIGHLY recommended)**
|
||||
Enable large GTT/unified memory so the iGPU can borrow system RAM for bigger models:
|
||||
|
||||
1. **Kernel parameters** (append to your GRUB cmdline):
|
||||
|
||||
```
|
||||
amd_iommu=off amdgpu.gttsize=131072 ttm.pages_limit=33554432
|
||||
```
|
||||
|
||||
| Parameter | Purpose |
|
||||
| -------------------------- | ---------------------------- |
|
||||
| `amd_iommu=off` | Reduces latency |
|
||||
| `amdgpu.gttsize=131072` | 128 GiB GTT (unified memory) |
|
||||
| `ttm.pages_limit=33554432` | Large pinned allocations |
|
||||
|
||||
2. **BIOS**: allocate **minimal VRAM** to the iGPU (e.g., **512 MB**) and rely on unified memory.
|
||||
|
||||
3. **Fedora example** (GRUB): edit `/etc/default/grub` → `GRUB_CMDLINE_LINUX=...` then:
|
||||
|
||||
```bash
|
||||
sudo grub2-mkconfig -o /boot/grub2/grub.cfg
|
||||
sudo reboot
|
||||
```
|
||||
|
||||
**Container runtime**
|
||||
|
||||
* Podman or Docker installed (examples use Podman; replace with Docker if preferred).
|
||||
|
||||
---
|
||||
|
||||
## 8) Acknowledgements & Links
|
||||
|
||||
* Base images & docs: [https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton](https://github.com/kyuz0/amd-strix-halo-pytorch-gfx1151-aotriton)
|
||||
* Upstreams: [vLLM](https://github.com/vllm-project/vllm), [ROCm/TheRock](https://github.com/ROCm/TheRock), [AOTriton](https://github.com/ROCm/aotriton)
|
||||
* Community: **AMD Strix Halo Home Lab Discord** — [https://discord.gg/pnPRyucNrG](https://discord.gg/pnPRyucNrG)
|
||||
* Big thanks to **lhl** and **ssweens** for prior art and inspiration.
|
||||
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
# vLLM Toolbox banner
|
||||
|
||||
gpu_name() {
|
||||
local name=""
|
||||
if command -v rocm-smi >/dev/null 2>&1; then
|
||||
name=$(rocm-smi --showproductname --csv 2>/dev/null | tail -n1 | cut -d, -f2)
|
||||
[[ -z "$name" ]] && name=$(rocm-smi --showproductname 2>/dev/null | grep -m1 -E 'Product Name|Card series' | sed 's/.*: //')
|
||||
fi
|
||||
if [[ -z "$name" ]]; then
|
||||
name="Unknown AMD GPU"
|
||||
fi
|
||||
printf '%s\n' "$name"
|
||||
}
|
||||
|
||||
vllm_version() {
|
||||
python -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "unknown"
|
||||
}
|
||||
|
||||
# Simple model selector
|
||||
vllm_start() {
|
||||
echo
|
||||
echo "Select a model to serve:"
|
||||
echo "1) Qwen2.5-7B-Instruct (recommended, ~14GB VRAM)"
|
||||
echo "2) Llama-3.1-8B-Instruct (~16GB VRAM)"
|
||||
echo "3) Qwen3-8B (~16GB VRAM, latest with thinking mode)"
|
||||
echo
|
||||
read -p "Choose [1-3]: " choice
|
||||
|
||||
case $choice in
|
||||
1) vllm serve Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 8000 --download-dir ~/models --dtype float16 --max-model-len 32768 ;;
|
||||
2) vllm serve meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 8000 --download-dir ~/models --dtype float16 --max-model-len 32768 ;;
|
||||
3) vllm serve Qwen/Qwen3-8B --host 0.0.0.0 --port 8000 --download-dir ~/models --dtype float16 --max-model-len 32768 --enable-reasoning --reasoning-parser qwen3 ;;
|
||||
*) echo "Invalid choice." ;;
|
||||
esac
|
||||
}
|
||||
|
||||
GPU="$(gpu_name)"
|
||||
VLLM_VER="$(vllm_version)"
|
||||
|
||||
echo
|
||||
echo "vLLM Toolbox - AMD STRIX HALO (gfx1151)"
|
||||
echo "GPU: $GPU"
|
||||
echo "vLLM: $VLLM_VER"
|
||||
echo
|
||||
echo "Commands:"
|
||||
echo " vllm_start - Start model server"
|
||||
echo " vllm_test - Test API"
|
||||
echo " ls ~/models - List downloaded models"
|
||||
echo
|
||||
echo "Server will be available at: http://localhost:8000"
|
||||
echo
|
||||
|
||||
# Test alias
|
||||
alias vllm_test='curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '\''{"model":"auto","messages":[{"role":"user","content":"Hello!"}]}'\'''
|
||||
@@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
# Auto-activate vLLM environment for toolbx
|
||||
|
||||
# Activate PyTorch + vLLM environment
|
||||
source /torch-therock/.venv/bin/activate
|
||||
|
||||
# ROCm and performance environment variables
|
||||
export PYTORCH_ROCM_ARCH=gfx1151
|
||||
export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
|
||||
export VLLM_USE_TRITON_FLASH_ATTN=0
|
||||
export TORCH_COMPILE_DEBUG=1
|
||||
export VLLM_COMPILE_LEVEL=3
|
||||
|
||||
# Detect and export ROCm toolchain paths
|
||||
eval "$(
|
||||
python3 - <<'PY'
|
||||
try:
|
||||
import pathlib, _rocm_sdk_core as r
|
||||
base = pathlib.Path(r.__file__).parent / "lib" / "llvm" / "bin"
|
||||
lib = pathlib.Path(r.__file__).parent / "lib"
|
||||
print(f'export TRITON_HIP_LLD_PATH="{base / "ld.lld"}"')
|
||||
print(f'export TRITON_HIP_CLANG_PATH="{base / "clang++"}"')
|
||||
print(f'export LD_LIBRARY_PATH="{lib}:$LD_LIBRARY_PATH"')
|
||||
except ImportError:
|
||||
pass
|
||||
PY
|
||||
)" 2>/dev/null || true
|
||||
|
||||
# Enable flash attention
|
||||
export FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE
|
||||
Reference in New Issue
Block a user