feat: Modularize Dockerfile dependency and ROCm SDK installations into dedicated scripts and add a GitHub Actions workflow to build and consume a custom RCCL library.

This commit is contained in:
Donato Capitella
2026-02-01 14:50:37 +00:00
vanhempi a8added616
commit b10aa50745
5 muutettua tiedostoa jossa 127 lisäystä ja 40 poistoa
@@ -59,6 +59,14 @@ jobs:
with:
buildkitd-flags: --config /tmp/buildkitd.toml
- name: Download custom RCCL artifact
uses: dawidd6/action-download-artifact@v6
with:
workflow: build-rccl.yml
name: librccl-gfx1151
path: custom_libs
if_no_artifact_found: warn
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
+40
Näytä tiedosto
@@ -0,0 +1,40 @@
name: build-rccl
on:
workflow_dispatch:
env:
ROCM_MAJOR_VER: 7
GFX: gfx1151
jobs:
build-rccl:
runs-on: ubuntu-latest
container: registry.fedoraproject.org/fedora:43
steps:
- uses: actions/checkout@v4
- name: Install Dependencies
run: bash scripts/install_deps.sh
- name: Install ROCm SDK
run: bash scripts/install_rocm_sdk.sh
- name: Build RCCL
shell: bash
run: |
source /etc/profile.d/rocm-sdk.sh
bash scripts/build_rccl_gfx1151.sh
- name: Compress Artifact
run: |
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
ls -lh rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
gzip -c rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 > librccl.so.1.gz
ls -lh librccl.so.1.gz
- name: Upload Artifact
uses: actions/upload-artifact@v4
with:
name: librccl-gfx1151
path: librccl.so.1.gz
+9 -40
Näytä tiedosto
@@ -2,51 +2,20 @@ FROM registry.fedoraproject.org/fedora:43
# 1. System Base & Build Tools
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
RUN dnf -y install --setopt=install_weak_deps=False --nodocs \
python3.13 python3.13-devel git rsync libatomic bash ca-certificates curl \
gcc gcc-c++ binutils make ffmpeg-free \
cmake ninja-build aria2c tar xz vim nano dialog \
libdrm-devel zlib-devel openssl-devel pgrep \
numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \
&& dnf clean all && rm -rf /var/cache/dnf/*
COPY scripts/install_deps.sh /tmp/install_deps.sh
RUN sh /tmp/install_deps.sh
# 2. Install "TheRock" ROCm SDK (Tarball Method)
WORKDIR /tmp
ARG ROCM_MAJOR_VER=7
ARG GFX=gfx1151
RUN set -euo pipefail; \
BASE="https://therock-nightly-tarball.s3.amazonaws.com"; \
PREFIX="therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}"; \
KEY="$(curl -s "${BASE}?list-type=2&prefix=${PREFIX}" \
| tr '<' '\n' \
| grep -o "therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}\..*\.tar\.gz" \
| sort -V | tail -n1)"; \
echo "Downloading Latest Tarball: ${KEY}"; \
aria2c -x 16 -s 16 -j 16 --file-allocation=none "${BASE}/${KEY}" -o therock.tar.gz; \
mkdir -p /opt/rocm; \
tar xzf therock.tar.gz -C /opt/rocm --strip-components=1; \
rm therock.tar.gz
# 3. Configure Global ROCm Environment
# We add LD_PRELOAD for tcmalloc here to fix the shutdown crash
RUN export ROCM_PATH=/opt/rocm && \
BITCODE_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
printf '%s\n' \
"export ROCM_PATH=/opt/rocm" \
"export HIP_PLATFORM=amd" \
"export HIP_PATH=/opt/rocm" \
"export HIP_CLANG_PATH=/opt/rocm/llvm/bin" \
"export HIP_DEVICE_LIB_PATH=$BITCODE_PATH" \
"export PATH=$ROCM_PATH/bin:$ROCM_PATH/llvm/bin:\$PATH" \
"export LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/lib64:$ROCM_PATH/llvm/lib:\$LD_LIBRARY_PATH" \
"export ROCBLAS_USE_HIPBLASLT=1" \
"export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1" \
"export VLLM_TARGET_DEVICE=rocm" \
"export HIP_FORCE_DEV_KERNARG=1" \
"export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
"export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4" \
> /etc/profile.d/rocm-sdk.sh && \
chmod 0644 /etc/profile.d/rocm-sdk.sh
# We pass ARGs to the script via ENV or rely on defaults.
# But let's be explicit and export them for the RUN command.
COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh
RUN chmod +x /tmp/install_rocm_sdk.sh && \
export ROCM_MAJOR_VER=$ROCM_MAJOR_VER && \
export GFX=$GFX && \
/tmp/install_rocm_sdk.sh
# 4. Python Venv Setup
RUN /usr/bin/python3.13 -m venv /opt/venv
+12
Näytä tiedosto
@@ -0,0 +1,12 @@
#!/bin/bash
set -e
# 1. System Base & Build Tools
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
dnf -y install --setopt=install_weak_deps=False --nodocs \
python3.13 python3.13-devel git rsync libatomic bash ca-certificates curl \
gcc gcc-c++ binutils make ffmpeg-free \
cmake ninja-build aria2c tar xz vim nano dialog \
libdrm-devel zlib-devel openssl-devel pgrep \
numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \
&& dnf clean all && rm -rf /var/cache/dnf/*
+58
Näytä tiedosto
@@ -0,0 +1,58 @@
#!/bin/bash
set -euo pipefail
# Configuration with defaults matching Dockerfile ARGs
ROCM_MAJOR_VER="${ROCM_MAJOR_VER:-7}"
GFX="${GFX:-gfx1151}"
echo "=== Installing ROCm SDK ($GFX / $ROCM_MAJOR_VER) ==="
# 2. Install "TheRock" ROCm SDK (Tarball Method)
# We work in /tmp as per Dockerfile WORKDIR
cd /tmp
BASE="https://therock-nightly-tarball.s3.amazonaws.com"
PREFIX="therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}"
# Fetch the Key
KEY="$(curl -s "${BASE}?list-type=2&prefix=${PREFIX}" \
| tr '<' '\n' \
| grep -o "therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}\..*\.tar\.gz" \
| sort -V | tail -n1)"
if [ -z "$KEY" ]; then
echo "Error: Could not find tarball key for $PREFIX"
exit 1
fi
echo "Downloading Latest Tarball: ${KEY}"
aria2c -x 16 -s 16 -j 16 --file-allocation=none "${BASE}/${KEY}" -o therock.tar.gz
mkdir -p /opt/rocm
tar xzf therock.tar.gz -C /opt/rocm --strip-components=1
rm therock.tar.gz
# 3. Configure Global ROCm Environment
# We add LD_PRELOAD for tcmalloc here to fix the shutdown crash
export ROCM_PATH=/opt/rocm
BITCODE_PATH=$(find /opt/rocm -type d -name bitcode -print -quit)
echo "Generating /etc/profile.d/rocm-sdk.sh..."
printf '%s\n' \
"export ROCM_PATH=/opt/rocm" \
"export HIP_PLATFORM=amd" \
"export HIP_PATH=/opt/rocm" \
"export HIP_CLANG_PATH=/opt/rocm/llvm/bin" \
"export HIP_DEVICE_LIB_PATH=$BITCODE_PATH" \
"export PATH=$ROCM_PATH/bin:$ROCM_PATH/llvm/bin:\$PATH" \
"export LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/lib64:$ROCM_PATH/llvm/lib:\$LD_LIBRARY_PATH" \
"export ROCBLAS_USE_HIPBLASLT=1" \
"export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1" \
"export VLLM_TARGET_DEVICE=rocm" \
"export HIP_FORCE_DEV_KERNARG=1" \
"export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
"export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4" \
> /etc/profile.d/rocm-sdk.sh
chmod 0644 /etc/profile.d/rocm-sdk.sh
echo "=== ROCm SDK Installation Complete ==="