diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml index aa09692..707ff31 100644 --- a/.github/workflows/build-and-publish.yml +++ b/.github/workflows/build-and-publish.yml @@ -59,6 +59,14 @@ jobs: with: buildkitd-flags: --config /tmp/buildkitd.toml + - name: Download custom RCCL artifact + uses: dawidd6/action-download-artifact@v6 + with: + workflow: build-rccl.yml + name: librccl-gfx1151 + path: custom_libs + if_no_artifact_found: warn + - name: Log in to Docker Hub uses: docker/login-action@v3 with: diff --git a/.github/workflows/build-rccl.yml b/.github/workflows/build-rccl.yml new file mode 100644 index 0000000..0b1c679 --- /dev/null +++ b/.github/workflows/build-rccl.yml @@ -0,0 +1,40 @@ +name: build-rccl + +on: + workflow_dispatch: + +env: + ROCM_MAJOR_VER: 7 + GFX: gfx1151 + +jobs: + build-rccl: + runs-on: ubuntu-latest + container: registry.fedoraproject.org/fedora:43 + steps: + - uses: actions/checkout@v4 + + - name: Install Dependencies + run: bash scripts/install_deps.sh + + - name: Install ROCm SDK + run: bash scripts/install_rocm_sdk.sh + + - name: Build RCCL + shell: bash + run: | + source /etc/profile.d/rocm-sdk.sh + bash scripts/build_rccl_gfx1151.sh + + - name: Compress Artifact + run: | + # Path determined from script logic: rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 + ls -lh rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 + gzip -c rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 > librccl.so.1.gz + ls -lh librccl.so.1.gz + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: librccl-gfx1151 + path: librccl.so.1.gz diff --git a/Dockerfile b/Dockerfile index 3508521..1fcd18d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,51 +2,20 @@ FROM registry.fedoraproject.org/fedora:43 # 1. System Base & Build Tools # Added 'gperftools-libs' for tcmalloc (fixes double-free) -RUN dnf -y install --setopt=install_weak_deps=False --nodocs \ - python3.13 python3.13-devel git rsync libatomic bash ca-certificates curl \ - gcc gcc-c++ binutils make ffmpeg-free \ - cmake ninja-build aria2c tar xz vim nano dialog \ - libdrm-devel zlib-devel openssl-devel pgrep \ - numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \ - && dnf clean all && rm -rf /var/cache/dnf/* +COPY scripts/install_deps.sh /tmp/install_deps.sh +RUN sh /tmp/install_deps.sh # 2. Install "TheRock" ROCm SDK (Tarball Method) WORKDIR /tmp ARG ROCM_MAJOR_VER=7 ARG GFX=gfx1151 -RUN set -euo pipefail; \ - BASE="https://therock-nightly-tarball.s3.amazonaws.com"; \ - PREFIX="therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}"; \ - KEY="$(curl -s "${BASE}?list-type=2&prefix=${PREFIX}" \ - | tr '<' '\n' \ - | grep -o "therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}\..*\.tar\.gz" \ - | sort -V | tail -n1)"; \ - echo "Downloading Latest Tarball: ${KEY}"; \ - aria2c -x 16 -s 16 -j 16 --file-allocation=none "${BASE}/${KEY}" -o therock.tar.gz; \ - mkdir -p /opt/rocm; \ - tar xzf therock.tar.gz -C /opt/rocm --strip-components=1; \ - rm therock.tar.gz - -# 3. Configure Global ROCm Environment -# We add LD_PRELOAD for tcmalloc here to fix the shutdown crash -RUN export ROCM_PATH=/opt/rocm && \ - BITCODE_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \ - printf '%s\n' \ - "export ROCM_PATH=/opt/rocm" \ - "export HIP_PLATFORM=amd" \ - "export HIP_PATH=/opt/rocm" \ - "export HIP_CLANG_PATH=/opt/rocm/llvm/bin" \ - "export HIP_DEVICE_LIB_PATH=$BITCODE_PATH" \ - "export PATH=$ROCM_PATH/bin:$ROCM_PATH/llvm/bin:\$PATH" \ - "export LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/lib64:$ROCM_PATH/llvm/lib:\$LD_LIBRARY_PATH" \ - "export ROCBLAS_USE_HIPBLASLT=1" \ - "export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1" \ - "export VLLM_TARGET_DEVICE=rocm" \ - "export HIP_FORCE_DEV_KERNARG=1" \ - "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \ - "export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4" \ - > /etc/profile.d/rocm-sdk.sh && \ - chmod 0644 /etc/profile.d/rocm-sdk.sh +# We pass ARGs to the script via ENV or rely on defaults. +# But let's be explicit and export them for the RUN command. +COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh +RUN chmod +x /tmp/install_rocm_sdk.sh && \ + export ROCM_MAJOR_VER=$ROCM_MAJOR_VER && \ + export GFX=$GFX && \ + /tmp/install_rocm_sdk.sh # 4. Python Venv Setup RUN /usr/bin/python3.13 -m venv /opt/venv diff --git a/scripts/install_deps.sh b/scripts/install_deps.sh new file mode 100755 index 0000000..b7ec3e2 --- /dev/null +++ b/scripts/install_deps.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# 1. System Base & Build Tools +# Added 'gperftools-libs' for tcmalloc (fixes double-free) +dnf -y install --setopt=install_weak_deps=False --nodocs \ + python3.13 python3.13-devel git rsync libatomic bash ca-certificates curl \ + gcc gcc-c++ binutils make ffmpeg-free \ + cmake ninja-build aria2c tar xz vim nano dialog \ + libdrm-devel zlib-devel openssl-devel pgrep \ + numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \ + && dnf clean all && rm -rf /var/cache/dnf/* diff --git a/scripts/install_rocm_sdk.sh b/scripts/install_rocm_sdk.sh new file mode 100755 index 0000000..49b9ef1 --- /dev/null +++ b/scripts/install_rocm_sdk.sh @@ -0,0 +1,58 @@ +#!/bin/bash +set -euo pipefail + +# Configuration with defaults matching Dockerfile ARGs +ROCM_MAJOR_VER="${ROCM_MAJOR_VER:-7}" +GFX="${GFX:-gfx1151}" + +echo "=== Installing ROCm SDK ($GFX / $ROCM_MAJOR_VER) ===" + +# 2. Install "TheRock" ROCm SDK (Tarball Method) +# We work in /tmp as per Dockerfile WORKDIR +cd /tmp + +BASE="https://therock-nightly-tarball.s3.amazonaws.com" +PREFIX="therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}" + +# Fetch the Key +KEY="$(curl -s "${BASE}?list-type=2&prefix=${PREFIX}" \ + | tr '<' '\n' \ + | grep -o "therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}\..*\.tar\.gz" \ + | sort -V | tail -n1)" + +if [ -z "$KEY" ]; then + echo "Error: Could not find tarball key for $PREFIX" + exit 1 +fi + +echo "Downloading Latest Tarball: ${KEY}" +aria2c -x 16 -s 16 -j 16 --file-allocation=none "${BASE}/${KEY}" -o therock.tar.gz + +mkdir -p /opt/rocm +tar xzf therock.tar.gz -C /opt/rocm --strip-components=1 +rm therock.tar.gz + +# 3. Configure Global ROCm Environment +# We add LD_PRELOAD for tcmalloc here to fix the shutdown crash +export ROCM_PATH=/opt/rocm +BITCODE_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) + +echo "Generating /etc/profile.d/rocm-sdk.sh..." +printf '%s\n' \ + "export ROCM_PATH=/opt/rocm" \ + "export HIP_PLATFORM=amd" \ + "export HIP_PATH=/opt/rocm" \ + "export HIP_CLANG_PATH=/opt/rocm/llvm/bin" \ + "export HIP_DEVICE_LIB_PATH=$BITCODE_PATH" \ + "export PATH=$ROCM_PATH/bin:$ROCM_PATH/llvm/bin:\$PATH" \ + "export LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/lib64:$ROCM_PATH/llvm/lib:\$LD_LIBRARY_PATH" \ + "export ROCBLAS_USE_HIPBLASLT=1" \ + "export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1" \ + "export VLLM_TARGET_DEVICE=rocm" \ + "export HIP_FORCE_DEV_KERNARG=1" \ + "export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \ + "export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4" \ + > /etc/profile.d/rocm-sdk.sh + +chmod 0644 /etc/profile.d/rocm-sdk.sh +echo "=== ROCm SDK Installation Complete ==="