feat: Modularize Dockerfile dependency and ROCm SDK installations into dedicated scripts and add a GitHub Actions workflow to build and consume a custom RCCL library.
This commit is contained in:
@@ -59,6 +59,14 @@ jobs:
|
||||
with:
|
||||
buildkitd-flags: --config /tmp/buildkitd.toml
|
||||
|
||||
- name: Download custom RCCL artifact
|
||||
uses: dawidd6/action-download-artifact@v6
|
||||
with:
|
||||
workflow: build-rccl.yml
|
||||
name: librccl-gfx1151
|
||||
path: custom_libs
|
||||
if_no_artifact_found: warn
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
name: build-rccl
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
ROCM_MAJOR_VER: 7
|
||||
GFX: gfx1151
|
||||
|
||||
jobs:
|
||||
build-rccl:
|
||||
runs-on: ubuntu-latest
|
||||
container: registry.fedoraproject.org/fedora:43
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install Dependencies
|
||||
run: bash scripts/install_deps.sh
|
||||
|
||||
- name: Install ROCm SDK
|
||||
run: bash scripts/install_rocm_sdk.sh
|
||||
|
||||
- name: Build RCCL
|
||||
shell: bash
|
||||
run: |
|
||||
source /etc/profile.d/rocm-sdk.sh
|
||||
bash scripts/build_rccl_gfx1151.sh
|
||||
|
||||
- name: Compress Artifact
|
||||
run: |
|
||||
# Path determined from script logic: rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
|
||||
ls -lh rocm-systems/projects/rccl/build_gfx1151/librccl.so.1
|
||||
gzip -c rocm-systems/projects/rccl/build_gfx1151/librccl.so.1 > librccl.so.1.gz
|
||||
ls -lh librccl.so.1.gz
|
||||
|
||||
- name: Upload Artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: librccl-gfx1151
|
||||
path: librccl.so.1.gz
|
||||
+9
-40
@@ -2,51 +2,20 @@ FROM registry.fedoraproject.org/fedora:43
|
||||
|
||||
# 1. System Base & Build Tools
|
||||
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
|
||||
RUN dnf -y install --setopt=install_weak_deps=False --nodocs \
|
||||
python3.13 python3.13-devel git rsync libatomic bash ca-certificates curl \
|
||||
gcc gcc-c++ binutils make ffmpeg-free \
|
||||
cmake ninja-build aria2c tar xz vim nano dialog \
|
||||
libdrm-devel zlib-devel openssl-devel pgrep \
|
||||
numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \
|
||||
&& dnf clean all && rm -rf /var/cache/dnf/*
|
||||
COPY scripts/install_deps.sh /tmp/install_deps.sh
|
||||
RUN sh /tmp/install_deps.sh
|
||||
|
||||
# 2. Install "TheRock" ROCm SDK (Tarball Method)
|
||||
WORKDIR /tmp
|
||||
ARG ROCM_MAJOR_VER=7
|
||||
ARG GFX=gfx1151
|
||||
RUN set -euo pipefail; \
|
||||
BASE="https://therock-nightly-tarball.s3.amazonaws.com"; \
|
||||
PREFIX="therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}"; \
|
||||
KEY="$(curl -s "${BASE}?list-type=2&prefix=${PREFIX}" \
|
||||
| tr '<' '\n' \
|
||||
| grep -o "therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}\..*\.tar\.gz" \
|
||||
| sort -V | tail -n1)"; \
|
||||
echo "Downloading Latest Tarball: ${KEY}"; \
|
||||
aria2c -x 16 -s 16 -j 16 --file-allocation=none "${BASE}/${KEY}" -o therock.tar.gz; \
|
||||
mkdir -p /opt/rocm; \
|
||||
tar xzf therock.tar.gz -C /opt/rocm --strip-components=1; \
|
||||
rm therock.tar.gz
|
||||
|
||||
# 3. Configure Global ROCm Environment
|
||||
# We add LD_PRELOAD for tcmalloc here to fix the shutdown crash
|
||||
RUN export ROCM_PATH=/opt/rocm && \
|
||||
BITCODE_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
|
||||
printf '%s\n' \
|
||||
"export ROCM_PATH=/opt/rocm" \
|
||||
"export HIP_PLATFORM=amd" \
|
||||
"export HIP_PATH=/opt/rocm" \
|
||||
"export HIP_CLANG_PATH=/opt/rocm/llvm/bin" \
|
||||
"export HIP_DEVICE_LIB_PATH=$BITCODE_PATH" \
|
||||
"export PATH=$ROCM_PATH/bin:$ROCM_PATH/llvm/bin:\$PATH" \
|
||||
"export LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/lib64:$ROCM_PATH/llvm/lib:\$LD_LIBRARY_PATH" \
|
||||
"export ROCBLAS_USE_HIPBLASLT=1" \
|
||||
"export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1" \
|
||||
"export VLLM_TARGET_DEVICE=rocm" \
|
||||
"export HIP_FORCE_DEV_KERNARG=1" \
|
||||
"export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
|
||||
"export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4" \
|
||||
> /etc/profile.d/rocm-sdk.sh && \
|
||||
chmod 0644 /etc/profile.d/rocm-sdk.sh
|
||||
# We pass ARGs to the script via ENV or rely on defaults.
|
||||
# But let's be explicit and export them for the RUN command.
|
||||
COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh
|
||||
RUN chmod +x /tmp/install_rocm_sdk.sh && \
|
||||
export ROCM_MAJOR_VER=$ROCM_MAJOR_VER && \
|
||||
export GFX=$GFX && \
|
||||
/tmp/install_rocm_sdk.sh
|
||||
|
||||
# 4. Python Venv Setup
|
||||
RUN /usr/bin/python3.13 -m venv /opt/venv
|
||||
|
||||
Executable
+12
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# 1. System Base & Build Tools
|
||||
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
|
||||
dnf -y install --setopt=install_weak_deps=False --nodocs \
|
||||
python3.13 python3.13-devel git rsync libatomic bash ca-certificates curl \
|
||||
gcc gcc-c++ binutils make ffmpeg-free \
|
||||
cmake ninja-build aria2c tar xz vim nano dialog \
|
||||
libdrm-devel zlib-devel openssl-devel pgrep \
|
||||
numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \
|
||||
&& dnf clean all && rm -rf /var/cache/dnf/*
|
||||
Executable
+58
@@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration with defaults matching Dockerfile ARGs
|
||||
ROCM_MAJOR_VER="${ROCM_MAJOR_VER:-7}"
|
||||
GFX="${GFX:-gfx1151}"
|
||||
|
||||
echo "=== Installing ROCm SDK ($GFX / $ROCM_MAJOR_VER) ==="
|
||||
|
||||
# 2. Install "TheRock" ROCm SDK (Tarball Method)
|
||||
# We work in /tmp as per Dockerfile WORKDIR
|
||||
cd /tmp
|
||||
|
||||
BASE="https://therock-nightly-tarball.s3.amazonaws.com"
|
||||
PREFIX="therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}"
|
||||
|
||||
# Fetch the Key
|
||||
KEY="$(curl -s "${BASE}?list-type=2&prefix=${PREFIX}" \
|
||||
| tr '<' '\n' \
|
||||
| grep -o "therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}\..*\.tar\.gz" \
|
||||
| sort -V | tail -n1)"
|
||||
|
||||
if [ -z "$KEY" ]; then
|
||||
echo "Error: Could not find tarball key for $PREFIX"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Downloading Latest Tarball: ${KEY}"
|
||||
aria2c -x 16 -s 16 -j 16 --file-allocation=none "${BASE}/${KEY}" -o therock.tar.gz
|
||||
|
||||
mkdir -p /opt/rocm
|
||||
tar xzf therock.tar.gz -C /opt/rocm --strip-components=1
|
||||
rm therock.tar.gz
|
||||
|
||||
# 3. Configure Global ROCm Environment
|
||||
# We add LD_PRELOAD for tcmalloc here to fix the shutdown crash
|
||||
export ROCM_PATH=/opt/rocm
|
||||
BITCODE_PATH=$(find /opt/rocm -type d -name bitcode -print -quit)
|
||||
|
||||
echo "Generating /etc/profile.d/rocm-sdk.sh..."
|
||||
printf '%s\n' \
|
||||
"export ROCM_PATH=/opt/rocm" \
|
||||
"export HIP_PLATFORM=amd" \
|
||||
"export HIP_PATH=/opt/rocm" \
|
||||
"export HIP_CLANG_PATH=/opt/rocm/llvm/bin" \
|
||||
"export HIP_DEVICE_LIB_PATH=$BITCODE_PATH" \
|
||||
"export PATH=$ROCM_PATH/bin:$ROCM_PATH/llvm/bin:\$PATH" \
|
||||
"export LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/lib64:$ROCM_PATH/llvm/lib:\$LD_LIBRARY_PATH" \
|
||||
"export ROCBLAS_USE_HIPBLASLT=1" \
|
||||
"export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1" \
|
||||
"export VLLM_TARGET_DEVICE=rocm" \
|
||||
"export HIP_FORCE_DEV_KERNARG=1" \
|
||||
"export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
|
||||
"export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4" \
|
||||
> /etc/profile.d/rocm-sdk.sh
|
||||
|
||||
chmod 0644 /etc/profile.d/rocm-sdk.sh
|
||||
echo "=== ROCm SDK Installation Complete ==="
|
||||
مرجع در شماره جدید
Block a user