feat: Modularize Dockerfile dependency and ROCm SDK installations into dedicated scripts and add a GitHub Actions workflow to build and consume a custom RCCL library.
This commit is contained in:
Executable
+12
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# 1. System Base & Build Tools
|
||||
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
|
||||
dnf -y install --setopt=install_weak_deps=False --nodocs \
|
||||
python3.13 python3.13-devel git rsync libatomic bash ca-certificates curl \
|
||||
gcc gcc-c++ binutils make ffmpeg-free \
|
||||
cmake ninja-build aria2c tar xz vim nano dialog \
|
||||
libdrm-devel zlib-devel openssl-devel pgrep \
|
||||
numactl-devel gperftools-libs iproute libibverbs-utils patch perftest ping iperf3 \
|
||||
&& dnf clean all && rm -rf /var/cache/dnf/*
|
||||
Executable
+58
@@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration with defaults matching Dockerfile ARGs
|
||||
ROCM_MAJOR_VER="${ROCM_MAJOR_VER:-7}"
|
||||
GFX="${GFX:-gfx1151}"
|
||||
|
||||
echo "=== Installing ROCm SDK ($GFX / $ROCM_MAJOR_VER) ==="
|
||||
|
||||
# 2. Install "TheRock" ROCm SDK (Tarball Method)
|
||||
# We work in /tmp as per Dockerfile WORKDIR
|
||||
cd /tmp
|
||||
|
||||
BASE="https://therock-nightly-tarball.s3.amazonaws.com"
|
||||
PREFIX="therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}"
|
||||
|
||||
# Fetch the Key
|
||||
KEY="$(curl -s "${BASE}?list-type=2&prefix=${PREFIX}" \
|
||||
| tr '<' '\n' \
|
||||
| grep -o "therock-dist-linux-${GFX}-${ROCM_MAJOR_VER}\..*\.tar\.gz" \
|
||||
| sort -V | tail -n1)"
|
||||
|
||||
if [ -z "$KEY" ]; then
|
||||
echo "Error: Could not find tarball key for $PREFIX"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Downloading Latest Tarball: ${KEY}"
|
||||
aria2c -x 16 -s 16 -j 16 --file-allocation=none "${BASE}/${KEY}" -o therock.tar.gz
|
||||
|
||||
mkdir -p /opt/rocm
|
||||
tar xzf therock.tar.gz -C /opt/rocm --strip-components=1
|
||||
rm therock.tar.gz
|
||||
|
||||
# 3. Configure Global ROCm Environment
|
||||
# We add LD_PRELOAD for tcmalloc here to fix the shutdown crash
|
||||
export ROCM_PATH=/opt/rocm
|
||||
BITCODE_PATH=$(find /opt/rocm -type d -name bitcode -print -quit)
|
||||
|
||||
echo "Generating /etc/profile.d/rocm-sdk.sh..."
|
||||
printf '%s\n' \
|
||||
"export ROCM_PATH=/opt/rocm" \
|
||||
"export HIP_PLATFORM=amd" \
|
||||
"export HIP_PATH=/opt/rocm" \
|
||||
"export HIP_CLANG_PATH=/opt/rocm/llvm/bin" \
|
||||
"export HIP_DEVICE_LIB_PATH=$BITCODE_PATH" \
|
||||
"export PATH=$ROCM_PATH/bin:$ROCM_PATH/llvm/bin:\$PATH" \
|
||||
"export LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/lib64:$ROCM_PATH/llvm/lib:\$LD_LIBRARY_PATH" \
|
||||
"export ROCBLAS_USE_HIPBLASLT=1" \
|
||||
"export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1" \
|
||||
"export VLLM_TARGET_DEVICE=rocm" \
|
||||
"export HIP_FORCE_DEV_KERNARG=1" \
|
||||
"export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1" \
|
||||
"export LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so.4" \
|
||||
> /etc/profile.d/rocm-sdk.sh
|
||||
|
||||
chmod 0644 /etc/profile.d/rocm-sdk.sh
|
||||
echo "=== ROCm SDK Installation Complete ==="
|
||||
Reference in New Issue
Block a user