diff --git a/Dockerfile b/Dockerfile index c209bb2..3508521 100644 --- a/Dockerfile +++ b/Dockerfile @@ -163,4 +163,15 @@ RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmo RUN chmod 0644 /etc/profile.d/*.sh RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh +# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one +COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz +RUN echo "Installing Custom RCCL..." && \ + gzip -d /tmp/librccl.so.1.gz && \ + chmod 755 /tmp/librccl.so.1 && \ + # Replace /opt/rocm library strictly as managed_rccl_install.sh does + cp -fv /tmp/librccl.so.1 /opt/rocm/lib/librccl.so.1.0 && \ + # Replace /opt/venv library + find /opt/venv -name "librccl.so.1" -exec cp -fv /tmp/librccl.so.1 {} + && \ + rm /tmp/librccl.so.1 + CMD ["/bin/bash"] diff --git a/scripts/build_rccl_gfx1151.sh b/scripts/build_rccl_gfx1151.sh new file mode 100755 index 0000000..eba9195 --- /dev/null +++ b/scripts/build_rccl_gfx1151.sh @@ -0,0 +1,50 @@ +#!/bin/bash +set -e +# Configuration +REPO_URL="https://github.com/kyuz0/rocm-systems.git" +BRANCH="gfx1151-rccl" +BUILD_DIR="build_gfx1151" +ROCM_PATH=${ROCM_PATH:-/opt/rocm} +# Project sub-directory +PROJECT_DIR="projects/rccl" +echo "=== Building RCCL for gfx1151 ===" +echo "Repo: $REPO_URL" +echo "Branch: $BRANCH" +echo "ROCm Path: $ROCM_PATH" +# 1. Clone/Fetch +if [ -d "rocm-systems" ]; then + echo "Directory 'rocm-systems' exists. Updating..." + cd rocm-systems + git fetch origin + git checkout $BRANCH + git pull origin $BRANCH +else + echo "Cloning repository..." + git clone -b $BRANCH $REPO_URL + cd rocm-systems +fi +# 2. Setup Build Directory +echo "Entering project directory..." +cd $PROJECT_DIR +mkdir -p $BUILD_DIR +cd $BUILD_DIR +echo "Configuring CMake for gfx1151..." +# We explicitly set GPU_TARGETS to gfx1151 to override the default list. +# We also set AMDGPU_TARGETS for standard rocm-cmake compliance. +CXX=$ROCM_PATH/bin/hipcc cmake .. \ + -DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc \ + -DDEFAULT_GPUS="gfx1151" \ + -DGPU_TARGETS="gfx1151" \ + -DAMDGPU_TARGETS="gfx1151" \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DBUILD_TESTS=OFF \ + -DGENERATE_SYM_KERNELS=OFF \ + -DENABLE_AMDSMI=OFF \ + -DCMAKE_BUILD_TYPE=Release +# 3. Build +echo "Building librccl.so..." +make -j$(nproc) +echo "=== Build Complete ===" +echo "Libraries are located in:" +echo " $(pwd)/librccl.so" +echo " $(pwd)/librccl.so.1" diff --git a/scripts/manage_rccl_install.sh b/scripts/manage_rccl_install.sh new file mode 100755 index 0000000..99c0509 --- /dev/null +++ b/scripts/manage_rccl_install.sh @@ -0,0 +1,93 @@ +#!/bin/bash +set -e +# Configuration +# Paths identified from your environment +ROCM_LIB_PATH="/opt/rocm/lib/librccl.so.1.0" +VENV_LIB_PATH="/opt/venv/lib/python3.13/site-packages/_rocm_sdk_libraries_gfx1151/lib/librccl.so.1" +BACKUP_DIR="./rccl_backups_$(date +%Y%m%d_%H%M%S)" +# Files to replace +# We assume the new library is named 'librccl.so' or 'librccl.so.1' in the current directory or provided as arg +NEW_LIB="${1:-librccl.so.1}" +usage() { + echo "Usage: $0 [install | restore]" + echo " install: Backs up existing libs and installs the new one." + echo " restore: Restores libraries from the most recent backup directory." + exit 1 +} +do_install() { + if [ ! -f "$NEW_LIB" ]; then + echo "Error: New library file '$NEW_LIB' not found." + echo "Please provide the path to the newly built librccl.so.1" + exit 1 + fi + echo "=== Installing Custom RCCL (gfx1151) ===" + echo "Creating backup directory: $BACKUP_DIR" + mkdir -p "$BACKUP_DIR" + # 1. Backup /opt/rocm location + if [ -f "$ROCM_LIB_PATH" ]; then + echo "Backing up $ROCM_LIB_PATH..." + cp -v "$ROCM_LIB_PATH" "$BACKUP_DIR/librccl.so.1.0.rocm.bak" + else + echo "Warning: $ROCM_LIB_PATH not found, skipping backup." + fi + # 2. Backup /opt/venv location + if [ -f "$VENV_LIB_PATH" ]; then + echo "Backing up $VENV_LIB_PATH..." + cp -v "$VENV_LIB_PATH" "$BACKUP_DIR/librccl.so.1.venv.bak" + else + echo "Warning: $VENV_LIB_PATH not found, skipping backup." + fi + # Save backup dir name for restore + echo "$BACKUP_DIR" > .last_rccl_backup + # 3. Install to /opt/rocm + echo "Installing to $ROCM_LIB_PATH..." + # We use sudo assuming root ownership as shown in your ls output + sudo cp -v "$NEW_LIB" "$ROCM_LIB_PATH" + # 4. Install to /opt/venv + if [ -d "$(dirname "$VENV_LIB_PATH")" ]; then + echo "Installing to $VENV_LIB_PATH..." + sudo cp -v "$NEW_LIB" "$VENV_LIB_PATH" + else + echo "Skipping venv install (directory not found)." + fi + echo "=== Installation Complete ===" +} +do_restore() { + if [ ! -f .last_rccl_backup ]; then + echo "Error: No previous backup record found (.last_rccl_backup)." + echo "Please manually restore from your backup directories." + exit 1 + fi + + LAST_BACKUP=$(cat .last_rccl_backup) + echo "=== Restoring RCCL from $LAST_BACKUP ===" + if [ ! -d "$LAST_BACKUP" ]; then + echo "Error: Backup directory $LAST_BACKUP does not exist." + exit 1 + fi + # Restore ROCm lib + if [ -f "$LAST_BACKUP/librccl.so.1.0.rocm.bak" ]; then + echo "Restoring $ROCM_LIB_PATH..." + sudo cp -v "$LAST_BACKUP/librccl.so.1.0.rocm.bak" "$ROCM_LIB_PATH" + fi + # Restore Venv lib + if [ -f "$LAST_BACKUP/librccl.so.1.venv.bak" ]; then + echo "Restoring $VENV_LIB_PATH..." + sudo cp -v "$LAST_BACKUP/librccl.so.1.venv.bak" "$VENV_LIB_PATH" + fi + echo "=== Restore Complete ===" +} +COMMAND="$1" +shift +case "$COMMAND" in + install) + NEW_LIB="${1:-librccl.so.1}" + do_install + ;; + restore) + do_restore + ;; + *) + usage + ;; +esac