feat: Introduce custom RCCL library management for gfx1151, including build scripts, Docker integration, and VLLM benchmarks.

This commit is contained in:
Donato Capitella
2026-02-01 13:23:10 +00:00
parent 13caab0634
commit a8added616
3 ha cambiato i file con 154 aggiunte e 0 eliminazioni
+11
Vedi File
@@ -163,4 +163,15 @@ RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmo
RUN chmod 0644 /etc/profile.d/*.sh
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one
COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz
RUN echo "Installing Custom RCCL..." && \
gzip -d /tmp/librccl.so.1.gz && \
chmod 755 /tmp/librccl.so.1 && \
# Replace /opt/rocm library strictly as managed_rccl_install.sh does
cp -fv /tmp/librccl.so.1 /opt/rocm/lib/librccl.so.1.0 && \
# Replace /opt/venv library
find /opt/venv -name "librccl.so.1" -exec cp -fv /tmp/librccl.so.1 {} + && \
rm /tmp/librccl.so.1
CMD ["/bin/bash"]
+50
Vedi File
@@ -0,0 +1,50 @@
#!/bin/bash
set -e
# Configuration
REPO_URL="https://github.com/kyuz0/rocm-systems.git"
BRANCH="gfx1151-rccl"
BUILD_DIR="build_gfx1151"
ROCM_PATH=${ROCM_PATH:-/opt/rocm}
# Project sub-directory
PROJECT_DIR="projects/rccl"
echo "=== Building RCCL for gfx1151 ==="
echo "Repo: $REPO_URL"
echo "Branch: $BRANCH"
echo "ROCm Path: $ROCM_PATH"
# 1. Clone/Fetch
if [ -d "rocm-systems" ]; then
echo "Directory 'rocm-systems' exists. Updating..."
cd rocm-systems
git fetch origin
git checkout $BRANCH
git pull origin $BRANCH
else
echo "Cloning repository..."
git clone -b $BRANCH $REPO_URL
cd rocm-systems
fi
# 2. Setup Build Directory
echo "Entering project directory..."
cd $PROJECT_DIR
mkdir -p $BUILD_DIR
cd $BUILD_DIR
echo "Configuring CMake for gfx1151..."
# We explicitly set GPU_TARGETS to gfx1151 to override the default list.
# We also set AMDGPU_TARGETS for standard rocm-cmake compliance.
CXX=$ROCM_PATH/bin/hipcc cmake .. \
-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc \
-DDEFAULT_GPUS="gfx1151" \
-DGPU_TARGETS="gfx1151" \
-DAMDGPU_TARGETS="gfx1151" \
-DCMAKE_INSTALL_PREFIX=./install \
-DBUILD_TESTS=OFF \
-DGENERATE_SYM_KERNELS=OFF \
-DENABLE_AMDSMI=OFF \
-DCMAKE_BUILD_TYPE=Release
# 3. Build
echo "Building librccl.so..."
make -j$(nproc)
echo "=== Build Complete ==="
echo "Libraries are located in:"
echo " $(pwd)/librccl.so"
echo " $(pwd)/librccl.so.1"
+93
Vedi File
@@ -0,0 +1,93 @@
#!/bin/bash
set -e
# Configuration
# Paths identified from your environment
ROCM_LIB_PATH="/opt/rocm/lib/librccl.so.1.0"
VENV_LIB_PATH="/opt/venv/lib/python3.13/site-packages/_rocm_sdk_libraries_gfx1151/lib/librccl.so.1"
BACKUP_DIR="./rccl_backups_$(date +%Y%m%d_%H%M%S)"
# Files to replace
# We assume the new library is named 'librccl.so' or 'librccl.so.1' in the current directory or provided as arg
NEW_LIB="${1:-librccl.so.1}"
usage() {
echo "Usage: $0 [install <path_to_new_lib> | restore]"
echo " install: Backs up existing libs and installs the new one."
echo " restore: Restores libraries from the most recent backup directory."
exit 1
}
do_install() {
if [ ! -f "$NEW_LIB" ]; then
echo "Error: New library file '$NEW_LIB' not found."
echo "Please provide the path to the newly built librccl.so.1"
exit 1
fi
echo "=== Installing Custom RCCL (gfx1151) ==="
echo "Creating backup directory: $BACKUP_DIR"
mkdir -p "$BACKUP_DIR"
# 1. Backup /opt/rocm location
if [ -f "$ROCM_LIB_PATH" ]; then
echo "Backing up $ROCM_LIB_PATH..."
cp -v "$ROCM_LIB_PATH" "$BACKUP_DIR/librccl.so.1.0.rocm.bak"
else
echo "Warning: $ROCM_LIB_PATH not found, skipping backup."
fi
# 2. Backup /opt/venv location
if [ -f "$VENV_LIB_PATH" ]; then
echo "Backing up $VENV_LIB_PATH..."
cp -v "$VENV_LIB_PATH" "$BACKUP_DIR/librccl.so.1.venv.bak"
else
echo "Warning: $VENV_LIB_PATH not found, skipping backup."
fi
# Save backup dir name for restore
echo "$BACKUP_DIR" > .last_rccl_backup
# 3. Install to /opt/rocm
echo "Installing to $ROCM_LIB_PATH..."
# We use sudo assuming root ownership as shown in your ls output
sudo cp -v "$NEW_LIB" "$ROCM_LIB_PATH"
# 4. Install to /opt/venv
if [ -d "$(dirname "$VENV_LIB_PATH")" ]; then
echo "Installing to $VENV_LIB_PATH..."
sudo cp -v "$NEW_LIB" "$VENV_LIB_PATH"
else
echo "Skipping venv install (directory not found)."
fi
echo "=== Installation Complete ==="
}
do_restore() {
if [ ! -f .last_rccl_backup ]; then
echo "Error: No previous backup record found (.last_rccl_backup)."
echo "Please manually restore from your backup directories."
exit 1
fi
LAST_BACKUP=$(cat .last_rccl_backup)
echo "=== Restoring RCCL from $LAST_BACKUP ==="
if [ ! -d "$LAST_BACKUP" ]; then
echo "Error: Backup directory $LAST_BACKUP does not exist."
exit 1
fi
# Restore ROCm lib
if [ -f "$LAST_BACKUP/librccl.so.1.0.rocm.bak" ]; then
echo "Restoring $ROCM_LIB_PATH..."
sudo cp -v "$LAST_BACKUP/librccl.so.1.0.rocm.bak" "$ROCM_LIB_PATH"
fi
# Restore Venv lib
if [ -f "$LAST_BACKUP/librccl.so.1.venv.bak" ]; then
echo "Restoring $VENV_LIB_PATH..."
sudo cp -v "$LAST_BACKUP/librccl.so.1.venv.bak" "$VENV_LIB_PATH"
fi
echo "=== Restore Complete ==="
}
COMMAND="$1"
shift
case "$COMMAND" in
install)
NEW_LIB="${1:-librccl.so.1}"
do_install
;;
restore)
do_restore
;;
*)
usage
;;
esac