feat: Introduce custom RCCL library management for gfx1151, including build scripts, Docker integration, and VLLM benchmarks.
Этот коммит содержится в:
@@ -163,4 +163,15 @@ RUN chmod 0644 /etc/profile.d/*.sh && chmod +x /usr/local/bin/start-vllm && chmo
|
||||
RUN chmod 0644 /etc/profile.d/*.sh
|
||||
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
|
||||
|
||||
# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one
|
||||
COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz
|
||||
RUN echo "Installing Custom RCCL..." && \
|
||||
gzip -d /tmp/librccl.so.1.gz && \
|
||||
chmod 755 /tmp/librccl.so.1 && \
|
||||
# Replace /opt/rocm library strictly as managed_rccl_install.sh does
|
||||
cp -fv /tmp/librccl.so.1 /opt/rocm/lib/librccl.so.1.0 && \
|
||||
# Replace /opt/venv library
|
||||
find /opt/venv -name "librccl.so.1" -exec cp -fv /tmp/librccl.so.1 {} + && \
|
||||
rm /tmp/librccl.so.1
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
Исполняемый файл
+50
@@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
# Configuration
|
||||
REPO_URL="https://github.com/kyuz0/rocm-systems.git"
|
||||
BRANCH="gfx1151-rccl"
|
||||
BUILD_DIR="build_gfx1151"
|
||||
ROCM_PATH=${ROCM_PATH:-/opt/rocm}
|
||||
# Project sub-directory
|
||||
PROJECT_DIR="projects/rccl"
|
||||
echo "=== Building RCCL for gfx1151 ==="
|
||||
echo "Repo: $REPO_URL"
|
||||
echo "Branch: $BRANCH"
|
||||
echo "ROCm Path: $ROCM_PATH"
|
||||
# 1. Clone/Fetch
|
||||
if [ -d "rocm-systems" ]; then
|
||||
echo "Directory 'rocm-systems' exists. Updating..."
|
||||
cd rocm-systems
|
||||
git fetch origin
|
||||
git checkout $BRANCH
|
||||
git pull origin $BRANCH
|
||||
else
|
||||
echo "Cloning repository..."
|
||||
git clone -b $BRANCH $REPO_URL
|
||||
cd rocm-systems
|
||||
fi
|
||||
# 2. Setup Build Directory
|
||||
echo "Entering project directory..."
|
||||
cd $PROJECT_DIR
|
||||
mkdir -p $BUILD_DIR
|
||||
cd $BUILD_DIR
|
||||
echo "Configuring CMake for gfx1151..."
|
||||
# We explicitly set GPU_TARGETS to gfx1151 to override the default list.
|
||||
# We also set AMDGPU_TARGETS for standard rocm-cmake compliance.
|
||||
CXX=$ROCM_PATH/bin/hipcc cmake .. \
|
||||
-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc \
|
||||
-DDEFAULT_GPUS="gfx1151" \
|
||||
-DGPU_TARGETS="gfx1151" \
|
||||
-DAMDGPU_TARGETS="gfx1151" \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DBUILD_TESTS=OFF \
|
||||
-DGENERATE_SYM_KERNELS=OFF \
|
||||
-DENABLE_AMDSMI=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
# 3. Build
|
||||
echo "Building librccl.so..."
|
||||
make -j$(nproc)
|
||||
echo "=== Build Complete ==="
|
||||
echo "Libraries are located in:"
|
||||
echo " $(pwd)/librccl.so"
|
||||
echo " $(pwd)/librccl.so.1"
|
||||
Исполняемый файл
+93
@@ -0,0 +1,93 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
# Configuration
|
||||
# Paths identified from your environment
|
||||
ROCM_LIB_PATH="/opt/rocm/lib/librccl.so.1.0"
|
||||
VENV_LIB_PATH="/opt/venv/lib/python3.13/site-packages/_rocm_sdk_libraries_gfx1151/lib/librccl.so.1"
|
||||
BACKUP_DIR="./rccl_backups_$(date +%Y%m%d_%H%M%S)"
|
||||
# Files to replace
|
||||
# We assume the new library is named 'librccl.so' or 'librccl.so.1' in the current directory or provided as arg
|
||||
NEW_LIB="${1:-librccl.so.1}"
|
||||
usage() {
|
||||
echo "Usage: $0 [install <path_to_new_lib> | restore]"
|
||||
echo " install: Backs up existing libs and installs the new one."
|
||||
echo " restore: Restores libraries from the most recent backup directory."
|
||||
exit 1
|
||||
}
|
||||
do_install() {
|
||||
if [ ! -f "$NEW_LIB" ]; then
|
||||
echo "Error: New library file '$NEW_LIB' not found."
|
||||
echo "Please provide the path to the newly built librccl.so.1"
|
||||
exit 1
|
||||
fi
|
||||
echo "=== Installing Custom RCCL (gfx1151) ==="
|
||||
echo "Creating backup directory: $BACKUP_DIR"
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
# 1. Backup /opt/rocm location
|
||||
if [ -f "$ROCM_LIB_PATH" ]; then
|
||||
echo "Backing up $ROCM_LIB_PATH..."
|
||||
cp -v "$ROCM_LIB_PATH" "$BACKUP_DIR/librccl.so.1.0.rocm.bak"
|
||||
else
|
||||
echo "Warning: $ROCM_LIB_PATH not found, skipping backup."
|
||||
fi
|
||||
# 2. Backup /opt/venv location
|
||||
if [ -f "$VENV_LIB_PATH" ]; then
|
||||
echo "Backing up $VENV_LIB_PATH..."
|
||||
cp -v "$VENV_LIB_PATH" "$BACKUP_DIR/librccl.so.1.venv.bak"
|
||||
else
|
||||
echo "Warning: $VENV_LIB_PATH not found, skipping backup."
|
||||
fi
|
||||
# Save backup dir name for restore
|
||||
echo "$BACKUP_DIR" > .last_rccl_backup
|
||||
# 3. Install to /opt/rocm
|
||||
echo "Installing to $ROCM_LIB_PATH..."
|
||||
# We use sudo assuming root ownership as shown in your ls output
|
||||
sudo cp -v "$NEW_LIB" "$ROCM_LIB_PATH"
|
||||
# 4. Install to /opt/venv
|
||||
if [ -d "$(dirname "$VENV_LIB_PATH")" ]; then
|
||||
echo "Installing to $VENV_LIB_PATH..."
|
||||
sudo cp -v "$NEW_LIB" "$VENV_LIB_PATH"
|
||||
else
|
||||
echo "Skipping venv install (directory not found)."
|
||||
fi
|
||||
echo "=== Installation Complete ==="
|
||||
}
|
||||
do_restore() {
|
||||
if [ ! -f .last_rccl_backup ]; then
|
||||
echo "Error: No previous backup record found (.last_rccl_backup)."
|
||||
echo "Please manually restore from your backup directories."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LAST_BACKUP=$(cat .last_rccl_backup)
|
||||
echo "=== Restoring RCCL from $LAST_BACKUP ==="
|
||||
if [ ! -d "$LAST_BACKUP" ]; then
|
||||
echo "Error: Backup directory $LAST_BACKUP does not exist."
|
||||
exit 1
|
||||
fi
|
||||
# Restore ROCm lib
|
||||
if [ -f "$LAST_BACKUP/librccl.so.1.0.rocm.bak" ]; then
|
||||
echo "Restoring $ROCM_LIB_PATH..."
|
||||
sudo cp -v "$LAST_BACKUP/librccl.so.1.0.rocm.bak" "$ROCM_LIB_PATH"
|
||||
fi
|
||||
# Restore Venv lib
|
||||
if [ -f "$LAST_BACKUP/librccl.so.1.venv.bak" ]; then
|
||||
echo "Restoring $VENV_LIB_PATH..."
|
||||
sudo cp -v "$LAST_BACKUP/librccl.so.1.venv.bak" "$VENV_LIB_PATH"
|
||||
fi
|
||||
echo "=== Restore Complete ==="
|
||||
}
|
||||
COMMAND="$1"
|
||||
shift
|
||||
case "$COMMAND" in
|
||||
install)
|
||||
NEW_LIB="${1:-librccl.so.1}"
|
||||
do_install
|
||||
;;
|
||||
restore)
|
||||
do_restore
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
Ссылка в новой задаче
Block a user