[Azure CI] rccl nightly pipeline that runs on slurm (#1723)

* [Azure CI] rccl nightly pipeline that runs on slurm
- Login node will be set up as a self-hosted agent on Azure Pipelines.
- Login node will run this job nightly.
- Login node will checkout the latest develop source, and then run build and test through sbatch calls, and then waiting for the jobs to complete. When the jobs are complete, print out the logs.

[ROCm/rccl commit: 12315c259a]
このコミットが含まれているのは:
Joseph Macaranas
2025-06-19 11:41:40 -04:00
committed by GitHub
コミット b37518663a
6個のファイルの変更298行の追加0行の削除
+43
ファイルの表示
@@ -0,0 +1,43 @@
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger: none
pr: none
schedules:
- cron: "0 5 * 11-3 *" # 11 PM CST (November - March)
displayName: "Nightly Build (CST)"
branches:
include:
- develop
always: false
- cron: "0 4 * 4-10 *" # 11 PM CDT (April - October)
displayName: "Nightly Build (CDT)"
branches:
include:
- develop
always: false
jobs:
- job: rccl
timeoutInMinutes: 180
pool: rocm-ci_rccl_slurm_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test.yml
+48
ファイルの表示
@@ -0,0 +1,48 @@
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger: none
pr:
autoCancel: true
branches:
include:
- develop
paths:
exclude:
- .github
- .jenkins
- docs
- '*.md'
- LICENSE.txt
- NOTICES.txt
drafts: false
stages:
- stage: rcclStage
displayName: 'RCCL develop PR'
jobs:
- deployment: rccl_pr_approval
displayName: "CI Run Requires Approval"
environment: rccl
- job: rccl
timeoutInMinutes: 180
pool: rocm-ci_rccl_slurm_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test.yml
+36
ファイルの表示
@@ -0,0 +1,36 @@
#!/bin/bash
#SBATCH --job-name=rccl-build
#SBATCH --output=rccl-build-%j.out
#SBATCH --error=rccl-build-%j.out
#SBATCH --time=60
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.0
# Setup local binary path
export PATH="$HOME/.local/bin:$PATH"
mkdir -p "$HOME/.local/bin"
# Install Ninja if not already available
if ! command -v ninja &>/dev/null; then
echo "Ninja not found. Installing locally..."
wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip
unzip -q /tmp/ninja.zip -d "$HOME/.local/bin"
chmod +x "$HOME/.local/bin/ninja"
fi
echo "Using Ninja at: $(which ninja)"
ninja --version
cd "${SLURM_SUBMIT_DIR:-$PWD}"
mkdir -p build
cd build
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=gfx942 -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
cmake --build .
cmake --build . --target install
+16
ファイルの表示
@@ -0,0 +1,16 @@
#!/bin/bash
#SBATCH --job-name=rccl-test
#SBATCH --output=rccl-test-%j.out
#SBATCH --error=rccl-test-%j.out
#SBATCH --time=120
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.0
cd "$BINARIES_DIR/bin"
LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/test_output.xml --gtest_color=yes
+86
ファイルの表示
@@ -0,0 +1,86 @@
# small subset of files to check for install to determine pass/fail
parameters:
- name: expectedInstallFiles
type: object
default:
- bin/rccl-UnitTests
- include/rccl/rccl.h
- lib/cmake/rccl/rccl-config.cmake
- lib/librccl.so
- share/doc/rccl/LICENSE.txt
- share/rccl/msccl-algorithms
- share/rccl/msccl-unit-test-algorithms
steps:
- task: Bash@3
displayName: Build Job
env:
BINARIES_DIR: $(Build.BinariesDirectory)
inputs:
targetType: inline
script: |
echo "##[section]Starting build job..."
rm -rf $(Build.BinariesDirectory)/*
echo "Submitting build job..."
mkdir -p $(Build.BinariesDirectory)
BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh)
echo "Submitted build job: $BUILD_JOB_ID"
echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID"
echo "Waiting for build job to start..."
while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do
echo "##[section]Build job $BUILD_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=30 # Maximum of 30 loops (30 minutes)
while true; do
STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Build job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Build failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking for expected installed files..."
MISSING_FILES=0
expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}"
i=1
total=$(echo "$expectedFiles" | wc -w)
while [ $i -le $total ]; do
relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i")
fullpath="$BINARIES_DIR/$relpath"
if [ ! -e "$fullpath" ]; then
echo "##vso[task.logissue type=error]Missing expected file: $fullpath"
MISSING_FILES=1
fi
i=$((i + 1))
done
if [ "$MISSING_FILES" -eq 1 ]; then
echo "One or more expected files are missing from the install directory."
exit 1
else
echo "All expected files are present in the install directory."
fi
- task: Bash@3
displayName: Build Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found"
+69
ファイルの表示
@@ -0,0 +1,69 @@
steps:
- task: Bash@3
displayName: Test Job
env:
BINARIES_DIR: $(Build.BinariesDirectory)
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
inputs:
targetType: inline
script: |
echo "Submitting test job..."
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test.sh)
echo "Submitted test job: $TEST_JOB_ID"
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
echo "Waiting for test job to start..."
while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
echo "##[section]Test job $TEST_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
while true; do
STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Test job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Test failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking test result XML for failures..."
TEST_XML=$(find "$(Pipeline.Workspace)" -name 'test_output.xml' | head -n1)
if [ -z "$TEST_XML" ]; then
echo "##vso[task.logissue type=error]No test_output.xml file found"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
fi
if grep -q 'failures="[^0]' "$TEST_XML"; then
echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
else
echo "No test failures detected."
fi
- task: Bash@3
displayName: Test Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-test-${TEST_JOB_ID}.out || echo "No log found"
- task: PublishTestResults@2
displayName: 'Publish Results'
condition: succeededOrFailed()
inputs:
searchFolder: $(Pipeline.Workspace)
testResultsFormat: JUnit
testResultsFiles: '**/test_output.xml'