[Azure CI] rccl nightly pipeline that runs on slurm (#1723)

* [Azure CI] rccl nightly pipeline that runs on slurm
- Login node will be set up as a self-hosted agent on Azure Pipelines.
- Login node will run this job nightly.
- Login node will checkout the latest develop source, and then run build and test through sbatch calls, and then waiting for the jobs to complete. When the jobs are complete, print out the logs.
Tento commit je obsažen v:
Joseph Macaranas
2025-06-19 11:41:40 -04:00
odevzdal GitHub
rodič 92a5d225d9
revize 12315c259a
6 změnil soubory, kde provedl 298 přidání a 0 odebrání
+43
Zobrazit soubor
@@ -0,0 +1,43 @@
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger: none
pr: none
schedules:
- cron: "0 5 * 11-3 *" # 11 PM CST (November - March)
displayName: "Nightly Build (CST)"
branches:
include:
- develop
always: false
- cron: "0 4 * 4-10 *" # 11 PM CDT (April - October)
displayName: "Nightly Build (CDT)"
branches:
include:
- develop
always: false
jobs:
- job: rccl
timeoutInMinutes: 180
pool: rocm-ci_rccl_slurm_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test.yml
+48
Zobrazit soubor
@@ -0,0 +1,48 @@
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger: none
pr:
autoCancel: true
branches:
include:
- develop
paths:
exclude:
- .github
- .jenkins
- docs
- '*.md'
- LICENSE.txt
- NOTICES.txt
drafts: false
stages:
- stage: rcclStage
displayName: 'RCCL develop PR'
jobs:
- deployment: rccl_pr_approval
displayName: "CI Run Requires Approval"
environment: rccl
- job: rccl
timeoutInMinutes: 180
pool: rocm-ci_rccl_slurm_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test.yml
+36
Zobrazit soubor
@@ -0,0 +1,36 @@
#!/bin/bash
#SBATCH --job-name=rccl-build
#SBATCH --output=rccl-build-%j.out
#SBATCH --error=rccl-build-%j.out
#SBATCH --time=60
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.0
# Setup local binary path
export PATH="$HOME/.local/bin:$PATH"
mkdir -p "$HOME/.local/bin"
# Install Ninja if not already available
if ! command -v ninja &>/dev/null; then
echo "Ninja not found. Installing locally..."
wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip
unzip -q /tmp/ninja.zip -d "$HOME/.local/bin"
chmod +x "$HOME/.local/bin/ninja"
fi
echo "Using Ninja at: $(which ninja)"
ninja --version
cd "${SLURM_SUBMIT_DIR:-$PWD}"
mkdir -p build
cd build
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=gfx942 -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
cmake --build .
cmake --build . --target install
+16
Zobrazit soubor
@@ -0,0 +1,16 @@
#!/bin/bash
#SBATCH --job-name=rccl-test
#SBATCH --output=rccl-test-%j.out
#SBATCH --error=rccl-test-%j.out
#SBATCH --time=120
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.0
cd "$BINARIES_DIR/bin"
LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/test_output.xml --gtest_color=yes
+86
Zobrazit soubor
@@ -0,0 +1,86 @@
# small subset of files to check for install to determine pass/fail
parameters:
- name: expectedInstallFiles
type: object
default:
- bin/rccl-UnitTests
- include/rccl/rccl.h
- lib/cmake/rccl/rccl-config.cmake
- lib/librccl.so
- share/doc/rccl/LICENSE.txt
- share/rccl/msccl-algorithms
- share/rccl/msccl-unit-test-algorithms
steps:
- task: Bash@3
displayName: Build Job
env:
BINARIES_DIR: $(Build.BinariesDirectory)
inputs:
targetType: inline
script: |
echo "##[section]Starting build job..."
rm -rf $(Build.BinariesDirectory)/*
echo "Submitting build job..."
mkdir -p $(Build.BinariesDirectory)
BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh)
echo "Submitted build job: $BUILD_JOB_ID"
echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID"
echo "Waiting for build job to start..."
while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do
echo "##[section]Build job $BUILD_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=30 # Maximum of 30 loops (30 minutes)
while true; do
STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Build job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Build failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking for expected installed files..."
MISSING_FILES=0
expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}"
i=1
total=$(echo "$expectedFiles" | wc -w)
while [ $i -le $total ]; do
relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i")
fullpath="$BINARIES_DIR/$relpath"
if [ ! -e "$fullpath" ]; then
echo "##vso[task.logissue type=error]Missing expected file: $fullpath"
MISSING_FILES=1
fi
i=$((i + 1))
done
if [ "$MISSING_FILES" -eq 1 ]; then
echo "One or more expected files are missing from the install directory."
exit 1
else
echo "All expected files are present in the install directory."
fi
- task: Bash@3
displayName: Build Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found"
+69
Zobrazit soubor
@@ -0,0 +1,69 @@
steps:
- task: Bash@3
displayName: Test Job
env:
BINARIES_DIR: $(Build.BinariesDirectory)
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
inputs:
targetType: inline
script: |
echo "Submitting test job..."
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test.sh)
echo "Submitted test job: $TEST_JOB_ID"
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
echo "Waiting for test job to start..."
while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
echo "##[section]Test job $TEST_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
while true; do
STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Test job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Test failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking test result XML for failures..."
TEST_XML=$(find "$(Pipeline.Workspace)" -name 'test_output.xml' | head -n1)
if [ -z "$TEST_XML" ]; then
echo "##vso[task.logissue type=error]No test_output.xml file found"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
fi
if grep -q 'failures="[^0]' "$TEST_XML"; then
echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
else
echo "No test failures detected."
fi
- task: Bash@3
displayName: Test Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-test-${TEST_JOB_ID}.out || echo "No log found"
- task: PublishTestResults@2
displayName: 'Publish Results'
condition: succeededOrFailed()
inputs:
searchFolder: $(Pipeline.Workspace)
testResultsFormat: JUnit
testResultsFiles: '**/test_output.xml'