[Azure CI] rccl nightly pipeline that runs on slurm (#1723)
* [Azure CI] rccl nightly pipeline that runs on slurm - Login node will be set up as a self-hosted agent on Azure Pipelines. - Login node will run this job nightly. - Login node will checkout the latest develop source, and then run build and test through sbatch calls, and then waiting for the jobs to complete. When the jobs are complete, print out the logs.
Tento commit je obsažen v:
@@ -0,0 +1,43 @@
|
||||
resources:
|
||||
repositories:
|
||||
- repository: pipelines_repo
|
||||
type: github
|
||||
endpoint: ROCm
|
||||
name: ROCm/ROCm
|
||||
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml@pipelines_repo
|
||||
|
||||
trigger: none
|
||||
pr: none
|
||||
schedules:
|
||||
- cron: "0 5 * 11-3 *" # 11 PM CST (November - March)
|
||||
displayName: "Nightly Build (CST)"
|
||||
branches:
|
||||
include:
|
||||
- develop
|
||||
always: false
|
||||
|
||||
- cron: "0 4 * 4-10 *" # 11 PM CDT (April - October)
|
||||
displayName: "Nightly Build (CDT)"
|
||||
branches:
|
||||
include:
|
||||
- develop
|
||||
always: false
|
||||
|
||||
jobs:
|
||||
- job: rccl
|
||||
timeoutInMinutes: 180
|
||||
pool: rocm-ci_rccl_slurm_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
Contents: '**/*'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
|
||||
parameters:
|
||||
submoduleBehaviour: recursive
|
||||
- template: templates/build.yml
|
||||
- template: templates/test.yml
|
||||
@@ -0,0 +1,48 @@
|
||||
resources:
|
||||
repositories:
|
||||
- repository: pipelines_repo
|
||||
type: github
|
||||
endpoint: ROCm
|
||||
name: ROCm/ROCm
|
||||
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml@pipelines_repo
|
||||
|
||||
trigger: none
|
||||
pr:
|
||||
autoCancel: true
|
||||
branches:
|
||||
include:
|
||||
- develop
|
||||
paths:
|
||||
exclude:
|
||||
- .github
|
||||
- .jenkins
|
||||
- docs
|
||||
- '*.md'
|
||||
- LICENSE.txt
|
||||
- NOTICES.txt
|
||||
drafts: false
|
||||
|
||||
stages:
|
||||
- stage: rcclStage
|
||||
displayName: 'RCCL develop PR'
|
||||
jobs:
|
||||
- deployment: rccl_pr_approval
|
||||
displayName: "CI Run Requires Approval"
|
||||
environment: rccl
|
||||
- job: rccl
|
||||
timeoutInMinutes: 180
|
||||
pool: rocm-ci_rccl_slurm_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
Contents: '**/*'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
|
||||
parameters:
|
||||
submoduleBehaviour: recursive
|
||||
- template: templates/build.yml
|
||||
- template: templates/test.yml
|
||||
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=rccl-build
|
||||
#SBATCH --output=rccl-build-%j.out
|
||||
#SBATCH --error=rccl-build-%j.out
|
||||
#SBATCH --time=60
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --partition=gt
|
||||
|
||||
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
|
||||
echo "Node identifier: $short_id"
|
||||
|
||||
source /etc/profile.d/lmod.sh
|
||||
module load rocm/6.4.0
|
||||
|
||||
# Setup local binary path
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
mkdir -p "$HOME/.local/bin"
|
||||
|
||||
# Install Ninja if not already available
|
||||
if ! command -v ninja &>/dev/null; then
|
||||
echo "Ninja not found. Installing locally..."
|
||||
wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip
|
||||
unzip -q /tmp/ninja.zip -d "$HOME/.local/bin"
|
||||
chmod +x "$HOME/.local/bin/ninja"
|
||||
fi
|
||||
|
||||
echo "Using Ninja at: $(which ninja)"
|
||||
ninja --version
|
||||
|
||||
cd "${SLURM_SUBMIT_DIR:-$PWD}"
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=gfx942 -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
|
||||
cmake --build .
|
||||
cmake --build . --target install
|
||||
@@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=rccl-test
|
||||
#SBATCH --output=rccl-test-%j.out
|
||||
#SBATCH --error=rccl-test-%j.out
|
||||
#SBATCH --time=120
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --partition=gt
|
||||
|
||||
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
|
||||
echo "Node identifier: $short_id"
|
||||
|
||||
source /etc/profile.d/lmod.sh
|
||||
module load rocm/6.4.0
|
||||
cd "$BINARIES_DIR/bin"
|
||||
LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/test_output.xml --gtest_color=yes
|
||||
@@ -0,0 +1,86 @@
|
||||
# small subset of files to check for install to determine pass/fail
|
||||
parameters:
|
||||
- name: expectedInstallFiles
|
||||
type: object
|
||||
default:
|
||||
- bin/rccl-UnitTests
|
||||
- include/rccl/rccl.h
|
||||
- lib/cmake/rccl/rccl-config.cmake
|
||||
- lib/librccl.so
|
||||
- share/doc/rccl/LICENSE.txt
|
||||
- share/rccl/msccl-algorithms
|
||||
- share/rccl/msccl-unit-test-algorithms
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
displayName: Build Job
|
||||
env:
|
||||
BINARIES_DIR: $(Build.BinariesDirectory)
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
echo "##[section]Starting build job..."
|
||||
|
||||
rm -rf $(Build.BinariesDirectory)/*
|
||||
|
||||
echo "Submitting build job..."
|
||||
mkdir -p $(Build.BinariesDirectory)
|
||||
BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh)
|
||||
echo "Submitted build job: $BUILD_JOB_ID"
|
||||
echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID"
|
||||
|
||||
echo "Waiting for build job to start..."
|
||||
while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do
|
||||
echo "##[section]Build job $BUILD_JOB_ID is still running..."
|
||||
sleep 60
|
||||
done
|
||||
|
||||
echo "Waiting for final status via sacct..."
|
||||
LOOP_COUNT=0
|
||||
MAX_LOOPS=30 # Maximum of 30 loops (30 minutes)
|
||||
while true; do
|
||||
STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
|
||||
echo "##[section]Build job state: $STATE"
|
||||
if [[ "$STATE" == "COMPLETED" ]]; then
|
||||
break
|
||||
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
|
||||
echo "Build failed with state $STATE"
|
||||
break
|
||||
fi
|
||||
sleep 60
|
||||
LOOP_COUNT=$((LOOP_COUNT + 1))
|
||||
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
|
||||
echo "Time limit reached while waiting for final status."
|
||||
exit 1 # Exit with an error code if time limit is reached
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Checking for expected installed files..."
|
||||
MISSING_FILES=0
|
||||
|
||||
expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}"
|
||||
i=1
|
||||
total=$(echo "$expectedFiles" | wc -w)
|
||||
while [ $i -le $total ]; do
|
||||
relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i")
|
||||
fullpath="$BINARIES_DIR/$relpath"
|
||||
if [ ! -e "$fullpath" ]; then
|
||||
echo "##vso[task.logissue type=error]Missing expected file: $fullpath"
|
||||
MISSING_FILES=1
|
||||
fi
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
if [ "$MISSING_FILES" -eq 1 ]; then
|
||||
echo "One or more expected files are missing from the install directory."
|
||||
exit 1
|
||||
else
|
||||
echo "All expected files are present in the install directory."
|
||||
fi
|
||||
- task: Bash@3
|
||||
displayName: Build Logs
|
||||
condition: always()
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found"
|
||||
@@ -0,0 +1,69 @@
|
||||
steps:
|
||||
- task: Bash@3
|
||||
displayName: Test Job
|
||||
env:
|
||||
BINARIES_DIR: $(Build.BinariesDirectory)
|
||||
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
echo "Submitting test job..."
|
||||
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test.sh)
|
||||
echo "Submitted test job: $TEST_JOB_ID"
|
||||
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
|
||||
|
||||
echo "Waiting for test job to start..."
|
||||
while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
|
||||
echo "##[section]Test job $TEST_JOB_ID is still running..."
|
||||
sleep 60
|
||||
done
|
||||
|
||||
echo "Waiting for final status via sacct..."
|
||||
LOOP_COUNT=0
|
||||
MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
|
||||
while true; do
|
||||
STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
|
||||
echo "##[section]Test job state: $STATE"
|
||||
if [[ "$STATE" == "COMPLETED" ]]; then
|
||||
break
|
||||
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
|
||||
echo "Test failed with state $STATE"
|
||||
break
|
||||
fi
|
||||
sleep 60
|
||||
LOOP_COUNT=$((LOOP_COUNT + 1))
|
||||
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
|
||||
echo "Time limit reached while waiting for final status."
|
||||
exit 1 # Exit with an error code if time limit is reached
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Checking test result XML for failures..."
|
||||
TEST_XML=$(find "$(Pipeline.Workspace)" -name 'test_output.xml' | head -n1)
|
||||
if [ -z "$TEST_XML" ]; then
|
||||
echo "##vso[task.logissue type=error]No test_output.xml file found"
|
||||
echo "##vso[task.complete result=Failed;]DONE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if grep -q 'failures="[^0]' "$TEST_XML"; then
|
||||
echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
|
||||
echo "##vso[task.complete result=Failed;]DONE"
|
||||
exit 1
|
||||
else
|
||||
echo "No test failures detected."
|
||||
fi
|
||||
- task: Bash@3
|
||||
displayName: Test Logs
|
||||
condition: always()
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
cat rccl-test-${TEST_JOB_ID}.out || echo "No log found"
|
||||
- task: PublishTestResults@2
|
||||
displayName: 'Publish Results'
|
||||
condition: succeededOrFailed()
|
||||
inputs:
|
||||
searchFolder: $(Pipeline.Workspace)
|
||||
testResultsFormat: JUnit
|
||||
testResultsFiles: '**/test_output.xml'
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele