From 12315c259ac4e24ef3709d37966a69dfdff17f6f Mon Sep 17 00:00:00 2001 From: Joseph Macaranas <145489236+jayhawk-commits@users.noreply.github.com> Date: Thu, 19 Jun 2025 11:41:40 -0400 Subject: [PATCH] [Azure CI] rccl nightly pipeline that runs on slurm (#1723) * [Azure CI] rccl nightly pipeline that runs on slurm - Login node will be set up as a self-hosted agent on Azure Pipelines. - Login node will run this job nightly. - Login node will checkout the latest develop source, and then run build and test through sbatch calls, and then waiting for the jobs to complete. When the jobs are complete, print out the logs. --- .azuredevops/multinode-ci-slurm-nightly.yml | 43 +++++++++++ .azuredevops/multinode-ci-slurm-pr.yml | 48 ++++++++++++ .azuredevops/slurm/build.sh | 36 +++++++++ .azuredevops/slurm/test.sh | 16 ++++ .azuredevops/templates/build.yml | 86 +++++++++++++++++++++ .azuredevops/templates/test.yml | 69 +++++++++++++++++ 6 files changed, 298 insertions(+) create mode 100644 .azuredevops/multinode-ci-slurm-nightly.yml create mode 100644 .azuredevops/multinode-ci-slurm-pr.yml create mode 100644 .azuredevops/slurm/build.sh create mode 100644 .azuredevops/slurm/test.sh create mode 100644 .azuredevops/templates/build.yml create mode 100644 .azuredevops/templates/test.yml diff --git a/.azuredevops/multinode-ci-slurm-nightly.yml b/.azuredevops/multinode-ci-slurm-nightly.yml new file mode 100644 index 0000000000..6523887294 --- /dev/null +++ b/.azuredevops/multinode-ci-slurm-nightly.yml @@ -0,0 +1,43 @@ +resources: + repositories: + - repository: pipelines_repo + type: github + endpoint: ROCm + name: ROCm/ROCm + +variables: +- group: common +- template: /.azuredevops/variables-global.yml@pipelines_repo + +trigger: none +pr: none +schedules: + - cron: "0 5 * 11-3 *" # 11 PM CST (November - March) + displayName: "Nightly Build (CST)" + branches: + include: + - develop + always: false + + - cron: "0 4 * 4-10 *" # 11 PM CDT (April - October) + displayName: "Nightly Build (CDT)" + branches: + include: + - develop + always: false + +jobs: +- job: rccl + timeoutInMinutes: 180 + pool: rocm-ci_rccl_slurm_pool + workspace: + clean: all + steps: + - task: DeleteFiles@1 + inputs: + Contents: '**/*' + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo + parameters: + submoduleBehaviour: recursive + - template: templates/build.yml + - template: templates/test.yml diff --git a/.azuredevops/multinode-ci-slurm-pr.yml b/.azuredevops/multinode-ci-slurm-pr.yml new file mode 100644 index 0000000000..3fc1465425 --- /dev/null +++ b/.azuredevops/multinode-ci-slurm-pr.yml @@ -0,0 +1,48 @@ +resources: + repositories: + - repository: pipelines_repo + type: github + endpoint: ROCm + name: ROCm/ROCm + +variables: +- group: common +- template: /.azuredevops/variables-global.yml@pipelines_repo + +trigger: none +pr: + autoCancel: true + branches: + include: + - develop + paths: + exclude: + - .github + - .jenkins + - docs + - '*.md' + - LICENSE.txt + - NOTICES.txt + drafts: false + +stages: +- stage: rcclStage + displayName: 'RCCL develop PR' + jobs: + - deployment: rccl_pr_approval + displayName: "CI Run Requires Approval" + environment: rccl + - job: rccl + timeoutInMinutes: 180 + pool: rocm-ci_rccl_slurm_pool + workspace: + clean: all + steps: + - task: DeleteFiles@1 + inputs: + Contents: '**/*' + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo + parameters: + submoduleBehaviour: recursive + - template: templates/build.yml + - template: templates/test.yml diff --git a/.azuredevops/slurm/build.sh b/.azuredevops/slurm/build.sh new file mode 100644 index 0000000000..e2889ee053 --- /dev/null +++ b/.azuredevops/slurm/build.sh @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --job-name=rccl-build +#SBATCH --output=rccl-build-%j.out +#SBATCH --error=rccl-build-%j.out +#SBATCH --time=60 +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --partition=gt + +short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) +echo "Node identifier: $short_id" + +source /etc/profile.d/lmod.sh +module load rocm/6.4.0 + +# Setup local binary path +export PATH="$HOME/.local/bin:$PATH" +mkdir -p "$HOME/.local/bin" + +# Install Ninja if not already available +if ! command -v ninja &>/dev/null; then + echo "Ninja not found. Installing locally..." + wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip + unzip -q /tmp/ninja.zip -d "$HOME/.local/bin" + chmod +x "$HOME/.local/bin/ninja" +fi + +echo "Using Ninja at: $(which ninja)" +ninja --version + +cd "${SLURM_SUBMIT_DIR:-$PWD}" +mkdir -p build +cd build +cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=gfx942 -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" .. +cmake --build . +cmake --build . --target install diff --git a/.azuredevops/slurm/test.sh b/.azuredevops/slurm/test.sh new file mode 100644 index 0000000000..541c5efc3b --- /dev/null +++ b/.azuredevops/slurm/test.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=rccl-test +#SBATCH --output=rccl-test-%j.out +#SBATCH --error=rccl-test-%j.out +#SBATCH --time=120 +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --partition=gt + +short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) +echo "Node identifier: $short_id" + +source /etc/profile.d/lmod.sh +module load rocm/6.4.0 +cd "$BINARIES_DIR/bin" +LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/test_output.xml --gtest_color=yes diff --git a/.azuredevops/templates/build.yml b/.azuredevops/templates/build.yml new file mode 100644 index 0000000000..fc671a7890 --- /dev/null +++ b/.azuredevops/templates/build.yml @@ -0,0 +1,86 @@ +# small subset of files to check for install to determine pass/fail +parameters: +- name: expectedInstallFiles + type: object + default: + - bin/rccl-UnitTests + - include/rccl/rccl.h + - lib/cmake/rccl/rccl-config.cmake + - lib/librccl.so + - share/doc/rccl/LICENSE.txt + - share/rccl/msccl-algorithms + - share/rccl/msccl-unit-test-algorithms + +steps: + - task: Bash@3 + displayName: Build Job + env: + BINARIES_DIR: $(Build.BinariesDirectory) + inputs: + targetType: inline + script: | + echo "##[section]Starting build job..." + + rm -rf $(Build.BinariesDirectory)/* + + echo "Submitting build job..." + mkdir -p $(Build.BinariesDirectory) + BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh) + echo "Submitted build job: $BUILD_JOB_ID" + echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID" + + echo "Waiting for build job to start..." + while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do + echo "##[section]Build job $BUILD_JOB_ID is still running..." + sleep 60 + done + + echo "Waiting for final status via sacct..." + LOOP_COUNT=0 + MAX_LOOPS=30 # Maximum of 30 loops (30 minutes) + while true; do + STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs) + echo "##[section]Build job state: $STATE" + if [[ "$STATE" == "COMPLETED" ]]; then + break + elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then + echo "Build failed with state $STATE" + break + fi + sleep 60 + LOOP_COUNT=$((LOOP_COUNT + 1)) + if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then + echo "Time limit reached while waiting for final status." + exit 1 # Exit with an error code if time limit is reached + fi + done + + echo "Checking for expected installed files..." + MISSING_FILES=0 + + expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}" + i=1 + total=$(echo "$expectedFiles" | wc -w) + while [ $i -le $total ]; do + relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i") + fullpath="$BINARIES_DIR/$relpath" + if [ ! -e "$fullpath" ]; then + echo "##vso[task.logissue type=error]Missing expected file: $fullpath" + MISSING_FILES=1 + fi + i=$((i + 1)) + done + + if [ "$MISSING_FILES" -eq 1 ]; then + echo "One or more expected files are missing from the install directory." + exit 1 + else + echo "All expected files are present in the install directory." + fi + - task: Bash@3 + displayName: Build Logs + condition: always() + inputs: + targetType: inline + script: | + cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found" diff --git a/.azuredevops/templates/test.yml b/.azuredevops/templates/test.yml new file mode 100644 index 0000000000..c72ae743e8 --- /dev/null +++ b/.azuredevops/templates/test.yml @@ -0,0 +1,69 @@ +steps: + - task: Bash@3 + displayName: Test Job + env: + BINARIES_DIR: $(Build.BinariesDirectory) + PIPELINE_WORKSPACE: $(Pipeline.Workspace) + inputs: + targetType: inline + script: | + echo "Submitting test job..." + TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test.sh) + echo "Submitted test job: $TEST_JOB_ID" + echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID" + + echo "Waiting for test job to start..." + while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do + echo "##[section]Test job $TEST_JOB_ID is still running..." + sleep 60 + done + + echo "Waiting for final status via sacct..." + LOOP_COUNT=0 + MAX_LOOPS=120 # Maximum of 120 loops (120 minutes) + while true; do + STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs) + echo "##[section]Test job state: $STATE" + if [[ "$STATE" == "COMPLETED" ]]; then + break + elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then + echo "Test failed with state $STATE" + break + fi + sleep 60 + LOOP_COUNT=$((LOOP_COUNT + 1)) + if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then + echo "Time limit reached while waiting for final status." + exit 1 # Exit with an error code if time limit is reached + fi + done + + echo "Checking test result XML for failures..." + TEST_XML=$(find "$(Pipeline.Workspace)" -name 'test_output.xml' | head -n1) + if [ -z "$TEST_XML" ]; then + echo "##vso[task.logissue type=error]No test_output.xml file found" + echo "##vso[task.complete result=Failed;]DONE" + exit 1 + fi + + if grep -q 'failures="[^0]' "$TEST_XML"; then + echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML" + echo "##vso[task.complete result=Failed;]DONE" + exit 1 + else + echo "No test failures detected." + fi + - task: Bash@3 + displayName: Test Logs + condition: always() + inputs: + targetType: inline + script: | + cat rccl-test-${TEST_JOB_ID}.out || echo "No log found" + - task: PublishTestResults@2 + displayName: 'Publish Results' + condition: succeededOrFailed() + inputs: + searchFolder: $(Pipeline.Workspace) + testResultsFormat: JUnit + testResultsFiles: '**/test_output.xml'