diff --git a/.azuredevops/multinode-ci-slurm-nightly.yml b/.azuredevops/multinode-ci-slurm-nightly.yml new file mode 100644 index 0000000000..6523887294 --- /dev/null +++ b/.azuredevops/multinode-ci-slurm-nightly.yml @@ -0,0 +1,43 @@ +resources: + repositories: + - repository: pipelines_repo + type: github + endpoint: ROCm + name: ROCm/ROCm + +variables: +- group: common +- template: /.azuredevops/variables-global.yml@pipelines_repo + +trigger: none +pr: none +schedules: + - cron: "0 5 * 11-3 *" # 11 PM CST (November - March) + displayName: "Nightly Build (CST)" + branches: + include: + - develop + always: false + + - cron: "0 4 * 4-10 *" # 11 PM CDT (April - October) + displayName: "Nightly Build (CDT)" + branches: + include: + - develop + always: false + +jobs: +- job: rccl + timeoutInMinutes: 180 + pool: rocm-ci_rccl_slurm_pool + workspace: + clean: all + steps: + - task: DeleteFiles@1 + inputs: + Contents: '**/*' + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo + parameters: + submoduleBehaviour: recursive + - template: templates/build.yml + - template: templates/test.yml diff --git a/.azuredevops/multinode-ci-slurm-pr.yml b/.azuredevops/multinode-ci-slurm-pr.yml new file mode 100644 index 0000000000..3fc1465425 --- /dev/null +++ b/.azuredevops/multinode-ci-slurm-pr.yml @@ -0,0 +1,48 @@ +resources: + repositories: + - repository: pipelines_repo + type: github + endpoint: ROCm + name: ROCm/ROCm + +variables: +- group: common +- template: /.azuredevops/variables-global.yml@pipelines_repo + +trigger: none +pr: + autoCancel: true + branches: + include: + - develop + paths: + exclude: + - .github + - .jenkins + - docs + - '*.md' + - LICENSE.txt + - NOTICES.txt + drafts: false + +stages: +- stage: rcclStage + displayName: 'RCCL develop PR' + jobs: + - deployment: rccl_pr_approval + displayName: "CI Run Requires Approval" + environment: rccl + - job: rccl + timeoutInMinutes: 180 + pool: rocm-ci_rccl_slurm_pool + workspace: + clean: all + steps: + - task: DeleteFiles@1 + inputs: + Contents: '**/*' + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo + parameters: + submoduleBehaviour: recursive + - template: templates/build.yml + - template: templates/test.yml diff --git a/.azuredevops/slurm/build.sh b/.azuredevops/slurm/build.sh new file mode 100644 index 0000000000..e2889ee053 --- /dev/null +++ b/.azuredevops/slurm/build.sh @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --job-name=rccl-build +#SBATCH --output=rccl-build-%j.out +#SBATCH --error=rccl-build-%j.out +#SBATCH --time=60 +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --partition=gt + +short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) +echo "Node identifier: $short_id" + +source /etc/profile.d/lmod.sh +module load rocm/6.4.0 + +# Setup local binary path +export PATH="$HOME/.local/bin:$PATH" +mkdir -p "$HOME/.local/bin" + +# Install Ninja if not already available +if ! command -v ninja &>/dev/null; then + echo "Ninja not found. Installing locally..." + wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip + unzip -q /tmp/ninja.zip -d "$HOME/.local/bin" + chmod +x "$HOME/.local/bin/ninja" +fi + +echo "Using Ninja at: $(which ninja)" +ninja --version + +cd "${SLURM_SUBMIT_DIR:-$PWD}" +mkdir -p build +cd build +cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=gfx942 -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" .. +cmake --build . +cmake --build . --target install diff --git a/.azuredevops/slurm/test.sh b/.azuredevops/slurm/test.sh new file mode 100644 index 0000000000..541c5efc3b --- /dev/null +++ b/.azuredevops/slurm/test.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=rccl-test +#SBATCH --output=rccl-test-%j.out +#SBATCH --error=rccl-test-%j.out +#SBATCH --time=120 +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --partition=gt + +short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) +echo "Node identifier: $short_id" + +source /etc/profile.d/lmod.sh +module load rocm/6.4.0 +cd "$BINARIES_DIR/bin" +LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/test_output.xml --gtest_color=yes diff --git a/.azuredevops/templates/build.yml b/.azuredevops/templates/build.yml new file mode 100644 index 0000000000..fc671a7890 --- /dev/null +++ b/.azuredevops/templates/build.yml @@ -0,0 +1,86 @@ +# small subset of files to check for install to determine pass/fail +parameters: +- name: expectedInstallFiles + type: object + default: + - bin/rccl-UnitTests + - include/rccl/rccl.h + - lib/cmake/rccl/rccl-config.cmake + - lib/librccl.so + - share/doc/rccl/LICENSE.txt + - share/rccl/msccl-algorithms + - share/rccl/msccl-unit-test-algorithms + +steps: + - task: Bash@3 + displayName: Build Job + env: + BINARIES_DIR: $(Build.BinariesDirectory) + inputs: + targetType: inline + script: | + echo "##[section]Starting build job..." + + rm -rf $(Build.BinariesDirectory)/* + + echo "Submitting build job..." + mkdir -p $(Build.BinariesDirectory) + BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh) + echo "Submitted build job: $BUILD_JOB_ID" + echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID" + + echo "Waiting for build job to start..." + while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do + echo "##[section]Build job $BUILD_JOB_ID is still running..." + sleep 60 + done + + echo "Waiting for final status via sacct..." + LOOP_COUNT=0 + MAX_LOOPS=30 # Maximum of 30 loops (30 minutes) + while true; do + STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs) + echo "##[section]Build job state: $STATE" + if [[ "$STATE" == "COMPLETED" ]]; then + break + elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then + echo "Build failed with state $STATE" + break + fi + sleep 60 + LOOP_COUNT=$((LOOP_COUNT + 1)) + if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then + echo "Time limit reached while waiting for final status." + exit 1 # Exit with an error code if time limit is reached + fi + done + + echo "Checking for expected installed files..." + MISSING_FILES=0 + + expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}" + i=1 + total=$(echo "$expectedFiles" | wc -w) + while [ $i -le $total ]; do + relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i") + fullpath="$BINARIES_DIR/$relpath" + if [ ! -e "$fullpath" ]; then + echo "##vso[task.logissue type=error]Missing expected file: $fullpath" + MISSING_FILES=1 + fi + i=$((i + 1)) + done + + if [ "$MISSING_FILES" -eq 1 ]; then + echo "One or more expected files are missing from the install directory." + exit 1 + else + echo "All expected files are present in the install directory." + fi + - task: Bash@3 + displayName: Build Logs + condition: always() + inputs: + targetType: inline + script: | + cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found" diff --git a/.azuredevops/templates/test.yml b/.azuredevops/templates/test.yml new file mode 100644 index 0000000000..c72ae743e8 --- /dev/null +++ b/.azuredevops/templates/test.yml @@ -0,0 +1,69 @@ +steps: + - task: Bash@3 + displayName: Test Job + env: + BINARIES_DIR: $(Build.BinariesDirectory) + PIPELINE_WORKSPACE: $(Pipeline.Workspace) + inputs: + targetType: inline + script: | + echo "Submitting test job..." + TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test.sh) + echo "Submitted test job: $TEST_JOB_ID" + echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID" + + echo "Waiting for test job to start..." + while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do + echo "##[section]Test job $TEST_JOB_ID is still running..." + sleep 60 + done + + echo "Waiting for final status via sacct..." + LOOP_COUNT=0 + MAX_LOOPS=120 # Maximum of 120 loops (120 minutes) + while true; do + STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs) + echo "##[section]Test job state: $STATE" + if [[ "$STATE" == "COMPLETED" ]]; then + break + elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then + echo "Test failed with state $STATE" + break + fi + sleep 60 + LOOP_COUNT=$((LOOP_COUNT + 1)) + if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then + echo "Time limit reached while waiting for final status." + exit 1 # Exit with an error code if time limit is reached + fi + done + + echo "Checking test result XML for failures..." + TEST_XML=$(find "$(Pipeline.Workspace)" -name 'test_output.xml' | head -n1) + if [ -z "$TEST_XML" ]; then + echo "##vso[task.logissue type=error]No test_output.xml file found" + echo "##vso[task.complete result=Failed;]DONE" + exit 1 + fi + + if grep -q 'failures="[^0]' "$TEST_XML"; then + echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML" + echo "##vso[task.complete result=Failed;]DONE" + exit 1 + else + echo "No test failures detected." + fi + - task: Bash@3 + displayName: Test Logs + condition: always() + inputs: + targetType: inline + script: | + cat rccl-test-${TEST_JOB_ID}.out || echo "No log found" + - task: PublishTestResults@2 + displayName: 'Publish Results' + condition: succeededOrFailed() + inputs: + searchFolder: $(Pipeline.Workspace) + testResultsFormat: JUnit + testResultsFiles: '**/test_output.xml'