From d73cee7588450c6b6f64dab48e8bccb38b9845d6 Mon Sep 17 00:00:00 2001 From: Nilesh M Negi Date: Wed, 27 Aug 2025 21:07:53 -0500 Subject: [PATCH] [AzureCI] Switch to ROCm 6.4.1 and add rccl-tests (#1782) * Use ROCm 6.4.1 for testing * Extend RCCL-Tests to multi-node * Add HSA_NO_SCRATCH_RECLAIM to UT runs * Limit to single-node rccl-tests for now --- .azuredevops/multinode-ci-slurm-nightly.yml | 3 +- .azuredevops/multinode-ci-slurm-pr.yml | 3 +- .azuredevops/slurm/build.sh | 19 ++++- .../slurm/{test.sh => test_rccl-UnitTests.sh} | 10 +-- .azuredevops/slurm/test_rccl-tests.sh | 62 +++++++++++++++ .../{test.yml => test_rccl-UnitTests.yml} | 12 +-- .azuredevops/templates/test_rccl-tests.yml | 77 +++++++++++++++++++ 7 files changed, 171 insertions(+), 15 deletions(-) rename .azuredevops/slurm/{test.sh => test_rccl-UnitTests.sh} (52%) create mode 100644 .azuredevops/slurm/test_rccl-tests.sh rename .azuredevops/templates/{test.yml => test_rccl-UnitTests.yml} (85%) create mode 100644 .azuredevops/templates/test_rccl-tests.yml diff --git a/.azuredevops/multinode-ci-slurm-nightly.yml b/.azuredevops/multinode-ci-slurm-nightly.yml index 6523887294..b7438f3885 100644 --- a/.azuredevops/multinode-ci-slurm-nightly.yml +++ b/.azuredevops/multinode-ci-slurm-nightly.yml @@ -40,4 +40,5 @@ jobs: parameters: submoduleBehaviour: recursive - template: templates/build.yml - - template: templates/test.yml + - template: templates/test_rccl-UnitTests.yml + - template: templates/test_rccl-tests.yml diff --git a/.azuredevops/multinode-ci-slurm-pr.yml b/.azuredevops/multinode-ci-slurm-pr.yml index 3fc1465425..c3d89ff43d 100644 --- a/.azuredevops/multinode-ci-slurm-pr.yml +++ b/.azuredevops/multinode-ci-slurm-pr.yml @@ -45,4 +45,5 @@ stages: parameters: submoduleBehaviour: recursive - template: templates/build.yml - - template: templates/test.yml + - template: templates/test_rccl-UnitTests.yml + - template: templates/test_rccl-tests.yml diff --git a/.azuredevops/slurm/build.sh b/.azuredevops/slurm/build.sh index e2889ee053..9ebcefd807 100644 --- a/.azuredevops/slurm/build.sh +++ b/.azuredevops/slurm/build.sh @@ -11,7 +11,7 @@ short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) echo "Node identifier: $short_id" source /etc/profile.d/lmod.sh -module load rocm/6.4.0 +module load rocm/6.4.1 # Setup local binary path export PATH="$HOME/.local/bin:$PATH" @@ -28,9 +28,24 @@ fi echo "Using Ninja at: $(which ninja)" ninja --version +# Define GPU target +export GPU_TARGETS="gfx942" + cd "${SLURM_SUBMIT_DIR:-$PWD}" +## Building RCCL mkdir -p build cd build -cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=gfx942 -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" .. +cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" .. +cmake --build . +cmake --build . --target install + + +cd "${SLURM_SUBMIT_DIR:-$PWD}" +## Building RCCL-Tests +git clone https://github.com/ROCm/rccl-tests +cd rccl-tests +mkdir -p build +cd build +cmake -DCMAKE_PREFIX_PATH="$BINARIES_DIR;$MPI_HOME" -DUSE_MPI=ON -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DROCM_PATH="$ROCM_PATH" .. cmake --build . cmake --build . --target install diff --git a/.azuredevops/slurm/test.sh b/.azuredevops/slurm/test_rccl-UnitTests.sh similarity index 52% rename from .azuredevops/slurm/test.sh rename to .azuredevops/slurm/test_rccl-UnitTests.sh index 541c5efc3b..f397fab76e 100644 --- a/.azuredevops/slurm/test.sh +++ b/.azuredevops/slurm/test_rccl-UnitTests.sh @@ -1,7 +1,7 @@ #!/bin/bash -#SBATCH --job-name=rccl-test -#SBATCH --output=rccl-test-%j.out -#SBATCH --error=rccl-test-%j.out +#SBATCH --job-name=rccl-UnitTests +#SBATCH --output=%x-%j.out +#SBATCH --error=%x-%j.out #SBATCH --time=120 #SBATCH --nodes=1 #SBATCH --exclusive @@ -11,6 +11,6 @@ short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) echo "Node identifier: $short_id" source /etc/profile.d/lmod.sh -module load rocm/6.4.0 +module load rocm/6.4.1 cd "$BINARIES_DIR/bin" -LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/test_output.xml --gtest_color=yes +LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 HSA_NO_SCRATCH_RECLAIM=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/rccl-UnitTests_output.xml --gtest_color=yes diff --git a/.azuredevops/slurm/test_rccl-tests.sh b/.azuredevops/slurm/test_rccl-tests.sh new file mode 100644 index 0000000000..a4509ce5d6 --- /dev/null +++ b/.azuredevops/slurm/test_rccl-tests.sh @@ -0,0 +1,62 @@ +#!/bin/bash +#SBATCH --job-name=rccl-tests +#SBATCH --output=%x-%j.out +#SBATCH --error=%x-%j.out +#SBATCH --time=60 +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --partition=gt + +short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) +echo "Node identifier: $short_id" + +source /etc/profile.d/lmod.sh +module load rocm/6.4.1 + +cd ${PIPELINE_WORKSPACE}/TestResults +mkdir -p ${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs +export WORKDIR=${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs + +export PATH="$BINARIES_DIR/bin:$PATH" +export LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" + +### create hostlist +#nodelist=($(scontrol show hostnames)) +#echo "SLURM nodes:" +#echo ${nodelist[@]} +#echo "" +# +#hosts_8ppn=() +#for node in "${nodelist[@]}" +#do +# hosts_8ppn+=("${node}:8") +#done +#echo ${hosts_8ppn[@]} + +### Run multi- and single-node RCCL-Tests +## Run single-node RCCL-Tests +for n in 1 +do + total=$((n*8)) + #h_8ppn=`echo ${hosts_8ppn[@]:0:${n}} | tr ' ' ','` + + for coll in all_reduce all_gather reduce_scatter alltoall alltoallv broadcast gather reduce scatter sendrecv + do + for dtype in float bfloat16 half fp8_e5m2 + do + out_filename="${WORKDIR}/rccl-tests_${coll}_1KB-16GB_nodes${n}_gpus${total}_${dtype}.log" + #cmd="${MPI_HOME}/bin/mpirun -np ${total} --host ${h_8ppn} -mca pml ob1 -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 -x NCCL_IB_GID_INDEX=3 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json" + cmd="${MPI_HOME}/bin/mpirun -np ${total} -mca pml ^ucx -mca osc ^ucx -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json" + + echo "Running ${coll}" 2>&1 | tee ${out_filename} + echo "Run cmd: ${cmd}" 2>&1 | tee -a ${out_filename} + eval ${cmd} 2>&1 | tee -a ${out_filename} + + sleep 2 + done + done +done + +## To add +### Summarize results +### Convert to junit diff --git a/.azuredevops/templates/test.yml b/.azuredevops/templates/test_rccl-UnitTests.yml similarity index 85% rename from .azuredevops/templates/test.yml rename to .azuredevops/templates/test_rccl-UnitTests.yml index c72ae743e8..8d195b109f 100644 --- a/.azuredevops/templates/test.yml +++ b/.azuredevops/templates/test_rccl-UnitTests.yml @@ -1,6 +1,6 @@ steps: - task: Bash@3 - displayName: Test Job + displayName: RCCL UnitTests env: BINARIES_DIR: $(Build.BinariesDirectory) PIPELINE_WORKSPACE: $(Pipeline.Workspace) @@ -8,7 +8,7 @@ steps: targetType: inline script: | echo "Submitting test job..." - TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test.sh) + TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-UnitTests.sh) echo "Submitted test job: $TEST_JOB_ID" echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID" @@ -39,9 +39,9 @@ steps: done echo "Checking test result XML for failures..." - TEST_XML=$(find "$(Pipeline.Workspace)" -name 'test_output.xml' | head -n1) + TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-UnitTests_output.xml' | head -n1) if [ -z "$TEST_XML" ]; then - echo "##vso[task.logissue type=error]No test_output.xml file found" + echo "##vso[task.logissue type=error]No $TEST_XML file found" echo "##vso[task.complete result=Failed;]DONE" exit 1 fi @@ -59,11 +59,11 @@ steps: inputs: targetType: inline script: | - cat rccl-test-${TEST_JOB_ID}.out || echo "No log found" + cat rccl-UnitTests-${TEST_JOB_ID}.out || echo "No log found" - task: PublishTestResults@2 displayName: 'Publish Results' condition: succeededOrFailed() inputs: searchFolder: $(Pipeline.Workspace) testResultsFormat: JUnit - testResultsFiles: '**/test_output.xml' + testResultsFiles: '**/rccl-UnitTests_output.xml' diff --git a/.azuredevops/templates/test_rccl-tests.yml b/.azuredevops/templates/test_rccl-tests.yml new file mode 100644 index 0000000000..9b047c4428 --- /dev/null +++ b/.azuredevops/templates/test_rccl-tests.yml @@ -0,0 +1,77 @@ +steps: + - task: Bash@3 + displayName: RCCL-Tests + env: + BINARIES_DIR: $(Build.BinariesDirectory) + PIPELINE_WORKSPACE: $(Pipeline.Workspace) + inputs: + targetType: inline + script: | + echo "Submitting test job..." + TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-tests.sh) + echo "Submitted test job: $TEST_JOB_ID" + echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID" + + echo "Waiting for test job to start..." + while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do + echo "##[section]Test job $TEST_JOB_ID is still running..." + sleep 60 + done + + echo "Waiting for final status via sacct..." + LOOP_COUNT=0 + MAX_LOOPS=120 # Maximum of 120 loops (120 minutes) + while true; do + STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs) + echo "##[section]Test job state: $STATE" + if [[ "$STATE" == "COMPLETED" ]]; then + break + elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then + echo "Test failed with state $STATE" + break + fi + sleep 60 + LOOP_COUNT=$((LOOP_COUNT + 1)) + if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then + echo "Time limit reached while waiting for final status." + exit 1 # Exit with an error code if time limit is reached + fi + done + + echo "Checking test result json for failures..." + TEST_JSON=$(find "$(Pipeline.Workspace)" -name 'rccl-tests*.json') + if [ -z "$TEST_JSON" ]; then + echo "##vso[task.logissue type=error]No $TEST_JSON file(s) found" + echo "##vso[task.complete result=Failed;]DONE" + exit 1 + fi + + #echo "Checking test result XML for failures..." + #TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-tests_output.xml' | head -n1) + #if [ -z "$TEST_XML" ]; then + # echo "##vso[task.logissue type=error]No $TES_XML file found" + # echo "##vso[task.complete result=Failed;]DONE" + # exit 1 + #fi + + #if grep -q 'failures="[^0]' "$TEST_XML"; then + # echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML" + # echo "##vso[task.complete result=Failed;]DONE" + # exit 1 + #else + # echo "No test failures detected." + #fi + - task: Bash@3 + displayName: Test Logs + condition: always() + inputs: + targetType: inline + script: | + cat rccl-tests-${TEST_JOB_ID}.out || echo "No log found" +# - task: PublishTestResults@2 +# displayName: 'Publish Results' +# condition: succeededOrFailed() +# inputs: +# searchFolder: $(Pipeline.Workspace) +# testResultsFormat: JUnit +# testResultsFiles: '**/rccl-tests_output.xml'