[AzureCI] Switch to ROCm 6.4.1 and add rccl-tests (#1782)

* Use ROCm 6.4.1 for testing
* Extend RCCL-Tests to multi-node
* Add HSA_NO_SCRATCH_RECLAIM to UT runs
* Limit to single-node rccl-tests for now
This commit is contained in:
Nilesh M Negi
2025-08-27 21:07:53 -05:00
zatwierdzone przez GitHub
rodzic 4699bff790
commit d73cee7588
7 zmienionych plików z 171 dodań i 15 usunięć
@@ -40,4 +40,5 @@ jobs:
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test.yml
- template: templates/test_rccl-UnitTests.yml
- template: templates/test_rccl-tests.yml
+2 -1
Wyświetl plik
@@ -45,4 +45,5 @@ stages:
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test.yml
- template: templates/test_rccl-UnitTests.yml
- template: templates/test_rccl-tests.yml
+17 -2
Wyświetl plik
@@ -11,7 +11,7 @@ short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.0
module load rocm/6.4.1
# Setup local binary path
export PATH="$HOME/.local/bin:$PATH"
@@ -28,9 +28,24 @@ fi
echo "Using Ninja at: $(which ninja)"
ninja --version
# Define GPU target
export GPU_TARGETS="gfx942"
cd "${SLURM_SUBMIT_DIR:-$PWD}"
## Building RCCL
mkdir -p build
cd build
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=gfx942 -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
cmake --build .
cmake --build . --target install
cd "${SLURM_SUBMIT_DIR:-$PWD}"
## Building RCCL-Tests
git clone https://github.com/ROCm/rccl-tests
cd rccl-tests
mkdir -p build
cd build
cmake -DCMAKE_PREFIX_PATH="$BINARIES_DIR;$MPI_HOME" -DUSE_MPI=ON -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DROCM_PATH="$ROCM_PATH" ..
cmake --build .
cmake --build . --target install
@@ -1,7 +1,7 @@
#!/bin/bash
#SBATCH --job-name=rccl-test
#SBATCH --output=rccl-test-%j.out
#SBATCH --error=rccl-test-%j.out
#SBATCH --job-name=rccl-UnitTests
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.out
#SBATCH --time=120
#SBATCH --nodes=1
#SBATCH --exclusive
@@ -11,6 +11,6 @@ short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.0
module load rocm/6.4.1
cd "$BINARIES_DIR/bin"
LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/test_output.xml --gtest_color=yes
LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 HSA_NO_SCRATCH_RECLAIM=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/rccl-UnitTests_output.xml --gtest_color=yes
@@ -0,0 +1,62 @@
#!/bin/bash
#SBATCH --job-name=rccl-tests
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.out
#SBATCH --time=60
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.1
cd ${PIPELINE_WORKSPACE}/TestResults
mkdir -p ${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
export WORKDIR=${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
export PATH="$BINARIES_DIR/bin:$PATH"
export LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH"
### create hostlist
#nodelist=($(scontrol show hostnames))
#echo "SLURM nodes:"
#echo ${nodelist[@]}
#echo ""
#
#hosts_8ppn=()
#for node in "${nodelist[@]}"
#do
# hosts_8ppn+=("${node}:8")
#done
#echo ${hosts_8ppn[@]}
### Run multi- and single-node RCCL-Tests
## Run single-node RCCL-Tests
for n in 1
do
total=$((n*8))
#h_8ppn=`echo ${hosts_8ppn[@]:0:${n}} | tr ' ' ','`
for coll in all_reduce all_gather reduce_scatter alltoall alltoallv broadcast gather reduce scatter sendrecv
do
for dtype in float bfloat16 half fp8_e5m2
do
out_filename="${WORKDIR}/rccl-tests_${coll}_1KB-16GB_nodes${n}_gpus${total}_${dtype}.log"
#cmd="${MPI_HOME}/bin/mpirun -np ${total} --host ${h_8ppn} -mca pml ob1 -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 -x NCCL_IB_GID_INDEX=3 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
cmd="${MPI_HOME}/bin/mpirun -np ${total} -mca pml ^ucx -mca osc ^ucx -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
echo "Running ${coll}" 2>&1 | tee ${out_filename}
echo "Run cmd: ${cmd}" 2>&1 | tee -a ${out_filename}
eval ${cmd} 2>&1 | tee -a ${out_filename}
sleep 2
done
done
done
## To add
### Summarize results
### Convert to junit
@@ -1,6 +1,6 @@
steps:
- task: Bash@3
displayName: Test Job
displayName: RCCL UnitTests
env:
BINARIES_DIR: $(Build.BinariesDirectory)
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
@@ -8,7 +8,7 @@ steps:
targetType: inline
script: |
echo "Submitting test job..."
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test.sh)
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-UnitTests.sh)
echo "Submitted test job: $TEST_JOB_ID"
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
@@ -39,9 +39,9 @@ steps:
done
echo "Checking test result XML for failures..."
TEST_XML=$(find "$(Pipeline.Workspace)" -name 'test_output.xml' | head -n1)
TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-UnitTests_output.xml' | head -n1)
if [ -z "$TEST_XML" ]; then
echo "##vso[task.logissue type=error]No test_output.xml file found"
echo "##vso[task.logissue type=error]No $TEST_XML file found"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
fi
@@ -59,11 +59,11 @@ steps:
inputs:
targetType: inline
script: |
cat rccl-test-${TEST_JOB_ID}.out || echo "No log found"
cat rccl-UnitTests-${TEST_JOB_ID}.out || echo "No log found"
- task: PublishTestResults@2
displayName: 'Publish Results'
condition: succeededOrFailed()
inputs:
searchFolder: $(Pipeline.Workspace)
testResultsFormat: JUnit
testResultsFiles: '**/test_output.xml'
testResultsFiles: '**/rccl-UnitTests_output.xml'
@@ -0,0 +1,77 @@
steps:
- task: Bash@3
displayName: RCCL-Tests
env:
BINARIES_DIR: $(Build.BinariesDirectory)
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
inputs:
targetType: inline
script: |
echo "Submitting test job..."
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-tests.sh)
echo "Submitted test job: $TEST_JOB_ID"
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
echo "Waiting for test job to start..."
while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
echo "##[section]Test job $TEST_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
while true; do
STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Test job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Test failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking test result json for failures..."
TEST_JSON=$(find "$(Pipeline.Workspace)" -name 'rccl-tests*.json')
if [ -z "$TEST_JSON" ]; then
echo "##vso[task.logissue type=error]No $TEST_JSON file(s) found"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
fi
#echo "Checking test result XML for failures..."
#TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-tests_output.xml' | head -n1)
#if [ -z "$TEST_XML" ]; then
# echo "##vso[task.logissue type=error]No $TES_XML file found"
# echo "##vso[task.complete result=Failed;]DONE"
# exit 1
#fi
#if grep -q 'failures="[^0]' "$TEST_XML"; then
# echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
# echo "##vso[task.complete result=Failed;]DONE"
# exit 1
#else
# echo "No test failures detected."
#fi
- task: Bash@3
displayName: Test Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-tests-${TEST_JOB_ID}.out || echo "No log found"
# - task: PublishTestResults@2
# displayName: 'Publish Results'
# condition: succeededOrFailed()
# inputs:
# searchFolder: $(Pipeline.Workspace)
# testResultsFormat: JUnit
# testResultsFiles: '**/rccl-tests_output.xml'