[AzureCI] Switch to ROCm 6.4.1 and add rccl-tests (#1782)
* Use ROCm 6.4.1 for testing * Extend RCCL-Tests to multi-node * Add HSA_NO_SCRATCH_RECLAIM to UT runs * Limit to single-node rccl-tests for now
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
4699bff790
commit
d73cee7588
@@ -40,4 +40,5 @@ jobs:
|
||||
parameters:
|
||||
submoduleBehaviour: recursive
|
||||
- template: templates/build.yml
|
||||
- template: templates/test.yml
|
||||
- template: templates/test_rccl-UnitTests.yml
|
||||
- template: templates/test_rccl-tests.yml
|
||||
|
||||
@@ -45,4 +45,5 @@ stages:
|
||||
parameters:
|
||||
submoduleBehaviour: recursive
|
||||
- template: templates/build.yml
|
||||
- template: templates/test.yml
|
||||
- template: templates/test_rccl-UnitTests.yml
|
||||
- template: templates/test_rccl-tests.yml
|
||||
|
||||
@@ -11,7 +11,7 @@ short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
|
||||
echo "Node identifier: $short_id"
|
||||
|
||||
source /etc/profile.d/lmod.sh
|
||||
module load rocm/6.4.0
|
||||
module load rocm/6.4.1
|
||||
|
||||
# Setup local binary path
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
@@ -28,9 +28,24 @@ fi
|
||||
echo "Using Ninja at: $(which ninja)"
|
||||
ninja --version
|
||||
|
||||
# Define GPU target
|
||||
export GPU_TARGETS="gfx942"
|
||||
|
||||
cd "${SLURM_SUBMIT_DIR:-$PWD}"
|
||||
## Building RCCL
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=gfx942 -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
|
||||
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
|
||||
cmake --build .
|
||||
cmake --build . --target install
|
||||
|
||||
|
||||
cd "${SLURM_SUBMIT_DIR:-$PWD}"
|
||||
## Building RCCL-Tests
|
||||
git clone https://github.com/ROCm/rccl-tests
|
||||
cd rccl-tests
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake -DCMAKE_PREFIX_PATH="$BINARIES_DIR;$MPI_HOME" -DUSE_MPI=ON -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DROCM_PATH="$ROCM_PATH" ..
|
||||
cmake --build .
|
||||
cmake --build . --target install
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=rccl-test
|
||||
#SBATCH --output=rccl-test-%j.out
|
||||
#SBATCH --error=rccl-test-%j.out
|
||||
#SBATCH --job-name=rccl-UnitTests
|
||||
#SBATCH --output=%x-%j.out
|
||||
#SBATCH --error=%x-%j.out
|
||||
#SBATCH --time=120
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --exclusive
|
||||
@@ -11,6 +11,6 @@ short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
|
||||
echo "Node identifier: $short_id"
|
||||
|
||||
source /etc/profile.d/lmod.sh
|
||||
module load rocm/6.4.0
|
||||
module load rocm/6.4.1
|
||||
cd "$BINARIES_DIR/bin"
|
||||
LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/test_output.xml --gtest_color=yes
|
||||
LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 HSA_NO_SCRATCH_RECLAIM=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/rccl-UnitTests_output.xml --gtest_color=yes
|
||||
@@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=rccl-tests
|
||||
#SBATCH --output=%x-%j.out
|
||||
#SBATCH --error=%x-%j.out
|
||||
#SBATCH --time=60
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --partition=gt
|
||||
|
||||
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
|
||||
echo "Node identifier: $short_id"
|
||||
|
||||
source /etc/profile.d/lmod.sh
|
||||
module load rocm/6.4.1
|
||||
|
||||
cd ${PIPELINE_WORKSPACE}/TestResults
|
||||
mkdir -p ${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
|
||||
export WORKDIR=${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
|
||||
|
||||
export PATH="$BINARIES_DIR/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH"
|
||||
|
||||
### create hostlist
|
||||
#nodelist=($(scontrol show hostnames))
|
||||
#echo "SLURM nodes:"
|
||||
#echo ${nodelist[@]}
|
||||
#echo ""
|
||||
#
|
||||
#hosts_8ppn=()
|
||||
#for node in "${nodelist[@]}"
|
||||
#do
|
||||
# hosts_8ppn+=("${node}:8")
|
||||
#done
|
||||
#echo ${hosts_8ppn[@]}
|
||||
|
||||
### Run multi- and single-node RCCL-Tests
|
||||
## Run single-node RCCL-Tests
|
||||
for n in 1
|
||||
do
|
||||
total=$((n*8))
|
||||
#h_8ppn=`echo ${hosts_8ppn[@]:0:${n}} | tr ' ' ','`
|
||||
|
||||
for coll in all_reduce all_gather reduce_scatter alltoall alltoallv broadcast gather reduce scatter sendrecv
|
||||
do
|
||||
for dtype in float bfloat16 half fp8_e5m2
|
||||
do
|
||||
out_filename="${WORKDIR}/rccl-tests_${coll}_1KB-16GB_nodes${n}_gpus${total}_${dtype}.log"
|
||||
#cmd="${MPI_HOME}/bin/mpirun -np ${total} --host ${h_8ppn} -mca pml ob1 -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 -x NCCL_IB_GID_INDEX=3 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
|
||||
cmd="${MPI_HOME}/bin/mpirun -np ${total} -mca pml ^ucx -mca osc ^ucx -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
|
||||
|
||||
echo "Running ${coll}" 2>&1 | tee ${out_filename}
|
||||
echo "Run cmd: ${cmd}" 2>&1 | tee -a ${out_filename}
|
||||
eval ${cmd} 2>&1 | tee -a ${out_filename}
|
||||
|
||||
sleep 2
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
## To add
|
||||
### Summarize results
|
||||
### Convert to junit
|
||||
+6
-6
@@ -1,6 +1,6 @@
|
||||
steps:
|
||||
- task: Bash@3
|
||||
displayName: Test Job
|
||||
displayName: RCCL UnitTests
|
||||
env:
|
||||
BINARIES_DIR: $(Build.BinariesDirectory)
|
||||
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
|
||||
@@ -8,7 +8,7 @@ steps:
|
||||
targetType: inline
|
||||
script: |
|
||||
echo "Submitting test job..."
|
||||
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test.sh)
|
||||
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-UnitTests.sh)
|
||||
echo "Submitted test job: $TEST_JOB_ID"
|
||||
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
|
||||
|
||||
@@ -39,9 +39,9 @@ steps:
|
||||
done
|
||||
|
||||
echo "Checking test result XML for failures..."
|
||||
TEST_XML=$(find "$(Pipeline.Workspace)" -name 'test_output.xml' | head -n1)
|
||||
TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-UnitTests_output.xml' | head -n1)
|
||||
if [ -z "$TEST_XML" ]; then
|
||||
echo "##vso[task.logissue type=error]No test_output.xml file found"
|
||||
echo "##vso[task.logissue type=error]No $TEST_XML file found"
|
||||
echo "##vso[task.complete result=Failed;]DONE"
|
||||
exit 1
|
||||
fi
|
||||
@@ -59,11 +59,11 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
cat rccl-test-${TEST_JOB_ID}.out || echo "No log found"
|
||||
cat rccl-UnitTests-${TEST_JOB_ID}.out || echo "No log found"
|
||||
- task: PublishTestResults@2
|
||||
displayName: 'Publish Results'
|
||||
condition: succeededOrFailed()
|
||||
inputs:
|
||||
searchFolder: $(Pipeline.Workspace)
|
||||
testResultsFormat: JUnit
|
||||
testResultsFiles: '**/test_output.xml'
|
||||
testResultsFiles: '**/rccl-UnitTests_output.xml'
|
||||
@@ -0,0 +1,77 @@
|
||||
steps:
|
||||
- task: Bash@3
|
||||
displayName: RCCL-Tests
|
||||
env:
|
||||
BINARIES_DIR: $(Build.BinariesDirectory)
|
||||
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
echo "Submitting test job..."
|
||||
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-tests.sh)
|
||||
echo "Submitted test job: $TEST_JOB_ID"
|
||||
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
|
||||
|
||||
echo "Waiting for test job to start..."
|
||||
while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
|
||||
echo "##[section]Test job $TEST_JOB_ID is still running..."
|
||||
sleep 60
|
||||
done
|
||||
|
||||
echo "Waiting for final status via sacct..."
|
||||
LOOP_COUNT=0
|
||||
MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
|
||||
while true; do
|
||||
STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
|
||||
echo "##[section]Test job state: $STATE"
|
||||
if [[ "$STATE" == "COMPLETED" ]]; then
|
||||
break
|
||||
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
|
||||
echo "Test failed with state $STATE"
|
||||
break
|
||||
fi
|
||||
sleep 60
|
||||
LOOP_COUNT=$((LOOP_COUNT + 1))
|
||||
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
|
||||
echo "Time limit reached while waiting for final status."
|
||||
exit 1 # Exit with an error code if time limit is reached
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Checking test result json for failures..."
|
||||
TEST_JSON=$(find "$(Pipeline.Workspace)" -name 'rccl-tests*.json')
|
||||
if [ -z "$TEST_JSON" ]; then
|
||||
echo "##vso[task.logissue type=error]No $TEST_JSON file(s) found"
|
||||
echo "##vso[task.complete result=Failed;]DONE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#echo "Checking test result XML for failures..."
|
||||
#TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-tests_output.xml' | head -n1)
|
||||
#if [ -z "$TEST_XML" ]; then
|
||||
# echo "##vso[task.logissue type=error]No $TES_XML file found"
|
||||
# echo "##vso[task.complete result=Failed;]DONE"
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
#if grep -q 'failures="[^0]' "$TEST_XML"; then
|
||||
# echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
|
||||
# echo "##vso[task.complete result=Failed;]DONE"
|
||||
# exit 1
|
||||
#else
|
||||
# echo "No test failures detected."
|
||||
#fi
|
||||
- task: Bash@3
|
||||
displayName: Test Logs
|
||||
condition: always()
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
cat rccl-tests-${TEST_JOB_ID}.out || echo "No log found"
|
||||
# - task: PublishTestResults@2
|
||||
# displayName: 'Publish Results'
|
||||
# condition: succeededOrFailed()
|
||||
# inputs:
|
||||
# searchFolder: $(Pipeline.Workspace)
|
||||
# testResultsFormat: JUnit
|
||||
# testResultsFiles: '**/rccl-tests_output.xml'
|
||||
Reference in New Issue
Block a user