Dosyalar
rocm-systems/projects/rccl/.azuredevops/slurm/test_rccl-tests.sh
T
systems-assistant[bot] 4358cad858 [AzureCI] Add all_reduce_bias to rccl-tests CI (#2768)
* [AzureCI] Add all_reduce_bias to rccl-tests CI

* Increase rccl-tests timeout to 2 hours

---------

Co-authored-by: nileshnegi <Nilesh.Negi@amd.com>
Co-authored-by: systems-assistant[bot] <systems-assistant[bot]@users.noreply.github.com>
Co-authored-by: Corey Derochie <161367113+corey-derochie-amd@users.noreply.github.com>
2026-01-30 14:29:06 -07:00

63 satır
2.5 KiB
Bash

#!/bin/bash
#SBATCH --job-name=rccl-tests
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.out
#SBATCH --time=120
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.1
cd ${PIPELINE_WORKSPACE}/TestResults
mkdir -p ${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
export WORKDIR=${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
export PATH="$BINARIES_DIR/bin:$PATH"
export LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH"
### create hostlist
#nodelist=($(scontrol show hostnames))
#echo "SLURM nodes:"
#echo ${nodelist[@]}
#echo ""
#
#hosts_8ppn=()
#for node in "${nodelist[@]}"
#do
# hosts_8ppn+=("${node}:8")
#done
#echo ${hosts_8ppn[@]}
### Run multi- and single-node RCCL-Tests
## Run single-node RCCL-Tests
for n in 1
do
total=$((n*8))
#h_8ppn=`echo ${hosts_8ppn[@]:0:${n}} | tr ' ' ','`
for coll in all_reduce all_reduce_bias all_gather reduce_scatter alltoall alltoallv broadcast gather reduce scatter sendrecv
do
for dtype in float bfloat16 half fp8_e5m2
do
out_filename="${WORKDIR}/rccl-tests_${coll}_1KB-16GB_nodes${n}_gpus${total}_${dtype}.log"
#cmd="${MPI_HOME}/bin/mpirun -np ${total} --host ${h_8ppn} -mca pml ob1 -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 -x NCCL_IB_GID_INDEX=3 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
cmd="${MPI_HOME}/bin/mpirun -np ${total} -mca pml ^ucx -mca osc ^ucx -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
echo "Running ${coll}" 2>&1 | tee ${out_filename}
echo "Run cmd: ${cmd}" 2>&1 | tee -a ${out_filename}
eval ${cmd} 2>&1 | tee -a ${out_filename}
sleep 2
done
done
done
## To add
### Summarize results
### Convert to junit