From e1f13fac90185dce98d62222c1b757de19d060dc Mon Sep 17 00:00:00 2001 From: David Addison Date: Wed, 13 Sep 2023 11:15:13 -0700 Subject: [PATCH 1/3] Make the -c option be a datacheck iteration count parameter Default is 1 [ROCm/rccl-tests commit: 6c46206a478203b6453035fe0d40dc6418acd089] --- projects/rccl-tests/README.md | 2 +- projects/rccl-tests/src/common.cu | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md index 580996b28d..4281799430 100644 --- a/projects/rccl-tests/README.md +++ b/projects/rccl-tests/README.md @@ -62,7 +62,7 @@ All tests support the same set of arguments : * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. - * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. + * `-c,--check ` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. * `-G,--cudagraph ` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 48a629ce10..dcead4ddd4 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -487,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int64_t wrongElts = 0; static __thread int rep = 0; rep++; - if (datacheck) { + for (int c = 0; c < datacheck; c++) { // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); @@ -536,8 +536,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t //aggregate delta from all threads and procs long long wrongElts1 = wrongElts; + //if (wrongElts) fprintf(stderr, "\nERROR: Data corruption : rank %d size %ld wrongElts %ld\n", args->proc, args->expectedBytes, wrongElts); Allreduce(args, &wrongElts1, /*sum*/4); wrongElts = wrongElts1; + if (wrongElts) break; } double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6; @@ -809,7 +811,7 @@ int main(int argc, char* argv[]) { "[-m,--agg_iters ] \n\t" "[-w,--warmup_iters ] \n\t" "[-p,--parallel_init <0/1>] \n\t" - "[-c,--check <0/1>] \n\t" + "[-c,--check ] \n\t" #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) "[-o,--op ] \n\t" #elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) From 459b52158f118286fb4b2988e4778ee531b7ad87 Mon Sep 17 00:00:00 2001 From: David Addison Date: Thu, 12 Oct 2023 16:53:32 -0700 Subject: [PATCH 2/3] Added an MPI_Barrier() call after MPI_Bcast() for HCOLL issue [ROCm/rccl-tests commit: 1292b25553bd0384f2faa2965f9d82b99797a348] --- projects/rccl-tests/src/common.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index dcead4ddd4..8588047d78 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -924,6 +924,7 @@ testResult_t run() { } #ifdef MPI_SUPPORT MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm); + MPI_Barrier(MPI_COMM_WORLD); // Ensure Bcast is complete for HCOLL #endif int gpus[nGpus*nThreads]; cudaStream_t streams[nGpus*nThreads]; From 5d52f0285cc5e44f9e1558b0a766f29fddd6af40 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 5 Feb 2024 08:53:54 -0800 Subject: [PATCH 3/3] Added missing MPI_Comm_free() call before MPI_Finalize() [ROCm/rccl-tests commit: c6afef0b6f76ffc55d4172d971be6cf5a08a73a4] --- projects/rccl-tests/src/common.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 8588047d78..4ac00fb3d7 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -1056,6 +1056,7 @@ testResult_t run() { PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); PRINT("#\n"); #ifdef MPI_SUPPORT + MPI_Comm_free(&mpi_comm); MPI_Finalize(); #endif