From 1922bd71cbeb9e09c1f3b6272a1097897a39c9a1 Mon Sep 17 00:00:00 2001 From: Kaiming Ouyang Date: Thu, 6 Jun 2024 04:59:28 -0700 Subject: [PATCH 01/11] Change ncclCommRegister size to maxBytes in serial comm init [ROCm/rccl-tests commit: d028efcf35101c6663ae8c5f33ad41bad00efb4d] --- projects/rccl-tests/src/common.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index fc5af1e014..04e81422f0 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -1000,8 +1000,8 @@ testResult_t run() { sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL; recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL; for (int i=0; i Date: Fri, 14 Jun 2024 11:28:55 +0200 Subject: [PATCH 02/11] improve parsing of stepbytes (increment size) argument [ROCm/rccl-tests commit: 746549b28d3b654e0670feca0065f51affdb7db8] --- projects/rccl-tests/src/common.cu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 04e81422f0..7706dd9a54 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -764,7 +764,12 @@ int main(int argc, char* argv[]) { maxBytes = (size_t)parsed; break; case 'i': - stepBytes = strtol(optarg, NULL, 0); + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'stepBytes'\n"); + return -1; + } + stepBytes = (size_t)parsed; break; case 'f': stepFactor = strtol(optarg, NULL, 0); From 5061074d09775c1fe5b5b6f7c3a92eed2ca9b268 Mon Sep 17 00:00:00 2001 From: Oren <47992694+OrenLeung@users.noreply.github.com> Date: Wed, 24 Jul 2024 22:55:00 -0400 Subject: [PATCH 03/11] doc: add all2all factor [ROCm/rccl-tests commit: c6eb15875f508076f3f26de4f7da3899701bc4db] --- projects/rccl-tests/doc/PERFORMANCE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/rccl-tests/doc/PERFORMANCE.md b/projects/rccl-tests/doc/PERFORMANCE.md index 21fef609af..942f054968 100644 --- a/projects/rccl-tests/doc/PERFORMANCE.md +++ b/projects/rccl-tests/doc/PERFORMANCE.md @@ -140,5 +140,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_ * AllGather : (_n_-1)/_n_ * Broadcast : 1 * Reduce : 1 +* AlltoAll: (_n_-1)/_n_ The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network. From cf3ffb2f5f4c4d0ccf4f8a050e34bafc2462cfa9 Mon Sep 17 00:00:00 2001 From: David Addison Date: Thu, 25 Jul 2024 21:47:40 -0700 Subject: [PATCH 04/11] Added -N,--run_cycles option [ROCm/rccl-tests commit: d2d40cc8249378efa4d7e2c949528c15eeb7d8e7] --- projects/rccl-tests/src/common.cu | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 04e81422f0..872a18a1b6 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -69,6 +69,7 @@ static int datacheck = 1; static int warmup_iters = 5; static int iters = 20; static int agg_iters = 1; +static int run_cycles = 1; static int ncclop = ncclSum; static int nccltype = ncclFloat; static int ncclroot = 0; @@ -598,7 +599,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* TESTCHECK(completeColl(args)); // Benchmark - for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { + long repeat = run_cycles; + do { + for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); char rootName[100]; sprintf(rootName, "%6i", root); @@ -606,7 +609,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* TESTCHECK(BenchTime(args, type, op, root, 0)); TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); - } + } + } while (--repeat); + return testSuccess; } @@ -717,6 +722,7 @@ int main(int argc, char* argv[]) { {"iters", required_argument, 0, 'n'}, {"agg_iters", required_argument, 0, 'm'}, {"warmup_iters", required_argument, 0, 'w'}, + {"run_cycles", required_argument, 0, 'N'}, {"parallel_init", required_argument, 0, 'p'}, {"check", required_argument, 0, 'c'}, {"op", required_argument, 0, 'o'}, @@ -735,7 +741,7 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); if (c == -1) break; @@ -782,6 +788,9 @@ int main(int argc, char* argv[]) { case 'w': warmup_iters = (int)strtol(optarg, NULL, 0); break; + case 'N': + run_cycles = (int)strtol(optarg, NULL, 0); + break; case 'c': datacheck = (int)strtol(optarg, NULL, 0); break; @@ -841,6 +850,7 @@ int main(int argc, char* argv[]) { "[-n,--iters ] \n\t" "[-m,--agg_iters ] \n\t" "[-w,--warmup_iters ] \n\t" + "[-N,--run_cycles run & print each cycle (default: 1; 0=infinite)] \n\t" "[-p,--parallel_init <0/1>] \n\t" "[-c,--check ] \n\t" #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) From 98b958afbda32f34923c5fb06910f41a9bf200a5 Mon Sep 17 00:00:00 2001 From: David Addison Date: Tue, 30 Jul 2024 14:50:45 -0700 Subject: [PATCH 05/11] Added some missing command line options to README.md Also updated single and multi-node examples. [ROCm/rccl-tests commit: 0d86b5a6e755c52be6f23ef3f4792385f5e255b1] --- projects/rccl-tests/README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md index 4281799430..44e406a633 100644 --- a/projects/rccl-tests/README.md +++ b/projects/rccl-tests/README.md @@ -24,14 +24,15 @@ NCCL tests can run on multiple processes, multiple threads, and multiple CUDA de ### Quick examples -Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : +Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : ```shell $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 ``` -Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs: +Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes : +(NB: The nccl-tests binaries must be compiled with `MPI=1` for this case) ```shell -$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 +$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 ``` ### Performance @@ -59,14 +60,18 @@ All tests support the same set of arguments : * `-n,--iters ` number of iterations. Default : 20. * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. + * `-N,--run_cycles ` run & print each cycle. Default : 1; 0=infinite. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-c,--check ` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. * `-G,--cudagraph ` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. + * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0. + * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0. + * `-T,--timeout