From 1922bd71cbeb9e09c1f3b6272a1097897a39c9a1 Mon Sep 17 00:00:00 2001
From: Kaiming Ouyang <kouyang@nvidia.com>
Date: Thu, 6 Jun 2024 04:59:28 -0700
Subject: [PATCH 01/11] Change ncclCommRegister size to maxBytes in serial comm
 init

[ROCm/rccl-tests commit: d028efcf35101c6663ae8c5f33ad41bad00efb4d]
---
 projects/rccl-tests/src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu
index fc5af1e014..04e81422f0 100644
--- a/projects/rccl-tests/src/common.cu
+++ b/projects/rccl-tests/src/common.cu
@@ -1000,8 +1000,8 @@ testResult_t run() {
      sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
      recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
      for (int i=0; i<nGpus*nThreads; i++) {
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], sendBytes, &sendRegHandles[i]));
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], recvBytes, &recvRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
      }
 #endif
   }

From a0e06f2133e939775ca22ae377d8fc1a27f68579 Mon Sep 17 00:00:00 2001
From: Stefano Salsano <stefano.salsano@uniroma2.it>
Date: Fri, 14 Jun 2024 11:28:55 +0200
Subject: [PATCH 02/11] improve parsing of stepbytes (increment size) argument

[ROCm/rccl-tests commit: 746549b28d3b654e0670feca0065f51affdb7db8]
---
 projects/rccl-tests/src/common.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu
index 04e81422f0..7706dd9a54 100644
--- a/projects/rccl-tests/src/common.cu
+++ b/projects/rccl-tests/src/common.cu
@@ -764,7 +764,12 @@ int main(int argc, char* argv[]) {
         maxBytes = (size_t)parsed;
         break;
       case 'i':
-        stepBytes = strtol(optarg, NULL, 0);
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'stepBytes'\n");
+          return -1;
+        }
+        stepBytes = (size_t)parsed;
         break;
       case 'f':
         stepFactor = strtol(optarg, NULL, 0);

From 5061074d09775c1fe5b5b6f7c3a92eed2ca9b268 Mon Sep 17 00:00:00 2001
From: Oren <47992694+OrenLeung@users.noreply.github.com>
Date: Wed, 24 Jul 2024 22:55:00 -0400
Subject: [PATCH 03/11] doc: add all2all factor

[ROCm/rccl-tests commit: c6eb15875f508076f3f26de4f7da3899701bc4db]
---
 projects/rccl-tests/doc/PERFORMANCE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/projects/rccl-tests/doc/PERFORMANCE.md b/projects/rccl-tests/doc/PERFORMANCE.md
index 21fef609af..942f054968 100644
--- a/projects/rccl-tests/doc/PERFORMANCE.md
+++ b/projects/rccl-tests/doc/PERFORMANCE.md
@@ -140,5 +140,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_
 * AllGather : (_n_-1)/_n_
 * Broadcast : 1
 * Reduce : 1
+* AlltoAll: (_n_-1)/_n_
 
 The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.

From cf3ffb2f5f4c4d0ccf4f8a050e34bafc2462cfa9 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 25 Jul 2024 21:47:40 -0700
Subject: [PATCH 04/11] Added -N,--run_cycles option

[ROCm/rccl-tests commit: d2d40cc8249378efa4d7e2c949528c15eeb7d8e7]
---
 projects/rccl-tests/src/common.cu | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu
index 04e81422f0..872a18a1b6 100644
--- a/projects/rccl-tests/src/common.cu
+++ b/projects/rccl-tests/src/common.cu
@@ -69,6 +69,7 @@ static int datacheck = 1;
 static int warmup_iters = 5;
 static int iters = 20;
 static int agg_iters = 1;
+static int run_cycles = 1;
 static int ncclop = ncclSum;
 static int nccltype = ncclFloat;
 static int ncclroot = 0;
@@ -598,7 +599,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   TESTCHECK(completeColl(args));
 
   // Benchmark
-  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+  long repeat = run_cycles;
+  do {
+    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
       setupArgs(size, type, args);
       char rootName[100];
       sprintf(rootName, "%6i", root);
@@ -606,7 +609,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
       TESTCHECK(BenchTime(args, type, op, root, 0));
       TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
-  }
+    }
+  } while (--repeat);
+
   return testSuccess;
 }
 
@@ -717,6 +722,7 @@ int main(int argc, char* argv[]) {
     {"iters", required_argument, 0, 'n'},
     {"agg_iters", required_argument, 0, 'm'},
     {"warmup_iters", required_argument, 0, 'w'},
+    {"run_cycles", required_argument, 0, 'N'},
     {"parallel_init", required_argument, 0, 'p'},
     {"check", required_argument, 0, 'c'},
     {"op", required_argument, 0, 'o'},
@@ -735,7 +741,7 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -782,6 +788,9 @@ int main(int argc, char* argv[]) {
       case 'w':
         warmup_iters = (int)strtol(optarg, NULL, 0);
         break;
+      case 'N':
+        run_cycles = (int)strtol(optarg, NULL, 0);
+        break;
       case 'c':
         datacheck = (int)strtol(optarg, NULL, 0);
         break;
@@ -841,6 +850,7 @@ int main(int argc, char* argv[]) {
             "[-n,--iters <iteration count>] \n\t"
             "[-m,--agg_iters <aggregated iteration count>] \n\t"
             "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-N,--run_cycles <cycle count> run & print each cycle (default: 1; 0=infinite)] \n\t"
             "[-p,--parallel_init <0/1>] \n\t"
             "[-c,--check <check iteration count>] \n\t"
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)

From 98b958afbda32f34923c5fb06910f41a9bf200a5 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 30 Jul 2024 14:50:45 -0700
Subject: [PATCH 05/11] Added some missing command line options to README.md
 Also updated single and multi-node examples.

[ROCm/rccl-tests commit: 0d86b5a6e755c52be6f23ef3f4792385f5e255b1]
---
 projects/rccl-tests/README.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md
index 4281799430..44e406a633 100644
--- a/projects/rccl-tests/README.md
+++ b/projects/rccl-tests/README.md
@@ -24,14 +24,15 @@ NCCL tests can run on multiple processes, multiple threads, and multiple CUDA de
 
 ### Quick examples
 
-Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
 ```shell
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```
 
-Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
+Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
+(NB: The nccl-tests binaries must be compiled with `MPI=1` for this case)
 ```shell
-$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
+$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
 ```
 
 ### Performance
@@ -59,14 +60,18 @@ All tests support the same set of arguments :
   * `-n,--iters <iteration count>` number of iterations. Default : 20.
   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-N,--run_cycles <cycle count>` run & print each cycle. Default : 1; 0=infinite.
   * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
   * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+  * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
+  * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
+  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
 
 ## Copyright
 
-NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
 

From 0a2f08311b3957df39cd9edb26d979b7e3d87eba Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Thu, 24 Oct 2024 09:21:37 -0700
Subject: [PATCH 06/11] Future-proof ncclstringtotype

Ensure that ncclstringtotype iterates only over data types known to
nccl-tests (as indicated by test_typenum), not over a potentially larger
set of all NCCL types.


[ROCm/rccl-tests commit: 34d6d5391084d30d7698e347497c2ebcc2d82b78]
---
 projects/rccl-tests/src/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/rccl-tests/src/common.h b/projects/rccl-tests/src/common.h
index 20fa4612db..e6762e1c97 100644
--- a/projects/rccl-tests/src/common.h
+++ b/projects/rccl-tests/src/common.h
@@ -254,7 +254,7 @@ extern ncclRedOp_t test_ops[];
 extern const char *test_opnames[];
 
 static int ncclstringtotype(char *str) {
-    for (int t=0; t<ncclNumTypes; t++) {
+    for (int t=0; t<test_typenum; t++) {
       if (strcmp(str, test_typenames[t]) == 0) {
         return t;
       }

From 69b9a05e7182512e4b47a734e92bc3464ebc95db Mon Sep 17 00:00:00 2001
From: John Bachan <jbachan@nvidia.com>
Date: Wed, 18 Dec 2024 11:14:18 -0800
Subject: [PATCH 07/11] Fixes to all tests that divide buffers by nranks so
 that they trim buffer sizes to be multiples of 16 bytes. This ensures
 non-pow2 ranks have buffer addresses aligned suitably for performance.

[ROCm/rccl-tests commit: 29f4114f027fed903649a3c81babc5d52e8d41ae]
---
 projects/rccl-tests/src/all_gather.cu     |  8 +++-----
 projects/rccl-tests/src/all_reduce.cu     |  4 ++--
 projects/rccl-tests/src/alltoall.cu       | 10 +++++-----
 projects/rccl-tests/src/broadcast.cu      |  4 ++--
 projects/rccl-tests/src/common.cu         |  2 +-
 projects/rccl-tests/src/common.h          |  2 +-
 projects/rccl-tests/src/gather.cu         | 12 ++++++------
 projects/rccl-tests/src/hypercube.cu      |  6 +++---
 projects/rccl-tests/src/reduce.cu         |  4 ++--
 projects/rccl-tests/src/reduce_scatter.cu |  8 +++-----
 projects/rccl-tests/src/scatter.cu        | 12 ++++++------
 projects/rccl-tests/src/sendrecv.cu       |  4 ++--
 12 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/projects/rccl-tests/src/all_gather.cu b/projects/rccl-tests/src/all_gather.cu
index 0831207433..6db67e6d00 100644
--- a/projects/rccl-tests/src/all_gather.cu
+++ b/projects/rccl-tests/src/all_gather.cu
@@ -7,10 +7,8 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-#define ALIGN 4
-
-void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  size_t base = (count/(ALIGN*nranks))*ALIGN;
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
   *sendcount = base;
   *recvcount = base*nranks;
   *sendInplaceOffset = base;
@@ -60,7 +58,7 @@ struct testColl allGatherTest = {
 
 void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/projects/rccl-tests/src/all_reduce.cu b/projects/rccl-tests/src/all_reduce.cu
index a38eabe057..4aa1feead7 100644
--- a/projects/rccl-tests/src/all_reduce.cu
+++ b/projects/rccl-tests/src/all_reduce.cu
@@ -7,7 +7,7 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
   *recvcount = count;
   *sendInplaceOffset = 0;
@@ -55,7 +55,7 @@ struct testColl allReduceTest = {
 
 void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/projects/rccl-tests/src/alltoall.cu b/projects/rccl-tests/src/alltoall.cu
index 41c7c4ae33..dd085e54a9 100644
--- a/projects/rccl-tests/src/alltoall.cu
+++ b/projects/rccl-tests/src/alltoall.cu
@@ -7,12 +7,12 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks;
-  *recvcount = (count/nranks)*nranks;
+void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *paramcount = (count/nranks) & -(16/eltSize);
+  *sendcount = nranks*(*paramcount);
+  *recvcount = *sendcount;
   *sendInplaceOffset = 0;
   *recvInplaceOffset = 0;
-  *paramcount = count/nranks;
 }
 
 testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -74,7 +74,7 @@ struct testColl alltoAllTest = {
 
 void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/projects/rccl-tests/src/broadcast.cu b/projects/rccl-tests/src/broadcast.cu
index 903066a2b8..67e9af2f36 100644
--- a/projects/rccl-tests/src/broadcast.cu
+++ b/projects/rccl-tests/src/broadcast.cu
@@ -7,7 +7,7 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
   *recvcount = count;
   *sendInplaceOffset = 0;
@@ -64,7 +64,7 @@ struct testColl broadcastTest = {
 
 void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu
index e1f8a85f16..6d103d797d 100644
--- a/projects/rccl-tests/src/common.cu
+++ b/projects/rccl-tests/src/common.cu
@@ -571,7 +571,7 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
   size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
 
   count = size / wordSize(type);
-  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, wordSize(type), (size_t)nranks);
 
   args->nbytes = paramCount * wordSize(type);
   args->sendBytes = sendCount * wordSize(type);
diff --git a/projects/rccl-tests/src/common.h b/projects/rccl-tests/src/common.h
index e6762e1c97..478d7fb1c0 100644
--- a/projects/rccl-tests/src/common.h
+++ b/projects/rccl-tests/src/common.h
@@ -87,7 +87,7 @@ struct testColl {
   void (*getCollByteCount)(
       size_t *sendcount, size_t *recvcount, size_t *paramcount,
       size_t *sendInplaceOffset, size_t *recvInplaceOffset,
-      size_t count, int nranks);
+      size_t count, size_t eltSize, int nranks);
   testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
diff --git a/projects/rccl-tests/src/gather.cu b/projects/rccl-tests/src/gather.cu
index 03ef4d9e3f..a4a7a30bcd 100644
--- a/projects/rccl-tests/src/gather.cu
+++ b/projects/rccl-tests/src/gather.cu
@@ -7,12 +7,12 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = count/nranks;
-  *recvcount = (count/nranks)*nranks;
-  *sendInplaceOffset = count/nranks;
+void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = (count/nranks) & -(16/eltSize);
+  *recvcount = (*sendcount)*nranks;
+  *sendInplaceOffset = *sendcount;
   *recvInplaceOffset = 0;
-  *paramcount = count/nranks;
+  *paramcount = *sendcount;
 }
 
 testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -73,7 +73,7 @@ struct testColl gatherTest = {
 
 void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/projects/rccl-tests/src/hypercube.cu b/projects/rccl-tests/src/hypercube.cu
index 5c1456f8c7..b3459c91f4 100644
--- a/projects/rccl-tests/src/hypercube.cu
+++ b/projects/rccl-tests/src/hypercube.cu
@@ -9,8 +9,8 @@
 
 #define ALIGN 4
 
-void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  size_t base = (count/(ALIGN*nranks))*ALIGN;
+void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
   *sendcount = base;
   *recvcount = base*nranks;
   *sendInplaceOffset = base;
@@ -78,7 +78,7 @@ struct testColl hyperCubeTest = {
 
 void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/projects/rccl-tests/src/reduce.cu b/projects/rccl-tests/src/reduce.cu
index f2fa80dd95..731abfa141 100644
--- a/projects/rccl-tests/src/reduce.cu
+++ b/projects/rccl-tests/src/reduce.cu
@@ -7,7 +7,7 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
   *recvcount = count;
   *sendInplaceOffset = 0;
@@ -54,7 +54,7 @@ struct testColl reduceTest = {
 
 void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/projects/rccl-tests/src/reduce_scatter.cu b/projects/rccl-tests/src/reduce_scatter.cu
index ed372e3b9a..35cfdd4929 100644
--- a/projects/rccl-tests/src/reduce_scatter.cu
+++ b/projects/rccl-tests/src/reduce_scatter.cu
@@ -7,10 +7,8 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-#define ALIGN 4
-
-void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  size_t base = (count/(ALIGN*nranks))*ALIGN;
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
   *sendcount = base*nranks;
   *recvcount = base;
   *sendInplaceOffset = 0;
@@ -59,7 +57,7 @@ struct testColl reduceScatterTest = {
 
 void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/projects/rccl-tests/src/scatter.cu b/projects/rccl-tests/src/scatter.cu
index 49d20e1601..d1eec71282 100644
--- a/projects/rccl-tests/src/scatter.cu
+++ b/projects/rccl-tests/src/scatter.cu
@@ -7,12 +7,12 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks;
-  *recvcount = count/nranks;
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *recvcount = (count/nranks) & -(16/eltSize);
+  *sendcount = (*recvcount)*nranks;
   *sendInplaceOffset = 0;
-  *recvInplaceOffset = count/nranks;
-  *paramcount = count/nranks;
+  *recvInplaceOffset = *recvcount;
+  *paramcount = *recvcount;
 }
 
 testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -69,7 +69,7 @@ struct testColl scatterTest = {
 
 void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/projects/rccl-tests/src/sendrecv.cu b/projects/rccl-tests/src/sendrecv.cu
index c9eb5bb427..67a4898b27 100644
--- a/projects/rccl-tests/src/sendrecv.cu
+++ b/projects/rccl-tests/src/sendrecv.cu
@@ -7,7 +7,7 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
   *recvcount = count;
   *sendInplaceOffset = 0;
@@ -68,7 +68,7 @@ struct testColl sendRecvTest = {
 
 void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

From 6f2e0f8a21237716e21238c29fe9855806ee2e33 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 23 Jan 2025 12:57:51 -0800
Subject: [PATCH 08/11] Update CUDA gencodes

Add support for Blackwell sm100 and sm120 from CUDA 12.8

Add support for Hopper sm90 from CUDA 12.0


[ROCm/rccl-tests commit: cb6a46fdd677783eec470e3df09aa138891bfebf]
---
 projects/rccl-tests/src/Makefile | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/projects/rccl-tests/src/Makefile b/projects/rccl-tests/src/Makefile
index 393de8e41b..5737092a86 100644
--- a/projects/rccl-tests/src/Makefile
+++ b/projects/rccl-tests/src/Makefile
@@ -16,15 +16,30 @@ CUDARTLIB ?= cudart
 
 CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
 NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
                 -gencode=arch=compute_61,code=sm_61 \
                 -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_80,code=sm_80 \
-                -gencode=arch=compute_80,code=compute_80
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_90,code=compute_90
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_80,code=compute_80
 else
 NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
                 -gencode=arch=compute_50,code=sm_50 \

From e2a9cbb3620b594e23f373fccbcaa4db40e9d105 Mon Sep 17 00:00:00 2001
From: Junyu Ma <junyum@nvidia.com>
Date: Thu, 23 Jan 2025 11:09:09 -0800
Subject: [PATCH 09/11] Perftests: Introduce NCCL_TESTS_SPLIT env

`NCCL_TESTS_SPLIT` serves as new way of computing the color for splitting communicators.

Will be overrided by `NCCL_TESTS_SPLIT_MASK`.

Examples:

NCCL_TESTS_SPLIT_MASK="0x7" # color = rank & 0x7. What we do today to run on a DGX with one GPU per node.
NCCL_TESTS_SPLIT="AND 0x7"  # color = rank & 0x7. New way to run on one GPU per node on a DGX, equivalent to NCCL_TESTS_SPLIT_MASK=0x7
NCCL_TESTS_SPLIT="MOD 72"   # color = rank % 72.  One GPU per NVLink domain on an NVL72 system.
NCCL_TESTS_SPLIT="DIV 72"   # color = rank / 72.  Intra NVLink domain on NVL72.

You can also use: "%" "&" "|" "/" for short.
Extra spaces in the middle will be automatically ignored.
Not case sensitive.

The followings are all equivalent:

NCCL_TESTS_SPLIT="%0x7"
NCCL_TESTS_SPLIT="%0b111"
NCCL_TESTS_SPLIT="AND 7"
NCCL_TESTS_SPLIT="and 0x7"


[ROCm/rccl-tests commit: a89cf07fe879e1c0187a4f617f873ae47d69af6b]
---
 projects/rccl-tests/src/common.cu | 51 +++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu
index 6d103d797d..9277ea2b15 100644
--- a/projects/rccl-tests/src/common.cu
+++ b/projects/rccl-tests/src/common.cu
@@ -10,6 +10,8 @@
 #include <type_traits>
 #include <getopt.h>
 #include <libgen.h>
+#include <string.h>
+#include <ctype.h>
 #include "cuda.h"
 
 #include "../verifiable/verifiable.h"
@@ -892,6 +894,26 @@ int main(int argc, char* argv[]) {
   return 0;
 }
 
+#ifdef MPI_SUPPORT
+// parse int for base 2/10/16, will ignore first whitespaces
+static bool parseInt(char *s, int *num) {
+  char *p = NULL;
+  if (!s || !num)
+    return false;
+  while (*s && isspace(*s)) ++s;
+  if (!*s) return false;
+
+  if (strncasecmp(s, "0b", 2) == 0)
+    *num = (int)strtoul(s + 2, &p, 2);
+  else
+    *num = (int)strtoul(s, &p, 0);
+
+  if (p == s)
+    return false;
+  return true;
+}
+#endif
+
 testResult_t run() {
   int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
   int localRank = 0;
@@ -909,10 +931,33 @@ testResult_t run() {
     if (hostHashs[p] == hostHashs[proc]) localRank++;
   }
 
-  char* str = getenv("NCCL_TESTS_SPLIT_MASK");
-  uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
+  char *splitMaskEnv = NULL;
+  if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT_MASK")) {
+    color = proc & strtoul(splitMaskEnv, NULL, 16);
+  } else if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT")) {
+    if (
+      (strncasecmp(splitMaskEnv, "AND", strlen("AND")) == 0 && parseInt(splitMaskEnv + strlen("AND"), &color)) ||
+      (strncasecmp(splitMaskEnv, "&", strlen("&")) == 0 && parseInt(splitMaskEnv + strlen("&"), &color))
+    )
+        color = proc & color;
+    if (
+      (strncasecmp(splitMaskEnv, "OR", strlen("OR")) == 0 && parseInt(splitMaskEnv + strlen("OR"), &color)) ||
+      (strncasecmp(splitMaskEnv, "|", strlen("|")) == 0 && parseInt(splitMaskEnv + strlen("|"), &color))
+    )
+        color = proc | color;
+    if (
+      (strncasecmp(splitMaskEnv, "MOD", strlen("MOD")) == 0 && parseInt(splitMaskEnv + strlen("MOD"), &color)) ||
+      (strncasecmp(splitMaskEnv, "%", strlen("%")) == 0 && parseInt(splitMaskEnv + strlen("%"), &color))
+    )
+        color = proc % color;
+    if (
+      (strncasecmp(splitMaskEnv, "DIV", strlen("DIV")) == 0 && parseInt(splitMaskEnv + strlen("DIV"), &color)) ||
+      (strncasecmp(splitMaskEnv, "/", strlen("/")) == 0 && parseInt(splitMaskEnv + strlen("/"), &color))
+    )
+        color = proc / color;
+  }
+
   MPI_Comm mpi_comm;
-  color = proc & mask;
   MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
   MPI_Comm_size(mpi_comm, &ncclProcs);
   MPI_Comm_rank(mpi_comm, &ncclProc);

From b740da9a319810bf97dd7dc36e3ba9ecec947404 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Thu, 6 Feb 2025 14:10:07 +0100
Subject: [PATCH 10/11] Add NCCL_TESTS_SPLIT documentation in the README

[ROCm/rccl-tests commit: 903918fc545fff518adf5411a8d5b3c99f5aceab]
---
 projects/rccl-tests/README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md
index 44e406a633..957f6afb90 100644
--- a/projects/rccl-tests/README.md
+++ b/projects/rccl-tests/README.md
@@ -71,6 +71,23 @@ All tests support the same set of arguments :
   * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
   * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
 
+### Running multiple operations in parallel
+
+NCCL tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel. 
+To split the GPUs, NCCL will compute a "color" for each rank, based on the `NCCL_TESTS_SPLIT` environment variable, then all ranks
+with the same color will end up in the same group. The resulting group is printed next to each GPU at the beginning of the test.
+
+`NCCL_TESTS_SPLIT` takes the following syntax: `<operation><value>`. Operation can be `AND`, `OR`, `MOD` or `DIV`. The `&`, `|`, `%`, and `/` symbols are also supported. The value can be either decimal, hexadecimal (prefixed by `0x`) or binary (prefixed by `0b`).
+
+`NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.
+
+Here are a few examples:
+ - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network)
+ - `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
+ - `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
+
+Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.
+
 ## Copyright
 
 NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.

From d516392facd219c4295e38de7bd7e0bf736bb036 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Fri, 28 Feb 2025 13:23:26 -0800
Subject: [PATCH 11/11] Add PCI domain and device ID for GPU device BDF display

[ROCm/rccl-tests commit: b4300cc79d05dd35e26f13afcdb4938f29aa31a5]
---
 projects/rccl-tests/src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu
index 9277ea2b15..0d4dfc1944 100644
--- a/projects/rccl-tests/src/common.cu
+++ b/projects/rccl-tests/src/common.cu
@@ -984,8 +984,8 @@ testResult_t run() {
     int rank = proc*nThreads*nGpus+i;
     cudaDeviceProp prop;
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    len += snprintf(line+len, MAX_LINE-len, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
-                    rank, color, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    len += snprintf(line+len, MAX_LINE-len, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [%04x:%02x:%02x] %s\n",
+                    rank, color, getpid(), hostname, cudaDev, prop.pciDomainID, prop.pciBusID, prop.pciDeviceID, prop.name);
     maxMem = std::min(maxMem, prop.totalGlobalMem);
   }