From b188a152997740a84b3ce0da864bb4f1423eb35a Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 8 Aug 2017 16:18:34 -0700
Subject: [PATCH 001/233] Initial commit

---
 LICENSE.txt           |   27 ++
 Makefile              |   20 +
 README.md             |   62 +++
 src/Makefile          |   78 ++++
 src/all_gather.cu     |  106 +++++
 src/all_reduce.cu     |  130 ++++++
 src/broadcast.cu      |  121 +++++
 src/common.cu         | 1036 +++++++++++++++++++++++++++++++++++++++++
 src/common.h          |  158 +++++++
 src/nccl1_compat.h    |   47 ++
 src/reduce.cu         |  159 +++++++
 src/reduce_scatter.cu |  139 ++++++
 12 files changed, 2083 insertions(+)
 create mode 100644 LICENSE.txt
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 src/Makefile
 create mode 100644 src/all_gather.cu
 create mode 100644 src/all_reduce.cu
 create mode 100644 src/broadcast.cu
 create mode 100644 src/common.cu
 create mode 100644 src/common.h
 create mode 100644 src/nccl1_compat.h
 create mode 100644 src/reduce.cu
 create mode 100644 src/reduce_scatter.cu

diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000000..4573c07c44
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,27 @@
+
+ Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000..29409a8422
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+#
+# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENCE.txt for license information
+#
+
+.PHONY : all clean
+
+default : src.build
+
+TARGETS=src
+
+all:   ${TARGETS:%=%.build}
+clean: ${TARGETS:%=%.clean}
+
+%.build:
+	${MAKE} -C $* build
+
+%.clean:
+	${MAKE} -C $* clean
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..d70bb1f54c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,62 @@
+# NCCL Tests
+
+These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL 1](http://github.com/nvidia/nccl) and [NCCL 2](http://developer.nvidia.com/nccl).
+
+## Build
+
+To build the tests, just type `make`.
+
+If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME.
+
+```shell
+$ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
+```
+
+NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
+
+```shell
+$ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
+```
+
+## Usage
+
+NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of gpus per thread).
+
+### Quick examples
+
+Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+```shell
+$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
+```
+
+Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each, disabling checks :
+```shell
+$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0
+```
+
+All tests support the same arguments :
+* Number of GPUs
+  * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
+  * `-g,--ngpus <gpus per thread>` number of gpus per process. Default : 1.
+* Sizes to scan
+  * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
+  * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
+  * Increments can be either fixes of a multiplication factor. Only one of those should be used
+  * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
+  * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
+* Performance 
+  * `-n,--iters <iteration count>` number of iterations. Default : 20.
+  * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
+* `-s,--swap_args <0/1>` when used with multiple threads, have threads manage different GPUs for each iteration. Default : 0.
+* `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel.
+* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
+* NCCL operations arguments
+  * `-o,--op <sum/prod/min/max/all>` Specify which reduction operation to perform. Only relevant for reduction operations. Default : Sum.
+  * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
+  * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce.
+  * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
+
+## Copyright
+
+NCCL tests are provided under the BSD licence. All source code and accompanying documentation is copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000000..6188d01424
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,78 @@
+#
+# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENCE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC = $(CUDA_HOME)/bin/nvcc
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
+		-gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_52,code=sm_52 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_61,code=compute_61
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3
+CXXFLAGS  += -O3
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCLDIR), "")
+NVCUFLAGS += -I$(NCCLDIR)/include/
+NVLDFLAGS   += -L$(NCCLDIR)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -lmpi
+endif
+LIBRARIES += curand nccl nvToolsExt
+NVLDFLAGS   += $(LIBRARIES:%=-l%)
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+${DST_DIR}/%.o: %.cu
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/src/all_gather.cu b/src/all_gather.cu
new file mode 100644
index 0000000000..2386842cdd
--- /dev/null
+++ b/src/all_gather.cu
@@ -0,0 +1,106 @@
+/*************************************************************************
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", 
+      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s", size, count, typeName);
+}
+
+void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
+    *sendcount = count/nranks;
+    *recvcount = (count/nranks)*nranks;
+    *sameExpected = 1;
+    *procSharedCount = 0;
+    *sendInplaceOffset = count/nranks;
+    *recvInplaceOffset = 0;
+    *paramcount = *sendcount;
+}
+
+void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
+  size_t nBytes = args->nbytes;
+  size_t count = nBytes / wordSize(type);
+  int proc = args->proc;
+  int nThreads = args->nThreads;
+  int t = args->thread;
+  int nGpus = args->nGpus;
+
+  while (args->sync[args->sync_idx] != t) pthread_yield();
+
+  for (int i=0; i<nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+
+    void* data = in_place ? (void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank) : args->sendbuffs[i];
+
+    CUDACHECK(cudaMemcpy((void *)((uintptr_t)args->expectedHost[0] + ((proc*nThreads + t)*nGpus + i)*nBytes), 
+                data, 
+                nBytes, cudaMemcpyDeviceToHost));
+
+    if (in_place == 0) {
+      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+
+  args->sync[args->sync_idx] = t + 1;
+
+  if (t+1 == nThreads) {
+#ifdef MPI_SUPPORT
+    // Last thread does the MPI allgather
+    MPI_Allgather(MPI_IN_PLACE, nBytes*nThreads*nGpus, MPI_BYTE, 
+        args->expectedHost[0], 
+        nBytes*nThreads*nGpus, MPI_BYTE, MPI_COMM_WORLD);
+#endif
+    args->sync[args->sync_idx] = 0;
+  } else {
+    while (args->sync[args->sync_idx]) pthread_yield();
+  }
+
+  args->sync_idx=!args->sync_idx;
+}
+
+void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
+}
+
+void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) { 
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else { 
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) { 
+     TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, NULL, 0, 1);
+  }   
+}
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
new file mode 100644
index 0000000000..f41aff5708
--- /dev/null
+++ b/src/all_reduce.cu
@@ -0,0 +1,130 @@
+/*************************************************************************
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
+      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+}
+
+void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
+    *sendcount = count;
+    *recvcount = count;
+    *sameExpected = 1;
+    *procSharedCount = 0;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = 0;
+    *paramcount = *sendcount;
+ }
+
+void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
+  size_t count = args->nbytes / wordSize(type);
+
+  while (args->sync[args->sync_idx] != args->thread) pthread_yield();
+
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+
+    if (is_first && i == 0) {
+      CUDACHECK(cudaMemcpy(args->expected[0], data, count*wordSize(type), cudaMemcpyDeviceToHost));
+    } else {
+      Accumulate(args->expected[0], data, count, type, op);
+    }
+
+    if (in_place == 0) {
+      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+
+  args->sync[args->sync_idx] = args->thread + 1;
+
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    // Last thread does the MPI reduction
+    if (args->nbytes > 0) {
+      void* remote, *remoteHost = malloc(args->nbytes);
+      void* myInitialData = malloc(args->nbytes);
+      memcpy(myInitialData, args->expectedHost[0], args->nbytes);
+      CUDACHECK(cudaHostRegister(remoteHost, args->nbytes, cudaHostRegisterPortable | cudaHostRegisterMapped));
+      CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0));
+      for (int i=0; i<args->nProcs; i++) {
+        if (i == args->proc) {
+          MPI_Bcast(myInitialData, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD);
+          free(myInitialData);
+        } else {
+          MPI_Bcast(remoteHost, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD);
+          Accumulate(args->expected[0], remote, count, type, op);
+          cudaDeviceSynchronize();
+        }
+      }
+      CUDACHECK(cudaHostUnregister(remoteHost));
+      free(remoteHost);
+    }
+#endif
+    args->sync[args->sync_idx] = 0;
+  } else {
+    while (args->sync[args->sync_idx]) pthread_yield();
+  }
+
+  args->sync_idx = !args->sync_idx;
+}
+
+void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+}
+
+
+void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) { 
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else { 
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) { 
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else { 
+    op_count = ncclNumOps;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) { 
+      for (int j=0; j<op_count; j++) { 
+          TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], 0, 1);
+      }
+  }   
+}
diff --git a/src/broadcast.cu b/src/broadcast.cu
new file mode 100644
index 0000000000..fe3d26deeb
--- /dev/null
+++ b/src/broadcast.cu
@@ -0,0 +1,121 @@
+/*************************************************************************
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include <assert.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s  %6s        out-of-place\n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "root", 
+      "time", "algbw", "busbw", "res");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s  %6i", size, count, typeName, root);
+}
+
+void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
+    *sendcount = count;
+    *recvcount = count;
+    *sameExpected = 0;
+    *procSharedCount = count;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = 0;
+    *paramcount = *sendcount;
+}
+
+void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
+  int root_proc = root/(args->nThreads*args->nGpus);
+  int root_thread = (root/args->nGpus)%(args->nThreads);
+  int root_gpu = root%args->nGpus;
+
+  assert(args->expectedBytes == args->nbytes);
+
+  if (root_thread == args->thread) {
+      if (root_proc == args->proc) {  
+         CUDACHECK(cudaMemcpy(args->procSharedHost,
+                    args->sendbuffs[root_gpu],
+                    args->nbytes, cudaMemcpyDeviceToHost));
+      }
+#ifdef MPI_SUPPORT 
+      MPI_Bcast(args->procSharedHost, args->nbytes, MPI_BYTE, root_proc, MPI_COMM_WORLD);
+#endif
+
+      args->sync[0] = 0;
+  }
+
+  Barrier(args);
+
+  for (int i=0; i<args->nGpus; i++) {
+     int device;
+     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); 
+     CUDACHECK(cudaSetDevice(device));
+
+     //set expected buf to zero at root, copy over source data at others
+     if ((root_proc == args->proc) 
+         && (root_thread == args->thread) 
+         && (root_gpu == i)) { 
+         memset(args->expectedHost[i], 0, args->nbytes); 
+     } else { 
+         memcpy(args->expectedHost[i], args->procSharedHost, args->nbytes);
+     }
+
+     //reset recvbufs to zero
+     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes));
+     CUDACHECK(cudaDeviceSynchronize());
+  }
+
+  Barrier(args);
+}
+
+void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int rank; 
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  if (rank == root) { 
+      NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream));
+  } else { 
+      NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream));
+  } 
+}
+
+void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root; 
+
+  if ((int)type != -1) { 
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else { 
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) { 
+     begin_root = end_root = root;
+  } else { 
+     begin_root = 0;
+     end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) { 
+       for (int j=begin_root; j<=end_root; j++) {
+          TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, NULL, j, 0);
+       }
+  }   
+}
diff --git a/src/common.cu b/src/common.cu
new file mode 100644
index 0000000000..a14c3aac01
--- /dev/null
+++ b/src/common.cu
@@ -0,0 +1,1036 @@
+/*************************************************************************
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include <pthread.h>
+#include <cstdio>
+#include <getopt.h>
+#include "cuda.h"
+
+#if NCCL_MAJOR >= 2
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble};
+const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"};
+#else
+ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+#endif
+ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin};
+const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"};
+
+thread_local int is_main_thread = 0;
+
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int swap_args = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+
+double parsesize(char *value) {
+    long long int units;
+    double size;
+
+    if (strchr(value, 'G') != NULL) {
+        units=1024*1024*1024;
+    } else if (strchr(value, 'M') != NULL) {
+        units=1024*1024;
+    } else if (strchr(value, 'K') != NULL) {
+        units=1024;
+    } else {
+        units=1;
+    }
+
+    size = atof(value)*units;
+    return size;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch(type) {
+    case ncclHalf: return 1e-2;
+    case ncclFloat: return 1e-5;
+    case ncclDouble: return 1e-12;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+    //case ncclInt32:
+    case ncclUint32:
+#endif
+    case ncclInt64:
+    case ncclUint64: return 1e-200;
+  }
+  return 1e-200;
+}
+
+template<typename T> __device__
+double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template<> __device__
+double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y-x));
+}
+
+template<typename T> __device__
+float toFloat(T a) {
+  return (float)a;
+}
+template<> __device__ 
+float toFloat(half a) {
+  return __half2float(a);
+}
+
+
+template<typename T, int BSIZE> __global__
+void deltaKern(void* A_, void* B_, size_t count, double* max) {
+  const T* A = (const T*)A_;
+  const T* B = (const T*)B_;
+  __shared__ double temp[BSIZE];
+  int tid = threadIdx.x;
+  double locmax = 0.0;
+  for(int i=tid; i<count; i+=blockDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if( delta > locmax ) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1) printf("Error at %d/%d : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  temp[tid] = locmax;
+  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
+    __syncthreads();
+    if( tid < stride )
+      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
+  }
+  __syncthreads();
+  if( threadIdx.x == 0)
+    *max = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+
+void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+    case ncclHalf:
+      deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<1, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<1, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+  }
+}
+
+#define CURAND_CHK(cmd)                                                         \
+    do {                                                                        \
+      curandStatus_t error = (cmd);                                             \
+      if (error != CURAND_STATUS_SUCCESS) {                                     \
+        printf("CuRAND error %i at %s:%i\n", error, __FILE__ , __LINE__);       \
+        exit(EXIT_FAILURE);                                                     \
+      }                                                                         \
+    } while (false)
+
+
+template<typename T>
+void GenerateRandom(curandGenerator_t generator, T * const dest,
+    const size_t N);
+
+template<>
+void GenerateRandom<int8_t>(curandGenerator_t generator, int8_t * const dest,
+    const size_t N) {
+  size_t align = (4 - (((size_t)dest) & 3)) % 4;
+  CURAND_CHK(curandGenerate(generator, (unsigned int*)(dest+align),
+      N * sizeof(int8_t) / sizeof(int)));
+  CUDACHECK(cudaMemcpy(dest, dest+4, align, cudaMemcpyDeviceToDevice));
+}
+template<>
+void GenerateRandom<uint8_t>(curandGenerator_t generator, uint8_t * const dest,
+    const size_t N) {
+  size_t align = (4 - (((size_t)dest) & 3)) % 4;
+  CURAND_CHK(curandGenerate(generator, (unsigned int*)(dest+align),
+      N * sizeof(uint8_t) / sizeof(int)));
+  CUDACHECK(cudaMemcpy(dest, dest+4, align, cudaMemcpyDeviceToDevice));
+}
+
+template<>
+void GenerateRandom<int32_t>(curandGenerator_t generator, int32_t * const dest,
+    const size_t N) {
+  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N));
+}
+
+template<>
+void GenerateRandom<uint32_t>(curandGenerator_t generator, uint32_t * const dest,
+    const size_t N) {
+  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N));
+}
+
+template<>
+void GenerateRandom<float>(curandGenerator_t generator, float * const dest,
+    const size_t N) {
+  CURAND_CHK(curandGenerateUniform(generator, dest, N));
+}
+
+template<>
+void GenerateRandom<double>(curandGenerator_t generator, double * const dest,
+    const size_t N) {
+  CURAND_CHK(curandGenerateUniformDouble(generator, dest, N));
+}
+
+template<>
+void GenerateRandom<uint64_t>(curandGenerator_t generator, uint64_t * const dest,
+    const size_t N) {
+  CURAND_CHK(curandGenerate(generator, (unsigned int *)dest, N*2));
+}
+
+template<>
+void GenerateRandom<int64_t>(curandGenerator_t generator, int64_t * const dest,
+    const size_t N) {
+  CURAND_CHK(curandGenerate(generator, (unsigned int *)dest, N*2));
+}
+
+template<typename T>
+void RandomizeType(void* dest, const size_t N, const int randomSeed) {
+  T* ptr = (T*)dest;
+  curandGenerator_t gen;
+  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
+  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
+  GenerateRandom<T>(gen, ptr, N);
+  CURAND_CHK(curandDestroyGenerator(gen));
+  CUDACHECK(cudaDeviceSynchronize());
+}
+
+__global__ void halve(const float * src, half* dest, size_t N) {
+  for(int tid = threadIdx.x + blockIdx.x*blockDim.x;
+      tid < N; tid += blockDim.x * gridDim.x)
+    dest[tid] = __float2half(src[tid]);
+}
+
+void RandomizeHalf(void* dest, const size_t N, const int randomSeed) {
+  half* ptr = (half*)dest;
+  curandGenerator_t gen;
+  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
+  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
+
+  float* temp;
+  CUDACHECK(cudaMalloc(&temp, N*sizeof(float)));
+  GenerateRandom<float>(gen, temp, N);
+  halve<<<128, 512>>>(temp, ptr, N);
+  CURAND_CHK(curandDestroyGenerator(gen));
+  CUDACHECK(cudaFree(temp));
+  CUDACHECK(cudaDeviceSynchronize());
+}
+
+void Randomize(void* ptr, const size_t count, ncclDataType_t type, const int seed) {
+  switch (type) {
+    case ncclChar:   RandomizeType<int8_t>  (ptr, count, seed); break;
+#if NCCL_MAJOR >= 2
+    case ncclUint8:  RandomizeType<uint8_t> (ptr, count, seed); break;
+#endif
+    case ncclInt:    RandomizeType<int32_t> (ptr, count, seed); break;
+#if NCCL_MAJOR >= 2
+    case ncclUint32: RandomizeType<uint32_t>(ptr, count, seed); break;
+#endif
+    case ncclInt64:  RandomizeType<int64_t> (ptr, count, seed); break;
+    case ncclUint64: RandomizeType<uint64_t>(ptr, count, seed); break;
+    case ncclHalf:   RandomizeHalf          (ptr, count, seed); break;
+    case ncclFloat:  RandomizeType<float>   (ptr, count, seed); break;
+    case ncclDouble: RandomizeType<double>  (ptr, count, seed); break;
+  }
+}
+
+template<typename T, int OP> __global__ static
+void accumKern(T* acum, const T* contrib, size_t N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    T c = contrib[i];
+    T a = acum[i];
+    if(OP == ncclSum) {
+      acum[i] = a+c;
+    } else if(OP == ncclProd) {
+      acum[i] = a*c;
+    } else if(OP == ncclMax) {
+      acum[i] = (a > c) ? a : c;
+    } else if(OP == ncclMin) {
+      acum[i] = (a < c) ? a : c;
+    }
+  }
+}
+
+template<> __global__
+void accumKern<half, ncclSum>(half* acum, const half* contrib, size_t N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    float c = __half2float(contrib[i]);
+    float a = __half2float(acum[i]);
+    acum[i] = __float2half( a + c );
+  }
+}
+
+template<> __global__
+void accumKern<half, ncclProd>(half* acum, const half* contrib, size_t N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    float c = __half2float(contrib[i]);
+    float a = __half2float(acum[i]);
+    acum[i] = __float2half( a * c );
+  }
+}
+
+template<> __global__
+void accumKern<half, ncclMax>(half* acum, const half* contrib, size_t N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    float c = __half2float(contrib[i]);
+    float a = __half2float(acum[i]);
+    acum[i] = __float2half( (a>c) ? a : c );
+  }
+}
+
+template<> __global__
+void accumKern<half, ncclMin>(half* acum, const half* contrib, size_t N) {
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int offset = blockDim.x*gridDim.x;
+  for(int i=tid; i<N; i+=offset) {
+    float c = __half2float(contrib[i]);
+    float a = __half2float(acum[i]);
+    acum[i] = __float2half( (a<c) ? a : c );
+  }
+}
+
+template<typename T>
+void accVecType(void* out, void* in, size_t n, ncclRedOp_t op) {
+  switch(op) {
+    case ncclSum:  accumKern<T, ncclSum> <<<256,256>>>((T*)out, (T*)in, n); break;
+    case ncclProd: accumKern<T, ncclProd><<<256,256>>>((T*)out, (T*)in, n); break;
+    case ncclMax:  accumKern<T, ncclMax> <<<256,256>>>((T*)out, (T*)in, n); break;
+    case ncclMin:  accumKern<T, ncclMin> <<<256,256>>>((T*)out, (T*)in, n); break;
+    default:
+      printf("Unknown reduction operation.\n");
+      exit(EXIT_FAILURE);
+  }
+}
+
+void Accumulate(void* out, void* in, size_t n, ncclDataType_t type, ncclRedOp_t op) {
+  switch (type) {
+    case ncclChar:   accVecType<int8_t>   (out, in, n, op); break;
+#if NCCL_MAJOR >= 2
+    case ncclUint8:  accVecType<uint8_t>  (out, in, n, op); break;
+#endif
+    case ncclInt:  accVecType<int32_t>  (out, in, n, op); break;
+#if NCCL_MAJOR >= 2
+    case ncclUint32: accVecType<uint32_t> (out, in, n, op); break;
+#endif
+    case ncclInt64:  accVecType<int64_t>  (out, in, n, op); break;
+    case ncclUint64: accVecType<uint64_t> (out, in, n, op); break;
+    case ncclHalf:   accVecType<half>     (out, in, n, op); break;
+    case ncclFloat:  accVecType<float>    (out, in, n, op); break;
+    case ncclDouble: accVecType<double>   (out, in, n, op); break;
+    default:
+      printf("Unknown reduction type.\n");
+      exit(EXIT_FAILURE);
+  }
+}
+
+void Barrier(struct threadArgs_t* args)
+{
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+
+  args->barrier[args->barrier_idx] = args->thread + 1;
+
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+
+  args->barrier_idx=!args->barrier_idx;
+}
+
+void RandomizeAccumulate(void* data, void* accum, size_t count, ncclDataType_t type, ncclRedOp_t op, int seed, int rank) {
+  Randomize(data, count, type, seed);
+  if (rank == 0) {
+    CUDACHECK(cudaMemcpy(accum, data, count*wordSize(type), cudaMemcpyDeviceToHost));
+  } else {
+    Accumulate(accum, data, count, type, op);
+  }
+}
+
+double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    CheckDelta(data , args->expected[i], count, type, args->delta);
+    cudaDeviceSynchronize();
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) { 
+       int *temp = (int *)malloc(args->expectedBytes);
+
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) { 
+       	printf("%d:%d ", j, *((int *)args->expectedHost[0] + j));
+       }
+       printf("\n");
+
+       cudaMemcpy(temp, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) { 
+       	printf("%d:%d ", j, *((int *)temp + j));
+       }
+       printf("\n");
+       free(temp);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  return maxDelta;
+}
+
+void InitSend(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
+  size_t count = args->sendBytes / wordSize(type);
+  static int rep = 1;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void* data = in_place ? (void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank) : args->sendbuffs[i];
+    int seed = rank+count+rep+in_place;
+    Randomize(data, count, type, seed);
+
+#ifdef DEBUG_PRINT
+    if (rank == 2) { 
+       int *temp = (int *)malloc(args->sendBytes);
+       cudaMemcpy(temp, data, args->sendBytes, cudaMemcpyDeviceToHost);
+       printf("\n Send Data at rank %d:", rank);
+       for (int i=0; i<args->sendBytes/sizeof(int); i++) { 
+       	printf("%d:%d ", i, *((int *)temp + i));
+       }
+       printf("\n");
+       free(temp);
+    }
+#endif
+
+    cudaDeviceSynchronize();
+  }
+  rep++;
+}
+
+#define CHECK 1
+
+void startColl(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int thread_offset) {
+  size_t count = args->nbytes / wordSize(type);
+
+  if (swap_args) {
+      args = (struct threadArgs_t*)args->proc_args + (args->thread + thread_offset)%args->nThreads;
+  }
+
+  if (args->nGpus == 1) {
+    int rank = args->proc*args->nThreads + args->thread;
+    RunColl((void*)(in_place ? ((void *)((uintptr_t)args->recvbuffs[0] + args->sendInplaceOffset*rank)) : args->sendbuffs[0]),
+        (void*)(in_place ? (void*)((uintptr_t)args->recvbuffs[0] + args->recvInplaceOffset*rank) : args->recvbuffs[0]),
+        count, type, op, root, args->comms[0], args->streams[0]);
+  } else {
+    NCCLCHECK(ncclGroupStart());
+    for (int i = 0; i < args->nGpus; i++) {
+#ifndef NCCL_MAJOR
+      int cudaDev;
+      NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
+      CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+      RunColl((void*)(in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank)) : args->sendbuffs[i]),
+          (void*)(in_place ? (void*)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank) : args->recvbuffs[i]),
+          count, type, op, root, args->comms[i], args->streams[i]);
+    }
+    NCCLCHECK(ncclGroupEnd());
+  }
+
+  if (swap_args || blocking_coll) {
+    //if args have been swapped, complete op before returning
+    for (int i = 0; i < args->nGpus; ++i) {
+      cudaError_t err = cudaErrorNotReady;
+      while (err == cudaErrorNotReady) { 
+          err = cudaStreamQuery(args->streams[i]);
+          pthread_yield();	
+      }
+      CUDACHECK(err);
+    }
+  }
+  if (blocking_coll) Barrier(args);
+}
+
+void completeColl(struct threadArgs_t* args) {
+  //it swap_args was enabled, op would have been completed immediately
+  if (swap_args || blocking_coll) return;
+
+  for (int i = 0; i < args->nGpus; ++i) {
+    cudaError_t err = cudaErrorNotReady;
+    while (err == cudaErrorNotReady) { 
+        err = cudaStreamQuery(args->streams[i]);
+        pthread_yield();	
+    }
+    CUDACHECK(err);
+  }
+}
+
+void BenchTime(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+  size_t count = args->nbytes / wordSize(type);
+  
+  // Sync
+  startColl(args, type, op, root, in_place, 0);
+  completeColl(args);
+
+  Barrier(args);
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < iters; iter++) {
+      startColl(args, type, op, root, in_place, iter); 
+  }
+  completeColl(args);
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec/iters;
+
+  double algBw, busBw;
+  GetBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+
+  Barrier(args);
+
+  if (datacheck) { 
+      InitSend(args, type, op, root, in_place, args->thread == 0 ? 1 : 0);
+      InitRecvResult(args, type, op, root, in_place, args->thread == 0 ? 1 : 0);
+      cudaDeviceSynchronize();
+  }
+
+  //test validation in single itertion, should ideally be included into the multi-iteration run
+  startColl(args, type, op, root, in_place, 0); 
+  completeColl(args);
+
+  double maxDelta = 0;
+#ifdef CHECK
+  if (datacheck) { 
+     maxDelta = CheckData(args, type, op, root, in_place);
+  } else { 
+     maxDelta = -1.0;
+  }
+#else
+     maxDelta = -1.0;
+#endif
+
+  //aggregate delta from all threads and procs
+  Barrier(args);
+  if (args->thread == 0) {
+      for (int i=1; i<args->nThreads; i++) { 
+          maxDelta += args->deltaThreads[i];
+      }
+#ifdef MPI_SUPPORT
+      MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+#endif
+  }
+  Barrier(args);
+
+  if (datacheck) { 
+     PRINT("  %7.3f  %5.2f  %5.2f  %7.0le", deltaSec * 1.0E3, algBw, busBw,
+         maxDelta);
+  } else {
+     PRINT("  %7.3f  %5.2f  %5.2f  \tN/A", deltaSec * 1.0E3, algBw, busBw);
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs_t* args) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset, procSharedCount;
+  int sameExpected;
+  
+  count = size / wordSize(type);
+  getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, &procSharedCount, &sameExpected, (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+void TimeTest(struct threadArgs_t* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, int inPlace) {
+  // Warm-up
+  setupArgs(args->maxbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+     startColl(args, type, op, root, 0, iter);
+  }
+  completeColl(args);
+
+  // Benchmark
+  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+      setupArgs(size, type, args);
+      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+      BenchTime(args, type, op, root, 0);
+      if (inPlace) BenchTime(args, type, op, root, 1);
+      PRINT("\n");
+  }
+}
+
+
+void* threadRunTests(void* args) {
+  struct threadArgs_t* targs = (struct threadArgs_t*)args;
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = targs->localRank*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+
+  RunTest(targs, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]);
+
+  return NULL;
+}
+
+void* threadInit(void* args) {
+  struct threadArgs_t* targs = (struct threadArgs_t*)args;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks =  targs->nProcs*targs->nThreads*targs->nGpus;
+
+  //set main thread again
+  is_main_thread = (targs->proc == 0 && targs->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i=0; i<targs->nGpus; i++) {
+    int rank = targs->proc*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus + i;
+    int gpuid = targs->localRank*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    NCCLCHECK(ncclCommInitRank(targs->comms+i, nranks, targs->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  PRINT("# Using devices\n");
+  for (int p=0; p<targs->nProcs; p++) {
+    if (p == targs->proc) {
+      for (int t=0; t<targs->nThreads; t++) {
+        if (t == targs->thread) {
+          for (int i=0; i<targs->nGpus; i++) {
+            int cudaDev;
+            int rank;
+            cudaDeviceProp prop;
+            NCCLCHECK(ncclCommCuDevice(targs->comms[i], &cudaDev));
+            NCCLCHECK(ncclCommUserRank(targs->comms[i], &rank));
+            CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+            printf("#   Rank %2d on %10s device %2d [0x%02x] %s\n", rank, hostname, cudaDev,
+                prop.pciBusID, prop.name);
+            fflush(stdout);
+          }
+          Barrier(targs);
+          fflush(stdout);
+	}
+      }
+    }
+  }
+
+  threadRunTests(args);
+
+  return NULL;
+}
+
+void AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, void **expectedHost, size_t nbytes, int nranks, int sameExpected) {
+    static int is_first = 1;
+    static void *cached_ptr = NULL;
+    static void *cached_hostptr = NULL;
+
+    CUDACHECK(cudaMalloc(sendbuff, sendBytes));
+    //work around for inline reduce scatter where recv count is smaller that send count
+    CUDACHECK(cudaMalloc(recvbuff, (sendBytes > recvBytes) ? sendBytes : recvBytes));
+
+    if (is_first || !sameExpected) {
+        *expectedHost = malloc(recvBytes);
+        CUDACHECK(cudaHostRegister(*expectedHost, recvBytes, cudaHostRegisterPortable | cudaHostRegisterMapped));
+        CUDACHECK(cudaHostGetDevicePointer(expected, *expectedHost, 0));
+        cached_ptr = *expected;
+        cached_hostptr = *expectedHost;
+        is_first = 0;
+    } else {
+        *expected = cached_ptr;
+        *expectedHost = cached_hostptr;
+    }
+}
+ 
+int ncclstringtotype(char *str) { 
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[nccltype]);
+    return nccltype;
+}
+
+int ncclstringtoop (char *str) { 
+    for (int o=0; o<ncclNumOps; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclop]);
+    return ncclop;
+}
+
+int main(int argc, char* argv[]) {
+ int nThreads = 1, nGpus = 1;
+ size_t minBytes = 32*1024*1024, maxBytes = 32*1024*1024, stepBytes = 1*1024*1024, stepFactor = 1;
+ int longindex;
+ int nProcs = 1, proc = 0;
+ int localRank = 0;
+ char hostname[1024];
+ getHostName(hostname, 1024);
+ 
+ static struct option longopts[] = {
+    {"nthreads", required_argument, 0, 't'}, 
+    {"ngpus", required_argument, 0, 'g'}, 
+    {"minbytes", required_argument, 0, 'b'}, 
+    {"maxbytes", required_argument, 0, 'e'}, 
+    {"stepbytes", required_argument, 0, 'i'},
+    {"stepfactor", required_argument, 0, 'f'},
+    {"iters", required_argument, 0, 'n'},
+    {"warmup_iters", required_argument, 0, 'w'},
+    {"swap_comms", required_argument, 0, 's'},
+    {"parallel_init", required_argument, 0, 'p'},
+    {"check", required_argument, 0, 'c'},
+    {"blocking", required_argument, 0, 'z'},
+    {"op", required_argument, 0, 'o'},
+    {"datatype", required_argument, 0, 'd'},
+    {"root", required_argument, 0, 'r'},
+    {"help", no_argument, 0, 'h'}
+ };
+
+ while(1) {
+      int c;
+      c = getopt_long(argc, argv, "t:g:b:e:i:f:n:w:s:p:c:o:d:r:z:h", longopts, &longindex);
+
+      if (c == -1)
+         break;
+
+      switch(c) {
+         case 't':
+             nThreads = strtol(optarg, NULL, 0);
+             break;
+         case 'g':
+             nGpus = strtol(optarg, NULL, 0);
+             break;
+         case 'b':
+             minBytes = (size_t)parsesize(optarg);
+             break;
+         case 'e':
+             maxBytes = (size_t)parsesize(optarg);
+             break;
+         case 'i':
+             stepBytes = strtol(optarg, NULL, 0);
+             break;
+         case 'f':
+             stepFactor = strtol(optarg, NULL, 0);
+             break;
+	 case 'n':
+	     iters = (int)strtol(optarg, NULL, 0);
+	     break;
+	 case 'w':
+	     warmup_iters = (int)strtol(optarg, NULL, 0);
+	     break;
+	 case 's':
+	     swap_args = (int)strtol(optarg, NULL, 0);
+	     break;
+	 case 'c':
+	     datacheck = (int)strtol(optarg, NULL, 0);
+	     break;
+	 case 'p':
+	     parallel_init = (int)strtol(optarg, NULL, 0);
+	     break;
+	 case 'o':
+	     ncclop = ncclstringtoop(optarg);
+	     break;
+	 case 'd':
+	     nccltype = ncclstringtotype(optarg);
+	     break;
+	 case 'r':
+	     ncclroot = strtol(optarg, NULL, 0);
+	     break;
+	 case 'z':
+	     blocking_coll = strtol(optarg, NULL, 0);
+	     break;
+         case 'h':
+	         printf("USAGE: ./test \n\t" 
+	 	 "[-t,--nthreads <num threads>] \n\t "
+		 "[-g,--ngpus <gpus per thread>] \n\t "
+		 "[-b,--minbytes <min size in bytes>] \n\t "
+		 "[-e,--maxbytes <max size in bytes>] \n\t "
+	         "[-i,--stepbytes <increment size>] \n\t "
+		 "[-f,--stepfactor <increment factor>] \n\t "
+		 "[-n,--iters <iteration count>] \n\t "
+		 "[-w,--warmup_iters <warmup iteration count>] \n\t" 
+		 "[-s,--swap_args <0/1>] \n\t "
+		 "[-p,--parallel_init <0/1>] \n\t "
+		 "[-c,--check <0/1>] \n\t "
+		 "[-o,--op <sum/prod/min/max/all>] \n\t "
+		 "[-d,--datatype <nccltype/all>] \n\t "
+		 "[-r,--root <root>] \n\t "
+		 "[-z,--blocking <0/1>] \n\t "
+		 "[-h,--help]\n");
+	         return 0;
+	 default: 
+	         printf("invalid option \n");
+	         printf("USAGE: ./test \n\t" 
+	 	 "[-t,--nthreads <num threads>] \n\t "
+		 "[-g,--ngpus <gpus per thread>] \n\t "
+		 "[-b,--minbytes <min size in bytes>] \n\t "
+		 "[-e,--maxbytes <max size in bytes>] \n\t "
+	         "[-i,--stepbytes <increment size>] \n\t "
+		 "[-f,--stepfactor <increment factor>] \n\t "
+		 "[-n,--iters <iteration count>] \n\t "
+		 "[-w,--warmup_iters <warmup iteration count>] \n\t" 
+		 "[-s,--swap_args <0/1>] \n\t "
+		 "[-p,--parallel_init <0/1>] \n\t "
+		 "[-c,--check <0/1>] \n\t "
+		 "[-o,--op <sum/prod/min/max/all>] \n\t "
+		 "[-d,--datatype <nccltype/all>] \n\t "
+		 "[-r,--root <root>] \n\t "
+		 "[-z,--blocking <0/1>] \n\t "
+		 "[-h,--help]\n");
+	         return 0;
+      }
+  }
+
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
+  for (int p=0; p<nProcs; p++) {
+    if (p == proc) break;
+    if (hostHashs[p] == hostHashs[proc]) localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  if (proc == 0) { 
+      printf("nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, 
+      			(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+      if (swap_args) printf("Swap Comms Enabled: swapping communicators among threads for each iteration \n");
+      if (blocking_coll) printf("Blocking Enabled: wait for completion and barrier after each collective \n"); 
+      if (parallel_init) printf("Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); 
+  }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus*nThreads];
+  void* sendbuffs[nGpus*nThreads];
+  void* recvbuffs[nGpus*nThreads];
+  void* expected[nGpus*nThreads];
+  void* expectedHost[nGpus*nThreads];
+  void *procSharedHost, *procShared;
+  size_t sendBytes, recvBytes, paramBytes, procSharedBytes, sendInplaceOffset, recvInplaceOffset; 
+  int sameExpected;
+
+  getCollByteCount(&sendBytes, &recvBytes, &paramBytes, &sendInplaceOffset, &recvInplaceOffset, &procSharedBytes, &sameExpected, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+
+  for (int i=0; i<nGpus*nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+    AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, expectedHost+i, (size_t)maxBytes, nProcs*nThreads*nGpus, sameExpected);
+    CUDACHECK(cudaStreamCreate(streams+i));
+  }
+
+  if (procSharedBytes > 0) { 
+      procSharedHost = malloc(procSharedBytes);
+      CUDACHECK(cudaHostRegister(procSharedHost, procSharedBytes, cudaHostRegisterPortable | cudaHostRegisterMapped));
+      CUDACHECK(cudaHostGetDevicePointer(&procShared, procSharedHost, 0));
+  }
+
+  //if parallel init is not selected, use main thread to initialize NCCL
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+  if (!parallel_init) {
+     if (nProcs == 1) {
+       int gpuArray[nGpus*nThreads];
+       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
+     } else {
+       NCCLCHECK(ncclGroupStart());
+       for (int i=0; i<nGpus*nThreads; i++) {
+         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i)); 
+       }
+       NCCLCHECK(ncclGroupEnd());
+     }
+
+     PRINT("# Using devices\n");
+     for (int p=0; p<nProcs; p++) {
+       if (p == proc) {
+         for (int i=0; i<nThreads*nGpus; i++) {
+           int cudaDev;
+           int rank;
+           cudaDeviceProp prop;
+           NCCLCHECK(ncclCommCuDevice(comms[i], &cudaDev));
+           NCCLCHECK(ncclCommUserRank(comms[i], &rank));
+           CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+           printf("#   Rank %2d on %10s device %2d [0x%02x] %s\n", rank, hostname, cudaDev,
+               prop.pciBusID, prop.name);
+           fflush(stdout);
+         }
+       }
+#ifdef MPI_SUPPORT
+       MPI_Barrier(MPI_COMM_WORLD);
+#endif
+       fflush(stdout);
+     }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double delta[nThreads];
+  int bw_count[nThreads];
+  for (int t=0; t<nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("\n");
+  print_header();
+
+  int* sync = (int*)calloc(2, sizeof(int));
+  int* barrier = (int*)calloc(2, sizeof(int));
+
+  pthread_t threads[nThreads];
+  struct threadArgs_t args[nThreads];
+
+  for (int t=nThreads-1; t>=0; t--) {
+    args[t].proc_args = (void *)args;
+    args[t].minbytes=minBytes;
+    args[t].maxbytes=maxBytes;
+    args[t].stepbytes=stepBytes;
+    args[t].stepfactor=stepFactor;
+    args[t].localRank = localRank;
+
+    args[t].nProcs=nProcs;
+    args[t].proc=proc;
+    args[t].nThreads=nThreads;
+    args[t].thread=t;
+    args[t].nGpus=nGpus;
+    args[t].sendbuffs = sendbuffs+t*nGpus;
+    args[t].recvbuffs = recvbuffs+t*nGpus;
+    args[t].ncclId = ncclId;
+    args[t].comms=comms+t*nGpus;
+    args[t].streams=streams+t*nGpus;
+
+    args[t].expectedHost = expectedHost + t*nGpus;
+    args[t].expected = expected + t*nGpus;
+    args[t].procSharedHost = procSharedHost; 
+    args[t].procShared = procShared; 
+    args[t].barrier = (volatile int*)barrier;
+    args[t].barrier_idx = 0;
+    args[t].sync = (volatile int*)sync;
+    args[t].sync_idx = 0;
+    args[t].deltaThreads = delta;
+    args[t].deltaHost = (delta + t);
+    CUDACHECK(cudaHostRegister(args[t].deltaHost, sizeof(double), cudaHostRegisterPortable|cudaHostRegisterMapped));
+    CUDACHECK(cudaHostGetDevicePointer(&args[t].delta, args[t].deltaHost, 0));
+    args[t].errors=errors+t;
+    args[t].bw=bw+t;
+    args[t].bw_count=bw_count+t;
+
+    if (!parallel_init) { 
+       if (t) 
+         pthread_create(threads+t, NULL, threadRunTests, args+t);
+       else
+         threadRunTests(args);
+    } else {
+        if (t || (parallel_init && (proc == 0))) 
+         pthread_create(threads+t, NULL, threadInit, args+t);
+       else  
+         threadInit(args);
+    }
+  }
+
+  // Wait for other threads
+  for (int t=nThreads-1; t>=0; t--) {
+    if (t || (parallel_init && (proc == 0))) pthread_join(threads[t], NULL);
+    errors[0] += errors[t];
+    bw[0] += bw[t];
+    bw_count[0] += bw_count[t];
+  }
+
+#ifdef MPI_SUPPORT
+    MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  for(int i=0; i<nGpus*nThreads; ++i)
+    ncclCommDestroy(comms[i]);
+  free(comms);
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT(" Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT(" Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+  if (errors[0] || bw[0] < check_avg_bw*(0.9))
+    exit(EXIT_FAILURE);
+  else 
+    exit(EXIT_SUCCESS);
+}
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000000..81b0436d75
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,158 @@
+/*************************************************************************
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include <stdio.h>
+#include <algorithm>
+#include <curand.h>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t e = cmd;                              \
+  if( e != cudaSuccess ) {                          \
+    printf("Cuda failure %s:%d '%s'\n",             \
+        __FILE__,__LINE__,cudaGetErrorString(e));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t r = cmd;                             \
+  if (r!= ncclSuccess) {                            \
+    printf("NCCL failure %s:%d '%s'\n",             \
+        __FILE__,__LINE__,ncclGetErrorString(r));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+struct threadArgs_t {
+  void *proc_args;
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int localRank;
+  void** sendbuffs;
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void** recvbuffs;
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expectedHost;
+  void** expected;
+  size_t expectedBytes;
+  void* procSharedHost;
+  void* procShared;
+  volatile int* sync;
+  int sync_idx;
+  volatile int* barrier;
+  int barrier_idx;
+  int syncRank;
+  int syncNranks;
+  double* deltaThreads;
+  double* deltaHost;
+  double* delta;
+  int* errors;
+  double* bw;
+  int* bw_count;
+};
+
+#include <chrono>
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs_t* args);
+extern void TimeTest(struct threadArgs_t* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root, int inPlace);
+extern void Randomize(void* ptr, size_t count, ncclDataType_t type, int seed);
+extern void Accumulate(void* out, void* in, size_t n, ncclDataType_t type, ncclRedOp_t op);
+extern void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax);
+extern double DeltaMaxValue(ncclDataType_t type);
+
+// Provided by each coll
+void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName);
+extern void GetBw(size_t count, int typeSize, double sec, double* algBw, double* busBw, int nranks);
+extern void RunColl(void* sendbuf, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op,  int root, ncclComm_t comm, cudaStream_t stream);
+extern void InitData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op,  int in_place, int is_first);
+extern double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+extern void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op,  int root, int in_place, int is_first);
+extern void getCollByteCount(size_t *sendbytes, size_t *recvbytes, size_t *parambytes, size_t *sendInlineOffset, size_t *recvInlineOffset, size_t *procSharedBytes, int *sameexpected, size_t nbytes, int nranks);
+extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
+extern void print_header();
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHostHash(const char* string) {
+  // Based on DJB2, result = result * 33 + char
+  uint64_t result = 5381;
+  for (int c = 0; string[c] != '\0'; c++){
+    result = ((result << 5) + result) + string[c];
+  }
+  return result;
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64: 
+      return 8;
+    default: return 0;
+  }
+}
+
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[ncclNumOps];
+extern const char *test_opnames[ncclNumOps];
+
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+
diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h
new file mode 100644
index 0000000000..4279789af6
--- /dev/null
+++ b/src/nccl1_compat.h
@@ -0,0 +1,47 @@
+/*************************************************************************
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
diff --git a/src/reduce.cu b/src/reduce.cu
new file mode 100644
index 0000000000..0bc9a7db83
--- /dev/null
+++ b/src/reduce.cu
@@ -0,0 +1,159 @@
+/*************************************************************************
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include <assert.h>
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %6s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op", "root",
+      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s  %6s  %6i", size, count, typeName, opName, root);
+}
+
+void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
+    *sendcount = count;
+    *recvcount = count;
+    *sameExpected = 0;
+    *procSharedCount = count;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = 0;
+    *paramcount = *sendcount;
+ }
+
+void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
+  size_t count = args->expectedBytes / wordSize(type);
+  int root_gpu = root%args->nGpus;
+
+  assert(args->expectedBytes == args->nbytes);
+
+  while (args->sync[args->sync_idx] != args->thread) pthread_yield();
+
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+
+    if (is_first && i == 0) {
+      CUDACHECK(cudaMemcpy(args->procSharedHost, data, count*wordSize(type), cudaMemcpyDeviceToHost));
+    } else {
+      Accumulate(args->procShared, data, count, type, op);
+    }
+
+    if (in_place == 0) {
+      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+
+  args->sync[args->sync_idx] = args->thread + 1;
+
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    int root_proc = root/(args->nThreads*args->nGpus);
+    if (args->expectedBytes) {
+      // Last thread does the MPI reduction
+      if (root_proc == args->proc) { 
+        void* temp, *tempHost = malloc(args->expectedBytes);
+        CUDACHECK(cudaHostRegister(tempHost, args->expectedBytes, 0));
+        CUDACHECK(cudaHostGetDevicePointer(&temp, tempHost, 0));
+
+        for (int i=0; i<args->nProcs; i++) {
+          if (i == args->proc) continue;
+          MPI_Recv(tempHost, args->expectedBytes, MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+          Accumulate(args->procShared, temp, count, type, op);
+          CUDACHECK(cudaDeviceSynchronize());
+        }
+
+        CUDACHECK(cudaHostUnregister(tempHost));
+        free(tempHost);
+      } else {
+        MPI_Send(args->procSharedHost, args->expectedBytes, MPI_BYTE, root_proc, 0, MPI_COMM_WORLD);
+      }
+    }
+#endif
+    args->sync[args->sync_idx] = 0;
+  } else {
+    while (args->sync[args->sync_idx]) pthread_yield();
+  }
+
+  //if root fill expected bytes with reduced data
+  // else if in_place, leave fill it with original data, else set to zero
+  for (int i=0; i<args->nGpus; i++) {
+      int rank = (args->proc*args->nThreads + args->thread)*args->nGpus + i;
+      if (rank == root) { 
+          memcpy(args->expectedHost[root_gpu], args->procSharedHost, args->expectedBytes); 
+      } else { 
+         if (in_place == 1) {
+              CUDACHECK(cudaMemcpy(args->expectedHost[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDeviceToHost));
+          } else {
+              memset(args->expectedHost[i], 0, args->expectedBytes); 
+          }
+      } 
+  }
+
+  args->sync_idx = !args->sync_idx;
+}
+
+void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  *busBw = baseBw;
+}
+
+void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
+}
+
+
+void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) { 
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else { 
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) { 
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else { 
+    op_count = ncclNumOps;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  if (root != -1) { 
+     begin_root = end_root = root;
+  } else { 
+     begin_root = 0;
+     end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) { 
+      for (int j=0; j<op_count; j++) { 
+         for (int k=begin_root; k<=end_root; k++) { 
+             TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k, 1);
+         }
+      }
+  }   
+}
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
new file mode 100644
index 0000000000..ef2b1b79d3
--- /dev/null
+++ b/src/reduce_scatter.cu
@@ -0,0 +1,139 @@
+/*************************************************************************
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENCE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
+      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+}
+
+void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
+    *sendcount = (count/nranks)*nranks;
+    *recvcount = count/nranks;
+    *sameExpected = 0;
+    *procSharedCount = *sendcount;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = count/nranks;
+    *paramcount = *recvcount;
+}
+
+void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
+  size_t recvbytes = args->expectedBytes;
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  size_t sendbytes = args->sendBytes;
+  size_t sendcount = args->sendBytes / wordSize(type);
+
+  while (args->sync[args->sync_idx] != args->thread) pthread_yield();
+
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+
+    if (is_first && i == 0) {
+      CUDACHECK(cudaMemcpy(args->procSharedHost, data, sendbytes, cudaMemcpyDeviceToHost));
+    } else {
+      Accumulate(args->procShared, data, sendcount, type, op);
+    }
+
+    CUDACHECK(cudaDeviceSynchronize());
+    if (in_place == 0) {
+      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, recvbytes));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+
+  args->sync[args->sync_idx] = args->thread + 1;
+
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (sendbytes > 0) {
+      // Last thread does the MPI reduction
+      void* remote, *remoteHost = malloc(sendbytes);
+      void* myInitialData = malloc(sendbytes);
+      memcpy(myInitialData, args->procSharedHost, sendbytes);
+      CUDACHECK(cudaHostRegister(remoteHost, sendbytes, 0));
+      CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0));
+
+      for (int i=0; i<args->nProcs; i++) {
+        if (i == args->proc) {
+          MPI_Bcast(myInitialData, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD);
+          free(myInitialData);
+        } else {
+          MPI_Bcast(remoteHost, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD);
+          Accumulate(args->procShared, remote, sendcount, type, op);
+          cudaDeviceSynchronize();
+        }
+      }
+      CUDACHECK(cudaHostUnregister(remoteHost));
+      free(remoteHost);
+    }
+#endif
+    args->sync[args->sync_idx] = 0;
+  } else {
+    while (args->sync[args->sync_idx]) pthread_yield();
+  }
+
+  for (int i=0; i<args->nGpus; i++) {
+      int offset = ((args->proc*args->nThreads + args->thread)*args->nGpus + i)*recvbytes;
+      memcpy(args->expectedHost[i], (void *)((uintptr_t)args->procSharedHost + offset), recvbytes);
+  }
+
+  args->sync_idx = !args->sync_idx;
+}
+
+void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
+}
+
+void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) { 
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) { 
+    run_ops = &op;
+    run_opnames = &opName;
+    op_count = 1;
+  } else { 
+    op_count = sizeof(test_ops)/sizeof(test_ops[0]);
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) { 
+      for (int j=0; j<op_count; j++) { 
+          TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], 0, 1);
+      }
+  }   
+}

From caede2fbd6f2df935beeb079719acc55782e8dfa Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 8 Aug 2017 16:25:07 -0700
Subject: [PATCH 002/233] Improve Readme

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d70bb1f54c..1532a658fb 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,10 @@ Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each, d
 $ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0
 ```
 
-All tests support the same arguments :
+### Arguments
+
+All tests support the same set of arguments :
+
 * Number of GPUs
   * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
   * `-g,--ngpus <gpus per thread>` number of gpus per process. Default : 1.

From a15599f5cfc6043e3514800c92ac9e55b8dec835 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 8 Aug 2017 16:28:46 -0700
Subject: [PATCH 003/233] Improve Readme

---
 README.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 1532a658fb..92b122c2f2 100644
--- a/README.md
+++ b/README.md
@@ -40,23 +40,24 @@ All tests support the same set of arguments :
 
 * Number of GPUs
   * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
-  * `-g,--ngpus <gpus per thread>` number of gpus per process. Default : 1.
+  * `-g,--ngpus <gpus per thread>` number of gpus per thread. Default : 1.
 * Sizes to scan
   * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
   * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
   * Increments can be either fixes of a multiplication factor. Only one of those should be used
-  * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
-  * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
+    * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
+    * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
+* NCCL operations arguments
+  * `-o,--op <sum/prod/min/max/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
+  * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
+  * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
 * Performance 
   * `-n,--iters <iteration count>` number of iterations. Default : 20.
   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
-* `-s,--swap_args <0/1>` when used with multiple threads, have threads manage different GPUs for each iteration. Default : 0.
-* `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel.
-* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
-* NCCL operations arguments
-  * `-o,--op <sum/prod/min/max/all>` Specify which reduction operation to perform. Only relevant for reduction operations. Default : Sum.
-  * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
-  * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce.
+* Test operation
+  * `-s,--swap_args <0/1>` when used with multiple threads, have threads manage different GPUs for each iteration. Default : 0.
+  * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
+  * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
 
 ## Copyright

From 9ec3e352769c1ec9900c59755fad98b61404f5a0 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 8 Aug 2017 16:29:25 -0700
Subject: [PATCH 004/233] Fix typo in Readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 92b122c2f2..10c255dae9 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ All tests support the same set of arguments :
 * Sizes to scan
   * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
   * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
-  * Increments can be either fixes of a multiplication factor. Only one of those should be used
+  * Increments can be either fixed or a multiplication factor. Only one of those should be used
     * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
     * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
 * NCCL operations arguments

From 25016c8eebbf8200208bfce9ebfbc1ea2254e915 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Wed, 9 Aug 2017 10:41:31 -0700
Subject: [PATCH 005/233] Fix NCCL_HOME to be consistent with README

---
 src/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 6188d01424..45d31d54b0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -43,9 +43,9 @@ endif
 .PHONY: build clean
 
 BUILDDIR ?= ../build
-ifneq ($(NCCLDIR), "")
-NVCUFLAGS += -I$(NCCLDIR)/include/
-NVLDFLAGS   += -L$(NCCLDIR)/lib
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS   += -L$(NCCL_HOME)/lib
 endif
 
 ifeq ($(MPI), 1)

From 925a70576e584e77bc930606c59595e9f66b71dd Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Thu, 21 Dec 2017 15:10:09 -0800
Subject: [PATCH 006/233] Print NCCL version at start

---
 src/common.cu      | 1 +
 src/nccl1_compat.h | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index a14c3aac01..f47e0f5da5 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -915,6 +915,7 @@ int main(int argc, char* argv[]) {
        NCCLCHECK(ncclGroupEnd());
      }
 
+     PRINT("# NCCL Tests compiled with NCCL %d.%d\n", NCCL_MAJOR, NCCL_MINOR);
      PRINT("# Using devices\n");
      for (int p=0; p<nProcs; p++) {
        if (p == proc) {
diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h
index 4279789af6..1a56b571cf 100644
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
@@ -8,6 +8,9 @@
 #define NCCL1_COMPAT_H
 
 #ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
 #define ncclNumOps nccl_NUM_OPS
 #define ncclNumTypes nccl_NUM_TYPES
 

From 222f94f94948ed7e2932850f28380b176886f963 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Mon, 29 Jan 2018 13:40:45 -0800
Subject: [PATCH 007/233] Added explanation about performance numbers

---
 README.md          |   4 ++
 doc/PERFORMANCE.md | 140 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 doc/PERFORMANCE.md

diff --git a/README.md b/README.md
index 10c255dae9..d036c69644 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,10 @@ Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each, d
 $ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0
 ```
 
+### Performance
+
+See the [doc/PERFORMANCE.md](Performance) page for explanation about numbers, and in particular the "busbw" column.
+
 ### Arguments
 
 All tests support the same set of arguments :
diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md
new file mode 100644
index 0000000000..bc01e57b5f
--- /dev/null
+++ b/doc/PERFORMANCE.md
@@ -0,0 +1,140 @@
+# Performance reported by NCCL tests
+
+NCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
+
+# Time
+
+Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations.
+
+On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but
+also the bandwidth multiplied by the size.
+
+Therefore, on large sizes, it makes more sense to look at the bandwidth.
+
+# Bandwidth
+
+## Algorithm bandwidth
+
+Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth.
+
+`algbw = S/t`
+
+## Bus bandwidth
+
+While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks.
+Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase).
+
+To provide a number which reflects how optimally the hardware is used, NCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
+This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication.
+Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+
+The formula depends on the collective operation.
+
+### AllReduce
+
+An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation :
+
+`o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}`
+
+**Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).**
+
+A ring would do that in an order which follows the ring :
+
+`i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}`
+
+A tree would do this hierchically :
+
+`(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
+
+In all cases, we need n-1 additions and n assignations for each element. Since every step is on a different rank except potentially one (the last input and the first output),
+we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation.
+
+Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best :
+
+ `t = (S*2*(n-1)) / (n*B)`
+
+Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them.
+Reordering the elements, we find that
+
+ `t = (S/B) * (2*(n-1)/n)`
+
+Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute :
+
+ `B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)`
+
+### ReduceScatter
+
+The ReduceScatter operation requires only to perform the addition part of the allReduce operation :
+
+ `o_K = i_0 + i_1 + i_2 + ... + i_{n-1}`
+
+With K being the rank which is getting the final result(K=offset/recvsize).
+
+The perfect reduceScatter time with a rank bandwidth of B would therefore be :
+
+ `t = S*(n-1) / (B*n)`
+
+And the Bus Bandwidth is therefore computed as :
+
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+
+### AllGather
+
+The AllGather operation requires only to perform the assignation part of the allReduce operation :
+
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K`
+
+With K being the rank where the data originates from (K=offset*sendsize).
+
+The perfect allGather time with a rank bandwidth of B would therefore be :
+
+ `t = S*(n-1) / (B*n)`
+
+And the Bus Bandwidth is therefore computed as :
+
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+
+### Broadcast
+
+The broadcast operation representation is similar to allGather :
+
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_R`
+
+R being the root of the operation.
+
+However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations.
+Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out :
+
+ `t = S/B`
+
+And :
+
+ `B = S/t`
+
+### Reduce
+
+The reduce operation performs :
+
+ `o_R = i_0 + i_1 + i_2 + ... + i_{n-1}`
+
+R being the root of the operation.
+
+Similarly to broadcast, all data need to be sent to the root, hence :
+
+ `t = S/B`
+
+And :
+
+ `B = S/t`
+
+### Summary
+
+To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth :
+
+* AllReduce : 2*(_n_-1)/_n_
+* ReduceScatter : (_n_-1)/_n_
+* AllGather : (_n_-1)/_n_
+* Broadcast : 1
+* Reduce : 1
+
+The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.

From db39a88f8a88730e1d5ca428ee764486d87a5805 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 30 Jan 2018 09:14:49 -0800
Subject: [PATCH 008/233] Fix link to performance page

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d036c69644..0fd7a24bc3 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ $ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0
 
 ### Performance
 
-See the [doc/PERFORMANCE.md](Performance) page for explanation about numbers, and in particular the "busbw" column.
+See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.
 
 ### Arguments
 

From e00cb1f1c429eb524f2e0903f986b46fe0d15e1f Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 30 Jan 2018 09:15:58 -0800
Subject: [PATCH 009/233] Typos/Clarifications

---
 doc/PERFORMANCE.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md
index bc01e57b5f..b9afbb4ecb 100644
--- a/doc/PERFORMANCE.md
+++ b/doc/PERFORMANCE.md
@@ -38,11 +38,11 @@ An allreduce operation, for each element of the N arrays (input i_X and output o
 
 **Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).**
 
-A ring would do that in an order which follows the ring :
+A ring would do that operation in an order which follows the ring :
 
 `i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}`
 
-A tree would do this hierchically :
+A tree would do it hierarchically :
 
 `(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
 

From eb4c43ff3d37d656efdf2ed75ce49e7f73efa581 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 30 Jan 2018 09:17:29 -0800
Subject: [PATCH 010/233] Clarification

---
 doc/PERFORMANCE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md
index b9afbb4ecb..97419ecde9 100644
--- a/doc/PERFORMANCE.md
+++ b/doc/PERFORMANCE.md
@@ -54,7 +54,7 @@ Considering that each rank has a bandwidth to the outside world of _B_, the time
  `t = (S*2*(n-1)) / (n*B)`
 
 Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them.
-Reordering the elements, we find that
+Reordering the equation, we find that
 
  `t = (S/B) * (2*(n-1)/n)`
 

From dcf818955fa6e279e03263c984e95384164c24ad Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Fri, 17 Aug 2018 14:58:44 -0700
Subject: [PATCH 011/233] Added a precision for AllGather and ReduceScatter
 sizes since NCCL uses the size per rank.

---
 doc/PERFORMANCE.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md
index 97419ecde9..7cc6ecee66 100644
--- a/doc/PERFORMANCE.md
+++ b/doc/PERFORMANCE.md
@@ -78,6 +78,8 @@ And the Bus Bandwidth is therefore computed as :
 
  `B = S/t * (n-1)/n = algbw * (n-1)/n`
 
+Note that here, S is the size in bytes of the total array, which for NCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
+
 ### AllGather
 
 The AllGather operation requires only to perform the assignation part of the allReduce operation :
@@ -94,6 +96,8 @@ And the Bus Bandwidth is therefore computed as :
 
  `B = S/t * (n-1)/n = algbw * (n-1)/n`
 
+Note that here, S is the size in bytes of the total array, which for NCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
+
 ### Broadcast
 
 The broadcast operation representation is similar to allGather :

From cbe7f654001d4b4123d8b104c863d983fa746a02 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 6 Mar 2019 18:17:20 -0800
Subject: [PATCH 012/233] Resync all tests with test code from NCCL 2.4

Major rework to merge most of the changes from the NCCL internal
tests into the public ones

Added "-m <agg_iters>" operation aggregation option.
Data integrity checking is now much more performant at scale.
Startup times at scale are improved.
Test latency units are now displayed in usec.
---
 README.md             |   12 +-
 src/Makefile          |   24 +-
 src/all_gather.cu     |  117 ++---
 src/all_reduce.cu     |  146 +++---
 src/broadcast.cu      |  145 +++---
 src/common.cu         | 1148 ++++++++++++++++++-----------------------
 src/common.h          |  130 ++++-
 src/nccl1_compat.h    |    4 +-
 src/reduce.cu         |  180 +++----
 src/reduce_scatter.cu |  140 +++--
 10 files changed, 949 insertions(+), 1097 deletions(-)

diff --git a/README.md b/README.md
index 0fd7a24bc3..7a4bbbc6ca 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # NCCL Tests
 
-These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL 1](http://github.com/nvidia/nccl) and [NCCL 2](http://developer.nvidia.com/nccl).
+These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL](http://github.com/nvidia/nccl)
 
 ## Build
 
@@ -20,7 +20,7 @@ $ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nc
 
 ## Usage
 
-NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of gpus per thread).
+NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
 
 ### Quick examples
 
@@ -44,7 +44,7 @@ All tests support the same set of arguments :
 
 * Number of GPUs
   * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
-  * `-g,--ngpus <gpus per thread>` number of gpus per thread. Default : 1.
+  * `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
 * Sizes to scan
   * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
   * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
@@ -55,16 +55,16 @@ All tests support the same set of arguments :
   * `-o,--op <sum/prod/min/max/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
   * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
   * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
-* Performance 
+* Performance
   * `-n,--iters <iteration count>` number of iterations. Default : 20.
   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
+  * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
 * Test operation
-  * `-s,--swap_args <0/1>` when used with multiple threads, have threads manage different GPUs for each iteration. Default : 0.
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
   * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
 
 ## Copyright
 
-NCCL tests are provided under the BSD licence. All source code and accompanying documentation is copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 
diff --git a/src/Makefile b/src/Makefile
index 45d31d54b0..034cc672fa 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,7 +1,7 @@
 #
-# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
-# See LICENCE.txt for license information
+# See LICENSE.txt for license information
 #
 
 CUDA_HOME ?= /usr/local/cuda
@@ -18,10 +18,10 @@ NVCC = $(CUDA_HOME)/bin/nvcc
 NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
 		-gencode=arch=compute_35,code=sm_35 \
                 -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_60,code=sm_60 \
+		-gencode=arch=compute_60,code=sm_60 \
                 -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_61,code=compute_61
+		-gencode=arch=compute_70,code=compute_70 \
+		-gencode=arch=compute_70,code=sm_70
 
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 
@@ -29,14 +29,16 @@ LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
 
 ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3
-CXXFLAGS  += -O3
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
 else
 NVCUFLAGS += -O0 -G -g
 CXXFLAGS  += -O0 -g -ggdb3
 endif
 
-ifeq ($(VERBOSE), 0)
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
 .SILENT:
 endif
 
@@ -45,7 +47,7 @@ endif
 BUILDDIR ?= ../build
 ifneq ($(NCCL_HOME), "")
 NVCUFLAGS += -I$(NCCL_HOME)/include/
-NVLDFLAGS   += -L$(NCCL_HOME)/lib
+NVLDFLAGS += -L$(NCCL_HOME)/lib
 endif
 
 ifeq ($(MPI), 1)
@@ -53,7 +55,7 @@ NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
 NVLDFLAGS += -L$(MPI_HOME)/lib -lmpi
 endif
 LIBRARIES += curand nccl nvToolsExt
-NVLDFLAGS   += $(LIBRARIES:%=-l%)
+NVLDFLAGS += $(LIBRARIES:%=-l%)
 
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
@@ -66,7 +68,7 @@ build: ${BIN_FILES}
 clean:
 	rm -rf ${DST_DIR}
 
-${DST_DIR}/%.o: %.cu
+${DST_DIR}/%.o: %.cu common.h
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
diff --git a/src/all_gather.cu b/src/all_gather.cu
index 2386842cdd..cfb2ec356b 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -1,79 +1,53 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "cuda_runtime.h"
 #include "common.h"
 
-
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", 
-      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
   PRINT("%12li  %12li  %6s", size, count, typeName);
 }
 
-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = count/nranks;
-    *recvcount = (count/nranks)*nranks;
-    *sameExpected = 1;
-    *procSharedCount = 0;
-    *sendInplaceOffset = count/nranks;
-    *recvInplaceOffset = 0;
-    *paramcount = *sendcount;
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count/nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = count/nranks;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
 }
 
-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  size_t nBytes = args->nbytes;
-  size_t count = nBytes / wordSize(type);
-  int proc = args->proc;
-  int nThreads = args->nThreads;
-  int t = args->thread;
-  int nGpus = args->nGpus;
+testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
 
-  while (args->sync[args->sync_idx] != t) pthread_yield();
-
-  for (int i=0; i<nGpus; i++) {
-    int device;
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
-
-    void* data = in_place ? (void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank) : args->sendbuffs[i];
-
-    CUDACHECK(cudaMemcpy((void *)((uintptr_t)args->expectedHost[0] + ((proc*nThreads + t)*nGpus + i)*nBytes), 
-                data, 
-                nBytes, cudaMemcpyDeviceToHost));
-
-    if (in_place == 0) {
-      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
     }
     CUDACHECK(cudaDeviceSynchronize());
   }
-
-  args->sync[args->sync_idx] = t + 1;
-
-  if (t+1 == nThreads) {
-#ifdef MPI_SUPPORT
-    // Last thread does the MPI allgather
-    MPI_Allgather(MPI_IN_PLACE, nBytes*nThreads*nGpus, MPI_BYTE, 
-        args->expectedHost[0], 
-        nBytes*nThreads*nGpus, MPI_BYTE, MPI_COMM_WORLD);
-#endif
-    args->sync[args->sync_idx] = 0;
-  } else {
-    while (args->sync[args->sync_idx]) pthread_yield();
-  }
-
-  args->sync_idx=!args->sync_idx;
+  return testSuccess;
 }
 
-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
 
   *algBw = baseBw;
@@ -81,26 +55,49 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
   *busBw = baseBw * factor;
 }
 
-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
+  return testSuccess;
 }
 
-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+struct testColl allGatherTest = {
+  "AllGather",
+  AllGatherGetCollByteCount,
+  AllGatherInitData,
+  AllGatherGetBw,
+  AllGatherRunColl
+};
+
+void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allGatherTest;
   ncclDataType_t *run_types;
   const char **run_typenames;
   int type_count;
 
-  if ((int)type != -1) { 
+  if ((int)type != -1) {
     type_count = 1;
     run_types = &type;
     run_typenames = &typeName;
-  } else { 
+  } else {
     type_count = ncclNumTypes;
     run_types = test_types;
     run_typenames = test_typenames;
   }
 
-  for (int i=0; i<type_count; i++) { 
-     TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, NULL, 0, 1);
-  }   
+  for (int i=0; i<type_count; i++) {
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  }
+  return testSuccess;
 }
+
+struct testEngine allGatherEngine = {
+  AllGatherGetBuffSize,
+  AllGatherRunTest
+};
+
+#pragma weak ncclTestEngine=allGatherEngine
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index f41aff5708..bd8daaf0a2 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -1,89 +1,51 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "cuda_runtime.h"
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
-      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
   PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
 }
 
-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = count;
-    *recvcount = count;
-    *sameExpected = 1;
-    *procSharedCount = 0;
-    *sendInplaceOffset = 0;
-    *recvInplaceOffset = 0;
-    *paramcount = *sendcount;
- }
-
-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  size_t count = args->nbytes / wordSize(type);
-
-  while (args->sync[args->sync_idx] != args->thread) pthread_yield();
-
-  for (int i=0; i<args->nGpus; i++) {
-    int device;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-
-    if (is_first && i == 0) {
-      CUDACHECK(cudaMemcpy(args->expected[0], data, count*wordSize(type), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate(args->expected[0], data, count, type, op);
-    }
-
-    if (in_place == 0) {
-      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes));
-    }
-    CUDACHECK(cudaDeviceSynchronize());
-  }
-
-  args->sync[args->sync_idx] = args->thread + 1;
-
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    // Last thread does the MPI reduction
-    if (args->nbytes > 0) {
-      void* remote, *remoteHost = malloc(args->nbytes);
-      void* myInitialData = malloc(args->nbytes);
-      memcpy(myInitialData, args->expectedHost[0], args->nbytes);
-      CUDACHECK(cudaHostRegister(remoteHost, args->nbytes, cudaHostRegisterPortable | cudaHostRegisterMapped));
-      CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0));
-      for (int i=0; i<args->nProcs; i++) {
-        if (i == args->proc) {
-          MPI_Bcast(myInitialData, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD);
-          free(myInitialData);
-        } else {
-          MPI_Bcast(remoteHost, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD);
-          Accumulate(args->expected[0], remote, count, type, op);
-          cudaDeviceSynchronize();
-        }
-      }
-      CUDACHECK(cudaHostUnregister(remoteHost));
-      free(remoteHost);
-    }
-#endif
-    args->sync[args->sync_idx] = 0;
-  } else {
-    while (args->sync[args->sync_idx]) pthread_yield();
-  }
-
-  args->sync_idx = !args->sync_idx;
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
 }
 
-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 
   *algBw = baseBw;
@@ -91,40 +53,62 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
   *busBw = baseBw * factor;
 }
 
-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
 }
 
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
 
-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
   ncclDataType_t *run_types;
   ncclRedOp_t *run_ops;
   const char **run_typenames, **run_opnames;
   int type_count, op_count;
 
-  if ((int)type != -1) { 
+  if ((int)type != -1) {
     type_count = 1;
     run_types = &type;
     run_typenames = &typeName;
-  } else { 
+  } else {
     type_count = ncclNumTypes;
     run_types = test_types;
     run_typenames = test_typenames;
   }
 
-  if ((int)op != -1) { 
+  if ((int)op != -1) {
     op_count = 1;
     run_ops = &op;
     run_opnames = &opName;
-  } else { 
+  } else {
     op_count = ncclNumOps;
     run_ops = test_ops;
     run_opnames = test_opnames;
   }
 
-  for (int i=0; i<type_count; i++) { 
-      for (int j=0; j<op_count; j++) { 
-          TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], 0, 1);
-      }
-  }   
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
 }
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
+
+#pragma weak ncclTestEngine=allReduceEngine
diff --git a/src/broadcast.cu b/src/broadcast.cu
index fe3d26deeb..c62a99ff62 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -1,78 +1,50 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
  *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "cuda_runtime.h"
 #include "common.h"
-#include <assert.h>
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "root", 
-      "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
   PRINT("%12li  %12li  %6s  %6i", size, count, typeName, root);
 }
 
-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = count;
-    *recvcount = count;
-    *sameExpected = 0;
-    *procSharedCount = count;
-    *sendInplaceOffset = 0;
-    *recvInplaceOffset = 0;
-    *paramcount = *sendcount;
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
 }
 
-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  int root_proc = root/(args->nThreads*args->nGpus);
-  int root_thread = (root/args->nGpus)%(args->nThreads);
-  int root_gpu = root%args->nGpus;
-
-  assert(args->expectedBytes == args->nbytes);
-
-  if (root_thread == args->thread) {
-      if (root_proc == args->proc) {  
-         CUDACHECK(cudaMemcpy(args->procSharedHost,
-                    args->sendbuffs[root_gpu],
-                    args->nbytes, cudaMemcpyDeviceToHost));
-      }
-#ifdef MPI_SUPPORT 
-      MPI_Bcast(args->procSharedHost, args->nbytes, MPI_BYTE, root_proc, MPI_COMM_WORLD);
-#endif
-
-      args->sync[0] = 0;
-  }
-
-  Barrier(args);
+testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
 
   for (int i=0; i<args->nGpus; i++) {
-     int device;
-     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); 
-     CUDACHECK(cudaSetDevice(device));
-
-     //set expected buf to zero at root, copy over source data at others
-     if ((root_proc == args->proc) 
-         && (root_thread == args->thread) 
-         && (root_gpu == i)) { 
-         memset(args->expectedHost[i], 0, args->nbytes); 
-     } else { 
-         memcpy(args->expectedHost[i], args->procSharedHost, args->nbytes);
-     }
-
-     //reset recvbufs to zero
-     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes));
-     CUDACHECK(cudaDeviceSynchronize());
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root));
+    CUDACHECK(cudaDeviceSynchronize());
   }
-
-  Barrier(args);
+  return testSuccess;
 }
 
-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 
   *algBw = baseBw;
@@ -80,42 +52,69 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
   *busBw = baseBw * factor;
 }
 
-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  int rank; 
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
-  if (rank == root) { 
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+  NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream));
+#else
+  if (rank == root) {
       NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream));
-  } else { 
+  } else {
       NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream));
-  } 
+  }
+#endif
+  return testSuccess;
 }
 
-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+struct testColl broadcastTest = {
+  "Broadcast",
+  BroadcastGetCollByteCount,
+  BroadcastInitData,
+  BroadcastGetBw,
+  BroadcastRunColl
+};
+
+void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &broadcastTest;
   ncclDataType_t *run_types;
   const char **run_typenames;
   int type_count;
-  int begin_root, end_root; 
+  int begin_root, end_root;
 
-  if ((int)type != -1) { 
+  if ((int)type != -1) {
     type_count = 1;
     run_types = &type;
     run_typenames = &typeName;
-  } else { 
+  } else {
     type_count = ncclNumTypes;
     run_types = test_types;
     run_typenames = test_typenames;
   }
 
-  if (root != -1) { 
-     begin_root = end_root = root;
-  } else { 
-     begin_root = 0;
-     end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
   }
 
-  for (int i=0; i<type_count; i++) { 
-       for (int j=begin_root; j<=end_root; j++) {
-          TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, NULL, j, 0);
-       }
-  }   
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+    }
+  }
+  return testSuccess;
 }
+
+struct testEngine broadcastEngine = {
+  BroadcastGetBuffSize,
+  BroadcastRunTest
+};
+
+#pragma weak ncclTestEngine=broadcastEngine
diff --git a/src/common.cu b/src/common.cu
index f47e0f5da5..5a3ae529d6 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1,13 +1,14 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "common.h"
 #include <pthread.h>
 #include <cstdio>
 #include <getopt.h>
+#include <libgen.h>
 #include "cuda.h"
 
 #if NCCL_MAJOR >= 2
@@ -22,13 +23,20 @@ const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"};
 
 thread_local int is_main_thread = 0;
 
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32*1024*1024;
+static size_t maxBytes = 32*1024*1024;
+static size_t stepBytes = 1*1024*1024;
+static size_t stepFactor = 1;
 static int datacheck = 1;
 static int warmup_iters = 5;
 static int iters = 20;
+static int agg_iters = 1;
 static int ncclop = ncclSum;
 static int nccltype = ncclFloat;
 static int ncclroot = 0;
-static int swap_args = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
 
@@ -83,12 +91,11 @@ template<typename T> __device__
 float toFloat(T a) {
   return (float)a;
 }
-template<> __device__ 
+template<> __device__
 float toFloat(half a) {
   return __half2float(a);
 }
 
-
 template<typename T, int BSIZE> __global__
 void deltaKern(void* A_, void* B_, size_t count, double* max) {
   const T* A = (const T*)A_;
@@ -102,7 +109,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
     if( delta > locmax ) {
       locmax = delta;
 #ifdef DEBUG_PRINT
-      if (delta > .1) printf("Error at %d/%d : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i]));
+      if (delta > .1) printf("Error at %d/%ld : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i]));
 #endif
     }
   }
@@ -119,7 +126,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
 }
 
 
-void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
+testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
   switch (type) {
     case ncclHalf:
       deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
@@ -142,223 +149,112 @@ void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type
     case ncclUint64:
       deltaKern<uint64_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
   }
-}
-
-#define CURAND_CHK(cmd)                                                         \
-    do {                                                                        \
-      curandStatus_t error = (cmd);                                             \
-      if (error != CURAND_STATUS_SUCCESS) {                                     \
-        printf("CuRAND error %i at %s:%i\n", error, __FILE__ , __LINE__);       \
-        exit(EXIT_FAILURE);                                                     \
-      }                                                                         \
-    } while (false)
-
-
-template<typename T>
-void GenerateRandom(curandGenerator_t generator, T * const dest,
-    const size_t N);
-
-template<>
-void GenerateRandom<int8_t>(curandGenerator_t generator, int8_t * const dest,
-    const size_t N) {
-  size_t align = (4 - (((size_t)dest) & 3)) % 4;
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)(dest+align),
-      N * sizeof(int8_t) / sizeof(int)));
-  CUDACHECK(cudaMemcpy(dest, dest+4, align, cudaMemcpyDeviceToDevice));
-}
-template<>
-void GenerateRandom<uint8_t>(curandGenerator_t generator, uint8_t * const dest,
-    const size_t N) {
-  size_t align = (4 - (((size_t)dest) & 3)) % 4;
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)(dest+align),
-      N * sizeof(uint8_t) / sizeof(int)));
-  CUDACHECK(cudaMemcpy(dest, dest+4, align, cudaMemcpyDeviceToDevice));
-}
-
-template<>
-void GenerateRandom<int32_t>(curandGenerator_t generator, int32_t * const dest,
-    const size_t N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N));
-}
-
-template<>
-void GenerateRandom<uint32_t>(curandGenerator_t generator, uint32_t * const dest,
-    const size_t N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N));
-}
-
-template<>
-void GenerateRandom<float>(curandGenerator_t generator, float * const dest,
-    const size_t N) {
-  CURAND_CHK(curandGenerateUniform(generator, dest, N));
-}
-
-template<>
-void GenerateRandom<double>(curandGenerator_t generator, double * const dest,
-    const size_t N) {
-  CURAND_CHK(curandGenerateUniformDouble(generator, dest, N));
-}
-
-template<>
-void GenerateRandom<uint64_t>(curandGenerator_t generator, uint64_t * const dest,
-    const size_t N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int *)dest, N*2));
-}
-
-template<>
-void GenerateRandom<int64_t>(curandGenerator_t generator, int64_t * const dest,
-    const size_t N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int *)dest, N*2));
-}
-
-template<typename T>
-void RandomizeType(void* dest, const size_t N, const int randomSeed) {
-  T* ptr = (T*)dest;
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
-  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
-  GenerateRandom<T>(gen, ptr, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
   CUDACHECK(cudaDeviceSynchronize());
+  return testSuccess;
 }
 
-__global__ void halve(const float * src, half* dest, size_t N) {
-  for(int tid = threadIdx.x + blockIdx.x*blockDim.x;
-      tid < N; tid += blockDim.x * gridDim.x)
-    dest[tid] = __float2half(src[tid]);
+// For integer values, we use values between 0 and 255
+template<typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep+rank+offset) % 256;
+  return (T)v;
 }
 
-void RandomizeHalf(void* dest, const size_t N, const int randomSeed) {
-  half* ptr = (half*)dest;
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
-  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
-
-  float* temp;
-  CUDACHECK(cudaMalloc(&temp, N*sizeof(float)));
-  GenerateRandom<float>(gen, temp, N);
-  halve<<<128, 512>>>(temp, ptr, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaFree(temp));
-  CUDACHECK(cudaDeviceSynchronize());
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template<>
+__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
 }
 
-void Randomize(void* ptr, const size_t count, ncclDataType_t type, const int seed) {
-  switch (type) {
-    case ncclChar:   RandomizeType<int8_t>  (ptr, count, seed); break;
-#if NCCL_MAJOR >= 2
-    case ncclUint8:  RandomizeType<uint8_t> (ptr, count, seed); break;
-#endif
-    case ncclInt:    RandomizeType<int32_t> (ptr, count, seed); break;
-#if NCCL_MAJOR >= 2
-    case ncclUint32: RandomizeType<uint32_t>(ptr, count, seed); break;
-#endif
-    case ncclInt64:  RandomizeType<int64_t> (ptr, count, seed); break;
-    case ncclUint64: RandomizeType<uint64_t>(ptr, count, seed); break;
-    case ncclHalf:   RandomizeHalf          (ptr, count, seed); break;
-    case ncclFloat:  RandomizeType<float>   (ptr, count, seed); break;
-    case ncclDouble: RandomizeType<double>  (ptr, count, seed); break;
-  }
-}
+// Operations
+template<typename T>
+__device__ T ncclOpSum(T a, T b) { return a+b; }
+template<typename T>
+__device__ T ncclOpProd(T a, T b) { return a*b; }
+template<typename T>
+__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
+template<typename T>
+__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
 
-template<typename T, int OP> __global__ static
-void accumKern(T* acum, const T* contrib, size_t N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    T c = contrib[i];
-    T a = acum[i];
-    if(OP == ncclSum) {
-      acum[i] = a+c;
-    } else if(OP == ncclProd) {
-      acum[i] = a*c;
-    } else if(OP == ncclMax) {
-      acum[i] = (a > c) ? a : c;
-    } else if(OP == ncclMin) {
-      acum[i] = (a < c) ? a : c;
+// Definitions for half
+template<>
+__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
+template<>
+__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
+template<>
+__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
+template<>
+__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
+
+template<typename T, T (*Op)(T, T)>
+__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
+    T val = testValue<T>(o+offset, rep, 0);
+    for (int i=1; i<nranks; i++) {
+      val = Op(val, testValue<T>(o+offset, rep, i));
     }
+    data[o] = val;
   }
 }
 
-template<> __global__
-void accumKern<half, ncclSum>(half* acum, const half* contrib, size_t N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( a + c );
-  }
-}
+#define KERN(type, op) (void*)InitDataReduceKernel<type, op<type>>
+#define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin)
 
-template<> __global__
-void accumKern<half, ncclProd>(half* acum, const half* contrib, size_t N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( a * c );
-  }
-}
+static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
+  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double)
+};
 
-template<> __global__
-void accumKern<half, ncclMax>(half* acum, const half* contrib, size_t N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( (a>c) ? a : c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclMin>(half* acum, const half* contrib, size_t N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( (a<c) ? a : c );
-  }
+testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*ncclNumOps+op], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
 }
 
 template<typename T>
-void accVecType(void* out, void* in, size_t n, ncclRedOp_t op) {
-  switch(op) {
-    case ncclSum:  accumKern<T, ncclSum> <<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclProd: accumKern<T, ncclProd><<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclMax:  accumKern<T, ncclMax> <<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclMin:  accumKern<T, ncclMin> <<<256,256>>>((T*)out, (T*)in, n); break;
-    default:
-      printf("Unknown reduction operation.\n");
-      exit(EXIT_FAILURE);
-  }
+__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
 }
 
-void Accumulate(void* out, void* in, size_t n, ncclDataType_t type, ncclRedOp_t op) {
-  switch (type) {
-    case ncclChar:   accVecType<int8_t>   (out, in, n, op); break;
-#if NCCL_MAJOR >= 2
-    case ncclUint8:  accVecType<uint8_t>  (out, in, n, op); break;
-#endif
-    case ncclInt:  accVecType<int32_t>  (out, in, n, op); break;
-#if NCCL_MAJOR >= 2
-    case ncclUint32: accVecType<uint32_t> (out, in, n, op); break;
-#endif
-    case ncclInt64:  accVecType<int64_t>  (out, in, n, op); break;
-    case ncclUint64: accVecType<uint64_t> (out, in, n, op); break;
-    case ncclHalf:   accVecType<half>     (out, in, n, op); break;
-    case ncclFloat:  accVecType<float>    (out, in, n, op); break;
-    case ncclDouble: accVecType<double>   (out, in, n, op); break;
-    default:
-      printf("Unknown reduction type.\n");
-      exit(EXIT_FAILURE);
-  }
+static void* const initDataKerns[ncclNumTypes] = {
+  (void*)InitDataKernel<  int8_t>,
+  (void*)InitDataKernel< uint8_t>,
+  (void*)InitDataKernel< int32_t>,
+  (void*)InitDataKernel<uint32_t>,
+  (void*)InitDataKernel< int64_t>,
+  (void*)InitDataKernel<uint64_t>,
+  (void*)InitDataKernel<    half>,
+  (void*)InitDataKernel<   float>,
+  (void*)InitDataKernel<  double>
+};
+
+template<typename T>
+testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
+  T* ptr = (T*)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
 }
 
-void Barrier(struct threadArgs_t* args)
+testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs* args)
 {
   while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
 
@@ -376,16 +272,7 @@ void Barrier(struct threadArgs_t* args)
   args->barrier_idx=!args->barrier_idx;
 }
 
-void RandomizeAccumulate(void* data, void* accum, size_t count, ncclDataType_t type, ncclRedOp_t op, int seed, int rank) {
-  Randomize(data, count, type, seed);
-  if (rank == 0) {
-    CUDACHECK(cudaMemcpy(accum, data, count*wordSize(type), cudaMemcpyDeviceToHost));
-  } else {
-    Accumulate(accum, data, count, type, op);
-  }
-}
-
-double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
   size_t count = args->expectedBytes/wordSize(type);
   double maxDelta = 0.0;
   for (int i=0; i<args->nGpus; i++) {
@@ -394,24 +281,25 @@ double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op,
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
     CUDACHECK(cudaSetDevice(device));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
-    CheckDelta(data , args->expected[i], count, type, args->delta);
-    cudaDeviceSynchronize();
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->delta));
     maxDelta = std::max(*(args->deltaHost), maxDelta);
 
 #ifdef DEBUG_PRINT
-    if (rank == 0) { 
-       int *temp = (int *)malloc(args->expectedBytes);
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
 
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
        printf("\n Expected: ");
-       for(int j=0; j<args->expectedBytes/sizeof(int); j++) { 
-       	printf("%d:%d ", j, *((int *)args->expectedHost[0] + j));
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
        }
        printf("\n");
 
-       cudaMemcpy(temp, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
        printf("\n Actual: ");
-       for (int j=0; j<args->expectedBytes/sizeof(int); j++) { 
-       	printf("%d:%d ", j, *((int *)temp + j));
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
        }
        printf("\n");
        free(temp);
@@ -420,173 +308,173 @@ double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op,
   }
   double nranks = args->nProcs*args->nThreads*args->nGpus;
   if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
-  return maxDelta;
+  *delta = maxDelta;
+  return testSuccess;
 }
 
-void InitSend(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  size_t count = args->sendBytes / wordSize(type);
-  static int rep = 1;
-  for (int i=0; i<args->nGpus; i++) {
-    int device;
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
-    void* data = in_place ? (void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank) : args->sendbuffs[i];
-    int seed = rank+count+rep+in_place;
-    Randomize(data, count, type, seed);
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int* done = (int*)malloc(sizeof(int)*ngpus);
+  memset(done, 0, sizeof(int)*ngpus);
+  while (remaining) {
+   int idle = 1;
+   for (int i=0; i<ngpus; i++) {
+     if (done[i]) continue;
 
-#ifdef DEBUG_PRINT
-    if (rank == 2) { 
-       int *temp = (int *)malloc(args->sendBytes);
-       cudaMemcpy(temp, data, args->sendBytes, cudaMemcpyDeviceToHost);
-       printf("\n Send Data at rank %d:", rank);
-       for (int i=0; i<args->sendBytes/sizeof(int); i++) { 
-       	printf("%d:%d ", i, *((int *)temp + i));
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
+       done[i] = 1;
+       remaining--;
+       idle = 0;
+       continue;
+     }
+
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+     if (comms) {
+       ncclResult_t ncclAsyncErr;
+       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+       if (ncclAsyncErr != ncclSuccess) {
+         // An asynchronous error happened. Stop the operation and destroy
+         // the communicator
+         for (int i=0; i<ngpus; i++)
+           NCCLCHECK(ncclCommAbort(comms[i]));
+         // Abort the perf test
+         NCCLCHECK(ncclAsyncErr);
        }
-       printf("\n");
-       free(temp);
-    }
+     }
 #endif
+   }
 
-    cudaDeviceSynchronize();
+   // We might want to let other threads (including NCCL threads) use the CPU.
+   if (idle) pthread_yield();
   }
-  rep++;
+  return testSuccess;
 }
 
-#define CHECK 1
-
-void startColl(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int thread_offset) {
+testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int iter) {
   size_t count = args->nbytes / wordSize(type);
 
-  if (swap_args) {
-      args = (struct threadArgs_t*)args->proc_args + (args->thread + thread_offset)%args->nThreads;
-  }
+  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t shift = (totalnbytes * iter) % args->maxbytes;
+  if (shift + totalnbytes > args->maxbytes) shift = 0;
 
-  if (args->nGpus == 1) {
-    int rank = args->proc*args->nThreads + args->thread;
-    RunColl((void*)(in_place ? ((void *)((uintptr_t)args->recvbuffs[0] + args->sendInplaceOffset*rank)) : args->sendbuffs[0]),
-        (void*)(in_place ? (void*)((uintptr_t)args->recvbuffs[0] + args->recvInplaceOffset*rank) : args->recvbuffs[0]),
-        count, type, op, root, args->comms[0], args->streams[0]);
-  } else {
-    NCCLCHECK(ncclGroupStart());
-    for (int i = 0; i < args->nGpus; i++) {
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
 #ifndef NCCL_MAJOR
-      int cudaDev;
-      NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
-      CUDACHECK(cudaSetDevice(cudaDev));
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
 #endif
-      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-      RunColl((void*)(in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank)) : args->sendbuffs[i]),
-          (void*)(in_place ? (void*)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank) : args->recvbuffs[i]),
-          count, type, op, root, args->comms[i], args->streams[i]);
-    }
-    NCCLCHECK(ncclGroupEnd());
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
+    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    TESTCHECK(args->collTest->runColl(
+          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
+          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
+        count, type, op, root, args->comms[i], args->streams[i]));
   }
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd());
 
-  if (swap_args || blocking_coll) {
-    //if args have been swapped, complete op before returning
-    for (int i = 0; i < args->nGpus; ++i) {
-      cudaError_t err = cudaErrorNotReady;
-      while (err == cudaErrorNotReady) { 
-          err = cudaStreamQuery(args->streams[i]);
-          pthread_yield();	
-      }
-      CUDACHECK(err);
-    }
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
   }
   if (blocking_coll) Barrier(args);
+  return testSuccess;
 }
 
-void completeColl(struct threadArgs_t* args) {
-  //it swap_args was enabled, op would have been completed immediately
-  if (swap_args || blocking_coll) return;
+testResult_t completeColl(struct threadArgs* args) {
+  if (blocking_coll) return testSuccess;
 
-  for (int i = 0; i < args->nGpus; ++i) {
-    cudaError_t err = cudaErrorNotReady;
-    while (err == cudaErrorNotReady) { 
-        err = cudaStreamQuery(args->streams[i]);
-        pthread_yield();	
-    }
-    CUDACHECK(err);
-  }
+  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  return testSuccess;
 }
 
-void BenchTime(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
-  
+
   // Sync
-  startColl(args, type, op, root, in_place, 0);
-  completeColl(args);
+  TESTCHECK(startColl(args, type, op, root, in_place, 0));
+  TESTCHECK(completeColl(args));
 
   Barrier(args);
 
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
-      startColl(args, type, op, root, in_place, iter); 
+    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
+    for (int aiter = 0; aiter < agg_iters; aiter++) {
+      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
+    }
+    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
   }
-  completeColl(args);
+  TESTCHECK(completeColl(args));
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
   double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
-  deltaSec = deltaSec/iters;
+  deltaSec = deltaSec/(iters*agg_iters);
 
   double algBw, busBw;
-  GetBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
 
   Barrier(args);
 
-  if (datacheck) { 
-      InitSend(args, type, op, root, in_place, args->thread == 0 ? 1 : 0);
-      InitRecvResult(args, type, op, root, in_place, args->thread == 0 ? 1 : 0);
-      cudaDeviceSynchronize();
-  }
-
-  //test validation in single itertion, should ideally be included into the multi-iteration run
-  startColl(args, type, op, root, in_place, 0); 
-  completeColl(args);
-
   double maxDelta = 0;
-#ifdef CHECK
-  if (datacheck) { 
-     maxDelta = CheckData(args, type, op, root, in_place);
-  } else { 
-     maxDelta = -1.0;
-  }
-#else
-     maxDelta = -1.0;
-#endif
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) {
+      // Initialize sendbuffs, recvbuffs and expected
+      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
-  //aggregate delta from all threads and procs
-  Barrier(args);
-  if (args->thread == 0) {
-      for (int i=1; i<args->nThreads; i++) { 
+      //test validation in single itertion, should ideally be included into the multi-iteration run
+      TESTCHECK(startColl(args, type, op, root, in_place, 0));
+      TESTCHECK(completeColl(args));
+
+      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+
+      //aggregate delta from all threads and procs
+      Barrier(args);
+      if (args->thread == 0) {
+        for (int i=1; i<args->nThreads; i++) {
           maxDelta += args->deltaThreads[i];
-      }
+        }
 #ifdef MPI_SUPPORT
-      MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+        MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
 #endif
+      }
+      Barrier(args);
   }
-  Barrier(args);
 
-  if (datacheck) { 
-     PRINT("  %7.3f  %5.2f  %5.2f  %7.0le", deltaSec * 1.0E3, algBw, busBw,
-         maxDelta);
+  double timeUsec = deltaSec*1.0E6;
+  char timeStr[10];
+  if (timeUsec > 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec > 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
   } else {
-     PRINT("  %7.3f  %5.2f  %5.2f  \tN/A", deltaSec * 1.0E3, algBw, busBw);
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
   }
 
   args->bw[0] += busBw;
   args->bw_count[0]++;
+  return testSuccess;
 }
 
-void setupArgs(size_t size, ncclDataType_t type, struct threadArgs_t* args) {
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
   int nranks = args->nProcs*args->nGpus*args->nThreads;
-  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset, procSharedCount;
-  int sameExpected;
-  
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
+
   count = size / wordSize(type);
-  getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, &procSharedCount, &sameExpected, (size_t)count, (size_t)nranks);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
 
   args->nbytes = paramCount * wordSize(type);
   args->sendBytes = sendCount * wordSize(type);
@@ -595,260 +483,224 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs_t* args) {
   args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
 }
 
-void TimeTest(struct threadArgs_t* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, int inPlace) {
-  // Warm-up
+testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+  // Warm-up for large size
   setupArgs(args->maxbytes, type, args);
   for (int iter = 0; iter < warmup_iters; iter++) {
-     startColl(args, type, op, root, 0, iter);
+    TESTCHECK(startColl(args, type, op, root, 0, iter));
   }
-  completeColl(args);
+  TESTCHECK(completeColl(args));
+
+  // Warm-up for small size
+  setupArgs(args->minbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    TESTCHECK(startColl(args, type, op, root, 0, iter));
+  }
+  TESTCHECK(completeColl(args));
 
   // Benchmark
   for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
       setupArgs(size, type, args);
       print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
-      BenchTime(args, type, op, root, 0);
-      if (inPlace) BenchTime(args, type, op, root, 1);
+      TESTCHECK(BenchTime(args, type, op, root, 0));
+      TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
   }
+  return testSuccess;
 }
 
-
-void* threadRunTests(void* args) {
-  struct threadArgs_t* targs = (struct threadArgs_t*)args;
+testResult_t threadRunTests(struct threadArgs* args) {
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
-  int gpuid = targs->localRank*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus;
+  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
   CUDACHECK(cudaSetDevice(gpuid));
-
-  RunTest(targs, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]);
-
-  return NULL;
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
 }
 
-void* threadInit(void* args) {
-  struct threadArgs_t* targs = (struct threadArgs_t*)args;
+testResult_t threadInit(struct threadArgs* args) {
   char hostname[1024];
   getHostName(hostname, 1024);
-  int nranks =  targs->nProcs*targs->nThreads*targs->nGpus;
+  int nranks =  args->nProcs*args->nThreads*args->nGpus;
 
   //set main thread again
-  is_main_thread = (targs->proc == 0 && targs->thread == 0) ? 1 : 0;
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
 
   NCCLCHECK(ncclGroupStart());
-  for (int i=0; i<targs->nGpus; i++) {
-    int rank = targs->proc*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus + i;
-    int gpuid = targs->localRank*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus + i;
+  for (int i=0; i<args->nGpus; i++) {
+    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
     CUDACHECK(cudaSetDevice(gpuid));
-    NCCLCHECK(ncclCommInitRank(targs->comms+i, nranks, targs->ncclId, rank));
+    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
   }
   NCCLCHECK(ncclGroupEnd());
 
-  PRINT("# Using devices\n");
-  for (int p=0; p<targs->nProcs; p++) {
-    if (p == targs->proc) {
-      for (int t=0; t<targs->nThreads; t++) {
-        if (t == targs->thread) {
-          for (int i=0; i<targs->nGpus; i++) {
-            int cudaDev;
-            int rank;
-            cudaDeviceProp prop;
-            NCCLCHECK(ncclCommCuDevice(targs->comms[i], &cudaDev));
-            NCCLCHECK(ncclCommUserRank(targs->comms[i], &rank));
-            CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-            printf("#   Rank %2d on %10s device %2d [0x%02x] %s\n", rank, hostname, cudaDev,
-                prop.pciBusID, prop.name);
-            fflush(stdout);
-          }
-          Barrier(targs);
-          fflush(stdout);
-	}
-      }
-    }
+  TESTCHECK(threadRunTests(args));
+
+  for (int i=0; i<args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
   }
+  return testSuccess;
+}
 
-  threadRunTests(args);
-
+void* threadLauncher(void* thread_) {
+  struct testThread* thread = (struct testThread*)thread_;
+  thread->ret = thread->func(&thread->args);
   return NULL;
 }
-
-void AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, void **expectedHost, size_t nbytes, int nranks, int sameExpected) {
-    static int is_first = 1;
-    static void *cached_ptr = NULL;
-    static void *cached_hostptr = NULL;
-
-    CUDACHECK(cudaMalloc(sendbuff, sendBytes));
-    //work around for inline reduce scatter where recv count is smaller that send count
-    CUDACHECK(cudaMalloc(recvbuff, (sendBytes > recvBytes) ? sendBytes : recvBytes));
-
-    if (is_first || !sameExpected) {
-        *expectedHost = malloc(recvBytes);
-        CUDACHECK(cudaHostRegister(*expectedHost, recvBytes, cudaHostRegisterPortable | cudaHostRegisterMapped));
-        CUDACHECK(cudaHostGetDevicePointer(expected, *expectedHost, 0));
-        cached_ptr = *expected;
-        cached_hostptr = *expectedHost;
-        is_first = 0;
-    } else {
-        *expected = cached_ptr;
-        *expectedHost = cached_hostptr;
-    }
-}
- 
-int ncclstringtotype(char *str) { 
-    for (int t=0; t<ncclNumTypes; t++) {
-      if (strcmp(str, test_typenames[t]) == 0) {
-        return t;
-      }
-    }
-    if (strcmp(str, "all") == 0) {
-      return -1;
-    }
-    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[nccltype]);
-    return nccltype;
+testResult_t threadLaunch(struct testThread* thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
 }
 
-int ncclstringtoop (char *str) { 
-    for (int o=0; o<ncclNumOps; o++) {
-      if (strcmp(str, test_opnames[o]) == 0) {
-        return o;
-      }
-    }
-    if (strcmp(str, "all") == 0) {
-      return -1;
-    }
-    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclop]);
-    return ncclop;
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    CUDACHECK(cudaMalloc(expected, recvBytes));
+    return testSuccess;
 }
 
+testResult_t run(); // Main function
+
 int main(int argc, char* argv[]) {
- int nThreads = 1, nGpus = 1;
- size_t minBytes = 32*1024*1024, maxBytes = 32*1024*1024, stepBytes = 1*1024*1024, stepFactor = 1;
- int longindex;
- int nProcs = 1, proc = 0;
- int localRank = 0;
- char hostname[1024];
- getHostName(hostname, 1024);
- 
- static struct option longopts[] = {
-    {"nthreads", required_argument, 0, 't'}, 
-    {"ngpus", required_argument, 0, 'g'}, 
-    {"minbytes", required_argument, 0, 'b'}, 
-    {"maxbytes", required_argument, 0, 'e'}, 
-    {"stepbytes", required_argument, 0, 'i'},
-    {"stepfactor", required_argument, 0, 'f'},
-    {"iters", required_argument, 0, 'n'},
-    {"warmup_iters", required_argument, 0, 'w'},
-    {"swap_comms", required_argument, 0, 's'},
-    {"parallel_init", required_argument, 0, 'p'},
-    {"check", required_argument, 0, 'c'},
-    {"blocking", required_argument, 0, 'z'},
-    {"op", required_argument, 0, 'o'},
-    {"datatype", required_argument, 0, 'd'},
-    {"root", required_argument, 0, 'r'},
-    {"help", no_argument, 0, 'h'}
- };
-
- while(1) {
-      int c;
-      c = getopt_long(argc, argv, "t:g:b:e:i:f:n:w:s:p:c:o:d:r:z:h", longopts, &longindex);
-
-      if (c == -1)
-         break;
-
-      switch(c) {
-         case 't':
-             nThreads = strtol(optarg, NULL, 0);
-             break;
-         case 'g':
-             nGpus = strtol(optarg, NULL, 0);
-             break;
-         case 'b':
-             minBytes = (size_t)parsesize(optarg);
-             break;
-         case 'e':
-             maxBytes = (size_t)parsesize(optarg);
-             break;
-         case 'i':
-             stepBytes = strtol(optarg, NULL, 0);
-             break;
-         case 'f':
-             stepFactor = strtol(optarg, NULL, 0);
-             break;
-	 case 'n':
-	     iters = (int)strtol(optarg, NULL, 0);
-	     break;
-	 case 'w':
-	     warmup_iters = (int)strtol(optarg, NULL, 0);
-	     break;
-	 case 's':
-	     swap_args = (int)strtol(optarg, NULL, 0);
-	     break;
-	 case 'c':
-	     datacheck = (int)strtol(optarg, NULL, 0);
-	     break;
-	 case 'p':
-	     parallel_init = (int)strtol(optarg, NULL, 0);
-	     break;
-	 case 'o':
-	     ncclop = ncclstringtoop(optarg);
-	     break;
-	 case 'd':
-	     nccltype = ncclstringtotype(optarg);
-	     break;
-	 case 'r':
-	     ncclroot = strtol(optarg, NULL, 0);
-	     break;
-	 case 'z':
-	     blocking_coll = strtol(optarg, NULL, 0);
-	     break;
-         case 'h':
-	         printf("USAGE: ./test \n\t" 
-	 	 "[-t,--nthreads <num threads>] \n\t "
-		 "[-g,--ngpus <gpus per thread>] \n\t "
-		 "[-b,--minbytes <min size in bytes>] \n\t "
-		 "[-e,--maxbytes <max size in bytes>] \n\t "
-	         "[-i,--stepbytes <increment size>] \n\t "
-		 "[-f,--stepfactor <increment factor>] \n\t "
-		 "[-n,--iters <iteration count>] \n\t "
-		 "[-w,--warmup_iters <warmup iteration count>] \n\t" 
-		 "[-s,--swap_args <0/1>] \n\t "
-		 "[-p,--parallel_init <0/1>] \n\t "
-		 "[-c,--check <0/1>] \n\t "
-		 "[-o,--op <sum/prod/min/max/all>] \n\t "
-		 "[-d,--datatype <nccltype/all>] \n\t "
-		 "[-r,--root <root>] \n\t "
-		 "[-z,--blocking <0/1>] \n\t "
-		 "[-h,--help]\n");
-	         return 0;
-	 default: 
-	         printf("invalid option \n");
-	         printf("USAGE: ./test \n\t" 
-	 	 "[-t,--nthreads <num threads>] \n\t "
-		 "[-g,--ngpus <gpus per thread>] \n\t "
-		 "[-b,--minbytes <min size in bytes>] \n\t "
-		 "[-e,--maxbytes <max size in bytes>] \n\t "
-	         "[-i,--stepbytes <increment size>] \n\t "
-		 "[-f,--stepfactor <increment factor>] \n\t "
-		 "[-n,--iters <iteration count>] \n\t "
-		 "[-w,--warmup_iters <warmup iteration count>] \n\t" 
-		 "[-s,--swap_args <0/1>] \n\t "
-		 "[-p,--parallel_init <0/1>] \n\t "
-		 "[-c,--check <0/1>] \n\t "
-		 "[-o,--op <sum/prod/min/max/all>] \n\t "
-		 "[-d,--datatype <nccltype/all>] \n\t "
-		 "[-r,--root <root>] \n\t "
-		 "[-z,--blocking <0/1>] \n\t "
-		 "[-h,--help]\n");
-	         return 0;
-      }
-  }
-
   // Make sure everyline is flushed so that we see the progress of the test
   setlinebuf(stdout);
 
+  // Parse args
+  int longindex;
+  static struct option longopts[] = {
+    {"nthreads", required_argument, 0, 't'},
+    {"ngpus", required_argument, 0, 'g'},
+    {"minbytes", required_argument, 0, 'b'},
+    {"maxbytes", required_argument, 0, 'e'},
+    {"stepbytes", required_argument, 0, 'i'},
+    {"stepfactor", required_argument, 0, 'f'},
+    {"iters", required_argument, 0, 'n'},
+    {"agg_iters", required_argument, 0, 'm'},
+    {"warmup_iters", required_argument, 0, 'w'},
+    {"parallel_init", required_argument, 0, 'p'},
+    {"check", required_argument, 0, 'c'},
+    {"op", required_argument, 0, 'o'},
+    {"datatype", required_argument, 0, 'd'},
+    {"root", required_argument, 0, 'r'},
+    {"blocking", required_argument, 0, 'z'},
+    {"help", no_argument, 0, 'h'}
+  };
+
+  while(1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:h", longopts, &longindex);
+
+    if (c == -1)
+      break;
+
+    switch(c) {
+      case 't':
+        nThreads = strtol(optarg, NULL, 0);
+        break;
+      case 'g':
+        nGpus = strtol(optarg, NULL, 0);
+        break;
+      case 'b':
+        minBytes = (size_t)parsesize(optarg);
+        break;
+      case 'e':
+        maxBytes = (size_t)parsesize(optarg);
+        break;
+      case 'i':
+        stepBytes = strtol(optarg, NULL, 0);
+        break;
+      case 'f':
+        stepFactor = strtol(optarg, NULL, 0);
+        break;
+      case 'n':
+        iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'm':
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+        agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+        printf("Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+        break;
+      case 'w':
+        warmup_iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'c':
+        datacheck = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'p':
+        parallel_init = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'o':
+        ncclop = ncclstringtoop(optarg);
+        break;
+      case 'd':
+        nccltype = ncclstringtotype(optarg);
+        break;
+      case 'r':
+        ncclroot = strtol(optarg, NULL, 0);
+        break;
+      case 'z':
+        blocking_coll = strtol(optarg, NULL, 0);
+        break;
+      case 'h':
+	printf("USAGE: %s \n\t"
+            "[-t,--nthreads <num threads>] \n\t"
+            "[-g,--ngpus <gpus per thread>] \n\t"
+            "[-b,--minbytes <min size in bytes>] \n\t"
+            "[-e,--maxbytes <max size in bytes>] \n\t"
+            "[-i,--stepbytes <increment size>] \n\t"
+            "[-f,--stepfactor <increment factor>] \n\t"
+            "[-n,--iters <iteration count>] \n\t"
+            "[-m,--agg_iters <aggregated iteration count>] \n\t"
+            "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-p,--parallel_init <0/1>] \n\t"
+            "[-c,--check <0/1>] \n\t"
+            "[-o,--op <sum/prod/min/max/all>] \n\t"
+            "[-d,--datatype <nccltype/all>] \n\t"
+            "[-r,--root <root>] \n\t"
+            "[-z,--blocking <0/1>] \n\t"
+            "[-h,--help]\n",
+	    basename(argv[0]));
+	return 0;
+      default:
+        printf("invalid option \n");
+	printf("USAGE: %s \n\t"
+            "[-t,--nthreads <num threads>] \n\t"
+            "[-g,--ngpus <gpus per thread>] \n\t"
+            "[-b,--minbytes <min size in bytes>] \n\t"
+            "[-e,--maxbytes <max size in bytes>] \n\t"
+            "[-i,--stepbytes <increment size>] \n\t"
+            "[-f,--stepfactor <increment factor>] \n\t"
+            "[-n,--iters <iteration count>] \n\t"
+            "[-m,--agg_iters <aggregated iteration count>] \n\t"
+            "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-p,--parallel_init <0/1>] \n\t"
+            "[-c,--check <0/1>] \n\t"
+            "[-o,--op <sum/prod/min/max/all>] \n\t"
+            "[-d,--datatype <nccltype/all>] \n\t"
+            "[-r,--root <root>] \n\t"
+            "[-z,--blocking <0/1>] \n\t"
+            "[-h,--help]\n",
+	    basename(argv[0]));
+	return 0;
+    }
+  }
 #ifdef MPI_SUPPORT
   MPI_Init(&argc, &argv);
+#endif
+  return run();
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
   MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
   MPI_Comm_rank(MPI_COMM_WORLD, &proc);
   uint64_t hostHashs[nProcs];
@@ -861,14 +713,38 @@ int main(int argc, char* argv[]) {
 #endif
   is_main_thread = (proc == 0) ? 1 : 0;
 
-  if (proc == 0) { 
-      printf("nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, 
-      			(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
-      if (swap_args) printf("Swap Comms Enabled: swapping communicators among threads for each iteration \n");
-      if (blocking_coll) printf("Blocking Enabled: wait for completion and barrier after each collective \n"); 
-      if (parallel_init) printf("Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); 
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  for (int i=0; i<nThreads*nGpus; i++) {
+    int cudaDev = localRank*nThreads*nGpus+i;
+    int rank = proc*nThreads*nGpus+i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
   }
 
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines+MAX_LINE*p);
+    free(lines);
+  }
+#else
+  PRINT("%s", line);
+#endif
+
   ncclUniqueId ncclId;
   if (proc == 0) {
     NCCLCHECK(ncclGetUniqueId(&ncclId));
@@ -880,23 +756,14 @@ int main(int argc, char* argv[]) {
   void* sendbuffs[nGpus*nThreads];
   void* recvbuffs[nGpus*nThreads];
   void* expected[nGpus*nThreads];
-  void* expectedHost[nGpus*nThreads];
-  void *procSharedHost, *procShared;
-  size_t sendBytes, recvBytes, paramBytes, procSharedBytes, sendInplaceOffset, recvInplaceOffset; 
-  int sameExpected;
+  size_t sendBytes, recvBytes;
 
-  getCollByteCount(&sendBytes, &recvBytes, &paramBytes, &sendInplaceOffset, &recvInplaceOffset, &procSharedBytes, &sameExpected, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
 
   for (int i=0; i<nGpus*nThreads; i++) {
     CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-    AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, expectedHost+i, (size_t)maxBytes, nProcs*nThreads*nGpus, sameExpected);
-    CUDACHECK(cudaStreamCreate(streams+i));
-  }
-
-  if (procSharedBytes > 0) { 
-      procSharedHost = malloc(procSharedBytes);
-      CUDACHECK(cudaHostRegister(procSharedHost, procSharedBytes, cudaHostRegisterPortable | cudaHostRegisterMapped));
-      CUDACHECK(cudaHostGetDevicePointer(&procShared, procSharedHost, 0));
+    AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus);
+    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL
@@ -910,128 +777,113 @@ int main(int argc, char* argv[]) {
        NCCLCHECK(ncclGroupStart());
        for (int i=0; i<nGpus*nThreads; i++) {
          CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i)); 
+         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
        }
        NCCLCHECK(ncclGroupEnd());
      }
-
-     PRINT("# NCCL Tests compiled with NCCL %d.%d\n", NCCL_MAJOR, NCCL_MINOR);
-     PRINT("# Using devices\n");
-     for (int p=0; p<nProcs; p++) {
-       if (p == proc) {
-         for (int i=0; i<nThreads*nGpus; i++) {
-           int cudaDev;
-           int rank;
-           cudaDeviceProp prop;
-           NCCLCHECK(ncclCommCuDevice(comms[i], &cudaDev));
-           NCCLCHECK(ncclCommUserRank(comms[i], &rank));
-           CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-           printf("#   Rank %2d on %10s device %2d [0x%02x] %s\n", rank, hostname, cudaDev,
-               prop.pciBusID, prop.name);
-           fflush(stdout);
-         }
-       }
-#ifdef MPI_SUPPORT
-       MPI_Barrier(MPI_COMM_WORLD);
-#endif
-       fflush(stdout);
-     }
   }
 
   int errors[nThreads];
   double bw[nThreads];
-  double delta[nThreads];
+  double* delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped));
   int bw_count[nThreads];
   for (int t=0; t<nThreads; t++) {
     bw[t] = 0.0;
     errors[t] = bw_count[t] = 0;
   }
 
-  PRINT("\n");
+  PRINT("#\n");
   print_header();
 
   int* sync = (int*)calloc(2, sizeof(int));
   int* barrier = (int*)calloc(2, sizeof(int));
 
-  pthread_t threads[nThreads];
-  struct threadArgs_t args[nThreads];
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread)*nThreads);
 
   for (int t=nThreads-1; t>=0; t--) {
-    args[t].proc_args = (void *)args;
-    args[t].minbytes=minBytes;
-    args[t].maxbytes=maxBytes;
-    args[t].stepbytes=stepBytes;
-    args[t].stepfactor=stepFactor;
-    args[t].localRank = localRank;
+    threads[t].args.minbytes=minBytes;
+    threads[t].args.maxbytes=maxBytes;
+    threads[t].args.stepbytes=stepBytes;
+    threads[t].args.stepfactor=stepFactor;
+    threads[t].args.localRank = localRank;
 
-    args[t].nProcs=nProcs;
-    args[t].proc=proc;
-    args[t].nThreads=nThreads;
-    args[t].thread=t;
-    args[t].nGpus=nGpus;
-    args[t].sendbuffs = sendbuffs+t*nGpus;
-    args[t].recvbuffs = recvbuffs+t*nGpus;
-    args[t].ncclId = ncclId;
-    args[t].comms=comms+t*nGpus;
-    args[t].streams=streams+t*nGpus;
+    threads[t].args.nProcs=nProcs;
+    threads[t].args.proc=proc;
+    threads[t].args.nThreads=nThreads;
+    threads[t].args.thread=t;
+    threads[t].args.nGpus=nGpus;
+    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
+    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
+    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms=comms+t*nGpus;
+    threads[t].args.streams=streams+t*nGpus;
 
-    args[t].expectedHost = expectedHost + t*nGpus;
-    args[t].expected = expected + t*nGpus;
-    args[t].procSharedHost = procSharedHost; 
-    args[t].procShared = procShared; 
-    args[t].barrier = (volatile int*)barrier;
-    args[t].barrier_idx = 0;
-    args[t].sync = (volatile int*)sync;
-    args[t].sync_idx = 0;
-    args[t].deltaThreads = delta;
-    args[t].deltaHost = (delta + t);
-    CUDACHECK(cudaHostRegister(args[t].deltaHost, sizeof(double), cudaHostRegisterPortable|cudaHostRegisterMapped));
-    CUDACHECK(cudaHostGetDevicePointer(&args[t].delta, args[t].deltaHost, 0));
-    args[t].errors=errors+t;
-    args[t].bw=bw+t;
-    args[t].bw_count=bw_count+t;
+    threads[t].args.barrier = (volatile int*)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.sync = (volatile int*)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaThreads = delta;
+    threads[t].args.deltaHost = (delta + t);
+    threads[t].args.delta = delta;
+    threads[t].args.errors=errors+t;
+    threads[t].args.bw=bw+t;
+    threads[t].args.bw_count=bw_count+t;
 
-    if (!parallel_init) { 
-       if (t) 
-         pthread_create(threads+t, NULL, threadRunTests, args+t);
-       else
-         threadRunTests(args);
-    } else {
-        if (t || (parallel_init && (proc == 0))) 
-         pthread_create(threads+t, NULL, threadInit, args+t);
-       else  
-         threadInit(args);
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads+t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t=nThreads-1; t>=0; t--) {
+    if (t) pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
     }
   }
 
-  // Wait for other threads
-  for (int t=nThreads-1; t>=0; t--) {
-    if (t || (parallel_init && (proc == 0))) pthread_join(threads[t], NULL);
-    errors[0] += errors[t];
-    bw[0] += bw[t];
-    bw_count[0] += bw_count[t];
-  }
-
 #ifdef MPI_SUPPORT
-    MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 #endif
 
-  for(int i=0; i<nGpus*nThreads; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
+  if (!parallel_init) {
+    for(int i=0; i<nGpus*nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i=0; i<nGpus*nThreads; i++) {
+    CUDACHECK(cudaFree(sendbuffs[i]));
+    CUDACHECK(cudaFree(recvbuffs[i]));
+    CUDACHECK(cudaFree(expected[i]));
+  }
+  CUDACHECK(cudaFreeHost(delta));
 
   char* str = getenv("NCCL_TESTS_MIN_BW");
   double check_avg_bw = str ? atof(str) : -1;
   bw[0] /= bw_count[0];
 
-  PRINT(" Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT(" Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
-  PRINT("\n");
+  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
 #ifdef MPI_SUPPORT
   MPI_Finalize();
 #endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
   if (errors[0] || bw[0] < check_avg_bw*(0.9))
     exit(EXIT_FAILURE);
-  else 
+  else
     exit(EXIT_SUCCESS);
 }
diff --git a/src/common.h b/src/common.h
index 81b0436d75..8fb5b8cadf 100644
--- a/src/common.h
+++ b/src/common.h
@@ -1,8 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
  ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
 
 #include "nccl.h"
 #include <stdio.h>
@@ -17,23 +19,75 @@
 #define CUDACHECK(cmd) do {                         \
   cudaError_t e = cmd;                              \
   if( e != cudaSuccess ) {                          \
-    printf("Cuda failure %s:%d '%s'\n",             \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
         __FILE__,__LINE__,cudaGetErrorString(e));   \
-    exit(EXIT_FAILURE);                             \
+    return testCudaError;                           \
   }                                                 \
 } while(0)
 
 #define NCCLCHECK(cmd) do {                         \
   ncclResult_t r = cmd;                             \
   if (r!= ncclSuccess) {                            \
-    printf("NCCL failure %s:%d '%s'\n",             \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
         __FILE__,__LINE__,ncclGetErrorString(r));   \
-    exit(EXIT_FAILURE);                             \
+    return testNcclError;                           \
   }                                                 \
 } while(0)
 
-struct threadArgs_t {
-  void *proc_args;
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+  testCuRandError = 4
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s: Test failure %s:%d\n",          \
+         hostname,                                  \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
   size_t nbytes;
   size_t minbytes;
   size_t maxbytes;
@@ -55,11 +109,8 @@ struct threadArgs_t {
   ncclComm_t* comms;
   cudaStream_t* streams;
 
-  void** expectedHost;
   void** expected;
   size_t expectedBytes;
-  void* procSharedHost;
-  void* procShared;
   volatile int* sync;
   int sync_idx;
   volatile int* barrier;
@@ -72,27 +123,28 @@ struct threadArgs_t {
   int* errors;
   double* bw;
   int* bw_count;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
 };
 
 #include <chrono>
 
 // Provided by common.cu
-extern void Barrier(struct threadArgs_t* args);
-extern void TimeTest(struct threadArgs_t* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root, int inPlace);
-extern void Randomize(void* ptr, size_t count, ncclDataType_t type, int seed);
-extern void Accumulate(void* out, void* in, size_t n, ncclDataType_t type, ncclRedOp_t op);
-extern void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax);
-extern double DeltaMaxValue(ncclDataType_t type);
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
 
 // Provided by each coll
-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName);
-extern void GetBw(size_t count, int typeSize, double sec, double* algBw, double* busBw, int nranks);
-extern void RunColl(void* sendbuf, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op,  int root, ncclComm_t comm, cudaStream_t stream);
-extern void InitData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op,  int in_place, int is_first);
-extern double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op);
-extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
-extern void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op,  int root, int in_place, int is_first);
-extern void getCollByteCount(size_t *sendbytes, size_t *recvbytes, size_t *parambytes, size_t *sendInlineOffset, size_t *recvInlineOffset, size_t *procSharedBytes, int *sameexpected, size_t nbytes, int nranks);
 extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
 extern void print_header();
 
@@ -152,7 +204,33 @@ extern const char *test_typenames[ncclNumTypes];
 extern ncclRedOp_t test_ops[ncclNumOps];
 extern const char *test_opnames[ncclNumOps];
 
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<ncclNumOps; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
 extern thread_local int is_main_thread;
 #define PRINT if (is_main_thread) printf
 
-
+#endif
diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h
index 1a56b571cf..020a4bc36f 100644
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
@@ -1,7 +1,7 @@
 /*************************************************************************
- * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
  ************************************************************************/
 
 #ifndef NCCL1_COMPAT_H
diff --git a/src/reduce.cu b/src/reduce.cu
index 0bc9a7db83..08825e45b0 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -1,159 +1,123 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <assert.h>
 #include "cuda_runtime.h"
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %6s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op", "root",
-      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
   PRINT("%12li  %12li  %6s  %6s  %6i", size, count, typeName, opName, root);
 }
 
-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = count;
-    *recvcount = count;
-    *sameExpected = 0;
-    *procSharedCount = count;
-    *sendInplaceOffset = 0;
-    *recvInplaceOffset = 0;
-    *paramcount = *sendcount;
- }
-
-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  size_t count = args->expectedBytes / wordSize(type);
-  int root_gpu = root%args->nGpus;
-
-  assert(args->expectedBytes == args->nbytes);
-
-  while (args->sync[args->sync_idx] != args->thread) pthread_yield();
-
-  for (int i=0; i<args->nGpus; i++) {
-    int device;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-
-    if (is_first && i == 0) {
-      CUDACHECK(cudaMemcpy(args->procSharedHost, data, count*wordSize(type), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate(args->procShared, data, count, type, op);
-    }
-
-    if (in_place == 0) {
-      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    }
-    CUDACHECK(cudaDeviceSynchronize());
-  }
-
-  args->sync[args->sync_idx] = args->thread + 1;
-
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    int root_proc = root/(args->nThreads*args->nGpus);
-    if (args->expectedBytes) {
-      // Last thread does the MPI reduction
-      if (root_proc == args->proc) { 
-        void* temp, *tempHost = malloc(args->expectedBytes);
-        CUDACHECK(cudaHostRegister(tempHost, args->expectedBytes, 0));
-        CUDACHECK(cudaHostGetDevicePointer(&temp, tempHost, 0));
-
-        for (int i=0; i<args->nProcs; i++) {
-          if (i == args->proc) continue;
-          MPI_Recv(tempHost, args->expectedBytes, MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-
-          Accumulate(args->procShared, temp, count, type, op);
-          CUDACHECK(cudaDeviceSynchronize());
-        }
-
-        CUDACHECK(cudaHostUnregister(tempHost));
-        free(tempHost);
-      } else {
-        MPI_Send(args->procSharedHost, args->expectedBytes, MPI_BYTE, root_proc, 0, MPI_COMM_WORLD);
-      }
-    }
-#endif
-    args->sync[args->sync_idx] = 0;
-  } else {
-    while (args->sync[args->sync_idx]) pthread_yield();
-  }
-
-  //if root fill expected bytes with reduced data
-  // else if in_place, leave fill it with original data, else set to zero
-  for (int i=0; i<args->nGpus; i++) {
-      int rank = (args->proc*args->nThreads + args->thread)*args->nGpus + i;
-      if (rank == root) { 
-          memcpy(args->expectedHost[root_gpu], args->procSharedHost, args->expectedBytes); 
-      } else { 
-         if (in_place == 1) {
-              CUDACHECK(cudaMemcpy(args->expectedHost[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDeviceToHost));
-          } else {
-              memset(args->expectedHost[i], 0, args->expectedBytes); 
-          }
-      } 
-  }
-
-  args->sync_idx = !args->sync_idx;
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
 }
 
-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
   *algBw = baseBw;
   *busBw = baseBw;
 }
 
-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
+  return testSuccess;
 }
 
+struct testColl reduceTest = {
+  "Reduce",
+  ReduceGetCollByteCount,
+  ReduceInitData,
+  ReduceGetBw,
+  ReduceRunColl
+};
 
-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceTest;
   ncclDataType_t *run_types;
   ncclRedOp_t *run_ops;
   const char **run_typenames, **run_opnames;
   int type_count, op_count;
   int begin_root, end_root;
 
-  if ((int)type != -1) { 
+  if ((int)type != -1) {
     type_count = 1;
     run_types = &type;
     run_typenames = &typeName;
-  } else { 
+  } else {
     type_count = ncclNumTypes;
     run_types = test_types;
     run_typenames = test_typenames;
   }
 
-  if ((int)op != -1) { 
+  if ((int)op != -1) {
     op_count = 1;
     run_ops = &op;
     run_opnames = &opName;
-  } else { 
+  } else {
     op_count = ncclNumOps;
     run_ops = test_ops;
     run_opnames = test_opnames;
   }
 
-  if (root != -1) { 
-     begin_root = end_root = root;
-  } else { 
-     begin_root = 0;
-     end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
   }
 
-  for (int i=0; i<type_count; i++) { 
-      for (int j=0; j<op_count; j++) { 
-         for (int k=begin_root; k<=end_root; k++) { 
-             TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k, 1);
-         }
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      for (int k=begin_root; k<=end_root; k++) {
+        TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
       }
-  }   
+    }
+  }
+  return testSuccess;
 }
+
+struct testEngine reduceEngine = {
+  ReduceGetBuffSize,
+  ReduceRunTest
+};
+
+#pragma weak ncclTestEngine=reduceEngine
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index ef2b1b79d3..0b1d986952 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -1,99 +1,52 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "cuda_runtime.h"
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
-      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
   PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
 }
 
-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = (count/nranks)*nranks;
-    *recvcount = count/nranks;
-    *sameExpected = 0;
-    *procSharedCount = *sendcount;
-    *sendInplaceOffset = 0;
-    *recvInplaceOffset = count/nranks;
-    *paramcount = *recvcount;
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = *recvcount;
 }
 
-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  size_t recvbytes = args->expectedBytes;
-  size_t recvcount = args->expectedBytes / wordSize(type);
-  size_t sendbytes = args->sendBytes;
+testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
-
-  while (args->sync[args->sync_idx] != args->thread) pthread_yield();
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    int device;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-
-    if (is_first && i == 0) {
-      CUDACHECK(cudaMemcpy(args->procSharedHost, data, sendbytes, cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate(args->procShared, data, sendcount, type, op);
-    }
-
-    CUDACHECK(cudaDeviceSynchronize());
-    if (in_place == 0) {
-      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, recvbytes));
-    }
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
     CUDACHECK(cudaDeviceSynchronize());
   }
-
-  args->sync[args->sync_idx] = args->thread + 1;
-
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    if (sendbytes > 0) {
-      // Last thread does the MPI reduction
-      void* remote, *remoteHost = malloc(sendbytes);
-      void* myInitialData = malloc(sendbytes);
-      memcpy(myInitialData, args->procSharedHost, sendbytes);
-      CUDACHECK(cudaHostRegister(remoteHost, sendbytes, 0));
-      CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0));
-
-      for (int i=0; i<args->nProcs; i++) {
-        if (i == args->proc) {
-          MPI_Bcast(myInitialData, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD);
-          free(myInitialData);
-        } else {
-          MPI_Bcast(remoteHost, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD);
-          Accumulate(args->procShared, remote, sendcount, type, op);
-          cudaDeviceSynchronize();
-        }
-      }
-      CUDACHECK(cudaHostUnregister(remoteHost));
-      free(remoteHost);
-    }
-#endif
-    args->sync[args->sync_idx] = 0;
-  } else {
-    while (args->sync[args->sync_idx]) pthread_yield();
-  }
-
-  for (int i=0; i<args->nGpus; i++) {
-      int offset = ((args->proc*args->nThreads + args->thread)*args->nGpus + i)*recvbytes;
-      memcpy(args->expectedHost[i], (void *)((uintptr_t)args->procSharedHost + offset), recvbytes);
-  }
-
-  args->sync_idx = !args->sync_idx;
+  return testSuccess;
 }
 
-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
 
   *algBw = baseBw;
@@ -101,17 +54,32 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
   *busBw = baseBw * factor;
 }
 
-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
 }
 
-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+struct testColl reduceScatterTest = {
+  "ReduceScatter",
+  ReduceScatterGetCollByteCount,
+  ReduceScatterInitData,
+  ReduceScatterGetBw,
+  ReduceScatterRunColl
+};
+
+void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceScatterTest;
   ncclDataType_t *run_types;
   ncclRedOp_t *run_ops;
   const char **run_typenames, **run_opnames;
   int type_count, op_count;
 
-  if ((int)type != -1) { 
+  if ((int)type != -1) {
     type_count = 1;
     run_types = &type;
     run_typenames = &typeName;
@@ -121,19 +89,27 @@ void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const cha
     run_typenames = test_typenames;
   }
 
-  if ((int)op != -1) { 
+  if ((int)op != -1) {
     run_ops = &op;
     run_opnames = &opName;
     op_count = 1;
-  } else { 
+  } else {
     op_count = sizeof(test_ops)/sizeof(test_ops[0]);
     run_ops = test_ops;
     run_opnames = test_opnames;
   }
 
-  for (int i=0; i<type_count; i++) { 
-      for (int j=0; j<op_count; j++) { 
-          TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], 0, 1);
-      }
-  }   
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
 }
+
+struct testEngine reduceScatterEngine = {
+  ReduceScatterGetBuffSize,
+  ReduceScatterRunTest
+};
+
+#pragma weak ncclTestEngine=reduceScatterEngine

From a15f771cb2bb4655a364c2a166b26e7c7a522713 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 9 Apr 2019 15:51:40 -0700
Subject: [PATCH 013/233] hipify nccl-tests to become rccl-tests

---
 README.md             |  22 ++++----
 doc/PERFORMANCE.md    |  10 ++--
 src/Makefile          |  57 ++++++++-----------
 src/all_gather.cu     |  10 ++--
 src/all_reduce.cu     |  10 ++--
 src/broadcast.cu      |  10 ++--
 src/common.cu         | 124 +++++++++++++++++++++---------------------
 src/common.h          |  17 +++---
 src/nccl1_compat.h    |  10 ++--
 src/reduce.cu         |  12 ++--
 src/reduce_scatter.cu |  12 ++--
 11 files changed, 143 insertions(+), 151 deletions(-)

diff --git a/README.md b/README.md
index 7a4bbbc6ca..13292fb93b 100644
--- a/README.md
+++ b/README.md
@@ -1,26 +1,26 @@
-# NCCL Tests
+# RCCL Tests
 
-These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL](http://github.com/nvidia/nccl)
+These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl)
 
 ## Build
 
 To build the tests, just type `make`.
 
-If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME.
+If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify RCCL\_HOME.
 
 ```shell
-$ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
+$ make HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
 ```
 
-NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
+RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
 
 ```shell
-$ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
+$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
 ```
 
 ## Usage
 
-NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
+RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
 
 ### Quick examples
 
@@ -51,7 +51,7 @@ All tests support the same set of arguments :
   * Increments can be either fixed or a multiplication factor. Only one of those should be used
     * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
     * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
-* NCCL operations arguments
+* RCCL operations arguments
   * `-o,--op <sum/prod/min/max/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
   * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
   * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
@@ -60,11 +60,11 @@ All tests support the same set of arguments :
   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
 * Test operation
-  * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
+  * `-p,--parallel_init <0/1>` use threads to initialize RCCL in parallel. Default : 0.
   * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
-  * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
+  * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
 
 ## Copyright
 
-NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+RCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 
diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md
index 7cc6ecee66..dd049bf6e9 100644
--- a/doc/PERFORMANCE.md
+++ b/doc/PERFORMANCE.md
@@ -1,6 +1,6 @@
-# Performance reported by NCCL tests
+# Performance reported by RCCL tests
 
-NCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
+RCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
 
 # Time
 
@@ -24,7 +24,7 @@ Algorithm bandwidth is using the most commonly used formula for bandwidth : size
 While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks.
 Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase).
 
-To provide a number which reflects how optimally the hardware is used, NCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
+To provide a number which reflects how optimally the hardware is used, RCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
 This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication.
 Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
 
@@ -78,7 +78,7 @@ And the Bus Bandwidth is therefore computed as :
 
  `B = S/t * (n-1)/n = algbw * (n-1)/n`
 
-Note that here, S is the size in bytes of the total array, which for NCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
+Note that here, S is the size in bytes of the total array, which for RCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
 
 ### AllGather
 
@@ -96,7 +96,7 @@ And the Bus Bandwidth is therefore computed as :
 
  `B = S/t * (n-1)/n = algbw * (n-1)/n`
 
-Note that here, S is the size in bytes of the total array, which for NCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
+Note that here, S is the size in bytes of the total array, which for RCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
 
 ### Broadcast
 
diff --git a/src/Makefile b/src/Makefile
index 034cc672fa..bb18157045 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,41 +4,30 @@
 # See LICENSE.txt for license information
 #
 
-CUDA_HOME ?= /usr/local/cuda
+ROCM_HOME ?= /opt/rocm
+MPI_HOME ?= /usr/lib/openmpi
 PREFIX ?= /usr/local
 VERBOSE ?= 0
 DEBUG ?= 0
 
-CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-NVCC = $(CUDA_HOME)/bin/nvcc
+HIPCC = $(ROCM_HOME)/hip/bin/hipcc
+CXX = $(HIPCC)
 
-# Better define NVCC_GENCODE in your environment to the minimal set
-# of archs to reduce compile time.
-NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
-		-gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-		-gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-		-gencode=arch=compute_70,code=compute_70 \
-		-gencode=arch=compute_70,code=sm_70
-
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
-
-LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
-NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
+HIPCUFLAGS :=
+HIPCUFLAGS += -I$(ROCM_HOME)/include
+HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl
+HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip
+HIPCUFLAGS += -I$(ROCM_HOME)/hiprand/include
+LDFLAGS    := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
 
 ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3 -g
-CXXFLAGS  += -O3 -g
+HIPCUFLAGS += -O3
 else
-NVCUFLAGS += -O0 -G -g
-CXXFLAGS  += -O0 -g -ggdb3
+HIPCUFLAGS += -O0 -g -ggdb3
 endif
 
-ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
-else
+ifeq ($(VERBOSE), 0)
 .SILENT:
 endif
 
@@ -46,16 +35,16 @@ endif
 
 BUILDDIR ?= ../build
 ifneq ($(NCCL_HOME), "")
-NVCUFLAGS += -I$(NCCL_HOME)/include/
-NVLDFLAGS += -L$(NCCL_HOME)/lib
+HIPCUFLAGS += -I$(NCCL_HOME)/include/
+HIPLDFLAGS   += -L$(NCCL_HOME)/lib
 endif
 
 ifeq ($(MPI), 1)
-NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
-NVLDFLAGS += -L$(MPI_HOME)/lib -lmpi
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include
+HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
 endif
-LIBRARIES += curand nccl nvToolsExt
-NVLDFLAGS += $(LIBRARIES:%=-l%)
+LIBRARIES += rccl
+HIPLDFLAGS   += $(LIBRARIES:%=-l%)
 
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
@@ -71,10 +60,12 @@ clean:
 ${DST_DIR}/%.o: %.cu common.h
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
 
 ${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}
 
diff --git a/src/all_gather.cu b/src/all_gather.cu
index cfb2ec356b..e9d382cd69 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "cuda_runtime.h"
+#include <hip/hip_runtime.h>
 #include "common.h"
 
 void print_header() {
@@ -34,15 +34,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(gpuid));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, type, rep, rank));
     for (int j=0; j<nranks; j++) {
       TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
     }
-    CUDACHECK(cudaDeviceSynchronize());
+    HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -55,7 +55,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index bd8daaf0a2..4fcb9a0e48 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "cuda_runtime.h"
+#include <hip/hip_runtime.h>
 #include "common.h"
 
 void print_header() {
@@ -34,13 +34,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(gpuid));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, type, rep, rank));
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    CUDACHECK(cudaDeviceSynchronize());
+    HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -53,7 +53,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/broadcast.cu b/src/broadcast.cu
index c62a99ff62..4a7cdb9ae2 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "cuda_runtime.h"
+#include <hip/hip_runtime.h>
 #include "common.h"
 
 void print_header() {
@@ -33,13 +33,13 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(gpuid));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
     TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root));
-    CUDACHECK(cudaDeviceSynchronize());
+    HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -52,7 +52,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
 #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
diff --git a/src/common.cu b/src/common.cu
index 5a3ae529d6..9fe70e5986 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*************************************************************************
  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
@@ -9,7 +10,6 @@
 #include <cstdio>
 #include <getopt.h>
 #include <libgen.h>
-#include "cuda.h"
 
 #if NCCL_MAJOR >= 2
 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble};
@@ -129,27 +129,27 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
 testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
   switch (type) {
     case ncclHalf:
-      deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      hipLaunchKernelGGL((deltaKern<half, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
     case ncclFloat:
-      deltaKern<float, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      hipLaunchKernelGGL((deltaKern<float, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
     case ncclDouble:
-      deltaKern<double, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      hipLaunchKernelGGL((deltaKern<double, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
 
     case ncclChar:
 #if NCCL_MAJOR >= 2
     case ncclUint8:
 #endif
-      deltaKern<uint8_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      hipLaunchKernelGGL((deltaKern<uint8_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
     case ncclInt:
 #if NCCL_MAJOR >= 2
     case ncclUint32:
 #endif
-      deltaKern<uint32_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      hipLaunchKernelGGL((deltaKern<uint32_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
     case ncclInt64:
     case ncclUint64:
-      deltaKern<uint64_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      hipLaunchKernelGGL((deltaKern<uint64_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
   }
-  CUDACHECK(cudaDeviceSynchronize());
+  HIPCHECK(hipDeviceSynchronize());
   return testSuccess;
 }
 
@@ -196,61 +196,63 @@ template<>
 __device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
 
 template<typename T, T (*Op)(T, T)>
-__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
+__global__ void InitDataReduceKernel(void* data, const size_t N, const size_t offset, const int rep, const int nranks) {
   for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
     T val = testValue<T>(o+offset, rep, 0);
     for (int i=1; i<nranks; i++) {
       val = Op(val, testValue<T>(o+offset, rep, i));
     }
-    data[o] = val;
+    ((T*)data)[o] = val;
   }
 }
 
-#define KERN(type, op) (void*)InitDataReduceKernel<type, op<type>>
+typedef void(*redInitKern_t)(void* data, const size_t N, const size_t offset, const int rep, const int nranks);
+
+#define KERN(type, op) InitDataReduceKernel<type, op<type>>
 #define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin)
 
-static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
+static redInitKern_t const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
   OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double)
 };
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
   dim3 grid = { 32, 1, 1 };
   dim3 block = { 256, 1, 1 };
-  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
-  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*ncclNumOps+op], grid, block, args, 0, cudaStreamDefault));
+  hipLaunchKernelGGL((redInitDataKerns[type*ncclNumOps+op]), grid, block, 0, 0, data, count, offset, rep, nranks);
   return testSuccess;
 }
 
 template<typename T>
-__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
+__global__ void InitDataKernel(void* data, const size_t N, const int rep, const int rank) {
   for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
-    data[o] = testValue<T>(o, rep, rank);
+    ((T*)data)[o] = testValue<T>(o, rep, rank);
 }
 
-static void* const initDataKerns[ncclNumTypes] = {
-  (void*)InitDataKernel<  int8_t>,
-  (void*)InitDataKernel< uint8_t>,
-  (void*)InitDataKernel< int32_t>,
-  (void*)InitDataKernel<uint32_t>,
-  (void*)InitDataKernel< int64_t>,
-  (void*)InitDataKernel<uint64_t>,
-  (void*)InitDataKernel<    half>,
-  (void*)InitDataKernel<   float>,
-  (void*)InitDataKernel<  double>
+typedef void(*initDataKern_t)(void* data, const size_t N, const int rep, const int rank);
+
+static initDataKern_t const initDataKerns[ncclNumTypes] = {
+  InitDataKernel<  int8_t>,
+  InitDataKernel< uint8_t>,
+  InitDataKernel< int32_t>,
+  InitDataKernel<uint32_t>,
+  InitDataKernel< int64_t>,
+  InitDataKernel<uint64_t>,
+  InitDataKernel<    half>,
+  InitDataKernel<   float>,
+  InitDataKernel<  double>
 };
 
 template<typename T>
 testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
   T* ptr = (T*)dest;
-  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  hipLaunchKernelGGL((InitDataKernel), dim3(16), dim3(512), 0, 0, ptr, N, rep, rank);
   return testSuccess;
 }
 
 testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
   dim3 grid = { 32, 1, 1 };
   dim3 block = { 256, 1, 1 };
-  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
-  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  hipLaunchKernelGGL((initDataKerns[type]), grid, block, 0, 0, data, count, rep, rank);
   return testSuccess;
 }
 
@@ -279,7 +281,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     int device;
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
+    HIPCHECK(hipSetDevice(device));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
     TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->delta));
     maxDelta = std::max(*(args->deltaHost), maxDelta);
@@ -289,14 +291,14 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
        int *expectedHost = (int *)malloc(args->expectedBytes);
        int *dataHost = (int *)malloc(args->expectedBytes);
 
-       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       hipMemcpy(expectedHost, args->expected[0], args->expectedBytes, hipMemcpyDeviceToHost);
        printf("\n Expected: ");
        for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
          printf("%d:%d ", j, expectedHost[j]);
        }
        printf("\n");
 
-       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
        printf("\n Actual: ");
        for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
          printf("%d:%d ", j, dataHost[j]);
@@ -312,8 +314,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   return testSuccess;
 }
 
-testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
-  cudaError_t cudaErr;
+testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) {
+  hipError_t hipErr;
   int remaining = ngpus;
   int* done = (int*)malloc(sizeof(int)*ngpus);
   memset(done, 0, sizeof(int)*ngpus);
@@ -322,15 +324,15 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
    for (int i=0; i<ngpus; i++) {
      if (done[i]) continue;
 
-     cudaErr = cudaStreamQuery(streams[i]);
-     if (cudaErr == cudaSuccess) {
+     hipErr = hipStreamQuery(streams[i]);
+     if (hipErr == hipSuccess) {
        done[i] = 1;
        remaining--;
        idle = 0;
        continue;
      }
 
-     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
+     if (hipErr != hipErrorNotReady) HIPCHECK(hipErr);
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
      if (comms) {
@@ -365,9 +367,9 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
   for (int i = 0; i < args->nGpus; i++) {
 #ifndef NCCL_MAJOR
-    int cudaDev;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
-    CUDACHECK(cudaSetDevice(cudaDev));
+    int hipDev;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev));
+    HIPCHECK(hipSetDevice(hipDev));
 #endif
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -514,7 +516,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
   int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
-  CUDACHECK(cudaSetDevice(gpuid));
+  HIPCHECK(hipSetDevice(gpuid));
   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
   return testSuccess;
 }
@@ -531,7 +533,7 @@ testResult_t threadInit(struct threadArgs* args) {
   for (int i=0; i<args->nGpus; i++) {
     int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(gpuid));
     NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
   }
   NCCLCHECK(ncclGroupEnd());
@@ -555,9 +557,9 @@ testResult_t threadLaunch(struct testThread* thread) {
 }
 
 testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
-    CUDACHECK(cudaMalloc(sendbuff, nbytes));
-    CUDACHECK(cudaMalloc(recvbuff, nbytes));
-    CUDACHECK(cudaMalloc(expected, recvBytes));
+    HIPCHECK(hipMalloc(sendbuff, nbytes));
+    HIPCHECK(hipMalloc(recvbuff, nbytes));
+    HIPCHECK(hipMalloc(expected, recvBytes));
     return testSuccess;
 }
 
@@ -724,12 +726,12 @@ testResult_t run() {
   char line[MAX_LINE];
   int len = 0;
   for (int i=0; i<nThreads*nGpus; i++) {
-    int cudaDev = localRank*nThreads*nGpus+i;
+    int hipDev = localRank*nThreads*nGpus+i;
     int rank = proc*nThreads*nGpus+i;
-    cudaDeviceProp prop;
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    hipDeviceProp_t prop;
+    HIPCHECK(hipGetDeviceProperties(&prop, hipDev));
     len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
-                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+                    rank, getpid(), hostname, hipDev, prop.pciBusID, prop.name);
   }
 
 #if MPI_SUPPORT
@@ -752,7 +754,7 @@ testResult_t run() {
 #ifdef MPI_SUPPORT
   MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
 #endif
-  cudaStream_t streams[nGpus*nThreads];
+  hipStream_t streams[nGpus*nThreads];
   void* sendbuffs[nGpus*nThreads];
   void* recvbuffs[nGpus*nThreads];
   void* expected[nGpus*nThreads];
@@ -761,9 +763,9 @@ testResult_t run() {
   ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
 
   for (int i=0; i<nGpus*nThreads; i++) {
-    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+    HIPCHECK(hipSetDevice(localRank*nThreads*nGpus+i));
     AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus);
-    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+    HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL
@@ -776,7 +778,7 @@ testResult_t run() {
      } else {
        NCCLCHECK(ncclGroupStart());
        for (int i=0; i<nGpus*nThreads; i++) {
-         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+         HIPCHECK(hipSetDevice(localRank*nThreads*nGpus+i));
          NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
        }
        NCCLCHECK(ncclGroupEnd());
@@ -786,7 +788,7 @@ testResult_t run() {
   int errors[nThreads];
   double bw[nThreads];
   double* delta;
-  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped));
+  HIPCHECK(hipHostMalloc(&delta, sizeof(double)*nThreads, hipHostMallocPortable | hipHostMallocMapped));
   int bw_count[nThreads];
   for (int t=0; t<nThreads; t++) {
     bw[t] = 0.0;
@@ -860,13 +862,13 @@ testResult_t run() {
     free(comms);
   }
 
-  // Free off CUDA allocated memory
+  // Free off HIP allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
-    CUDACHECK(cudaFree(sendbuffs[i]));
-    CUDACHECK(cudaFree(recvbuffs[i]));
-    CUDACHECK(cudaFree(expected[i]));
+    HIPCHECK(hipFree(sendbuffs[i]));
+    HIPCHECK(hipFree(recvbuffs[i]));
+    HIPCHECK(hipFree(expected[i]));
   }
-  CUDACHECK(cudaFreeHost(delta));
+  HIPCHECK(hipHostFree(delta));
 
   char* str = getenv("NCCL_TESTS_MIN_BW");
   double check_avg_bw = str ? atof(str) : -1;
@@ -879,8 +881,8 @@ testResult_t run() {
   MPI_Finalize();
 #endif
 
-  // 'cuda-memcheck --leak-check full' requires this
-  cudaDeviceReset();
+  // 'hip-memcheck --leak-check full' requires this
+  hipDeviceReset();
 
   if (errors[0] || bw[0] < check_avg_bw*(0.9))
     exit(EXIT_FAILURE);
diff --git a/src/common.h b/src/common.h
index 8fb5b8cadf..be4000dd64 100644
--- a/src/common.h
+++ b/src/common.h
@@ -6,24 +6,23 @@
 #ifndef __COMMON_H__
 #define __COMMON_H__
 
-#include "nccl.h"
+#include "rccl.h"
 #include <stdio.h>
 #include <algorithm>
-#include <curand.h>
 #ifdef MPI_SUPPORT
 #include "mpi.h"
 #endif
 #include <pthread.h>
 #include "nccl1_compat.h"
 
-#define CUDACHECK(cmd) do {                         \
-  cudaError_t e = cmd;                              \
-  if( e != cudaSuccess ) {                          \
+#define HIPCHECK(cmd) do {                         \
+  hipError_t e = cmd;                              \
+  if( e != hipSuccess ) {                          \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
-    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+    printf("%s: Test HIP failure %s:%d '%s'\n",    \
          hostname,                                  \
-        __FILE__,__LINE__,cudaGetErrorString(e));   \
+        __FILE__,__LINE__,hipGetErrorString(e));   \
     return testCudaError;                           \
   }                                                 \
 } while(0)
@@ -71,7 +70,7 @@ struct testColl {
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
   testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+      ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
@@ -107,7 +106,7 @@ struct threadArgs {
   size_t recvInplaceOffset;
   ncclUniqueId ncclId;
   ncclComm_t* comms;
-  cudaStream_t* streams;
+  hipStream_t* streams;
 
   void** expected;
   size_t expectedBytes;
diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h
index 020a4bc36f..726669c885 100644
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
@@ -20,28 +20,28 @@ static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
 #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
 
 static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   CHECKCOUNT(count);
   return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
 }
 static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
   CHECKCOUNT(count);
   return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
 }
 static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
+    ncclComm_t comm, hipStream_t stream) {
   CHECKCOUNT(count);
   return ncclBcast(buff, (int)count, datatype, root, comm, stream);
 }
 static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream) {
+    hipStream_t stream) {
   CHECKCOUNT(recvcount);
   return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
 }
 static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
   CHECKCOUNT(sendcount);
   return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
 }
diff --git a/src/reduce.cu b/src/reduce.cu
index 08825e45b0..5a286c1b6b 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "cuda_runtime.h"
+#include <hip/hip_runtime.h>
 #include "common.h"
 
 void print_header() {
@@ -34,14 +34,14 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
 
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(gpuid));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
     if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    CUDACHECK(cudaDeviceSynchronize());
+    HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -52,7 +52,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw;
 }
 
-testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
   return testSuccess;
 }
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index 0b1d986952..3906621e96 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "cuda_runtime.h"
+#include <hip/hip_runtime.h>
 #include "common.h"
 
 void print_header() {
@@ -34,14 +34,14 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
 
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(gpuid));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
-    CUDACHECK(cudaDeviceSynchronize());
+    HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -54,7 +54,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
   *busBw = baseBw * factor;
 }
 
-testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }

From 71e663e62d4ffb124c470dba7a286f291653161f Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Wed, 10 Apr 2019 15:28:40 -0700
Subject: [PATCH 014/233] Adding AMD copyright notices

---
 LICENSE.txt           | 1 +
 Makefile              | 1 +
 README.md             | 4 +++-
 src/Makefile          | 1 +
 src/all_gather.cu     | 1 +
 src/all_reduce.cu     | 1 +
 src/broadcast.cu      | 1 +
 src/common.cu         | 4 +++-
 src/common.h          | 3 ++-
 src/nccl1_compat.h    | 1 +
 src/reduce.cu         | 1 +
 src/reduce_scatter.cu | 1 +
 12 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index 4573c07c44..d2e566e3e2 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,5 +1,6 @@
 
  Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+ Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
diff --git a/Makefile b/Makefile
index 29409a8422..6a90587888 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
 #
 # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENCE.txt for license information
 #
diff --git a/README.md b/README.md
index 13292fb93b..e96ce21599 100644
--- a/README.md
+++ b/README.md
@@ -66,5 +66,7 @@ All tests support the same set of arguments :
 
 ## Copyright
 
-RCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+RCCL tests are provided under the BSD license.
+All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 
diff --git a/src/Makefile b/src/Makefile
index bb18157045..56d52405bb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,6 @@
 #
 # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/src/all_gather.cu b/src/all_gather.cu
index e9d382cd69..45615ccd27 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 4fcb9a0e48..177674085e 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 4a7cdb9ae2..4119d9eefb 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/common.cu b/src/common.cu
index 9fe70e5986..cd2974189f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1,10 +1,12 @@
-#include "hip/hip_runtime.h"
+
 /*************************************************************************
  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
+#include "hip/hip_runtime.h"
 #include "common.h"
 #include <pthread.h>
 #include <cstdio>
diff --git a/src/common.h b/src/common.h
index be4000dd64..2ddf40b2ee 100644
--- a/src/common.h
+++ b/src/common.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -192,7 +193,7 @@ static size_t wordSize(ncclDataType_t type) {
     case ncclInt64:
     case ncclUint64:
     case ncclDouble:
-    //case ncclFloat64: 
+    //case ncclFloat64:
       return 8;
     default: return 0;
   }
diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h
index 726669c885..3c241d3d14 100644
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/reduce.cu b/src/reduce.cu
index 5a286c1b6b..3e9056ad05 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index 3906621e96..c466ca284b 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/

From 4474fe168d5c8b38f56ec2931d093102c74ef3d1 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 18 Apr 2019 10:34:55 -0700
Subject: [PATCH 015/233] workaround weak symbol issue

hcc prints "error: alias must point to a defined variable or function"
---
 src/all_gather.cu     | 6 ++----
 src/all_reduce.cu     | 6 ++----
 src/broadcast.cu      | 6 ++----
 src/reduce.cu         | 6 ++----
 src/reduce_scatter.cu | 6 ++----
 5 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/all_gather.cu b/src/all_gather.cu
index 45615ccd27..2e6c880160 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -96,9 +96,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
   return testSuccess;
 }
 
-struct testEngine allGatherEngine = {
+struct testEngine ncclTestEngine = {
   AllGatherGetBuffSize,
   AllGatherRunTest
-};
-
-#pragma weak ncclTestEngine=allGatherEngine
+};
\ No newline at end of file
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 177674085e..acc7c9c69a 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -107,9 +107,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
   return testSuccess;
 }
 
-struct testEngine allReduceEngine = {
+struct testEngine ncclTestEngine = {
   AllReduceGetBuffSize,
   AllReduceRunTest
-};
-
-#pragma weak ncclTestEngine=allReduceEngine
+};
\ No newline at end of file
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 4119d9eefb..bb29738ee0 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -113,9 +113,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
   return testSuccess;
 }
 
-struct testEngine broadcastEngine = {
+struct testEngine ncclTestEngine = {
   BroadcastGetBuffSize,
   BroadcastRunTest
-};
-
-#pragma weak ncclTestEngine=broadcastEngine
+};
\ No newline at end of file
diff --git a/src/reduce.cu b/src/reduce.cu
index 3e9056ad05..541930797b 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -116,9 +116,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
   return testSuccess;
 }
 
-struct testEngine reduceEngine = {
+struct testEngine ncclTestEngine = {
   ReduceGetBuffSize,
   ReduceRunTest
-};
-
-#pragma weak ncclTestEngine=reduceEngine
+};
\ No newline at end of file
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index c466ca284b..10856cc3da 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -108,9 +108,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
   return testSuccess;
 }
 
-struct testEngine reduceScatterEngine = {
+struct testEngine ncclTestEngine = {
   ReduceScatterGetBuffSize,
   ReduceScatterRunTest
-};
-
-#pragma weak ncclTestEngine=reduceScatterEngine
+};
\ No newline at end of file

From 10e1572f726054ccef30f08526897e5f08fbe54c Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Mon, 22 Apr 2019 10:25:49 -0700
Subject: [PATCH 016/233] Adding way to specify a custom RCCL shared library
 file to link against

---
 src/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 56d52405bb..f1a6a6e2e3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,13 +14,14 @@ DEBUG ?= 0
 HIPCC = $(ROCM_HOME)/hip/bin/hipcc
 CXX = $(HIPCC)
 
+
 HIPCUFLAGS :=
 HIPCUFLAGS += -I$(ROCM_HOME)/include
 HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl
 HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip
 HIPCUFLAGS += -I$(ROCM_HOME)/hiprand/include
 LDFLAGS    := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
-HIPLDFLAGS := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS := $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
 
 ifeq ($(DEBUG), 0)
 HIPCUFLAGS += -O3

From 7e80ea9d3afd93985413cd5a59e4fd909d666d02 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 26 Apr 2019 17:00:31 -0700
Subject: [PATCH 017/233] fix build with 1.0 library

---
 src/common.cu | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index cd2974189f..81351e0433 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -214,7 +214,11 @@ typedef void(*redInitKern_t)(void* data, const size_t N, const size_t offset, co
 #define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin)
 
 static redInitKern_t const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
+#if NCCL_MAJOR >= 2
   OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double)
+#else
+  OPS(char), OPS(int32_t), OPS(half), OPS(float), OPS(double), OPS(int64_t), OPS(uint64_t)
+#endif
 };
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
@@ -233,6 +237,7 @@ __global__ void InitDataKernel(void* data, const size_t N, const int rep, const
 typedef void(*initDataKern_t)(void* data, const size_t N, const int rep, const int rank);
 
 static initDataKern_t const initDataKerns[ncclNumTypes] = {
+#if NCCL_MAJOR >= 2
   InitDataKernel<  int8_t>,
   InitDataKernel< uint8_t>,
   InitDataKernel< int32_t>,
@@ -242,6 +247,15 @@ static initDataKern_t const initDataKerns[ncclNumTypes] = {
   InitDataKernel<    half>,
   InitDataKernel<   float>,
   InitDataKernel<  double>
+#else
+  InitDataKernel<    char>,
+  InitDataKernel< int32_t>,
+  InitDataKernel<    half>,
+  InitDataKernel<   float>,
+  InitDataKernel<  double>,
+  InitDataKernel< int64_t>,
+  InitDataKernel<uint64_t>,
+#endif
 };
 
 template<typename T>
@@ -336,6 +350,7 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
 
      if (hipErr != hipErrorNotReady) HIPCHECK(hipErr);
 
+#if NCCL_MAJOR >= 2
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
      if (comms) {
        ncclResult_t ncclAsyncErr;
@@ -349,6 +364,7 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
          NCCLCHECK(ncclAsyncErr);
        }
      }
+#endif
 #endif
    }
 
@@ -543,7 +559,11 @@ testResult_t threadInit(struct threadArgs* args) {
   TESTCHECK(threadRunTests(args));
 
   for (int i=0; i<args->nGpus; i++) {
+#if NCCL_MAJOR >= 2
     NCCLCHECK(ncclCommDestroy(args->comms[i]));
+#else
+    ncclCommDestroy(args->comms[i]);
+#endif
   }
   return testSuccess;
 }
@@ -860,7 +880,11 @@ testResult_t run() {
 
   if (!parallel_init) {
     for(int i=0; i<nGpus*nThreads; ++i)
+#if NCCL_MAJOR >= 2
       NCCLCHECK(ncclCommDestroy(comms[i]));
+#else
+      ncclCommDestroy(comms[i]);
+#endif
     free(comms);
   }
 

From 3f89175af5bc93db0d36758a0e4217f49b7147fa Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 1 May 2019 12:58:04 -0700
Subject: [PATCH 018/233] allow using different memory types for input and
 output buffers

---
 src/common.cu | 37 ++++++++++++++++++++++++++++++++-----
 src/common.h  | 15 +++++++++++++++
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 81351e0433..61084eb1bd 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -22,6 +22,7 @@ const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "dou
 #endif
 ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin};
 const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"};
+const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host"};
 
 thread_local int is_main_thread = 0;
 
@@ -41,6 +42,7 @@ static int nccltype = ncclFloat;
 static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
+static int memorytype = 0;
 
 double parsesize(char *value) {
     long long int units;
@@ -579,10 +581,22 @@ testResult_t threadLaunch(struct testThread* thread) {
 }
 
 testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+  if (memorytype == ncclFine) {
+    HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
+    HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
+    HIPCHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
+  }
+  else if (memorytype == ncclHost) {
+    HIPCHECK(hipHostMalloc(sendbuff, nbytes));
+    HIPCHECK(hipHostMalloc(recvbuff, nbytes));
+    HIPCHECK(hipHostMalloc(expected, recvBytes));
+  }
+  else {
     HIPCHECK(hipMalloc(sendbuff, nbytes));
     HIPCHECK(hipMalloc(recvbuff, nbytes));
     HIPCHECK(hipMalloc(expected, recvBytes));
-    return testSuccess;
+  }
+  return testSuccess;
 }
 
 testResult_t run(); // Main function
@@ -609,12 +623,13 @@ int main(int argc, char* argv[]) {
     {"datatype", required_argument, 0, 'd'},
     {"root", required_argument, 0, 'r'},
     {"blocking", required_argument, 0, 'z'},
+    {"memory_type", required_argument, 0, 'y'},
     {"help", no_argument, 0, 'h'}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:h", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:h", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -669,6 +684,9 @@ int main(int argc, char* argv[]) {
       case 'z':
         blocking_coll = strtol(optarg, NULL, 0);
         break;
+      case 'y':
+        memorytype = ncclstringtomtype(optarg);
+        break;
       case 'h':
 	printf("USAGE: %s \n\t"
             "[-t,--nthreads <num threads>] \n\t"
@@ -686,6 +704,7 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
+            "[-y,--memory_type <coarse/fine/host>] \n\t"
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;
@@ -707,6 +726,7 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
+            "[-y,--memory_type <coarse/fine/host>] \n\t"
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;
@@ -890,9 +910,16 @@ testResult_t run() {
 
   // Free off HIP allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
-    HIPCHECK(hipFree(sendbuffs[i]));
-    HIPCHECK(hipFree(recvbuffs[i]));
-    HIPCHECK(hipFree(expected[i]));
+    if (memorytype == ncclHost) {
+      HIPCHECK(hipHostFree(sendbuffs[i]));
+      HIPCHECK(hipHostFree(recvbuffs[i]));
+      HIPCHECK(hipHostFree(expected[i]));
+    }
+    else {
+      HIPCHECK(hipFree(sendbuffs[i]));
+      HIPCHECK(hipFree(recvbuffs[i]));
+      HIPCHECK(hipFree(expected[i]));
+    }
   }
   HIPCHECK(hipHostFree(delta));
 
diff --git a/src/common.h b/src/common.h
index 2ddf40b2ee..dd98d547df 100644
--- a/src/common.h
+++ b/src/common.h
@@ -203,6 +203,11 @@ extern ncclDataType_t test_types[ncclNumTypes];
 extern const char *test_typenames[ncclNumTypes];
 extern ncclRedOp_t test_ops[ncclNumOps];
 extern const char *test_opnames[ncclNumOps];
+typedef enum { ncclCoarse        = 0,
+               ncclFine          = 1,
+               ncclHost          = 2,
+               nccl_NUM_MTYPES   = 3 } ncclMemoryType_t;
+extern const char *test_memorytypes[nccl_NUM_MTYPES];
 
 static int ncclstringtotype(char *str) {
     for (int t=0; t<ncclNumTypes; t++) {
@@ -230,6 +235,16 @@ static int ncclstringtoop (char *str) {
     return ncclSum;
 }
 
+static int ncclstringtomtype (char *str) {
+    for (int o=0; o<nccl_NUM_MTYPES; o++) {
+      if (strcmp(str, test_memorytypes[o]) == 0) {
+        return o;
+      }
+    }
+    printf("invalid memorytype %s, defaulting to %s .. \n", str, test_memorytypes[ncclCoarse]);
+    return ncclCoarse;
+}
+
 extern thread_local int is_main_thread;
 #define PRINT if (is_main_thread) printf
 

From 677e2701e7a69b29ab758f3a20701afa0f4a8cfe Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Tue, 7 May 2019 18:27:25 +0000
Subject: [PATCH 019/233] Adding copyright notice.

---
 NOTICES.txt | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 NOTICES.txt

diff --git a/NOTICES.txt b/NOTICES.txt
new file mode 100644
index 0000000000..6f49d61763
--- /dev/null
+++ b/NOTICES.txt
@@ -0,0 +1,66 @@
+Notices and Licenses file
+_______________________________________________________________
+
+Dependencies on nvidia-nccl-tests v2.0.0 (BSD3)
+Copyright (c) 2016-2017, NVIDIA CORPORATION.
+Modifications Copyright (c) 2019 Advanced Micro Devices, Inc.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+
+nvidia-nccl-tests v2.0.0 (BSD2)
+Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.

From 86f053be841023885cb24bef1096229a2e5327df Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Mon, 13 May 2019 23:45:28 +0000
Subject: [PATCH 020/233] enable building with mpich

Use following command to build: MPICH=1 make
---
 src/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index f1a6a6e2e3..acf41d7e5a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -44,7 +44,11 @@ endif
 ifeq ($(MPI), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include
 HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+else ifeq($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich
+HIPLDFLAGS += -L/usr/lib -lmpi
 endif
+
 LIBRARIES += rccl
 HIPLDFLAGS   += $(LIBRARIES:%=-l%)
 

From 79356ec21874624b39a3b5a1bfd8b2ec85b624fc Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Wed, 15 May 2019 16:59:47 +0000
Subject: [PATCH 021/233] Updating README to include CUSTOM_RCCL_LIB.

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e96ce21599..c442de3934 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,10 @@ These tests check both the performance and the correctness of RCCL operations. T
 
 To build the tests, just type `make`.
 
-If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify RCCL\_HOME.
+If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify NCCL\_HOME and CUSTOM\_RCCL\_LIB.
 
 ```shell
-$ make HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
+$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rccl/lib/librccl.so
 ```
 
 RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.

From 3aa32972f4b0bbb59eda1d7eb8e1a90a28fa7638 Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Wed, 15 May 2019 11:22:34 -0600
Subject: [PATCH 022/233] Update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c442de3934..2731d65c65 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # RCCL Tests
 
-These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl)
+These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl).
 
 ## Build
 
@@ -67,6 +67,8 @@ All tests support the same set of arguments :
 ## Copyright
 
 RCCL tests are provided under the BSD license.
+
 All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+
 All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 

From 3fac1d679be8bb10dc12d786025d35579ff9de58 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 16 May 2019 13:18:23 -0700
Subject: [PATCH 023/233] Fix missing space in Makefile

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index acf41d7e5a..78470b8f48 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -44,7 +44,7 @@ endif
 ifeq ($(MPI), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include
 HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
-else ifeq($(MPICH), 1)
+else ifeq ($(MPICH), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich
 HIPLDFLAGS += -L/usr/lib -lmpi
 endif

From 924521ff570069f2969377001af9913b3b026065 Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Fri, 28 Jun 2019 09:52:44 -0600
Subject: [PATCH 024/233] Adding unit tests and files for CI (#4)

* Adding initial unit test and Jenkins code.

Fixing scope of unit tests

Adding unit tests and files for CI

Fixing Jenkinsfile

* Removing typos from Jenkinsfile

* Making some fixes to the Jenkins file; temporarily disabling MPI

* Making corrections to Jenkinsfile

* Correcting dockerNodes entry in Jenkinsfile

* Fixed Jenkinsfile for CI

* Correcting Jenkinsfile for CI

* Updating README to include instructions on how to run unit tests.
---
 Jenkinsfile                |  82 +++++++++++++++++++++++++++++
 README.md                  |  12 +++++
 install.sh                 |  98 +++++++++++++++++++++++++++++++++++
 src/common.cu              |  18 +++++--
 test/__init__.py           |  20 ++++++++
 test/conftest.py           |  23 +++++++++
 test/test_AllGather.py     | 102 +++++++++++++++++++++++++++++++++++++
 test/test_AllReduce.py     | 102 +++++++++++++++++++++++++++++++++++++
 test/test_Broadcast.py     | 102 +++++++++++++++++++++++++++++++++++++
 test/test_Reduce.py        | 102 +++++++++++++++++++++++++++++++++++++
 test/test_ReduceScatter.py | 102 +++++++++++++++++++++++++++++++++++++
 11 files changed, 758 insertions(+), 5 deletions(-)
 create mode 100644 Jenkinsfile
 create mode 100755 install.sh
 create mode 100644 test/__init__.py
 create mode 100644 test/conftest.py
 create mode 100644 test/test_AllGather.py
 create mode 100644 test/test_AllReduce.py
 create mode 100644 test/test_Broadcast.py
 create mode 100644 test/test_Reduce.py
 create mode 100644 test/test_ReduceScatter.py

diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 0000000000..7589636c68
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,82 @@
+#!/usr/bin/env groovy
+// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS
+@Library('rocJenkins@noDocker') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+
+////////////////////////////////////////////////////////////////////////
+// Mostly generated from snippet generator 'properties; set job properties'
+// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM
+properties([
+    pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]),
+    buildDiscarder(logRotator(
+      artifactDaysToKeepStr: '',
+      artifactNumToKeepStr: '',
+      daysToKeepStr: '',
+      numToKeepStr: '10')),
+    disableConcurrentBuilds(),
+    [$class: 'CopyArtifactPermissionProperty', projectNames: '*']
+   ])
+
+
+////////////////////////////////////////////////////////////////////////
+import java.nio.file.Path;
+
+rcclTestsCI:
+{
+    def rcclTests = new rocProject('rcclTests')
+    // customize for project
+    rcclTests.paths.build_command = './install.sh'
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(['RCCL'], rcclTests)
+
+    boolean formatCheck = false
+
+    def compileCommand =
+    {
+        platform, project->
+
+        project.paths.construct_build_prefix()
+
+        def command = """#!/usr/bin/env bash
+                  set -x
+                  rm -rf rccl
+                  git clone https://github.com/ROCmSoftwarePlatform/rccl
+                  cd rccl
+                  export RCCL_PATH=${WORKSPACE}/rccl/rccl-install
+                  ./install.sh -i --prefix=\$RCCL_PATH
+                  cd ..
+                  cd ${project.paths.project_build_prefix}
+                  ${project.paths.build_command} --rccl_home=\$RCCL_PATH
+                """
+	  sh command
+    }
+    def testCommand =
+    {
+        platform, project->
+
+        def command = """#!/usr/bin/env bash
+                set -x
+                LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:${WORKSPACE}/rccl/rccl-install/lib/ python3 -m pytest -k "not MPI" --junitxml=./testreport.xml
+            """
+
+        sh command
+        //junit "${project.paths.project_build_prefix}/build/release/*.xml"
+    }
+
+    def packageCommand =
+    {
+        platform, project->
+
+        def command = """
+                      """
+    }
+
+    buildProjectNoDocker(rcclTests, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
+}
diff --git a/README.md b/README.md
index 2731d65c65..dc3120f119 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,18 @@ All tests support the same set of arguments :
   * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
 
+## Unit tests
+
+Unit tests for rccl-tests are implemented with pytest (python3 is also required).  Several notes for the unit tests:
+
+1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
+2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.
+
+The unit tests can be invoked within the rccl-tests root, or in the test subfolder.  An example call to the unit tests:
+```shell
+$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest
+```
+
 ## Copyright
 
 RCCL tests are provided under the BSD license.
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000000..32e5dc4d4e
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+
+# #################################################
+# helper functions
+# #################################################
+function display_help()
+{
+    echo "RCCL-tests build & installation helper script"
+    echo "./install [-h|--help] "
+    echo "    [-h|--help] Prints this help message."
+    echo "    [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)"
+    echo "    [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)"
+    echo "    [--mpi_home] Specify path to your MPI installation."
+}
+
+# #################################################
+# global variables
+# #################################################
+run_tests=false
+build_release=true
+mpi_enabled=false
+rccl_dir=/opt/rocm/rccl
+mpi_dir=""
+# #################################################
+# Parameter parsing
+# #################################################
+
+# check if we have a modern version of getopt that can handle whitespace and long parameters
+getopt -T
+if [[ $? -eq 4 ]]; then
+    GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@")
+else
+    echo "Need a new version of getopt"
+    exit 1
+fi
+
+if [[ $? -ne 0 ]]; then
+    echo "getopt invocation failed; could not parse the command line";
+    exit 1
+fi
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+    case "${1}" in
+	-h|--help)
+        display_help
+        exit 0
+        ;;
+	-m|--mpi)
+	    mpi_enabled=true
+	    shift ;;
+	-t|--test)
+	    run_tests=true
+	    shift ;;
+    --rccl_home)
+        rccl_dir=${2}
+        shift 2 ;;
+    --mpi_home)
+        mpi_dir=${2}
+        shift 2 ;;
+	--) shift ; break ;;
+	*)  echo "Unexpected command line parameter received; aborting";
+	    exit 1
+	    ;;
+    esac
+    done
+
+# Install the pre-commit hook
+#bash ./githooks/install
+
+build_dir=./build
+# #################################################
+# prep
+# #################################################
+# ensure a clean build environment
+rm -rf ${build_dir}
+
+if ($mpi_enabled); then
+    if [[ ${mpi_dir} -eq "" ]]; then
+        echo "MPI flag enabled but path to MPI installation not specified.  See --mpi_home command line argument."
+        exit 1
+    else
+        make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc)
+    fi
+else
+    make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc)
+fi
+
+# Optionally, run tests if they're enabled.
+if ($run_tests); then
+    if ($mpi_enabled); then
+        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
+    else
+        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest
+    fi
+fi
diff --git a/src/common.cu b/src/common.cu
index 61084eb1bd..d708a7a916 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -292,7 +292,7 @@ void Barrier(struct threadArgs* args)
   args->barrier_idx=!args->barrier_idx;
 }
 
-testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta, bool *error) {
   size_t count = args->expectedBytes/wordSize(type);
   double maxDelta = 0.0;
   for (int i=0; i<args->nGpus; i++) {
@@ -327,7 +327,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #endif
   }
   double nranks = args->nProcs*args->nThreads*args->nGpus;
-  if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  if (maxDelta > DeltaMaxValue(type)*(nranks - 1))
+  {
+    args->errors[0]++;
+    *error = true;
+  }
   *delta = maxDelta;
   return testSuccess;
 }
@@ -446,6 +450,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   Barrier(args);
 
   double maxDelta = 0;
+  bool error = false;
   static __thread int rep = 0;
   rep++;
   if (datacheck) {
@@ -456,7 +461,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       TESTCHECK(startColl(args, type, op, root, in_place, 0));
       TESTCHECK(completeColl(args));
 
-      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta, &error));
 
       //aggregate delta from all threads and procs
       Barrier(args);
@@ -466,6 +471,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
         }
 #ifdef MPI_SUPPORT
         MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+        MPI_Allreduce(MPI_IN_PLACE, &error, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);
 #endif
       }
       Barrier(args);
@@ -481,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     sprintf(timeStr, "%7.2f", timeUsec);
   }
   if (datacheck) {
-     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+     PRINT("  %7s  %6.2f  %6.2f  %5.0le%s", timeStr, algBw, busBw, maxDelta, error ? "*" : "");
   } else {
      PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
   }
@@ -757,7 +763,7 @@ testResult_t run() {
 #endif
   is_main_thread = (proc == 0) ? 1 : 0;
 
-  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+  PRINT("# nThread: %d nGpus: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
       (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
   if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
   if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
@@ -887,6 +893,7 @@ testResult_t run() {
   for (int t=nThreads-1; t>=0; t--) {
     if (t) pthread_join(threads[t].thread, NULL);
     TESTCHECK(threads[t].ret);
+
     if (t) {
       errors[0] += errors[t];
       bw[0] += bw[t];
@@ -927,6 +934,7 @@ testResult_t run() {
   double check_avg_bw = str ? atof(str) : -1;
   bw[0] /= bw_count[0];
 
+  if (datacheck) PRINT("# Errors with asterisks indicate errors that have exceeded the maximum threshold.\n");
   PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
   PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
   PRINT("#\n");
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000000..cfd487930d
--- /dev/null
+++ b/test/__init__.py
@@ -0,0 +1,20 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
\ No newline at end of file
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 0000000000..79ce9b8ef8
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,23 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+def pytest_addoption(parser):
+    parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile")
\ No newline at end of file
diff --git a/test/test_AllGather.py b/test/test_AllGather.py
new file mode 100644
index 0000000000..2d3d74bcef
--- /dev/null
+++ b/test/test_AllGather.py
@@ -0,0 +1,102 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/all_gather_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllGather test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllGather test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
diff --git a/test/test_AllReduce.py b/test/test_AllReduce.py
new file mode 100644
index 0000000000..b3cb5f99ff
--- /dev/null
+++ b/test/test_AllReduce.py
@@ -0,0 +1,102 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/all_reduce_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllReduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllReduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
diff --git a/test/test_Broadcast.py b/test/test_Broadcast.py
new file mode 100644
index 0000000000..f4b8b38363
--- /dev/null
+++ b/test/test_Broadcast.py
@@ -0,0 +1,102 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/broadcast_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Broadcast test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Broadcast test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
diff --git a/test/test_Reduce.py b/test/test_Reduce.py
new file mode 100644
index 0000000000..5df694490d
--- /dev/null
+++ b/test/test_Reduce.py
@@ -0,0 +1,102 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/reduce_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Reduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Reduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
diff --git a/test/test_ReduceScatter.py b/test/test_ReduceScatter.py
new file mode 100644
index 0000000000..66b431b00a
--- /dev/null
+++ b/test/test_ReduceScatter.py
@@ -0,0 +1,102 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/reduce_scatter_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("ReduceScatter test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("ReduceScatter test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file

From 043eef69996a825698c5679d8419ee12768740d0 Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Thu, 11 Jul 2019 15:36:21 +0000
Subject: [PATCH 025/233] Checking that number of requested GPUs is not more
 than number of available GPUs

---
 src/common.cu | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index d708a7a916..d4d528ea7c 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -738,6 +738,15 @@ int main(int argc, char* argv[]) {
 	return 0;
     }
   }
+
+  int numDevices;
+  HIPCHECK(hipGetDeviceCount(&numDevices));
+  if (nGpus > numDevices)
+  {
+      fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices);
+      return testNcclError;
+  }
+
 #ifdef MPI_SUPPORT
   MPI_Init(&argc, &argv);
 #endif

From 23c374475f0472a06b461ad5ba5d09b5312a1f3c Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 26 Jul 2019 00:12:41 +0000
Subject: [PATCH 026/233] Allow call ncclCommAbort on Ctrl+C

---
 src/common.cu | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index d4d528ea7c..4f97a4847f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -12,6 +12,7 @@
 #include <cstdio>
 #include <getopt.h>
 #include <libgen.h>
+#include <signal.h>
 
 #if NCCL_MAJOR >= 2
 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble};
@@ -43,6 +44,7 @@ static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
 static int memorytype = 0;
+static ncclResult_t ncclabort = ncclSuccess;
 
 double parsesize(char *value) {
     long long int units;
@@ -336,6 +338,21 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   return testSuccess;
 }
 
+void INThandler(int sig) {
+  char  c;
+
+  signal(sig, SIG_IGN);
+  printf("\nDo you want to call ncclCommAbort before exit? [y/n] ");
+  c = getchar();
+  if (c == 'y' || c == 'Y') {
+    ncclabort = ncclSystemError;
+    signal(SIGINT, INThandler);
+  }
+  else
+    exit (0);
+  getchar(); // Get new line character
+}
+
 testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) {
   hipError_t hipErr;
   int remaining = ngpus;
@@ -361,13 +378,17 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
      if (comms) {
        ncclResult_t ncclAsyncErr;
        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
-       if (ncclAsyncErr != ncclSuccess) {
+       if (ncclAsyncErr != ncclSuccess || ncclabort != ncclSuccess) {
          // An asynchronous error happened. Stop the operation and destroy
          // the communicator
          for (int i=0; i<ngpus; i++)
            NCCLCHECK(ncclCommAbort(comms[i]));
+         // Let all kernels to exit
+         for (int i=0; i<ngpus; i++)
+           HIPCHECK(hipStreamSynchronize(streams[i]));
          // Abort the perf test
          NCCLCHECK(ncclAsyncErr);
+         NCCLCHECK(ncclabort);
        }
      }
 #endif
@@ -608,6 +629,12 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
 testResult_t run(); // Main function
 
 int main(int argc, char* argv[]) {
+#if NCCL_MAJOR >= 2
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+  // may call ncclCommAbort
+  signal(SIGINT, INThandler);
+#endif
+#endif
   // Make sure everyline is flushed so that we see the progress of the test
   setlinebuf(stdout);
 

From ab82f1af6f5a373bb71fe0c44b8d772db24a03af Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 9 Aug 2019 10:22:14 -0700
Subject: [PATCH 027/233] Fix memory leak and possible buffer overrun

---
 src/common.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 4f97a4847f..36f2aa0941 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -398,6 +398,7 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
    // We might want to let other threads (including NCCL threads) use the CPU.
    if (idle) pthread_yield();
   }
+  free(done);
   return testSuccess;
 }
 
@@ -814,7 +815,7 @@ testResult_t run() {
     int rank = proc*nThreads*nGpus+i;
     hipDeviceProp_t prop;
     HIPCHECK(hipGetDeviceProperties(&prop, hipDev));
-    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+    len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
                     rank, getpid(), hostname, hipDev, prop.pciBusID, prop.name);
   }
 

From ca7a565236ce9353d1fe56026afa5a2b0e7bb9f1 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Fri, 16 Aug 2019 09:06:28 -0700
Subject: [PATCH 028/233] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7a4bbbc6ca..b8b65676b0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # NCCL Tests
 
-These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL](http://github.com/nvidia/nccl)
+These tests check both the performance and the correctness of [NCCL](http://github.com/nvidia/nccl) operations.
 
 ## Build
 

From 13d0ddd12e93d72ce69a083e5811439a3b658f73 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 25 Sep 2019 14:07:04 -0700
Subject: [PATCH 029/233] Init data for throughput iterations to avoid all zero
 data

---
 src/common.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 36f2aa0941..e13aa3521b 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -851,6 +851,9 @@ testResult_t run() {
     HIPCHECK(hipSetDevice(localRank*nThreads*nGpus+i));
     AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus);
     HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+    // initialize data buffer to avoid all zero data
+    TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclUint8, 0, i));
+    HIPCHECK(hipDeviceSynchronize());
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL

From a2af1d959dd01a68b55a8f31aa44538d58dc0c35 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Thu, 10 Oct 2019 10:51:05 -0700
Subject: [PATCH 030/233] Update README.md

Checks are now fully local, no need to disable them at scale.
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b8b65676b0..791bed2599 100644
--- a/README.md
+++ b/README.md
@@ -29,9 +29,9 @@ Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```
 
-Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each, disabling checks :
+Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each :
 ```shell
-$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0
+$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
 ```
 
 ### Performance

From 24eb972cae0f0cbe228ddcf9d653db38688d8cfd Mon Sep 17 00:00:00 2001
From: Pak Lui <5041261+paklui@users.noreply.github.com>
Date: Thu, 17 Oct 2019 15:38:37 -0700
Subject: [PATCH 031/233] fix syntax error for string comparison

---
 install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.sh b/install.sh
index 32e5dc4d4e..7c8a865ef5 100755
--- a/install.sh
+++ b/install.sh
@@ -78,7 +78,7 @@ build_dir=./build
 rm -rf ${build_dir}
 
 if ($mpi_enabled); then
-    if [[ ${mpi_dir} -eq "" ]]; then
+    if [[ ${mpi_dir} == "" ]]; then
         echo "MPI flag enabled but path to MPI installation not specified.  See --mpi_home command line argument."
         exit 1
     else

From aa0f02bee034b85450075c5c0d0ed373f42d8e2d Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Mon, 11 Nov 2019 11:37:45 -0800
Subject: [PATCH 032/233] Fix incorrect print out when data size is greater
 than 4GB

---
 src/common.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index e13aa3521b..af0e2d22dd 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -13,6 +13,7 @@
 #include <getopt.h>
 #include <libgen.h>
 #include <signal.h>
+#include <algorithm>
 
 #if NCCL_MAJOR >= 2
 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble};
@@ -551,7 +552,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   // Benchmark
   for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
       setupArgs(size, type, args);
-      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+      print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
       TESTCHECK(BenchTime(args, type, op, root, 0));
       TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");

From 32399955afc00d566d481e56c050bf22dcd11e76 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 12 Nov 2019 23:03:59 +0000
Subject: [PATCH 033/233] Fix build with RCCL 1.x API

---
 src/common.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index af0e2d22dd..5bf78eeebe 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -853,7 +853,11 @@ testResult_t run() {
     AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus);
     HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
     // initialize data buffer to avoid all zero data
+#if NCCL_MAJOR >= 2
     TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclUint8, 0, i));
+#else
+    TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclChar, 0, i));
+#endif
     HIPCHECK(hipDeviceSynchronize());
   }
 

From 1cda2f52b6a3b96a5035c049519a988f8e3bccfa Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 15 Nov 2019 13:46:03 -0800
Subject: [PATCH 034/233] Add bf16 support in rccl-tests

---
 src/Makefile        |   3 +-
 src/common.cu       |  26 ++++-
 src/common.h        |   4 +
 src/rccl_bfloat16.h | 253 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 283 insertions(+), 3 deletions(-)
 create mode 100644 src/rccl_bfloat16.h

diff --git a/src/Makefile b/src/Makefile
index 78470b8f48..157a351e5c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -15,11 +15,10 @@ HIPCC = $(ROCM_HOME)/hip/bin/hipcc
 CXX = $(HIPCC)
 
 
-HIPCUFLAGS :=
+HIPCUFLAGS := -std=c++14
 HIPCUFLAGS += -I$(ROCM_HOME)/include
 HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl
 HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip
-HIPCUFLAGS += -I$(ROCM_HOME)/hiprand/include
 LDFLAGS    := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
 HIPLDFLAGS := $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
 
diff --git a/src/common.cu b/src/common.cu
index 5bf78eeebe..07ebcd90a3 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -7,6 +7,7 @@
  ************************************************************************/
 
 #include "hip/hip_runtime.h"
+#include "rccl_bfloat16.h"
 #include "common.h"
 #include <pthread.h>
 #include <cstdio>
@@ -16,8 +17,13 @@
 #include <algorithm>
 
 #if NCCL_MAJOR >= 2
+#if RCCL_BFLOAT16 == 1
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, ncclBfloat16};
+const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double", "bf16"};
+#else
 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble};
 const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"};
+#endif
 #else
 ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
 const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
@@ -78,6 +84,9 @@ double DeltaMaxValue(ncclDataType_t type) {
 #endif
     case ncclInt64:
     case ncclUint64: return 1e-200;
+#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
+    case ncclBfloat16: return 1e-2;
+#endif
   }
   return 1e-200;
 }
@@ -155,6 +164,10 @@ testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataTyp
     case ncclInt64:
     case ncclUint64:
       hipLaunchKernelGGL((deltaKern<uint64_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
+#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
+    case ncclBfloat16:
+      hipLaunchKernelGGL((deltaKern<rccl_bfloat16, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
+#endif
   }
   HIPCHECK(hipDeviceSynchronize());
   return testSuccess;
@@ -181,6 +194,10 @@ template<>
 __device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
   return __float2half(testValue<float>(offset, rep, rank));
 }
+template<>
+__device__ rccl_bfloat16 testValue<rccl_bfloat16>(const size_t offset, const int rep, const int rank) {
+  return (float)testValue<float>(offset, rep, rank);
+}
 
 // Operations
 template<typename T>
@@ -220,7 +237,11 @@ typedef void(*redInitKern_t)(void* data, const size_t N, const size_t offset, co
 
 static redInitKern_t const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
 #if NCCL_MAJOR >= 2
+#if RCCL_BFLOAT16 == 1
+  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), OPS(rccl_bfloat16)
+#else
   OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double)
+#endif
 #else
   OPS(char), OPS(int32_t), OPS(half), OPS(float), OPS(double), OPS(int64_t), OPS(uint64_t)
 #endif
@@ -251,7 +272,10 @@ static initDataKern_t const initDataKerns[ncclNumTypes] = {
   InitDataKernel<uint64_t>,
   InitDataKernel<    half>,
   InitDataKernel<   float>,
-  InitDataKernel<  double>
+  InitDataKernel<  double>,
+#if RCCL_BFLOAT16 == 1
+  InitDataKernel<rccl_bfloat16>
+#endif
 #else
   InitDataKernel<    char>,
   InitDataKernel< int32_t>,
diff --git a/src/common.h b/src/common.h
index dd98d547df..54f216c9c5 100644
--- a/src/common.h
+++ b/src/common.h
@@ -195,6 +195,10 @@ static size_t wordSize(ncclDataType_t type) {
     case ncclDouble:
     //case ncclFloat64:
       return 8;
+#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
+    case ncclBfloat16:
+      return 2;
+#endif
     default: return 0;
   }
 }
diff --git a/src/rccl_bfloat16.h b/src/rccl_bfloat16.h
new file mode 100644
index 0000000000..06b053a626
--- /dev/null
+++ b/src/rccl_bfloat16.h
@@ -0,0 +1,253 @@
+/**
+ * MIT License
+ *
+ * Copyright 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*!\file
+ * \brief rccl_bfloat16.h provides struct for rccl_bfloat16 typedef
+ */
+
+#ifndef _RCCL_BFLOAT16_H_
+#define _RCCL_BFLOAT16_H_
+
+#if __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+// If this is a C compiler, C++ compiler below C++14, or a host-only compiler, we only
+// include a minimal definition of rccl_bfloat16
+
+#include <stdint.h>
+/*! \brief Struct to represent a 16 bit brain floating point number. */
+typedef struct
+{
+    uint16_t data;
+} rccl_bfloat16;
+
+#else // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <ostream>
+#include <type_traits>
+
+struct rccl_bfloat16
+{
+    uint16_t data;
+
+    __host__ __device__ rccl_bfloat16() = default;
+
+    // round upper 16 bits of IEEE float to convert to bfloat16
+    explicit constexpr __host__ __device__ rccl_bfloat16(float f)
+        : data(float_to_bfloat16(f))
+    {
+    }
+
+    // zero extend lower 16 bits of bfloat16 to convert to IEEE float
+    constexpr __host__ __device__ operator float() const
+    {
+        union
+        {
+            uint32_t int32;
+            float    fp32;
+        } u = {uint32_t(data) << 16};
+        return u.fp32;
+    }
+
+private:
+    static constexpr __host__ __device__ uint16_t float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        if(~u.int32 & 0x7f800000)
+        {
+            // When the exponent bits are not all 1s, then the value is zero, normal,
+            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+            // least significant bits of the float mantissa are greater than 0x8000,
+            // or if they are equal to 0x8000 and the least significant bit of the
+            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+            // has the value 0x7f, then incrementing it causes it to become 0x00 and
+            // the exponent is incremented by one, which is the next higher FP value
+            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+            // incrementing it causes it to become an exponent of 0xFF and a mantissa
+            // of 0x00, which is Inf, the next higher value to the unrounded value.
+            u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
+        }
+        else if(u.int32 & 0xffff)
+        {
+            // When all of the exponent bits are 1, the value is Inf or NaN.
+            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+            // bit being 1. Signaling NaN is indicated by the most significant
+            // mantissa bit being 0 but some other bit(s) being 1. If any of the
+            // lower 16 bits of the mantissa are 1, we set the least significant bit
+            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+            // the bloat16's mantissa bits are all 0.
+            u.int32 |= 0x10000; // Preserve signaling NaN
+        }
+        return uint16_t(u.int32 >> 16);
+    }
+};
+
+typedef struct
+{
+    uint16_t data;
+} rccl_bfloat16_public;
+
+static_assert(std::is_standard_layout<rccl_bfloat16>{},
+              "rccl_bfloat16 is not a standard layout type, and thus is "
+              "incompatible with C.");
+
+static_assert(std::is_trivial<rccl_bfloat16>{},
+              "rccl_bfloat16 is not a trivial type, and thus is "
+              "incompatible with C.");
+
+static_assert(sizeof(rccl_bfloat16) == sizeof(rccl_bfloat16_public)
+                  && offsetof(rccl_bfloat16, data) == offsetof(rccl_bfloat16_public, data),
+              "internal rccl_bfloat16 does not match public rccl_bfloat16");
+
+inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat16& bf16)
+{
+    return os << float(bf16);
+}
+constexpr __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a)
+{
+    return a;
+}
+constexpr __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a)
+{
+    a.data ^= 0x8000;
+    return a;
+}
+constexpr __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) + float(b));
+}
+constexpr __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) - float(b));
+}
+constexpr __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) * float(b));
+}
+constexpr __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) / float(b));
+}
+constexpr __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return float(a) < float(b);
+}
+constexpr __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return float(a) == float(b);
+}
+constexpr __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return b < a;
+}
+constexpr __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return !(a > b);
+}
+constexpr __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return !(a == b);
+}
+constexpr __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return !(a < b);
+}
+constexpr __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a + b;
+}
+constexpr __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a - b;
+}
+constexpr __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a * b;
+}
+constexpr __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a / b;
+}
+constexpr __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a)
+{
+    return a += rccl_bfloat16(1.0f);
+}
+constexpr __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a)
+{
+    return a -= rccl_bfloat16(1.0f);
+}
+constexpr __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int)
+{
+    rccl_bfloat16 orig = a;
+    ++a;
+    return orig;
+}
+constexpr __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int)
+{
+    rccl_bfloat16 orig = a;
+    --a;
+    return orig;
+}
+
+namespace std
+{
+    constexpr __host__ __device__ bool isinf(rccl_bfloat16 a)
+    {
+        return !(~a.data & 0x7f80) && !(a.data & 0x7f);
+    }
+    constexpr __host__ __device__ bool isnan(rccl_bfloat16 a)
+    {
+        return !(~a.data & 0x7f80) && +(a.data & 0x7f);
+    }
+    constexpr __host__ __device__ bool iszero(rccl_bfloat16 a)
+    {
+        return !(a.data & 0x7fff);
+    }
+    inline rccl_bfloat16 sin(rccl_bfloat16 a)
+    {
+        return rccl_bfloat16(sinf(float(a)));
+    }
+    inline rccl_bfloat16 cos(rccl_bfloat16 a)
+    {
+        return rccl_bfloat16(cosf(float(a)));
+    }
+}
+
+#endif // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#endif // _RCCL_BFLOAT16_H_

From bd53e98df32f0eff71556bdfb74543e1dda3c1d7 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 22 Nov 2019 10:29:32 -0800
Subject: [PATCH 035/233] Fix build error with hip-clang

---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 07ebcd90a3..3edba38831 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -196,7 +196,7 @@ __device__ half testValue<half>(const size_t offset, const int rep, const int ra
 }
 template<>
 __device__ rccl_bfloat16 testValue<rccl_bfloat16>(const size_t offset, const int rep, const int rank) {
-  return (float)testValue<float>(offset, rep, rank);
+  return rccl_bfloat16(testValue<float>(offset, rep, rank));
 }
 
 // Operations

From 0f173234bb2837327d806e9e4de9af3dda9a7043 Mon Sep 17 00:00:00 2001
From: Wei Zhang <wzam@amazon.com>
Date: Mon, 16 Dec 2019 16:18:22 -0800
Subject: [PATCH 036/233] Add -L$(MPI_HOME)/lib64 to NVLDFLAGS

In some cases, the MPI library is not in $(MPI_HOME)/lib but
in $(MPI_HOME)/lib64. For example, on RedHat like Linux system
(CentOS, Amazon Linux), and MPI is installed by yum or rpm.

Under such circumstance, the current make file will cause failure.
This patch address this issue by adding -L$(MPI_HOME)/lib64 to
NVLDFLAGS in src/Makefile.

Signed-off-by: Wei Zhang <wzam@amazon.com>
---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 034cc672fa..ed723d4210 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -52,7 +52,7 @@ endif
 
 ifeq ($(MPI), 1)
 NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
-NVLDFLAGS += -L$(MPI_HOME)/lib -lmpi
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
 endif
 LIBRARIES += curand nccl nvToolsExt
 NVLDFLAGS += $(LIBRARIES:%=-l%)

From 6e9e05972b3dfd709304beaebd7359f7a232cb44 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 11 Mar 2020 13:40:17 -0700
Subject: [PATCH 037/233] Add option for stress testing

---
 src/common.cu | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 3edba38831..6aa2a02450 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -51,6 +51,7 @@ static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
 static int memorytype = 0;
+static int stress_cycles = 1;
 static ncclResult_t ncclabort = ncclSuccess;
 
 double parsesize(char *value) {
@@ -573,13 +574,16 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   }
   TESTCHECK(completeColl(args));
 
-  // Benchmark
-  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
-      setupArgs(size, type, args);
-      print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
-      TESTCHECK(BenchTime(args, type, op, root, 0));
-      TESTCHECK(BenchTime(args, type, op, root, 1));
-      PRINT("\n");
+  for (size_t iter = 0; iter < stress_cycles; iter++) {
+    if (iter > 0) PRINT("# Testing %d cycle.\n", iter+1);
+    // Benchmark
+    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+        setupArgs(size, type, args);
+        print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+        TESTCHECK(BenchTime(args, type, op, root, 0));
+        TESTCHECK(BenchTime(args, type, op, root, 1));
+        PRINT("\n");
+    }
   }
   return testSuccess;
 }
@@ -683,12 +687,13 @@ int main(int argc, char* argv[]) {
     {"root", required_argument, 0, 'r'},
     {"blocking", required_argument, 0, 'z'},
     {"memory_type", required_argument, 0, 'y'},
+    {"stress_cycles", required_argument, 0, 's'},
     {"help", no_argument, 0, 'h'}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:h", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:s:h", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -746,6 +751,9 @@ int main(int argc, char* argv[]) {
       case 'y':
         memorytype = ncclstringtomtype(optarg);
         break;
+      case 's':
+        stress_cycles = strtol(optarg, NULL, 0);
+        break;
       case 'h':
 	printf("USAGE: %s \n\t"
             "[-t,--nthreads <num threads>] \n\t"

From 119a0ecf600f30d6b82897126f6301e15b6582b8 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 17 Mar 2020 12:00:19 -0700
Subject: [PATCH 038/233] Add alltoall perf test

---
 src/Makefile    |   2 +-
 src/alltoall.cu | 117 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/common.cu   |   4 +-
 src/common.h    |  14 +++---
 4 files changed, 129 insertions(+), 8 deletions(-)
 create mode 100644 src/alltoall.cu

diff --git a/src/Makefile b/src/Makefile
index 034cc672fa..33ca479422 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -60,7 +60,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/alltoall.cu b/src/alltoall.cu
new file mode 100644
index 0000000000..aea9370f65
--- /dev/null
+++ b/src/alltoall.cu
@@ -0,0 +1,117 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+}
+
+void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks;
+}
+
+testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    char* str = getenv("NCCL_TESTS_DEVICE");
+    int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place alltoall
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nRanks; r++) {
+    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl alltoAllTest = {
+  "AlltoAll",
+  AlltoAllGetCollByteCount,
+  AlltoAllInitData,
+  AlltoAllGetBw,
+  AlltoAllRunColl
+};
+
+void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &alltoAllTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine alltoAllEngine = {
+  AlltoAllGetBuffSize,
+  AlltoAllRunTest
+};
+
+#pragma weak ncclTestEngine=alltoAllEngine
diff --git a/src/common.cu b/src/common.cu
index 5a3ae529d6..6d44b9a6ec 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -307,7 +307,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #endif
   }
   double nranks = args->nProcs*args->nThreads*args->nGpus;
-  if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
   *delta = maxDelta;
   return testSuccess;
 }
@@ -832,6 +832,8 @@ testResult_t run() {
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
 
+    threads[t].args.reportErrors = 1;
+
     threads[t].func = parallel_init ? threadInit : threadRunTests;
     if (t)
       TESTCHECK(threadLaunch(threads+t));
diff --git a/src/common.h b/src/common.h
index 8fb5b8cadf..a2d7ae2958 100644
--- a/src/common.h
+++ b/src/common.h
@@ -17,25 +17,25 @@
 #include "nccl1_compat.h"
 
 #define CUDACHECK(cmd) do {                         \
-  cudaError_t e = cmd;                              \
-  if( e != cudaSuccess ) {                          \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
     printf("%s: Test CUDA failure %s:%d '%s'\n",    \
          hostname,                                  \
-        __FILE__,__LINE__,cudaGetErrorString(e));   \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
     return testCudaError;                           \
   }                                                 \
 } while(0)
 
 #define NCCLCHECK(cmd) do {                         \
-  ncclResult_t r = cmd;                             \
-  if (r!= ncclSuccess) {                            \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
     printf("%s: Test NCCL failure %s:%d '%s'\n",    \
          hostname,                                  \
-        __FILE__,__LINE__,ncclGetErrorString(r));   \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
     return testNcclError;                           \
   }                                                 \
 } while(0)
@@ -124,6 +124,8 @@ struct threadArgs {
   double* bw;
   int* bw_count;
 
+  int reportErrors;
+
   struct testColl* collTest;
 };
 

From 6932a583e74c7685fcc4e7206e367d917f3e0485 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 19 Mar 2020 10:18:39 -0700
Subject: [PATCH 039/233] Add gather and scatter test

---
 src/Makefile   |   2 +-
 src/common.cu  |  21 ++++----
 src/gather.cu  | 127 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/scatter.cu | 129 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 269 insertions(+), 10 deletions(-)
 create mode 100644 src/gather.cu
 create mode 100644 src/scatter.cu

diff --git a/src/Makefile b/src/Makefile
index b02d6e886d..e109b8c3ae 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -54,7 +54,7 @@ HIPLDFLAGS   += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/common.cu b/src/common.cu
index d14c286125..908a69b9c5 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -16,6 +16,8 @@
 #include <signal.h>
 #include <algorithm>
 
+//#define DEBUG_PRINT
+
 #if NCCL_MAJOR >= 2
 #if RCCL_BFLOAT16 == 1
 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, ncclBfloat16};
@@ -326,6 +328,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   for (int i=0; i<args->nGpus; i++) {
     int device;
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    if (rank != root && strcmp(args->collTest->name, "Gather") == 0)
+      continue;
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
     HIPCHECK(hipSetDevice(device));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
@@ -333,25 +337,24 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     maxDelta = std::max(*(args->deltaHost), maxDelta);
 
 #ifdef DEBUG_PRINT
-    if (rank == 0) {
+    //if (rank == 0) {
        int *expectedHost = (int *)malloc(args->expectedBytes);
        int *dataHost = (int *)malloc(args->expectedBytes);
 
-       hipMemcpy(expectedHost, args->expected[0], args->expectedBytes, hipMemcpyDeviceToHost);
-       printf("\n Expected: ");
+       hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost);
+       printf("\n Rank [%d] Expected: ", rank);
        for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
          printf("%d:%d ", j, expectedHost[j]);
        }
-       printf("\n");
-
        hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
-       printf("\n Actual: ");
+       printf("\n Rank [%d] Actual: ", rank);
        for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
          printf("%d:%d ", j, dataHost[j]);
        }
        printf("\n");
-       free(temp);
-    }
+       free(dataHost);
+       free(expectedHost);
+    //}
 #endif
   }
   double nranks = args->nProcs*args->nThreads*args->nGpus;
@@ -571,7 +574,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   TESTCHECK(completeColl(args));
 
   for (size_t iter = 0; iter < stress_cycles; iter++) {
-    if (iter > 0) PRINT("# Testing %ld cycle.\n", iter+1);
+    if (iter > 0) PRINT("# Testing %lu cycle.\n", iter+1);
     // Benchmark
     for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
         setupArgs(size, type, args);
diff --git a/src/gather.cu b/src/gather.cu
new file mode 100644
index 0000000000..5230fdc81d
--- /dev/null
+++ b/src/gather.cu
@@ -0,0 +1,127 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+//#define DEBUG_PRINT
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s", size, count, typeName);
+}
+
+void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count/nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = count/nranks;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    HIPCHECK(hipSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+#ifdef DEBUG_PRINT
+    int *dataHost = (int *)malloc(args->sendBytes);
+    hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
+    printf("\n Rank [%d] Init: ", rank);
+    for (int j=0; j<args->sendBytes/sizeof(int); j++) {
+     printf("%d:%d ", j, dataHost[j]);
+    }
+    printf("\n");
+    free(dataHost);
+#endif
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  NCCLCHECK(ncclGroupStart());
+  if (rank == root) {
+  for (int r=0; r<nRanks; r++)
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+  }
+  NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+struct testColl gatherTest = {
+  "Gather",
+  GatherGetCollByteCount,
+  GatherInitData,
+  GatherGetBw,
+  GatherRunColl
+};
+
+void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &gatherTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", root));
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  GatherGetBuffSize,
+  GatherRunTest
+};
diff --git a/src/scatter.cu b/src/scatter.cu
new file mode 100644
index 0000000000..18ebb75b07
--- /dev/null
+++ b/src/scatter.cu
@@ -0,0 +1,129 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+//#define DEBUG_PRINT
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+}
+
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *recvcount;
+}
+
+testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    HIPCHECK(hipSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) {
+      for (int j=0; j<nranks; j++) {
+        TESTCHECK(InitData(((char*)data)+args->expectedBytes*j, recvcount, type, rep, j));
+      }
+#ifdef DEBUG_PRINT
+      int *dataHost = (int *)malloc(args->sendBytes);
+      hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
+      printf("\n Rank [%d] Init: ", rank);
+      for (int j=0; j<args->sendBytes/sizeof(int); j++) {
+        printf("%d:%d ", j, dataHost[j]);
+      }
+      printf("\n");
+      free(dataHost);
+#endif
+    }
+    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, rank));
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  NCCLCHECK(ncclGroupStart());
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++)
+      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+  }
+  NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+struct testColl scatterTest = {
+  "Scatter",
+  ScatterGetCollByteCount,
+  ScatterInitData,
+  ScatterGetBw,
+  ScatterRunColl
+};
+
+void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &scatterTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", root));
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ScatterGetBuffSize,
+  ScatterRunTest
+};

From 07b8876277596948512fab4df2b705d6e1e6ca15 Mon Sep 17 00:00:00 2001
From: Sourav Chakraborty <souchakr@amd.com>
Date: Fri, 10 Apr 2020 07:51:39 -0700
Subject: [PATCH 040/233] Improve makefile to avoid LD_LIBRARY_PATH

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index e109b8c3ae..8a63340968 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -37,11 +37,11 @@ endif
 BUILDDIR ?= ../build
 ifneq ($(NCCL_HOME), "")
 HIPCUFLAGS += -I$(NCCL_HOME)/include/
-HIPLDFLAGS   += -L$(NCCL_HOME)/lib
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME)/lib -L$(NCCL_HOME)/lib
 endif
 
 ifeq ($(MPI), 1)
-HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
 HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
 else ifeq ($(MPICH), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich

From 0d7c4db33ec7d2859b28a2cbe1b07baf227923f0 Mon Sep 17 00:00:00 2001
From: Sourav Chakraborty <souchakr@amd.com>
Date: Fri, 10 Apr 2020 07:53:38 -0700
Subject: [PATCH 041/233] Add sendrecv benchmark

---
 src/Makefile    |   2 +-
 src/sendrecv.cu | 112 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 src/sendrecv.cu

diff --git a/src/Makefile b/src/Makefile
index 8a63340968..ac317eb26b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -54,7 +54,7 @@ HIPLDFLAGS   += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter sendrecv
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
new file mode 100644
index 0000000000..180d8c26a9
--- /dev/null
+++ b/src/sendrecv.cu
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s", size, count, typeName);
+}
+
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    HIPCHECK(hipSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    TESTCHECK(InitData(args->sendbuffs[i], sendcount, type, rep, rank));
+    TESTCHECK(InitData(args->recvbuffs[i], recvcount, type, rep, rank));
+    int src = rank < nranks/2 ? rank : rank - nranks/2;
+    TESTCHECK(InitData(args->expected[i],  recvcount, type, rep, src));
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = nranks/2;
+  *busBw = baseBw * factor;
+}
+
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int rank, peer, nranks, npairs;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  NCCLCHECK(ncclCommCount(comm, &nranks));
+  npairs = nranks / 2;
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7
+  if (rank < npairs) {
+      peer = rank + npairs;
+      NCCLCHECK(ncclSend(sendbuff, count, type, peer, comm, stream));
+  } else if (rank < 2*npairs) {
+      peer = rank - npairs;
+      NCCLCHECK(ncclRecv(recvbuff, count, type, peer, comm, stream));
+  }
+#endif
+  return testSuccess;
+}
+
+struct testColl sendrecvTest = {
+  "SendRecv",
+  SendRecvGetCollByteCount,
+  SendRecvInitData,
+  SendRecvGetBw,
+  SendRecvRunColl
+};
+
+void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &sendrecvTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", 0));
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  SendRecvGetBuffSize,
+  SendRecvRunTest
+};

From 2813c968264b2368264dbfc9f8e864b3d0ef7aa4 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Mon, 13 Apr 2020 15:51:57 -0700
Subject: [PATCH 042/233] Add option to use alltoall, gather and scatter API

These APIs launche RCCL kernel implementation by default. If environmental
variable RCCL_ALLTOALL_KERNEL_DISABLE=1, then the APIs use wrapper around
ncclSend and ncclRecv.
---
 src/alltoall.cu |  8 ++++++++
 src/gather.cu   | 12 ++++++++++++
 src/scatter.cu  |  9 +++++++++
 3 files changed, 29 insertions(+)

diff --git a/src/alltoall.cu b/src/alltoall.cu
index 4d86bdaae5..41bbc780a8 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -8,6 +8,8 @@
 #include <hip/hip_runtime.h>
 #include "common.h"
 
+#define USE_RCCL_GATHER_SCATTER
+
 void print_header() {
   PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
   PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
@@ -65,12 +67,18 @@ testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclD
   size_t rankOffset = count * wordSize(type);
   if (count == 0) return testSuccess;
 
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7
+#if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER)
+  NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
+#else
   NCCLCHECK(ncclGroupStart());
   for (int r=0; r<nRanks; r++) {
     NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
     NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
   }
   NCCLCHECK(ncclGroupEnd());
+#endif
+#endif
 
   return testSuccess;
 }
diff --git a/src/gather.cu b/src/gather.cu
index 5230fdc81d..65dc714893 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -9,6 +9,7 @@
 #include "common.h"
 
 //#define DEBUG_PRINT
+#define USE_RCCL_GATHER_SCATTER
 
 void print_header() {
   PRINT("# %10s  %12s  %6s            out-of-place                       in-place          \n", "", "", "");
@@ -57,6 +58,8 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
     }
     HIPCHECK(hipDeviceSynchronize());
   }
+  // We don't support in-place gather
+  args->reportErrors = in_place ? 0 : 1;
   return testSuccess;
 }
 
@@ -76,6 +79,13 @@ testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDat
 
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7
+#if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER)
+  if (rank == root)
+    NCCLCHECK(ncclGather(sendbuff, recvbuff, count, type, root, comm, stream));
+  else
+    NCCLCHECK(ncclGather(sendbuff, 0, count, type, root, comm, stream));
+#else
   NCCLCHECK(ncclGroupStart());
   if (rank == root) {
   for (int r=0; r<nRanks; r++)
@@ -83,6 +93,8 @@ testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDat
   }
   NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
   NCCLCHECK(ncclGroupEnd());
+#endif
+#endif
   return testSuccess;
 }
 
diff --git a/src/scatter.cu b/src/scatter.cu
index 18ebb75b07..d741391300 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -9,6 +9,7 @@
 #include "common.h"
 
 //#define DEBUG_PRINT
+#define USE_RCCL_GATHER_SCATTER
 
 void print_header() {
   PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
@@ -59,6 +60,8 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
     TESTCHECK(InitData(args->expected[i], recvcount, type, rep, rank));
     HIPCHECK(hipDeviceSynchronize());
   }
+  // We don't support in-place scatter
+  args->reportErrors = in_place ? 0 : 1;
   return testSuccess;
 }
 
@@ -78,6 +81,10 @@ testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDa
 
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7
+#if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER)
+  NCCLCHECK(ncclScatter(sendbuff, recvbuff, count, type, root, comm, stream));
+#else
   NCCLCHECK(ncclGroupStart());
   if (rank == root) {
     for (int r=0; r<nRanks; r++)
@@ -85,6 +92,8 @@ testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDa
   }
   NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
   NCCLCHECK(ncclGroupEnd());
+#endif
+#endif
   return testSuccess;
 }
 

From 95e498c939be8ec098fdda262952fc00f4012baa Mon Sep 17 00:00:00 2001
From: saadrahim <44449863+saadrahim@users.noreply.github.com>
Date: Thu, 7 May 2020 12:29:07 -0600
Subject: [PATCH 043/233] Restarting CI (#6)

---
 .jenkins/common.groovy              | 37 +++++++++++++
 .jenkins/precheckin.groovy          | 81 +++++++++++++++++++++++++++++
 docker/dockerfile-build-centos      | 41 +++++++++++++++
 docker/dockerfile-build-ubuntu-rock | 43 +++++++++++++++
 docker/dockerfile-install-centos    |  8 +++
 docker/dockerfile-install-ubuntu    |  8 +++
 6 files changed, 218 insertions(+)
 create mode 100644 .jenkins/common.groovy
 create mode 100644 .jenkins/precheckin.groovy
 create mode 100644 docker/dockerfile-build-centos
 create mode 100644 docker/dockerfile-build-ubuntu-rock
 create mode 100644 docker/dockerfile-install-centos
 create mode 100644 docker/dockerfile-install-ubuntu

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
new file mode 100644
index 0000000000..5bf86a4765
--- /dev/null
+++ b/.jenkins/common.groovy
@@ -0,0 +1,37 @@
+// This file is for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+def runCompileCommand(platform, project, jobName)
+{
+    project.paths.construct_build_prefix()
+
+    String hipclangArgs = jobName.contains('hipclang') ? '--hip-clang' : ''
+    def getRCCL = auxiliary.getLibrary('rccl',platform.jenkinsLabel,'develop')
+
+    def command = """#!/usr/bin/env bash
+                set -x
+                ${getRCCL}
+                ${auxiliary.exitIfNotSuccess()}
+                cd ${project.paths.project_build_prefix}
+                ${project.paths.build_command}
+                ${auxiliary.exitIfNotSuccess()}
+            """
+
+    platform.runCommand(this,command)
+}
+
+def runTestCommand (platform, project)
+{
+    String sudo = auxiliary.sudo(platform.jenkinsLabel)
+
+    def command = """#!/usr/bin/env bash
+                set -x
+                cd ${project.paths.project_build_prefix}
+		python3 -m pytest -k "not MPI" --verbose --junitxml=./testreport.xml
+            """
+
+   platform.runCommand(this, command)
+   junit "${project.paths.project_build_prefix}/build/release/test/*.xml"
+}
+
+return this
diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
new file mode 100644
index 0000000000..aae81c922e
--- /dev/null
+++ b/.jenkins/precheckin.groovy
@@ -0,0 +1,81 @@
+#!/usr/bin/env groovy
+// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
+@Library('rocJenkins@pong') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path
+
+def runCI = 
+{
+    nodeDetails, jobName->
+
+    def prj  = new rocProject('rccl-tests', 'PreCheckin')
+    prj.paths.build_command = './install.sh'
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    boolean formatCheck = false
+
+    def commonGroovy
+
+    def compileCommand =
+    {
+        platform, project->
+
+        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
+        commonGroovy.runCompileCommand(platform, project, jobName)
+    }
+    
+    def testCommand =
+    {
+        platform, project->
+
+        commonGroovy.runTestCommand(platform, project)
+    }
+
+    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null)
+}
+
+ci: { 
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], 
+                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
+                        "rocm-docker":[]]
+    propertyList = auxiliary.appendPropertyList(propertyList)
+
+    def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['rccl906']]), 
+                       "rocm-docker":([ubuntu16:['rccl906']])]
+    jobNameList = auxiliary.appendJobNameList(jobNameList)
+    jobNameList['compute-rocm-dkms-no-npi-hipclang'] = [ubuntu16:['rccl906']]
+    
+    propertyList.each 
+    {
+        jobName, property->
+        if (urlJobName == jobName)
+            properties(auxiliary.addCommonProperties(property))
+    }
+
+    jobNameList.each 
+    {
+        jobName, nodeDetails->
+        if (urlJobName == jobName)
+            stage(jobName) {
+                runCI(nodeDetails, jobName)
+            }
+    }
+
+    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
+    if(!jobNameList.keySet().contains(urlJobName))
+    {
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        stage(urlJobName) {
+            runCI([ubuntu16:['rccl906']], urlJobName)
+        }
+    }
+}
\ No newline at end of file
diff --git a/docker/dockerfile-build-centos b/docker/dockerfile-build-centos
new file mode 100644
index 0000000000..6e48134bfa
--- /dev/null
+++ b/docker/dockerfile-build-centos
@@ -0,0 +1,41 @@
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# This Dockerfile provides a starting point for a ROCm installation of rccl
+
+# Parameters related to building rccl
+ARG base_image
+
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+
+USER root
+ARG user_uid
+
+# Install dependent packages
+RUN yum install -y --nogpgcheck \
+    sudo \
+    chrpath \
+    rock-dkms \
+    rocm-cmake \
+    centos-release-scl \
+    devtoolset-7 \
+    ca-certificates \
+    git \
+    cmake3 \
+    make \
+    libgomp \
+    clang \
+    clang-devel \
+    gcc-c++ \
+    pkgconfig \
+    numactl-libs 
+
+RUN echo '#!/bin/bash' | tee /etc/profile.d/devtoolset7.sh && echo \
+    'source scl_source enable devtoolset-7' >>/etc/profile.d/devtoolset7.sh
+
+# docker pipeline runs containers with particular uid
+# create a jenkins user with this specific uid so it can use sudo priviledges
+# Grant any member of sudo group password-less sudo privileges
+RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
+    echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd && \
+    chmod 400 /etc/sudoers.d/sudo-nopasswd
+
diff --git a/docker/dockerfile-build-ubuntu-rock b/docker/dockerfile-build-ubuntu-rock
new file mode 100644
index 0000000000..f7e17d500a
--- /dev/null
+++ b/docker/dockerfile-build-ubuntu-rock
@@ -0,0 +1,43 @@
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+
+ARG user_uid
+
+# Install dependent packages
+# Dependencies:
+# * hcc-config.cmake: pkg-config
+# * tensile: python2.7, python-yaml
+# * rocblas-test: gfortran, googletest
+# * rocblas-bench: libboost-program-options-dev
+# * libhsakmt.so: libnuma1
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    rock-dkms \
+    sudo \
+    ca-certificates \
+    chrpath \
+    git \
+    make \
+    cmake \
+    pkg-config \
+    python2.7 \
+    python-yaml \
+    python3-pytest \
+    rocm-cmake \
+    libboost-program-options-dev \
+    libnuma1 \
+    libomp-dev \
+    && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# docker pipeline runs containers with particular uid
+# create a jenkins user with this specific uid so it can use sudo priviledges
+# Grant any member of sudo group password-less sudo privileges
+RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
+    mkdir -p /etc/sudoers.d/ && \
+    echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd
+
diff --git a/docker/dockerfile-install-centos b/docker/dockerfile-install-centos
new file mode 100644
index 0000000000..2ccd6337f6
--- /dev/null
+++ b/docker/dockerfile-install-centos
@@ -0,0 +1,8 @@
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+
+#empty for now
diff --git a/docker/dockerfile-install-ubuntu b/docker/dockerfile-install-ubuntu
new file mode 100644
index 0000000000..d0b70e37c1
--- /dev/null
+++ b/docker/dockerfile-install-ubuntu
@@ -0,0 +1,8 @@
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+
+#empty for now

From a698b55cf50688041f470c1b15dbe3d35e66b14f Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Mon, 11 May 2020 15:17:53 -0700
Subject: [PATCH 044/233] Update rccl_bfloat16.h to match rocBLAS

---
 src/rccl_bfloat16.h | 77 ++++++++++++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 28 deletions(-)

diff --git a/src/rccl_bfloat16.h b/src/rccl_bfloat16.h
index 06b053a626..cbc6e059a5 100644
--- a/src/rccl_bfloat16.h
+++ b/src/rccl_bfloat16.h
@@ -1,7 +1,7 @@
 /**
  * MIT License
  *
- * Copyright 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -29,9 +29,9 @@
 #ifndef _RCCL_BFLOAT16_H_
 #define _RCCL_BFLOAT16_H_
 
-#if __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__))
+#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
 
-// If this is a C compiler, C++ compiler below C++14, or a host-only compiler, we only
+// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
 // include a minimal definition of rccl_bfloat16
 
 #include <stdint.h>
@@ -41,7 +41,7 @@ typedef struct
     uint16_t data;
 } rccl_bfloat16;
 
-#else // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__))
+#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
 
 #include <cmath>
 #include <cstddef>
@@ -54,16 +54,26 @@ struct rccl_bfloat16
 {
     uint16_t data;
 
+    enum truncate_t
+    {
+        truncate
+    };
+
     __host__ __device__ rccl_bfloat16() = default;
 
     // round upper 16 bits of IEEE float to convert to bfloat16
-    explicit constexpr __host__ __device__ rccl_bfloat16(float f)
+    explicit __host__ __device__ rccl_bfloat16(float f)
         : data(float_to_bfloat16(f))
     {
     }
 
+    explicit __host__ __device__ rccl_bfloat16(float f, truncate_t)
+        : data(truncate_float_to_bfloat16(f))
+    {
+    }
+
     // zero extend lower 16 bits of bfloat16 to convert to IEEE float
-    constexpr __host__ __device__ operator float() const
+    __host__ __device__ operator float() const
     {
         union
         {
@@ -74,7 +84,7 @@ struct rccl_bfloat16
     }
 
 private:
-    static constexpr __host__ __device__ uint16_t float_to_bfloat16(float f)
+    static __host__ __device__ uint16_t float_to_bfloat16(float f)
     {
         union
         {
@@ -115,6 +125,17 @@ private:
         }
         return uint16_t(u.int32 >> 16);
     }
+
+    // Truncate instead of rounding, preserving SNaN
+    static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
+    }
 };
 
 typedef struct
@@ -138,86 +159,86 @@ inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat16& bf16)
 {
     return os << float(bf16);
 }
-constexpr __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a)
+inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a)
 {
     return a;
 }
-constexpr __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a)
+inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a)
 {
     a.data ^= 0x8000;
     return a;
 }
-constexpr __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return rccl_bfloat16(float(a) + float(b));
 }
-constexpr __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return rccl_bfloat16(float(a) - float(b));
 }
-constexpr __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return rccl_bfloat16(float(a) * float(b));
 }
-constexpr __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return rccl_bfloat16(float(a) / float(b));
 }
-constexpr __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return float(a) < float(b);
 }
-constexpr __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return float(a) == float(b);
 }
-constexpr __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return b < a;
 }
-constexpr __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return !(a > b);
 }
-constexpr __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return !(a == b);
 }
-constexpr __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b)
+inline __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b)
 {
     return !(a < b);
 }
-constexpr __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b)
+inline __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b)
 {
     return a = a + b;
 }
-constexpr __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b)
+inline __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b)
 {
     return a = a - b;
 }
-constexpr __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b)
+inline __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b)
 {
     return a = a * b;
 }
-constexpr __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b)
+inline __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b)
 {
     return a = a / b;
 }
-constexpr __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a)
+inline __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a)
 {
     return a += rccl_bfloat16(1.0f);
 }
-constexpr __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a)
+inline __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a)
 {
     return a -= rccl_bfloat16(1.0f);
 }
-constexpr __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int)
+inline __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int)
 {
     rccl_bfloat16 orig = a;
     ++a;
     return orig;
 }
-constexpr __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int)
+inline __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int)
 {
     rccl_bfloat16 orig = a;
     --a;
@@ -248,6 +269,6 @@ namespace std
     }
 }
 
-#endif // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__))
+#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
 
 #endif // _RCCL_BFLOAT16_H_

From 97a26afc2626650783ff94f36b1254d809b4d7d0 Mon Sep 17 00:00:00 2001
From: saadrahim <44449863+saadrahim@users.noreply.github.com>
Date: Fri, 22 May 2020 09:58:42 -0600
Subject: [PATCH 045/233] Update common.groovy

---
 .jenkins/common.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index 5bf86a4765..c893f9fd8e 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -31,7 +31,7 @@ def runTestCommand (platform, project)
             """
 
    platform.runCommand(this, command)
-   junit "${project.paths.project_build_prefix}/build/release/test/*.xml"
+   junit "${project.paths.project_build_prefix}/*.xml"
 }
 
 return this

From 622771cc4e5db5a589c6ef0febafe1fe5367d848 Mon Sep 17 00:00:00 2001
From: saadrahim <44449863+saadrahim@users.noreply.github.com>
Date: Tue, 2 Jun 2020 10:19:38 -0600
Subject: [PATCH 046/233] Removing old Jenkinsfile that is no longer needed

---
 Jenkinsfile | 82 -----------------------------------------------------
 1 file changed, 82 deletions(-)
 delete mode 100644 Jenkinsfile

diff --git a/Jenkinsfile b/Jenkinsfile
deleted file mode 100644
index 7589636c68..0000000000
--- a/Jenkinsfile
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env groovy
-// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS
-@Library('rocJenkins@noDocker') _
-
-// This is file for internal AMD use.
-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
-
-import com.amd.project.*
-import com.amd.docker.*
-
-////////////////////////////////////////////////////////////////////////
-// Mostly generated from snippet generator 'properties; set job properties'
-// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM
-properties([
-    pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]),
-    buildDiscarder(logRotator(
-      artifactDaysToKeepStr: '',
-      artifactNumToKeepStr: '',
-      daysToKeepStr: '',
-      numToKeepStr: '10')),
-    disableConcurrentBuilds(),
-    [$class: 'CopyArtifactPermissionProperty', projectNames: '*']
-   ])
-
-
-////////////////////////////////////////////////////////////////////////
-import java.nio.file.Path;
-
-rcclTestsCI:
-{
-    def rcclTests = new rocProject('rcclTests')
-    // customize for project
-    rcclTests.paths.build_command = './install.sh'
-
-    // Define test architectures, optional rocm version argument is available
-    def nodes = new dockerNodes(['RCCL'], rcclTests)
-
-    boolean formatCheck = false
-
-    def compileCommand =
-    {
-        platform, project->
-
-        project.paths.construct_build_prefix()
-
-        def command = """#!/usr/bin/env bash
-                  set -x
-                  rm -rf rccl
-                  git clone https://github.com/ROCmSoftwarePlatform/rccl
-                  cd rccl
-                  export RCCL_PATH=${WORKSPACE}/rccl/rccl-install
-                  ./install.sh -i --prefix=\$RCCL_PATH
-                  cd ..
-                  cd ${project.paths.project_build_prefix}
-                  ${project.paths.build_command} --rccl_home=\$RCCL_PATH
-                """
-	  sh command
-    }
-    def testCommand =
-    {
-        platform, project->
-
-        def command = """#!/usr/bin/env bash
-                set -x
-                LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:${WORKSPACE}/rccl/rccl-install/lib/ python3 -m pytest -k "not MPI" --junitxml=./testreport.xml
-            """
-
-        sh command
-        //junit "${project.paths.project_build_prefix}/build/release/*.xml"
-    }
-
-    def packageCommand =
-    {
-        platform, project->
-
-        def command = """
-                      """
-    }
-
-    buildProjectNoDocker(rcclTests, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
-}

From ba924dac95c794540b582cdbc480398fb3f64930 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Wed, 3 Jun 2020 15:07:51 -0700
Subject: [PATCH 047/233] Fix #43 : Add .gitignore for build dir

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..a0a013e438
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENCE.txt for license information
+/build

From 83b846cf4fb8e16f6e07f5577168c40db7173e65 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 10 Jun 2020 23:24:08 +0000
Subject: [PATCH 048/233] Correct szie when init sendbuff

---
 src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 908a69b9c5..ed88f51905 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -885,9 +885,9 @@ testResult_t run() {
     HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
     // initialize data buffer to avoid all zero data
 #if NCCL_MAJOR >= 2
-    TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclUint8, 0, i));
+    TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
 #else
-    TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclChar, 0, i));
+    TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclChar, 0, i));
 #endif
     HIPCHECK(hipDeviceSynchronize());
   }

From 7a833631b2ba685627aec257627a966f58e26bd4 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Mon, 15 Jun 2020 08:54:21 -0700
Subject: [PATCH 049/233] Remove sm_30

---
 src/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index ed723d4210..56d2e6345d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -15,8 +15,7 @@ NVCC = $(CUDA_HOME)/bin/nvcc
 
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
-		-gencode=arch=compute_35,code=sm_35 \
+NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
                 -gencode=arch=compute_50,code=sm_50 \
 		-gencode=arch=compute_60,code=sm_60 \
                 -gencode=arch=compute_61,code=sm_61 \

From af4fa0f4cf7c2c3db0540da7ac8d6efc1d526635 Mon Sep 17 00:00:00 2001
From: Luke Yeager <lyeager@nvidia.com>
Date: Tue, 7 Jan 2020 13:30:19 -0800
Subject: [PATCH 050/233] Fix some memory leaks

---
 src/common.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 5a3ae529d6..2c5e38eca3 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -302,7 +302,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
          printf("%d:%d ", j, dataHost[j]);
        }
        printf("\n");
-       free(temp);
+       free(expectedHost);
+       free(dataHost);
     }
 #endif
   }
@@ -351,6 +352,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
    // We might want to let other threads (including NCCL threads) use the CPU.
    if (idle) pthread_yield();
   }
+  free(done);
   return testSuccess;
 }
 

From 07ac716c1ac5999964bd583806ec37e928251119 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Thu, 18 Jun 2020 15:00:05 -0700
Subject: [PATCH 051/233] Fix #47 : compilation error on NCCL<2.7

Return an error when trying to run alltoall test when compiled
against NCCL<2.7.
---
 src/alltoall.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/alltoall.cu b/src/alltoall.cu
index aea9370f65..31cfca090d 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -64,14 +64,18 @@ testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclD
   size_t rankOffset = count * wordSize(type);
   if (count == 0) return testSuccess;
 
+#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
+  printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
+  return testNcclError;
+#else
   NCCLCHECK(ncclGroupStart());
   for (int r=0; r<nRanks; r++) {
     NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
     NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
   }
   NCCLCHECK(ncclGroupEnd());
-
   return testSuccess;
+#endif
 }
 
 struct testColl alltoAllTest = {

From ec1b5e22e618d342698fda659efdd5918da6bd9f Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Fri, 19 Jun 2020 10:40:33 -0700
Subject: [PATCH 052/233] Change all_gather/reduce_scatter algbw to match the
 documentation.

Fix #45 : All_gather and reduce_scatter algorithm bandwidth was
computed as time/count*(nranks-1) which is not consistent with the
way we compute it for other collectives.

This change makes algbw higher; busbw is unchanged.
---
 src/all_gather.cu     | 4 ++--
 src/reduce_scatter.cu | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/all_gather.cu b/src/all_gather.cu
index cfb2ec356b..f5bc44c57d 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -48,10 +48,10 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
 }
 
 void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
-  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
 
   *algBw = baseBw;
-  double factor = 1;
+  double factor = ((double)(nranks - 1))/((double)nranks);
   *busBw = baseBw * factor;
 }
 
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index 0b1d986952..86e789c15d 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -47,10 +47,10 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
 }
 
 void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
-  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
 
   *algBw = baseBw;
-  double factor = 1;
+  double factor = ((double)(nranks - 1))/((double)nranks);
   *busBw = baseBw * factor;
 }
 

From b2603a2e85436b63b80a02b5fc45df84fe42be7b Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 23 Jun 2020 18:16:46 -0700
Subject: [PATCH 053/233] Add gencode for CUDA11

---
 src/Makefile | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 2440db1672..0770f080ed 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -13,14 +13,25 @@ CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
 NVCC = $(CUDA_HOME)/bin/nvcc
 
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -genncode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_80,code=sm_80 \
+                -gencode=arch=compute_80,code=compute_80
+else
 NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
                 -gencode=arch=compute_50,code=sm_50 \
-		-gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_60,code=sm_60 \
                 -gencode=arch=compute_61,code=sm_61 \
-		-gencode=arch=compute_70,code=compute_70 \
-		-gencode=arch=compute_70,code=sm_70
+                -gencode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_70,code=compute_70
+endif
 
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 

From afdaf59b3b179af51553614c85925dd2ab0a39a4 Mon Sep 17 00:00:00 2001
From: Luke Yeager <lukeyeager@users.noreply.github.com>
Date: Wed, 24 Jun 2020 14:39:22 -0700
Subject: [PATCH 054/233] Fix typo in src/Makefile

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 0770f080ed..52169bb3e1 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -21,7 +21,7 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
 NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
                 -gencode=arch=compute_61,code=sm_61 \
-                -genncode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_70,code=sm_70 \
                 -gencode=arch=compute_80,code=sm_80 \
                 -gencode=arch=compute_80,code=compute_80
 else

From 346cb164427e3e3de1acbe068587af65d17e6aff Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Mon, 6 Jul 2020 17:12:50 -0700
Subject: [PATCH 055/233] Change scatter and gather bandwidth calculation to
 match alltoall (#7)

---
 src/gather.cu  | 11 +++++++----
 src/scatter.cu | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/gather.cu b/src/gather.cu
index 65dc714893..4b98ede241 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -64,10 +64,10 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
 }
 
 void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
-  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 
   *algBw = baseBw;
-  double factor = 1;
+  double factor = ((double)(nranks-1))/((double)(nranks));
   *busBw = baseBw * factor;
 }
 
@@ -79,7 +79,10 @@ testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDat
 
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
-#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7
+#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
+  printf("NCCL 2.7 or later is needed for gather. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
+  return testNcclError;
+#else
 #if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER)
   if (rank == root)
     NCCLCHECK(ncclGather(sendbuff, recvbuff, count, type, root, comm, stream));
@@ -93,9 +96,9 @@ testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDat
   }
   NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
   NCCLCHECK(ncclGroupEnd());
-#endif
 #endif
   return testSuccess;
+#endif
 }
 
 struct testColl gatherTest = {
diff --git a/src/scatter.cu b/src/scatter.cu
index d741391300..4dbbda25a3 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -66,10 +66,10 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
 }
 
 void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
-  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 
   *algBw = baseBw;
-  double factor = 1;
+  double factor = ((double)(nranks-1))/((double)(nranks));
   *busBw = baseBw * factor;
 }
 
@@ -81,7 +81,10 @@ testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDa
 
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
-#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7
+#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
+  printf("NCCL 2.7 or later is needed for scatter. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
+  return testNcclError;
+#else
 #if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER)
   NCCLCHECK(ncclScatter(sendbuff, recvbuff, count, type, root, comm, stream));
 #else
@@ -92,9 +95,9 @@ testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDa
   }
   NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
   NCCLCHECK(ncclGroupEnd());
-#endif
 #endif
   return testSuccess;
+#endif
 }
 
 struct testColl scatterTest = {

From 3d63a84d97b49edeb63e9b7e0be407ecbe7f008e Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 21 Aug 2020 21:34:55 +0000
Subject: [PATCH 056/233] Add cumask option

---
 src/common.cu | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index bc9ac3185c..e1fb769c95 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -55,6 +55,7 @@ static int blocking_coll = 0;
 static int memorytype = 0;
 static int stress_cycles = 1;
 static ncclResult_t ncclabort = ncclSuccess;
+static uint32_t cumask[4];
 
 double parsesize(char *value) {
     long long int units;
@@ -687,12 +688,13 @@ int main(int argc, char* argv[]) {
     {"blocking", required_argument, 0, 'z'},
     {"memory_type", required_argument, 0, 'y'},
     {"stress_cycles", required_argument, 0, 's'},
+    {"cumask", required_argument, 0, 'u'},
     {"help", no_argument, 0, 'h'}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:s:h", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:s:u:h", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -753,6 +755,16 @@ int main(int argc, char* argv[]) {
       case 's':
         stress_cycles = strtol(optarg, NULL, 0);
         break;
+      case 'u':
+        {
+          int nmasks = 0;
+          char *mask = strtok(optarg, ",");
+          while (mask != NULL && nmasks < 4) {
+            cumask[nmasks++] = strtol(mask, NULL, 16);
+            mask = strtok(NULL, ",");
+          };
+        }
+        break;
       case 'h':
 	printf("USAGE: %s \n\t"
             "[-t,--nthreads <num threads>] \n\t"
@@ -771,6 +783,8 @@ int main(int argc, char* argv[]) {
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
             "[-y,--memory_type <coarse/fine/host>] \n\t"
+            "[-s,--stress_cycles <number of cycles>] \n\t"
+            "[-u,--cumask <d0,d1,d2,d3>] \n\t"
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;
@@ -793,6 +807,8 @@ int main(int argc, char* argv[]) {
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
             "[-y,--memory_type <coarse/fine/host>] \n\t"
+            "[-s,--stress_cycles <number of cycles>] \n\t"
+            "[-u,--cumask <d0,d1,d2,d3>] \n\t"
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;
@@ -882,7 +898,13 @@ testResult_t run() {
   for (int i=0; i<nGpus*nThreads; i++) {
     HIPCHECK(hipSetDevice(localRank*nThreads*nGpus+i));
     AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus);
-    HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+    if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
+      PRINT("cumask: ");
+      for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
+      PRINT("\n");
+      HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
+    } else
+      HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
     // initialize data buffer to avoid all zero data
 #if NCCL_MAJOR >= 2
     TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));

From 58dcd35af23f64a9becb08e8cf4c872177133227 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 27 Aug 2020 23:45:47 +0000
Subject: [PATCH 057/233] Add alltoallv test

---
 src/Makefile     |   2 +-
 src/alltoallv.cu | 190 +++++++++++++++++++++++++++++++++++++++++++++++
 src/common.cu    |  41 +++++-----
 3 files changed, 215 insertions(+), 18 deletions(-)
 create mode 100644 src/alltoallv.cu

diff --git a/src/Makefile b/src/Makefile
index ac317eb26b..5b7ffa965c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -54,7 +54,7 @@ HIPLDFLAGS   += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter sendrecv
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter sendrecv alltoallv
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
new file mode 100644
index 0000000000..7993059441
--- /dev/null
+++ b/src/alltoallv.cu
@@ -0,0 +1,190 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+#define USE_RCCL_GATHER_SCATTER
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+}
+
+void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  if (count < nranks*nranks/2) {
+    *sendcount = 0;
+    *recvcount = 0;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = 0;
+    *paramcount = 0;
+  } else {
+    *sendcount = (count/nranks)*nranks;
+    *recvcount = (count/nranks)*nranks;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = 0;
+    *paramcount = count/nranks;
+  }
+}
+
+testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    char* str = getenv("NCCL_TESTS_DEVICE");
+    int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    HIPCHECK(hipSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+#if 0
+    int *dataHost = (int *)malloc(args->sendBytes);
+    hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
+    printf(" Rank [%d] Original: ", rank);
+    for(int j=0; j<sendcount; j++) {
+      printf("%d:%d ", j, dataHost[j]);
+    }
+    printf("\n");
+    free(dataHost);
+#endif
+    size_t rdisp = 0;
+    size_t data_count = sendcount*2/nranks;
+    size_t chunksize = data_count/nranks;
+    for (int j=0; j<nranks; j++) {
+      size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
+      if (j+rank == nranks-1)
+          rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
+      size_t sdisp = 0;
+      for (int k=0; k<nranks; k++) {
+        scount = ((k+j)%nranks)*chunksize;
+        if (k+j == nranks-1)
+          scount += (sendcount-chunksize*(nranks-1)*nranks/2);
+        if (k == rank)
+          break;
+        sdisp += scount;
+      }
+      TESTCHECK(InitData(((char*)args->expected[i])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
+      rdisp += rcount;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  // We don't support in-place alltoall
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nranks;
+  NCCLCHECK(ncclCommCount(comm, &nranks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  #define MAX_ALLTOALLV_RANKS 256
+  static size_t sendcounts[MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS];
+  if (count == 0) return testSuccess;
+  if (nranks > MAX_ALLTOALLV_RANKS) {
+    printf("Number of ranks %d exceeds limit %d\n", nranks, MAX_ALLTOALLV_RANKS);
+    return testNcclError;
+  }
+
+  size_t disp = 0;
+  size_t chunksize = count*2/nranks;
+  for (int i = 0; i < nranks; i++) {
+      size_t scount = ((i+rank)%nranks)*chunksize;
+      if (i+rank == nranks-1)
+          scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
+      sendcounts[i] = recvcounts[i] = scount;
+      sdispls[i] = rdispls[i] = disp;
+      disp += scount;
+  }
+
+#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
+  printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
+  return testNcclError;
+#else
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nranks; r++) {
+    if (sendcounts[r] != 0) {
+      NCCLCHECK(ncclSend(
+          ((char*)sendbuff) + sdispls[r] * wordSize(type),
+          sendcounts[r],
+          type,
+          r,
+          comm,
+          stream));
+    }
+    if (recvcounts[r] != 0) {
+      NCCLCHECK(ncclRecv(
+          ((char*)recvbuff) + rdispls[r] * wordSize(type),
+          recvcounts[r],
+          type,
+          r,
+          comm,
+          stream));
+    }
+  }
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+#endif
+}
+
+struct testColl alltoAllTest = {
+  "AlltoAllv",
+  AlltoAllvGetCollByteCount,
+  AlltoAllvInitData,
+  AlltoAllvGetBw,
+  AlltoAllvRunColl
+};
+
+void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &alltoAllTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  AlltoAllvGetBuffSize,
+  AlltoAllvRunTest
+};
diff --git a/src/common.cu b/src/common.cu
index e1fb769c95..23d884cb9b 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -128,7 +128,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
     double delta = absDiff(A[i], B[i]);
     if( delta > locmax ) {
       locmax = delta;
-#ifdef DEBUG_PRINT
+#if 0
       if (delta > .1) printf("Error at %d/%ld : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i]));
 #endif
     }
@@ -339,23 +339,30 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
 #ifdef DEBUG_PRINT
     //if (rank == 0) {
-       int *expectedHost = (int *)malloc(args->expectedBytes);
-       int *dataHost = (int *)malloc(args->expectedBytes);
+      int *expectedHost = (int *)malloc(args->expectedBytes);
+      int *dataHost = (int *)malloc(args->expectedBytes);
 
-       hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost);
-       printf("\n Rank [%d] Expected: ", rank);
-       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, expectedHost[j]);
-       }
-       hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
-       printf("\n Rank [%d] Actual: ", rank);
-       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, dataHost[j]);
-       }
-       printf("\n");
-       free(expectedHost);
-       free(dataHost);
-    }
+      hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost);
+      hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
+      int j, k, l;
+      for (j=0; j<args->expectedBytes/sizeof(int); j++)
+        if (expectedHost[j] != dataHost[j]) break;
+      k = j;
+      for (; j<args->expectedBytes/sizeof(int); j++)
+        if (expectedHost[j] == dataHost[j]) break;
+      l = j;
+      printf("\n Rank [%d] Expected: ", rank);
+      for (j=k; j<args->expectedBytes/sizeof(int) && j<l; j++) {
+        printf("%d:%d ", j, expectedHost[j]);
+      }
+      printf("\n Rank [%d] Actual  : ", rank);
+      for (j=k; j<args->expectedBytes/sizeof(int) && j<l; j++) {
+        printf("%d:%d ", j, dataHost[j]);
+      }
+      printf("\n");
+      free(expectedHost);
+      free(dataHost);
+    //}
 #endif
   }
   double nranks = args->nProcs*args->nThreads*args->nGpus;

From 0d1940e18eafa337f558b9d6410399eb3d97f96d Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 17 Sep 2020 14:54:12 -0700
Subject: [PATCH 058/233] Prioritize NCCL_HOME

---
 src/Makefile | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 5b7ffa965c..260e98a282 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,13 +14,19 @@ DEBUG ?= 0
 HIPCC = $(ROCM_HOME)/hip/bin/hipcc
 CXX = $(HIPCC)
 
-
 HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+endif
 HIPCUFLAGS += -I$(ROCM_HOME)/include
 HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl
 HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip
-LDFLAGS    := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
-HIPLDFLAGS := $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
+LDFLAGS    += -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
 
 ifeq ($(DEBUG), 0)
 HIPCUFLAGS += -O3
@@ -35,10 +41,6 @@ endif
 .PHONY: build clean
 
 BUILDDIR ?= ../build
-ifneq ($(NCCL_HOME), "")
-HIPCUFLAGS += -I$(NCCL_HOME)/include/
-HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME)/lib -L$(NCCL_HOME)/lib
-endif
 
 ifeq ($(MPI), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi

From 3f1dfacc9503434560d6e16551865b7d8d816a96 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 10 Sep 2020 22:31:19 +0000
Subject: [PATCH 059/233] Add test for alltoallv API

---
 src/alltoallv.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 7993059441..30577fea43 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -122,6 +122,9 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
 #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
   printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
   return testNcclError;
+#else
+#if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER)
+  NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls, type, comm, stream));
 #else
   NCCLCHECK(ncclGroupStart());
   for (int r=0; r<nranks; r++) {
@@ -145,6 +148,7 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
     }
   }
   NCCLCHECK(ncclGroupEnd());
+#endif
   return testSuccess;
 #endif
 }

From 4c0ec7347ca6de4772b436452ee28a588b055eb6 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 25 Sep 2020 18:06:09 +0000
Subject: [PATCH 060/233] Fix build error

---
 src/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Makefile b/src/Makefile
index 260e98a282..f4e1f805c0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,6 +10,7 @@ MPI_HOME ?= /usr/lib/openmpi
 PREFIX ?= /usr/local
 VERBOSE ?= 0
 DEBUG ?= 0
+NCCL_HOME ?= ""
 
 HIPCC = $(ROCM_HOME)/hip/bin/hipcc
 CXX = $(HIPCC)

From bf4a866109667bea8c7fa142d940ef00f46fc2d6 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Mon, 19 Oct 2020 14:06:23 -0400
Subject: [PATCH 061/233] Uses nullptr as send buffer for non-root ranks during
 scatter

---
 src/scatter.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/scatter.cu b/src/scatter.cu
index 4dbbda25a3..b18ed382ce 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -86,7 +86,10 @@ testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDa
   return testNcclError;
 #else
 #if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER)
-  NCCLCHECK(ncclScatter(sendbuff, recvbuff, count, type, root, comm, stream));
+  if (rank == root)
+    NCCLCHECK(ncclScatter(sendbuff, recvbuff, count, type, root, comm, stream));
+  else
+    NCCLCHECK(ncclScatter(0, recvbuff, count, type, root, comm, stream));
 #else
   NCCLCHECK(ncclGroupStart());
   if (rank == root) {

From e3f9e281f1a03abf983b81e1a1804ef7214dee6b Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Mon, 19 Oct 2020 14:43:01 -0700
Subject: [PATCH 062/233] Fix mpich linking option

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index f4e1f805c0..8b33b66b13 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -48,7 +48,7 @@ HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
 HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
 else ifeq ($(MPICH), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich
-HIPLDFLAGS += -L/usr/lib -lmpi
+HIPLDFLAGS += -L/usr/lib -lmpich
 endif
 
 LIBRARIES += rccl

From d310466d882948cf44fc2a75ee419ce2a6958ce3 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Mon, 14 Dec 2020 18:01:04 -0500
Subject: [PATCH 063/233] Fix alltoallv test

---
 src/alltoallv.cu | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 30577fea43..fb6d0acde8 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -101,7 +101,7 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
   #define MAX_ALLTOALLV_RANKS 256
-  static size_t sendcounts[MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS];
+  static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS];
   if (count == 0) return testSuccess;
   if (nranks > MAX_ALLTOALLV_RANKS) {
     printf("Number of ranks %d exceeds limit %d\n", nranks, MAX_ALLTOALLV_RANKS);
@@ -114,9 +114,10 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
       size_t scount = ((i+rank)%nranks)*chunksize;
       if (i+rank == nranks-1)
           scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
-      sendcounts[i] = recvcounts[i] = scount;
-      sdispls[i] = rdispls[i] = disp;
+      sendcounts[i+rank*MAX_ALLTOALLV_RANKS] = recvcounts[i+rank*MAX_ALLTOALLV_RANKS] = scount;
+      sdispls[i+rank*MAX_ALLTOALLV_RANKS] = rdispls[i+rank*MAX_ALLTOALLV_RANKS] = disp;
       disp += scount;
+      //printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type), sdispls[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type));
   }
 
 #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
@@ -124,23 +125,23 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   return testNcclError;
 #else
 #if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER)
-  NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls, type, comm, stream));
+  NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*MAX_ALLTOALLV_RANKS, sdispls+rank*MAX_ALLTOALLV_RANKS, recvbuff, recvcounts+rank*MAX_ALLTOALLV_RANKS, rdispls+rank*MAX_ALLTOALLV_RANKS, type, comm, stream));
 #else
   NCCLCHECK(ncclGroupStart());
   for (int r=0; r<nranks; r++) {
-    if (sendcounts[r] != 0) {
+    if (sendcounts[r+rank*MAX_ALLTOALLV_RANKS] != 0) {
       NCCLCHECK(ncclSend(
-          ((char*)sendbuff) + sdispls[r] * wordSize(type),
-          sendcounts[r],
+          ((char*)sendbuff) + sdispls[r+rank*MAX_ALLTOALLV_RANKS] * wordSize(type),
+          sendcounts[r+rank*MAX_ALLTOALLV_RANKS],
           type,
           r,
           comm,
           stream));
     }
-    if (recvcounts[r] != 0) {
+    if (recvcounts[r+rank*MAX_ALLTOALLV_RANKS] != 0) {
       NCCLCHECK(ncclRecv(
-          ((char*)recvbuff) + rdispls[r] * wordSize(type),
-          recvcounts[r],
+          ((char*)recvbuff) + rdispls[r+rank*MAX_ALLTOALLV_RANKS] * wordSize(type),
+          recvcounts[r+rank*MAX_ALLTOALLV_RANKS],
           type,
           r,
           comm,

From 3117033150250c096f4ff00089f5aad056e8abac Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 15 Dec 2020 22:05:50 -0500
Subject: [PATCH 064/233] Add support for testing memory allocated with
 hipMallocManaged

---
 src/common.cu | 9 +++++++--
 src/common.h  | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 23d884cb9b..401ba46c2b 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -32,7 +32,7 @@ const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "dou
 #endif
 ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin};
 const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"};
-const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host"};
+const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host", "managed"};
 
 thread_local int is_main_thread = 0;
 
@@ -655,6 +655,11 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
     HIPCHECK(hipHostMalloc(recvbuff, nbytes));
     HIPCHECK(hipHostMalloc(expected, recvBytes));
   }
+  else if (memorytype == ncclManaged) {
+    HIPCHECK(hipMallocManaged(sendbuff, nbytes));
+    HIPCHECK(hipMallocManaged(recvbuff, nbytes));
+    HIPCHECK(hipMallocManaged(expected, recvBytes));
+  }
   else {
     HIPCHECK(hipMalloc(sendbuff, nbytes));
     HIPCHECK(hipMalloc(recvbuff, nbytes));
@@ -813,7 +818,7 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
-            "[-y,--memory_type <coarse/fine/host>] \n\t"
+            "[-y,--memory_type <coarse/fine/host/managed>] \n\t"
             "[-s,--stress_cycles <number of cycles>] \n\t"
             "[-u,--cumask <d0,d1,d2,d3>] \n\t"
             "[-h,--help]\n",
diff --git a/src/common.h b/src/common.h
index 8de2efaa4b..a498cce8e6 100644
--- a/src/common.h
+++ b/src/common.h
@@ -212,7 +212,8 @@ extern const char *test_opnames[ncclNumOps];
 typedef enum { ncclCoarse        = 0,
                ncclFine          = 1,
                ncclHost          = 2,
-               nccl_NUM_MTYPES   = 3 } ncclMemoryType_t;
+               ncclManaged       = 3,
+               nccl_NUM_MTYPES   = 4 } ncclMemoryType_t;
 extern const char *test_memorytypes[nccl_NUM_MTYPES];
 
 static int ncclstringtotype(char *str) {

From da67a81c8e43496e442931ccacf5fc3fd1b4e91e Mon Sep 17 00:00:00 2001
From: Jithin Jose <jijos@microsoft.com>
Date: Fri, 18 Dec 2020 10:12:54 -0800
Subject: [PATCH 065/233] Use DJB2a hash algorithm in getHostHash()

---
 src/common.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.h b/src/common.h
index a2d7ae2958..0fb5aa4467 100644
--- a/src/common.h
+++ b/src/common.h
@@ -165,10 +165,10 @@ static void getHostName(char* hostname, int maxlen) {
 #include <stdint.h>
 
 static uint64_t getHostHash(const char* string) {
-  // Based on DJB2, result = result * 33 + char
+  // Based on DJB2a, result = result * 33 ^ char
   uint64_t result = 5381;
   for (int c = 0; string[c] != '\0'; c++){
-    result = ((result << 5) + result) + string[c];
+    result = ((result << 5) + result) ^ string[c];
   }
   return result;
 }

From ae1ce98e69dfec377261ad168214dfc8d47aa996 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 4 Jan 2021 11:37:32 -0800
Subject: [PATCH 066/233] Add boot_id to the hostname hash due to collisions on
 Azure

Fixes #60
---
 src/common.h | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/common.h b/src/common.h
index 0fb5aa4467..865ee258cd 100644
--- a/src/common.h
+++ b/src/common.h
@@ -164,15 +164,46 @@ static void getHostName(char* hostname, int maxlen) {
 
 #include <stdint.h>
 
-static uint64_t getHostHash(const char* string) {
+static uint64_t getHash(const char* string, size_t n) {
   // Based on DJB2a, result = result * 33 ^ char
   uint64_t result = 5381;
-  for (int c = 0; string[c] != '\0'; c++){
+  for (size_t c = 0; c < n; c++) {
     result = ((result << 5) + result) ^ string[c];
   }
   return result;
 }
 
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
 static size_t wordSize(ncclDataType_t type) {
   switch(type) {
     case ncclChar:

From e5f1482efb91c1e0d505ec9b61070dc9f5b60e28 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Mon, 4 Jan 2021 16:51:16 -0500
Subject: [PATCH 067/233] Add tests code that can print info and reset
 input/output buffers

---
 src/common.cu | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 401ba46c2b..6363899965 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -659,6 +659,11 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
     HIPCHECK(hipMallocManaged(sendbuff, nbytes));
     HIPCHECK(hipMallocManaged(recvbuff, nbytes));
     HIPCHECK(hipMallocManaged(expected, recvBytes));
+#if 0
+    HIPCHECK(hipMemset(*sendbuff, 0, nbytes));
+    HIPCHECK(hipMemset(*recvbuff, 0, nbytes));
+    HIPCHECK(hipMemset(*expected, 0, recvBytes));
+#endif
   }
   else {
     HIPCHECK(hipMalloc(sendbuff, nbytes));
@@ -910,6 +915,7 @@ testResult_t run() {
   for (int i=0; i<nGpus*nThreads; i++) {
     HIPCHECK(hipSetDevice(localRank*nThreads*nGpus+i));
     AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus);
+    //PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
     if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
       PRINT("cumask: ");
       for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);

From 7677f3f6081cfafb89947b675cde565c63a3f98c Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 20 Jan 2021 17:08:40 -0800
Subject: [PATCH 068/233] Do not allocate memory for expected buffer if
 checking disabled

This allows the tests to be run with larger buffers
---
 src/common.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 19129d66ec..ff4e1fd857 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -559,7 +559,7 @@ testResult_t threadLaunch(struct testThread* thread) {
 testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
     CUDACHECK(cudaMalloc(sendbuff, nbytes));
     CUDACHECK(cudaMalloc(recvbuff, nbytes));
-    CUDACHECK(cudaMalloc(expected, recvBytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
     return testSuccess;
 }
 
@@ -764,7 +764,7 @@ testResult_t run() {
 
   for (int i=0; i<nGpus*nThreads; i++) {
     CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-    AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus);
+    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
     CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
   }
 
@@ -868,7 +868,7 @@ testResult_t run() {
   for (int i=0; i<nGpus*nThreads; i++) {
     CUDACHECK(cudaFree(sendbuffs[i]));
     CUDACHECK(cudaFree(recvbuffs[i]));
-    CUDACHECK(cudaFree(expected[i]));
+    if (datacheck) CUDACHECK(cudaFree(expected[i]));
   }
   CUDACHECK(cudaFreeHost(delta));
 

From 39086cdc0ac04e6fc34e06945517e26410da6c45 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 3 Feb 2021 21:16:18 -0500
Subject: [PATCH 069/233] Revert "Allow call ncclCommAbort on Ctrl+C"

This reverts commit 23c374475f0472a06b461ad5ba5d09b5312a1f3c.
---
 src/common.cu | 29 +----------------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 6363899965..443c140fcc 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -13,7 +13,6 @@
 #include <cstdio>
 #include <getopt.h>
 #include <libgen.h>
-#include <signal.h>
 #include <algorithm>
 
 //#define DEBUG_PRINT
@@ -54,7 +53,6 @@ static int parallel_init = 0;
 static int blocking_coll = 0;
 static int memorytype = 0;
 static int stress_cycles = 1;
-static ncclResult_t ncclabort = ncclSuccess;
 static uint32_t cumask[4];
 
 double parsesize(char *value) {
@@ -371,21 +369,6 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   return testSuccess;
 }
 
-void INThandler(int sig) {
-  char  c;
-
-  signal(sig, SIG_IGN);
-  printf("\nDo you want to call ncclCommAbort before exit? [y/n] ");
-  c = getchar();
-  if (c == 'y' || c == 'Y') {
-    ncclabort = ncclSystemError;
-    signal(SIGINT, INThandler);
-  }
-  else
-    exit (0);
-  getchar(); // Get new line character
-}
-
 testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) {
   hipError_t hipErr;
   int remaining = ngpus;
@@ -411,17 +394,13 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
      if (comms) {
        ncclResult_t ncclAsyncErr;
        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
-       if (ncclAsyncErr != ncclSuccess || ncclabort != ncclSuccess) {
+       if (ncclAsyncErr != ncclSuccess) {
          // An asynchronous error happened. Stop the operation and destroy
          // the communicator
          for (int i=0; i<ngpus; i++)
            NCCLCHECK(ncclCommAbort(comms[i]));
-         // Let all kernels to exit
-         for (int i=0; i<ngpus; i++)
-           HIPCHECK(hipStreamSynchronize(streams[i]));
          // Abort the perf test
          NCCLCHECK(ncclAsyncErr);
-         NCCLCHECK(ncclabort);
        }
      }
 #endif
@@ -676,12 +655,6 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
 testResult_t run(); // Main function
 
 int main(int argc, char* argv[]) {
-#if NCCL_MAJOR >= 2
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-  // may call ncclCommAbort
-  signal(SIGINT, INThandler);
-#endif
-#endif
   // Make sure everyline is flushed so that we see the progress of the test
   setlinebuf(stdout);
 

From e37545e4911c210558baba789941ea7bf59db00d Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 15 Mar 2021 14:44:06 -0700
Subject: [PATCH 070/233] Add support for new datatype: bfloat16

---
 src/common.cu | 41 +++++++++++++++++++++++++++++++++++++----
 src/common.h  |  3 +++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index ff4e1fd857..4589593b07 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -12,8 +12,16 @@
 #include "cuda.h"
 
 #if NCCL_MAJOR >= 2
-ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble};
-const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"};
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble,
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+                                           ncclBfloat16
+#endif
+};
+const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double",
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+                                            "bfloat16"
+#endif
+};
 #else
 ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
 const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
@@ -61,6 +69,9 @@ double parsesize(char *value) {
 double DeltaMaxValue(ncclDataType_t type) {
   switch(type) {
     case ncclHalf: return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16: return 1e-2;
+#endif
     case ncclFloat: return 1e-5;
     case ncclDouble: return 1e-12;
     case ncclInt:
@@ -95,6 +106,12 @@ template<> __device__
 float toFloat(half a) {
   return __half2float(a);
 }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<> __device__
+float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
 
 template<typename T, int BSIZE> __global__
 void deltaKern(void* A_, void* B_, size_t count, double* max) {
@@ -128,6 +145,10 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
 
 testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
   switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break;
+#endif
     case ncclHalf:
       deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
     case ncclFloat:
@@ -174,6 +195,12 @@ template<>
 __device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
   return __float2half(testValue<float>(offset, rep, rank));
 }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
 
 // Operations
 template<typename T>
@@ -210,7 +237,10 @@ __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offse
 #define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin)
 
 static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
-  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double)
+  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  OPS(__nv_bfloat16)
+#endif
 };
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
@@ -236,7 +266,10 @@ static void* const initDataKerns[ncclNumTypes] = {
   (void*)InitDataKernel<uint64_t>,
   (void*)InitDataKernel<    half>,
   (void*)InitDataKernel<   float>,
-  (void*)InitDataKernel<  double>
+  (void*)InitDataKernel<  double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  (void*)InitDataKernel<__nv_bfloat16>,
+#endif
 };
 
 template<typename T>
diff --git a/src/common.h b/src/common.h
index 865ee258cd..c869254669 100644
--- a/src/common.h
+++ b/src/common.h
@@ -213,6 +213,9 @@ static size_t wordSize(ncclDataType_t type) {
 #endif
       return 1;
     case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
     //case ncclFloat16:
       return 2;
     case ncclInt:

From 5373e3c6307d64711c8b03b86b9eafd7d9d45bbd Mon Sep 17 00:00:00 2001
From: Stanley Tsang <stanley.tsang@amd.com>
Date: Tue, 16 Mar 2021 20:38:13 +0000
Subject: [PATCH 071/233] Disabling host and fine memory types for CI testing

---
 .jenkins/common.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index c893f9fd8e..14c644b026 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -27,7 +27,7 @@ def runTestCommand (platform, project)
     def command = """#!/usr/bin/env bash
                 set -x
                 cd ${project.paths.project_build_prefix}
-		python3 -m pytest -k "not MPI" --verbose --junitxml=./testreport.xml
+		python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml
             """
 
    platform.runCommand(this, command)

From 0fccaec26f25e13f78c6d3cc1e4ba30c2c363451 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 16 Apr 2021 18:23:28 -0400
Subject: [PATCH 072/233] Update mpich include path

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 8b33b66b13..fa506c567a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -47,7 +47,7 @@ ifeq ($(MPI), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
 HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
 else ifeq ($(MPICH), 1)
-HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich
+HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
 HIPLDFLAGS += -L/usr/lib -lmpich
 endif
 

From e12c35d84b026acc5fb573f7ac5a732430eedd32 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Thu, 27 May 2021 09:12:52 -0700
Subject: [PATCH 073/233] Update PERFORMANCE.md

---
 doc/PERFORMANCE.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md
index 7cc6ecee66..21fef609af 100644
--- a/doc/PERFORMANCE.md
+++ b/doc/PERFORMANCE.md
@@ -46,7 +46,7 @@ A tree would do it hierarchically :
 
 `(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
 
-In all cases, we need n-1 additions and n assignations for each element. Since every step is on a different rank except potentially one (the last input and the first output),
+In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output),
 we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation.
 
 Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best :
@@ -82,7 +82,7 @@ Note that here, S is the size in bytes of the total array, which for NCCL is equ
 
 ### AllGather
 
-The AllGather operation requires only to perform the assignation part of the allReduce operation :
+The AllGather operation requires only to perform the assignment part of the allReduce operation :
 
  `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K`
 

From c4de829d9131d83e4b0ca5c08cd9a8eca2dfc289 Mon Sep 17 00:00:00 2001
From: Greg Inozemtsev <ginozemt@amazon.com>
Date: Wed, 2 Jun 2021 17:52:11 -0700
Subject: [PATCH 074/233] Cleanup argument error handling and messages

Add error checking for minbytes and maxbytes arguments

Also accept lowercase literals when parsing size arguments and print errors and usage on stderr.
---
 src/common.cu | 69 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 17 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index ff4e1fd857..25fc7dac7a 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -40,22 +40,40 @@ static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
 
-double parsesize(char *value) {
+static double parsesize(const char *value) {
     long long int units;
     double size;
+    char size_lit;
 
-    if (strchr(value, 'G') != NULL) {
-        units=1024*1024*1024;
-    } else if (strchr(value, 'M') != NULL) {
-        units=1024*1024;
-    } else if (strchr(value, 'K') != NULL) {
-        units=1024;
-    } else {
-        units=1;
+    int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+    switch (count) {
+    case 2:
+      switch (size_lit) {
+      case 'G':
+      case 'g':
+        units = 1024*1024*1024;
+        break;
+      case 'M':
+      case 'm':
+        units = 1024*1024;
+        break;
+      case 'K':
+      case 'k':
+        units = 1024;
+        break;
+      default:
+        return -1.0;
+      };
+      break;
+    case 1:
+      units = 1;
+      break;
+    default:
+      return -1.0;
     }
 
-    size = atof(value)*units;
-    return size;
+    return size * units;
 }
 
 double DeltaMaxValue(ncclDataType_t type) {
@@ -570,6 +588,7 @@ int main(int argc, char* argv[]) {
   setlinebuf(stdout);
 
   // Parse args
+  double parsed;
   int longindex;
   static struct option longopts[] = {
     {"nthreads", required_argument, 0, 't'},
@@ -605,10 +624,20 @@ int main(int argc, char* argv[]) {
         nGpus = strtol(optarg, NULL, 0);
         break;
       case 'b':
-        minBytes = (size_t)parsesize(optarg);
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'minbytes'\n");
+          return -1;
+        }
+        minBytes = (size_t)parsed;
         break;
       case 'e':
-        maxBytes = (size_t)parsesize(optarg);
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+          return -1;
+        }
+        maxBytes = (size_t)parsed;
         break;
       case 'i':
         stepBytes = strtol(optarg, NULL, 0);
@@ -623,7 +652,7 @@ int main(int argc, char* argv[]) {
 #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
         agg_iters = (int)strtol(optarg, NULL, 0);
 #else
-        printf("Option -m not supported before NCCL 2.2. Ignoring\n");
+        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
 #endif
         break;
       case 'w':
@@ -648,7 +677,7 @@ int main(int argc, char* argv[]) {
         blocking_coll = strtol(optarg, NULL, 0);
         break;
       case 'h':
-	printf("USAGE: %s \n\t"
+        fprintf(stderr, "USAGE: %s \n\t"
             "[-t,--nthreads <num threads>] \n\t"
             "[-g,--ngpus <gpus per thread>] \n\t"
             "[-b,--minbytes <min size in bytes>] \n\t"
@@ -668,8 +697,8 @@ int main(int argc, char* argv[]) {
 	    basename(argv[0]));
 	return 0;
       default:
-        printf("invalid option \n");
-	printf("USAGE: %s \n\t"
+        fprintf(stderr, "invalid option \n");
+        fprintf(stderr, "USAGE: %s \n\t"
             "[-t,--nthreads <num threads>] \n\t"
             "[-g,--ngpus <gpus per thread>] \n\t"
             "[-b,--minbytes <min size in bytes>] \n\t"
@@ -690,6 +719,12 @@ int main(int argc, char* argv[]) {
 	return 0;
     }
   }
+  if (minBytes > maxBytes) {
+    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+           (unsigned long long)minBytes,
+           (unsigned long long)maxBytes);
+    return -1;
+  }
 #ifdef MPI_SUPPORT
   MPI_Init(&argc, &argv);
 #endif

From cde7e769c1879a77daddebe9da164513e030105b Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 17 Jun 2021 14:08:43 -0700
Subject: [PATCH 075/233] Add support for ncclAvg operation

---
 src/all_gather.cu     |  2 +-
 src/all_reduce.cu     |  4 +-
 src/alltoall.cu       |  2 +-
 src/broadcast.cu      |  2 +-
 src/common.cu         | 92 ++++++++++++++++++++++++++++++++++++-------
 src/common.h          |  5 ++-
 src/reduce.cu         |  4 +-
 src/reduce_scatter.cu |  4 +-
 8 files changed, 90 insertions(+), 25 deletions(-)

diff --git a/src/all_gather.cu b/src/all_gather.cu
index f5bc44c57d..ee1d0ea0b9 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -84,7 +84,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index bd8daaf0a2..52dce8993c 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -83,7 +83,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
@@ -93,7 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_ops = &op;
     run_opnames = &opName;
   } else {
-    op_count = ncclNumOps;
+    op_count = test_opnum;
     run_ops = test_ops;
     run_opnames = test_opnames;
   }
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 31cfca090d..4afd3eb947 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -102,7 +102,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
diff --git a/src/broadcast.cu b/src/broadcast.cu
index c62a99ff62..f7c0094864 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -92,7 +92,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
diff --git a/src/common.cu b/src/common.cu
index 4589593b07..1313079e79 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -11,23 +11,41 @@
 #include <libgen.h>
 #include "cuda.h"
 
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
 #if NCCL_MAJOR >= 2
 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
                                            ncclBfloat16
 #endif
 };
 const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double",
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
                                             "bfloat16"
 #endif
 };
+
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+int test_typenum = 10;
+#else
+int test_typenum = 9;
+#endif
+
 #else
 ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
 const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+int test_typenum = 7;
 #endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg};
+const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min", "avg"};
+int test_opnum = 5;
+#else
 ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin};
 const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"};
+int test_opnum = 4;
+#endif
 
 thread_local int is_main_thread = 0;
 
@@ -126,7 +144,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
     if( delta > locmax ) {
       locmax = delta;
 #ifdef DEBUG_PRINT
-      if (delta > .1) printf("Error at %d/%ld : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i]));
+      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
 #endif
     }
   }
@@ -222,23 +240,48 @@ __device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(
 template<>
 __device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
 
-template<typename T, T (*Op)(T, T)>
+template<typename T>
+__device__ T ncclPostOpIdent(T x, int n) { return x; }
+
+template<typename T>
+__device__ T ncclPostOpDiv(T x, int n) { return x/n; }
+template<>
+__device__ half ncclPostOpDiv<half>(half x, int n) { return __float2half(__half2float(x)/n); }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 ncclPostOpDiv<__nv_bfloat16>(__nv_bfloat16 x, int n) { return __float2bfloat16(__bfloat162float(x)/n); }
+#endif
+
+template<typename T, T (*Op)(T, T), T(*PostOp)(T,int)>
 __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
   for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
     T val = testValue<T>(o+offset, rep, 0);
     for (int i=1; i<nranks; i++) {
       val = Op(val, testValue<T>(o+offset, rep, i));
     }
-    data[o] = val;
+    data[o] = PostOp(val, nranks);
   }
 }
 
-#define KERN(type, op) (void*)InitDataReduceKernel<type, op<type>>
-#define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin)
+#define KERN(type, op, postop) (void*)InitDataReduceKernel<type, op<type>, postop<type> >
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPostOpIdent), \
+    KERN(type, ncclOpProd, ncclPostOpIdent), \
+    KERN(type, ncclOpMax, ncclPostOpIdent), \
+    KERN(type, ncclOpMin, ncclPostOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPostOpDiv)
+#else
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPostOpIdent), \
+    KERN(type, ncclOpProd, ncclPostOpIdent), \
+    KERN(type, ncclOpMax, ncclPostOpIdent), \
+    KERN(type, ncclOpMin, ncclPostOpIdent)
+#endif
 
 static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
   OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
   OPS(__nv_bfloat16)
 #endif
 };
@@ -267,7 +310,7 @@ static void* const initDataKerns[ncclNumTypes] = {
   (void*)InitDataKernel<    half>,
   (void*)InitDataKernel<   float>,
   (void*)InitDataKernel<  double>,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
   (void*)InitDataKernel<__nv_bfloat16>,
 #endif
 };
@@ -367,7 +410,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
      if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-     if (comms) {
+     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
        ncclResult_t ncclAsyncErr;
        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
        if (ncclAsyncErr != ncclSuccess) {
@@ -602,6 +645,17 @@ int main(int argc, char* argv[]) {
   // Make sure everyline is flushed so that we see the progress of the test
   setlinebuf(stdout);
 
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+    ncclGetVersion(&test_ncclVersion);
+  #else
+    test_ncclVersion = NCCL_VERSION_CODE;
+  #endif
+  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) &&  test_ncclVersion < NCCL_VERSION(2,10,0)) {
+    test_opnum -= 1; // exclude ncclAvg
+    test_typenum -= 1; // exclude bfloat16
+  }
+
   // Parse args
   int longindex;
   static struct option longopts[] = {
@@ -653,7 +707,7 @@ int main(int argc, char* argv[]) {
         iters = (int)strtol(optarg, NULL, 0);
         break;
       case 'm':
-#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
         agg_iters = (int)strtol(optarg, NULL, 0);
 #else
         printf("Option -m not supported before NCCL 2.2. Ignoring\n");
@@ -693,7 +747,11 @@ int main(int argc, char* argv[]) {
             "[-w,--warmup_iters <warmup iteration count>] \n\t"
             "[-p,--parallel_init <0/1>] \n\t"
             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
             "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
@@ -701,8 +759,8 @@ int main(int argc, char* argv[]) {
 	    basename(argv[0]));
 	return 0;
       default:
-        printf("invalid option \n");
-	printf("USAGE: %s \n\t"
+        if (c != 'h') printf("invalid option '%c'\n", c);
+        printf("USAGE: %s \n\t"
             "[-t,--nthreads <num threads>] \n\t"
             "[-g,--ngpus <gpus per thread>] \n\t"
             "[-b,--minbytes <min size in bytes>] \n\t"
@@ -714,7 +772,11 @@ int main(int argc, char* argv[]) {
             "[-w,--warmup_iters <warmup iteration count>] \n\t"
             "[-p,--parallel_init <0/1>] \n\t"
             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
             "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
@@ -899,8 +961,8 @@ testResult_t run() {
 
   // Free off CUDA allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
-    CUDACHECK(cudaFree(sendbuffs[i]));
-    CUDACHECK(cudaFree(recvbuffs[i]));
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
     if (datacheck) CUDACHECK(cudaFree(expected[i]));
   }
   CUDACHECK(cudaFreeHost(delta));
diff --git a/src/common.h b/src/common.h
index c869254669..00103f7722 100644
--- a/src/common.h
+++ b/src/common.h
@@ -235,10 +235,13 @@ static size_t wordSize(ncclDataType_t type) {
   }
 }
 
+extern int test_ncclVersion; // init'd with ncclGetVersion()
 extern ncclDataType_t test_types[ncclNumTypes];
 extern const char *test_typenames[ncclNumTypes];
 extern ncclRedOp_t test_ops[ncclNumOps];
 extern const char *test_opnames[ncclNumOps];
+extern int test_opnum;
+extern int test_typenum;
 
 static int ncclstringtotype(char *str) {
     for (int t=0; t<ncclNumTypes; t++) {
@@ -254,7 +257,7 @@ static int ncclstringtotype(char *str) {
 }
 
 static int ncclstringtoop (char *str) {
-    for (int o=0; o<ncclNumOps; o++) {
+    for (int o=0; o<test_opnum; o++) {
       if (strcmp(str, test_opnames[o]) == 0) {
         return o;
       }
diff --git a/src/reduce.cu b/src/reduce.cu
index 08825e45b0..e40b501b7e 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -83,7 +83,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
@@ -93,7 +93,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
     run_ops = &op;
     run_opnames = &opName;
   } else {
-    op_count = ncclNumOps;
+    op_count = test_opnum;
     run_ops = test_ops;
     run_opnames = test_opnames;
   }
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index 86e789c15d..c6de434ebe 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -84,7 +84,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
@@ -94,7 +94,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
     run_opnames = &opName;
     op_count = 1;
   } else {
-    op_count = sizeof(test_ops)/sizeof(test_ops[0]);
+    op_count = test_opnum;
     run_ops = test_ops;
     run_opnames = test_opnames;
   }

From 526eacadf79debe32ae0078cf2cb0fc9ed9c1baf Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 28 Jun 2021 10:12:34 -0700
Subject: [PATCH 076/233] Fixed formatting for bfloat16 support

---
 src/all_gather.cu     | 8 ++++----
 src/all_reduce.cu     | 8 ++++----
 src/alltoall.cu       | 8 ++++----
 src/broadcast.cu      | 8 ++++----
 src/reduce.cu         | 8 ++++----
 src/reduce_scatter.cu | 8 ++++----
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/all_gather.cu b/src/all_gather.cu
index ee1d0ea0b9..0b9e0cc939 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s            out-of-place                       in-place          \n", "", "", "");
-  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s", size, count, typeName);
+  PRINT("%12li  %12li  %8s", size, count, typeName);
 }
 
 void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 52dce8993c..9b6b7f02b9 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
 }
 
 void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 4afd3eb947..865099743d 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
 }
 
 void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/broadcast.cu b/src/broadcast.cu
index f7c0094864..e2b4421ac5 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6i", size, count, typeName, root);
+  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
 }
 
 void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/reduce.cu b/src/reduce.cu
index e40b501b7e..278768881d 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s  %6i", size, count, typeName, opName, root);
+  PRINT("%12li  %12li  %8s  %6s  %6i", size, count, typeName, opName, root);
 }
 
 void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index c6de434ebe..b0c4fab52e 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
 }
 
 void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {

From e55ad3796d710adcf72778dca02559dc6c9706bb Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 28 Jun 2021 14:19:45 -0700
Subject: [PATCH 077/233] Added support for CUDA graph capture/replay (-G)

---
 src/common.cu | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 1313079e79..c180294644 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -65,6 +65,7 @@ static int nccltype = ncclFloat;
 static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
 
 double parsesize(char *value) {
     long long int units;
@@ -481,6 +482,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   Barrier(args);
 
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
+  if (cudaGraphLaunches >= 1) {
+    // Begin cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
+    }
+  }
+
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
@@ -490,11 +500,40 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     }
     if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
   }
+
+  if (cudaGraphLaunches >= 1) {
+    // End cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+    }
+    // Instantiate cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+    }
+    // Resync CPU, restart timing, launch cuda graph
+    Barrier(args);
+    start = std::chrono::high_resolution_clock::now();
+    for (int l=0; l<cudaGraphLaunches; l++) {
+      for (int i=0; i<args->nGpus; i++) {
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+      }
+    }
+  }
+
   TESTCHECK(completeColl(args));
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
   double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
   deltaSec = deltaSec/(iters*agg_iters);
+  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+
+  if (cudaGraphLaunches >= 1) {
+    //destroy cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
+    }
+  }
 
   double algBw, busBw;
   args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
@@ -508,10 +547,41 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       // Initialize sendbuffs, recvbuffs and expected
       TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
+      if (cudaGraphLaunches >= 1) {
+        // Begin cuda graph capture for data check
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+        }
+      }
+
       //test validation in single itertion, should ideally be included into the multi-iteration run
       TESTCHECK(startColl(args, type, op, root, in_place, 0));
+
+      if (cudaGraphLaunches >= 1) {
+        // End cuda graph capture
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+        }
+        // Instantiate cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+        }
+        // Launch cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+        }
+      }
+
       TESTCHECK(completeColl(args));
 
+      if (cudaGraphLaunches >= 1) {
+        //destroy cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+          CUDACHECK(cudaGraphDestroy(graphs[i]));
+        }
+      }
+
       TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
 
       //aggregate delta from all threads and procs
@@ -674,12 +744,13 @@ int main(int argc, char* argv[]) {
     {"datatype", required_argument, 0, 'd'},
     {"root", required_argument, 0, 'r'},
     {"blocking", required_argument, 0, 'z'},
+    {"cudagraph", required_argument, 0, 'G'},
     {"help", no_argument, 0, 'h'}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:h", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -734,6 +805,13 @@ int main(int argc, char* argv[]) {
       case 'z':
         blocking_coll = strtol(optarg, NULL, 0);
         break;
+      case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
+        cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+#endif
+        break;
       case 'h':
 	printf("USAGE: %s \n\t"
             "[-t,--nthreads <num threads>] \n\t"
@@ -755,6 +833,7 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;
@@ -780,6 +859,7 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;

From 9dae3d3a37a7505a9eb0622be4268e2d2a3cb5f9 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 28 Jun 2021 16:49:10 -0700
Subject: [PATCH 078/233] Added new tests: scatter, sendrecv, hypercube

---
 src/Makefile     |   4 +-
 src/hypercube.cu | 124 +++++++++++++++++++++++++++++++++++++++++++++
 src/scatter.cu   | 125 ++++++++++++++++++++++++++++++++++++++++++++++
 src/sendrecv.cu  | 127 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 378 insertions(+), 2 deletions(-)
 create mode 100644 src/hypercube.cu
 create mode 100644 src/scatter.cu
 create mode 100644 src/sendrecv.cu

diff --git a/src/Makefile b/src/Makefile
index 52169bb3e1..26e653e7d6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -70,7 +70,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter sendrecv hypercube
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/hypercube.cu b/src/hypercube.cu
new file mode 100644
index 0000000000..142f1a6359
--- /dev/null
+++ b/src/hypercube.cu
@@ -0,0 +1,124 @@
+/*************************************************************************
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+#define ALIGN 4
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s", size, count, typeName);
+}
+
+void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
+  *recvInplaceOffset = 0;
+  *paramcount = base;
+}
+
+testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  char* sbuff = (char*)sendbuff;
+  char* rbuff = (char*)recvbuff;
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankSize = count * wordSize(type);
+
+  if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
+
+  // Hypercube AllGather
+  for (int mask=1; mask<nRanks; mask<<=1) {
+    NCCLCHECK(ncclGroupStart());
+    int s = rank & ~(mask-1);
+    int r = s ^ mask;
+    NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclGroupEnd());
+  }
+  return testSuccess;
+}
+
+struct testColl hyperCubeTest = {
+  "HyperCube",
+  HyperCubeGetCollByteCount,
+  HyperCubeInitData,
+  HyperCubeGetBw,
+  HyperCubeRunColl
+};
+
+void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &hyperCubeTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine hyperCubeEngine = {
+  HyperCubeGetBuffSize,
+  HyperCubeRunTest
+};
+
+#pragma weak ncclTestEngine=hyperCubeEngine
diff --git a/src/scatter.cu b/src/scatter.cu
new file mode 100644
index 0000000000..93ab2e694a
--- /dev/null
+++ b/src/scatter.cu
@@ -0,0 +1,125 @@
+/*************************************************************************
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
+}
+
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = count/nranks;
+}
+
+testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(args->expected[i], recvcount, type, rep+rank*recvcount, root));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl scatterTest = {
+  "Scatter",
+  ScatterGetCollByteCount,
+  ScatterInitData,
+  ScatterGetBw,
+  ScatterRunColl
+};
+
+void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &scatterTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine scatterEngine = {
+  ScatterGetBuffSize,
+  ScatterRunTest
+};
+
+#pragma weak ncclTestEngine=scatterEngine
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
new file mode 100644
index 0000000000..8bebc48e3d
--- /dev/null
+++ b/src/sendrecv.cu
@@ -0,0 +1,127 @@
+/*************************************************************************
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s", size, count, typeName);
+}
+
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    int peer = (rank-1+nranks)%nranks;
+    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place sendrecv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  int recvPeer = (rank-1+nRanks) % nRanks;
+  int sendPeer = (rank+1) % nRanks;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
+  NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+struct testColl sendRecvTest = {
+  "SendRecv",
+  SendRecvGetCollByteCount,
+  SendRecvInitData,
+  SendRecvGetBw,
+  SendRecvRunColl
+};
+
+void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &sendRecvTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine sendRecvEngine = {
+  SendRecvGetBuffSize,
+  SendRecvRunTest
+};
+
+#pragma weak ncclTestEngine=sendRecvEngine

From 1ae8cdc315d10c3f65764a1915e0fb0f1563d893 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 28 Jun 2021 18:23:12 -0700
Subject: [PATCH 079/233] Resync with changes in gitilab-master code

---
 src/common.cu | 81 +++++++++++++++++++++++----------------------------
 src/common.h  |  5 ++--
 2 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index c180294644..7aad2c1868 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -67,6 +67,8 @@ static int parallel_init = 0;
 static int blocking_coll = 0;
 static int cudaGraphLaunches = 0;
 
+#define NUM_BLOCKS 32
+
 double parsesize(char *value) {
     long long int units;
     double size;
@@ -137,9 +139,9 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
   const T* A = (const T*)A_;
   const T* B = (const T*)B_;
   __shared__ double temp[BSIZE];
-  int tid = threadIdx.x;
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
   double locmax = 0.0;
-  for(int i=tid; i<count; i+=blockDim.x) {
+  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
 
     double delta = absDiff(A[i], B[i]);
     if( delta > locmax ) {
@@ -150,6 +152,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
     }
   }
 
+  tid = threadIdx.x;
   temp[tid] = locmax;
   for(int stride = BSIZE/2; stride > 1; stride>>=1) {
     __syncthreads();
@@ -158,38 +161,38 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
   }
   __syncthreads();
   if( threadIdx.x == 0)
-    *max = temp[0] > temp[1] ? temp[0] : temp[1];
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
 }
 
-
-testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
   switch (type) {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
     case ncclBfloat16:
-      deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
 #endif
     case ncclHalf:
-      deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
     case ncclFloat:
-      deltaKern<float, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
     case ncclDouble:
-      deltaKern<double, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
 
     case ncclChar:
 #if NCCL_MAJOR >= 2
     case ncclUint8:
 #endif
-      deltaKern<uint8_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
     case ncclInt:
 #if NCCL_MAJOR >= 2
     case ncclUint32:
 #endif
-      deltaKern<uint32_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
     case ncclInt64:
     case ncclUint64:
-      deltaKern<uint64_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
   }
   CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
   return testSuccess;
 }
 
@@ -438,8 +441,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
   size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
-  size_t shift = (totalnbytes * iter) % args->maxbytes;
-  if (shift + totalnbytes > args->maxbytes) shift = 0;
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
 
   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
   for (int i = 0; i < args->nGpus; i++) {
@@ -475,6 +478,10 @@ testResult_t completeColl(struct threadArgs* args) {
 
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
+  if (datacheck) {
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
+  }
 
   // Sync
   TESTCHECK(startColl(args, type, op, root, in_place, 0));
@@ -598,10 +605,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   }
 
   double timeUsec = deltaSec*1.0E6;
-  char timeStr[10];
+  char timeStr[100];
   if (timeUsec > 10000.0) {
     sprintf(timeStr, "%7.0f", timeUsec);
-  } else if (timeUsec > 100.0) {
+  } else if (timeUsec >= 100.0) {
     sprintf(timeStr, "%7.1f", timeUsec);
   } else {
     sprintf(timeStr, "%7.2f", timeUsec);
@@ -812,31 +819,6 @@ int main(int argc, char* argv[]) {
         printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
 #endif
         break;
-      case 'h':
-	printf("USAGE: %s \n\t"
-            "[-t,--nthreads <num threads>] \n\t"
-            "[-g,--ngpus <gpus per thread>] \n\t"
-            "[-b,--minbytes <min size in bytes>] \n\t"
-            "[-e,--maxbytes <max size in bytes>] \n\t"
-            "[-i,--stepbytes <increment size>] \n\t"
-            "[-f,--stepfactor <increment factor>] \n\t"
-            "[-n,--iters <iteration count>] \n\t"
-            "[-m,--agg_iters <aggregated iteration count>] \n\t"
-            "[-w,--warmup_iters <warmup iteration count>] \n\t"
-            "[-p,--parallel_init <0/1>] \n\t"
-            "[-c,--check <0/1>] \n\t"
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
-#else
-            "[-o,--op <sum/prod/min/max/all>] \n\t"
-#endif
-            "[-d,--datatype <nccltype/all>] \n\t"
-            "[-r,--root <root>] \n\t"
-            "[-z,--blocking <0/1>] \n\t"
-            "[-G,--cudagraph <num graph launches>] \n\t"
-            "[-h,--help]\n",
-	    basename(argv[0]));
-	return 0;
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
         printf("USAGE: %s \n\t"
@@ -868,7 +850,8 @@ int main(int argc, char* argv[]) {
 #ifdef MPI_SUPPORT
   MPI_Init(&argc, &argv);
 #endif
-  return run();
+  TESTCHECK(run());
+  return 0;
 }
 
 testResult_t run() {
@@ -900,6 +883,7 @@ testResult_t run() {
 #define MAX_LINE 2048
   char line[MAX_LINE];
   int len = 0;
+  size_t maxMem = ~0;
   for (int i=0; i<nThreads*nGpus; i++) {
     int cudaDev = localRank*nThreads*nGpus+i;
     int rank = proc*nThreads*nGpus+i;
@@ -907,6 +891,7 @@ testResult_t run() {
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
     len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
                     rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
   }
 
 #if MPI_SUPPORT
@@ -918,10 +903,18 @@ testResult_t run() {
       PRINT("%s", lines+MAX_LINE*p);
     free(lines);
   }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
 #else
   PRINT("%s", line);
 #endif
 
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
+  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+  }
+
   ncclUniqueId ncclId;
   if (proc == 0) {
     NCCLCHECK(ncclGetUniqueId(&ncclId));
@@ -963,7 +956,7 @@ testResult_t run() {
   int errors[nThreads];
   double bw[nThreads];
   double* delta;
-  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped));
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
   int bw_count[nThreads];
   for (int t=0; t<nThreads; t++) {
     bw[t] = 0.0;
@@ -1003,7 +996,7 @@ testResult_t run() {
     threads[t].args.sync = (volatile int*)sync;
     threads[t].args.sync_idx = 0;
     threads[t].args.deltaThreads = delta;
-    threads[t].args.deltaHost = (delta + t);
+    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
     threads[t].args.delta = delta;
     threads[t].args.errors=errors+t;
     threads[t].args.bw=bw+t;
diff --git a/src/common.h b/src/common.h
index 00103f7722..44b298dfd2 100644
--- a/src/common.h
+++ b/src/common.h
@@ -54,8 +54,8 @@ typedef enum {
   if (r!= testSuccess) {                            \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
-    printf(" .. %s: Test failure %s:%d\n",          \
-         hostname,                                  \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
         __FILE__,__LINE__);                         \
     return r;                                       \
   }                                                 \
@@ -78,6 +78,7 @@ extern struct testColl allGatherTest;
 extern struct testColl reduceScatterTest;
 extern struct testColl broadcastTest;
 extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
 
 struct testEngine {
   void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);

From 1dfc76ecccb73f7e7336730663e5901d6d1600ef Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 30 Jun 2021 19:36:07 -0700
Subject: [PATCH 080/233] Added new option to report average iteration time

---
 src/common.cu | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 7aad2c1868..d9f036879e 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -66,6 +66,10 @@ static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
 static int cudaGraphLaunches = 0;
+#ifdef MPI_SUPPORT
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+#endif
 
 #define NUM_BLOCKS 32
 
@@ -533,6 +537,23 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
   deltaSec = deltaSec/(iters*agg_iters);
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+#ifdef MPI_SUPPORT
+  switch (average) {
+  case 1:
+    // Calculate the average time across all ranks
+    MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    deltaSec = deltaSec/(args->nProcs*args->nThreads*args->nGpus);
+    break;
+  case 2:
+    // Obtain the minimum time across all ranks
+    MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+    break;
+  case 3:
+    // Obtain the maximum time across all ranks
+    MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    break;
+  }
+#endif
 
   if (cudaGraphLaunches >= 1) {
     //destroy cuda graph
@@ -752,12 +773,13 @@ int main(int argc, char* argv[]) {
     {"root", required_argument, 0, 'r'},
     {"blocking", required_argument, 0, 'z'},
     {"cudagraph", required_argument, 0, 'G'},
+    {"average", required_argument, 0, 'a'},
     {"help", no_argument, 0, 'h'}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -819,6 +841,11 @@ int main(int argc, char* argv[]) {
         printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
 #endif
         break;
+#ifdef MPI_SUPPORT
+      case 'a':
+        average = (int)strtol(optarg, NULL, 0);
+        break;
+#endif
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
         printf("USAGE: %s \n\t"
@@ -842,6 +869,9 @@ int main(int argc, char* argv[]) {
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
             "[-G,--cudagraph <num graph launches>] \n\t"
+#ifdef MPI_SUPPORT
+            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+#endif
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;

From 11cff17a04e268ea0a82cc8517fdcfde3414280e Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 6 Jul 2021 14:47:50 -0700
Subject: [PATCH 081/233] Updated with new command line arguments

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 791bed2599..bff6433b89 100644
--- a/README.md
+++ b/README.md
@@ -52,19 +52,21 @@ All tests support the same set of arguments :
     * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
     * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
 * NCCL operations arguments
-  * `-o,--op <sum/prod/min/max/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
+  * `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
   * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
   * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
 * Performance
   * `-n,--iters <iteration count>` number of iterations. Default : 20.
   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
   * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
+  * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
 
 ## Copyright
 
-NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
 

From 547e119d350a8ad9034c4e75c8664e62c60bf599 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 8 Jul 2021 16:42:40 -0700
Subject: [PATCH 082/233] Fix issues with MPI_Allreduce and multi-threaded
 tests

---
 src/common.cu | 71 ++++++++++++++++++++++++++-------------------------
 src/common.h  |  4 +--
 2 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index d4ee519107..4768b9be3e 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -356,12 +356,9 @@ testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const
   return testSuccess;
 }
 
-void Barrier(struct threadArgs* args)
-{
+void Barrier(struct threadArgs* args) {
   while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
-
   args->barrier[args->barrier_idx] = args->thread + 1;
-
   if (args->thread+1 == args->nThreads) {
 #ifdef MPI_SUPPORT
     MPI_Barrier(MPI_COMM_WORLD);
@@ -370,7 +367,35 @@ void Barrier(struct threadArgs* args)
   } else {
     while (args->barrier[args->barrier_idx]) pthread_yield();
   }
+  args->barrier_idx=!args->barrier_idx;
+}
 
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs* args, double* value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1) val += val2;
+    if (average == 2) val = std::min(val, val2);
+    if (average == 3) val = std::max(val, val2);
+  }
+  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
+    args->reduce[1-args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
   args->barrier_idx=!args->barrier_idx;
 }
 
@@ -383,7 +408,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
     CUDACHECK(cudaSetDevice(device));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
-    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->delta));
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
     maxDelta = std::max(*(args->deltaHost), maxDelta);
 
 #ifdef DEBUG_PRINT
@@ -555,23 +580,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
   deltaSec = deltaSec/(iters*agg_iters);
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
-#ifdef MPI_SUPPORT
-  switch (average) {
-  case 1:
-    // Calculate the average time across all ranks
-    MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    deltaSec = deltaSec/(args->nProcs*args->nThreads*args->nGpus);
-    break;
-  case 2:
-    // Obtain the minimum time across all ranks
-    MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    break;
-  case 3:
-    // Obtain the maximum time across all ranks
-    MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    break;
-  }
-#endif
+  Allreduce(args, &deltaSec, average);
 
   if (cudaGraphLaunches >= 1) {
     //destroy cuda graph
@@ -631,21 +640,12 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
 
       //aggregate delta from all threads and procs
-      Barrier(args);
-      if (args->thread == 0) {
-        for (int i=1; i<args->nThreads; i++) {
-          maxDelta += args->deltaThreads[i];
-        }
-#ifdef MPI_SUPPORT
-        MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-#endif
-      }
-      Barrier(args);
+      Allreduce(args, &maxDelta, 3);
   }
 
   double timeUsec = deltaSec*1.0E6;
   char timeStr[100];
-  if (timeUsec > 10000.0) {
+  if (timeUsec >= 10000.0) {
     sprintf(timeStr, "%7.0f", timeUsec);
   } else if (timeUsec >= 100.0) {
     sprintf(timeStr, "%7.1f", timeUsec);
@@ -875,6 +875,7 @@ int main(int argc, char* argv[]) {
         average = (int)strtol(optarg, NULL, 0);
         break;
 #endif
+      case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
         printf("USAGE: %s \n\t"
@@ -1033,6 +1034,7 @@ testResult_t run() {
 
   int* sync = (int*)calloc(2, sizeof(int));
   int* barrier = (int*)calloc(2, sizeof(int));
+  double* reduce = (double*)calloc(2, sizeof(double));
 
   struct testThread threads[nThreads];
   memset(threads, 0, sizeof(struct testThread)*nThreads);
@@ -1058,11 +1060,10 @@ testResult_t run() {
 
     threads[t].args.barrier = (volatile int*)barrier;
     threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double*)reduce;
     threads[t].args.sync = (volatile int*)sync;
     threads[t].args.sync_idx = 0;
-    threads[t].args.deltaThreads = delta;
     threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
-    threads[t].args.delta = delta;
     threads[t].args.errors=errors+t;
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
diff --git a/src/common.h b/src/common.h
index 44b298dfd2..f789c787cd 100644
--- a/src/common.h
+++ b/src/common.h
@@ -8,6 +8,7 @@
 
 #include "nccl.h"
 #include <stdio.h>
+#include <cstdint>
 #include <algorithm>
 #include <curand.h>
 #ifdef MPI_SUPPORT
@@ -116,11 +117,10 @@ struct threadArgs {
   int sync_idx;
   volatile int* barrier;
   int barrier_idx;
+  volatile double* reduce;
   int syncRank;
   int syncNranks;
-  double* deltaThreads;
   double* deltaHost;
-  double* delta;
   int* errors;
   double* bw;
   int* bw_count;

From b9f90d12a906a0dc5e49f1ede6a52c7779289c01 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 12 Jul 2021 11:43:57 -0700
Subject: [PATCH 083/233] Removed MPI_SUPPORT conditional compilation of
 average flag

---
 src/common.cu | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 4768b9be3e..c343342ffa 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -66,10 +66,8 @@ static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
 static int cudaGraphLaunches = 0;
-#ifdef MPI_SUPPORT
 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
-#endif
 
 #define NUM_BLOCKS 32
 
@@ -870,11 +868,9 @@ int main(int argc, char* argv[]) {
         printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
 #endif
         break;
-#ifdef MPI_SUPPORT
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
         break;
-#endif
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -899,9 +895,7 @@ int main(int argc, char* argv[]) {
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
             "[-G,--cudagraph <num graph launches>] \n\t"
-#ifdef MPI_SUPPORT
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
-#endif
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;

From 1f8f5416863a3082975b10eaa05fecee6fe870c8 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 13 Jul 2021 10:17:05 -0700
Subject: [PATCH 084/233] Add CUDA graph support only for CUDA 11.3 and later
 builds

Fixes #90
---
 src/common.cu | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index c343342ffa..6a26c6c4e8 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -534,6 +534,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   Barrier(args);
 
+#if CUDART_VERSION >= 11030
   cudaGraph_t graphs[args->nGpus];
   cudaGraphExec_t graphExec[args->nGpus];
   if (cudaGraphLaunches >= 1) {
@@ -542,6 +543,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
     }
   }
+#endif
 
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
@@ -553,6 +555,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
   }
 
+#if CUDART_VERSION >= 11030
   if (cudaGraphLaunches >= 1) {
     // End cuda graph capture
     for (int i=0; i<args->nGpus; i++) {
@@ -571,6 +574,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       }
     }
   }
+#endif
 
   TESTCHECK(completeColl(args));
 
@@ -580,6 +584,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
   Allreduce(args, &deltaSec, average);
 
+#if CUDART_VERSION >= 11030
   if (cudaGraphLaunches >= 1) {
     //destroy cuda graph
     for (int i=0; i<args->nGpus; i++) {
@@ -587,6 +592,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       CUDACHECK(cudaGraphDestroy(graphs[i]));
     }
   }
+#endif
 
   double algBw, busBw;
   args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
@@ -600,16 +606,19 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       // Initialize sendbuffs, recvbuffs and expected
       TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
+#if CUDART_VERSION >= 11030
       if (cudaGraphLaunches >= 1) {
         // Begin cuda graph capture for data check
         for (int i=0; i<args->nGpus; i++) {
           CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
         }
       }
+#endif
 
       //test validation in single itertion, should ideally be included into the multi-iteration run
       TESTCHECK(startColl(args, type, op, root, in_place, 0));
 
+#if CUDART_VERSION >= 11030
       if (cudaGraphLaunches >= 1) {
         // End cuda graph capture
         for (int i=0; i<args->nGpus; i++) {
@@ -624,9 +633,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
           CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
         }
       }
+#endif
 
       TESTCHECK(completeColl(args));
 
+#if CUDART_VERSION >= 11030
       if (cudaGraphLaunches >= 1) {
         //destroy cuda graph
         for (int i=0; i<args->nGpus; i++) {
@@ -634,6 +645,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
           CUDACHECK(cudaGraphDestroy(graphs[i]));
         }
       }
+#endif
 
       TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
 

From cc34c545098145bc148e5035e4c8e767b4d71ece Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 21 Jul 2021 14:19:48 -0700
Subject: [PATCH 085/233] Use ROCM_PATH instead of ROCM_HOME

---
 src/Makefile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 31e0fda431..cb2bdb09d5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -5,14 +5,14 @@
 # See LICENSE.txt for license information
 #
 
-ROCM_HOME ?= /opt/rocm
+ROCM_PATH ?= /opt/rocm
 MPI_HOME ?= /usr/lib/openmpi
 PREFIX ?= /usr/local
 VERBOSE ?= 0
 DEBUG ?= 0
 NCCL_HOME ?= ""
 
-HIPCC = $(ROCM_HOME)/hip/bin/hipcc
+HIPCC = $(ROCM_PATH)/hip/bin/hipcc
 CXX = $(HIPCC)
 
 HIPCUFLAGS := -std=c++14
@@ -23,11 +23,11 @@ ifneq ($(NCCL_HOME), "")
 HIPCUFLAGS += -I$(NCCL_HOME)
 HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
 endif
-HIPCUFLAGS += -I$(ROCM_HOME)/include
-HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl
-HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip
-LDFLAGS    += -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
-HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
+HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
 
 ifeq ($(DEBUG), 0)
 HIPCUFLAGS += -O3

From f773748b464ea76930d3aa4cd24f270f6c955cb8 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 13 Sep 2021 14:43:22 -0700
Subject: [PATCH 086/233] Resync with NCCL 2.11

New operator: mulsum
New test: gather
---
 src/Makefile  |   2 +-
 src/common.cu | 201 +++++++++++++++++++++++++++++++++++---------------
 src/common.h  |   9 ++-
 src/gather.cu | 131 ++++++++++++++++++++++++++++++++
 4 files changed, 279 insertions(+), 64 deletions(-)
 create mode 100644 src/gather.cu

diff --git a/src/Makefile b/src/Makefile
index 26e653e7d6..c8491ea537 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -70,7 +70,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter sendrecv hypercube
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/common.cu b/src/common.cu
index 6a26c6c4e8..05f814d923 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -14,37 +14,37 @@
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 #if NCCL_MAJOR >= 2
-ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble,
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-                                           ncclBfloat16
-#endif
-};
-const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double",
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-                                            "bfloat16"
-#endif
-};
+  ncclDataType_t test_types[ncclNumTypes] = {
+    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclBfloat16
+  #endif
+  };
+  const char *test_typenames[ncclNumTypes] = {
+    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , "bfloat16"
+  #endif
+  };
+  int test_typenum = -1;
 
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-int test_typenum = 10;
+  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclAvg
+  #endif
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+  #endif
+  };
+  int test_opnum = -1;
 #else
-int test_typenum = 9;
-#endif
-
-#else
-ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
-const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
-int test_typenum = 7;
-#endif
-
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg};
-const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min", "avg"};
-int test_opnum = 5;
-#else
-ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin};
-const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"};
-int test_opnum = 4;
+  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+  int test_typenum = 7;
+  const char *test_opnames[] = {"sum", "prod", "max", "min"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+  int test_opnum = 4;
 #endif
 
 thread_local int is_main_thread = 0;
@@ -265,45 +265,73 @@ template<>
 __device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
 
 template<typename T>
-__device__ T ncclPostOpIdent(T x, int n) { return x; }
-
+__device__ T ncclPPOpIdent(T x, int arg) { return x; }
 template<typename T>
-__device__ T ncclPostOpDiv(T x, int n) { return x/n; }
+__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
+template<typename T>
+__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
 template<>
-__device__ half ncclPostOpDiv<half>(half x, int n) { return __float2half(__half2float(x)/n); }
+__device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x)*float(arg));
+}
+template<>
+__device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x)/n);
+}
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 template<>
-__device__ __nv_bfloat16 ncclPostOpDiv<__nv_bfloat16>(__nv_bfloat16 x, int n) { return __float2bfloat16(__bfloat162float(x)/n); }
+__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x)*float(arg));
+}
+template<>
+__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x)/n);
+}
 #endif
 
-template<typename T, T (*Op)(T, T), T(*PostOp)(T,int)>
+__host__ __device__ int preMulScalar(int rank) {
+  return 1 + rank%2;
+}
+
+template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
 __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
   for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
     T val = testValue<T>(o+offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
     for (int i=1; i<nranks; i++) {
-      val = Op(val, testValue<T>(o+offset, rep, i));
+      T val1 = testValue<T>(o+offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
     }
     data[o] = PostOp(val, nranks);
   }
 }
 
-#define KERN(type, op, postop) (void*)InitDataReduceKernel<type, op<type>, postop<type> >
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
   #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPostOpIdent), \
-    KERN(type, ncclOpProd, ncclPostOpIdent), \
-    KERN(type, ncclOpMax, ncclPostOpIdent), \
-    KERN(type, ncclOpMin, ncclPostOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPostOpDiv)
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
+    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
 #else
   #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPostOpIdent), \
-    KERN(type, ncclOpProd, ncclPostOpIdent), \
-    KERN(type, ncclOpMax, ncclPostOpIdent), \
-    KERN(type, ncclOpMin, ncclPostOpIdent)
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
 #endif
 
-static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
+static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
   OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
 #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
   OPS(__nv_bfloat16)
@@ -314,7 +342,7 @@ testResult_t InitDataReduce(void* data, const size_t count, const size_t offset,
   dim3 grid = { 32, 1, 1 };
   dim3 block = { 256, 1, 1 };
   void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
-  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*ncclNumOps+op], grid, block, args, 0, cudaStreamDefault));
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
   return testSuccess;
 }
 
@@ -335,7 +363,7 @@ static void* const initDataKerns[ncclNumTypes] = {
   (void*)InitDataKernel<   float>,
   (void*)InitDataKernel<  double>,
 #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  (void*)InitDataKernel<__nv_bfloat16>,
+  (void*)InitDataKernel<__nv_bfloat16>
 #endif
 };
 
@@ -481,7 +509,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
   return testSuccess;
 }
 
-testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int iter) {
+testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
   size_t count = args->nbytes / wordSize(type);
 
   // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
@@ -499,10 +527,49 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
     char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    ncclRedOp_t op;
+
+    if(opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    else {
+      union {
+        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
+        half f16; float f32; double f64;
+        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+        #endif
+      };
+      int scalar = preMulScalar(rank);
+      switch(type) {
+      case ncclInt8: i8 = int8_t(scalar); break;
+      case ncclUint8: u8 = uint8_t(scalar); break;
+      case ncclInt32: i32 = int32_t(scalar); break;
+      case ncclUint32: u32 = uint32_t(scalar); break;
+      case ncclInt64: i64 = int32_t(scalar); break;
+      case ncclUint64: u64 = uint32_t(scalar); break;
+      case ncclFloat16: f16 = __float2half(float(scalar)); break;
+      case ncclFloat32: f32 = float(scalar); break;
+      case ncclFloat64: f64 = double(scalar); break;
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
+      #endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
+    }
+    #endif
+
     TESTCHECK(args->collTest->runColl(
           (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
           (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
         count, type, op, root, args->comms[i], args->streams[i]));
+
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    if(opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
+    }
+    #endif
   }
   if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd());
 
@@ -540,7 +607,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) {
     // Begin cuda graph capture
     for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
+      // Thread local mode is needed for:
+      // - Multi-thread mode
+      // - P2P pre-connect
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
     }
   }
 #endif
@@ -610,7 +680,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       if (cudaGraphLaunches >= 1) {
         // Begin cuda graph capture for data check
         for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+          CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
         }
       }
 #endif
@@ -777,10 +847,19 @@ int main(int argc, char* argv[]) {
     test_ncclVersion = NCCL_VERSION_CODE;
   #endif
   //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
-  if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) &&  test_ncclVersion < NCCL_VERSION(2,10,0)) {
-    test_opnum -= 1; // exclude ncclAvg
-    test_typenum -= 1; // exclude bfloat16
-  }
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
+    test_opnum = 4;
+    test_typenum = 9;
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_opnum++; // ncclAvg
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+        test_typenum++; // bfloat16
+      #endif
+    }
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
+      test_opnum++; // PreMulSum
+    }
+  #endif
 
   // Parse args
   double parsed;
@@ -803,7 +882,8 @@ int main(int argc, char* argv[]) {
     {"blocking", required_argument, 0, 'z'},
     {"cudagraph", required_argument, 0, 'G'},
     {"average", required_argument, 0, 'a'},
-    {"help", no_argument, 0, 'h'}
+    {"help", no_argument, 0, 'h'},
+    {}
   };
 
   while(1) {
@@ -898,7 +978,9 @@ int main(int argc, char* argv[]) {
             "[-w,--warmup_iters <warmup iteration count>] \n\t"
             "[-p,--parallel_init <0/1>] \n\t"
             "[-c,--check <0/1>] \n\t"
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
             "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
 #else
             "[-o,--op <sum/prod/min/max/all>] \n\t"
@@ -993,6 +1075,7 @@ testResult_t run() {
   }
 #ifdef MPI_SUPPORT
   MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
 #endif
   cudaStream_t streams[nGpus*nThreads];
   void* sendbuffs[nGpus*nThreads];
diff --git a/src/common.h b/src/common.h
index f789c787cd..e13816f6f8 100644
--- a/src/common.h
+++ b/src/common.h
@@ -237,12 +237,13 @@ static size_t wordSize(ncclDataType_t type) {
 }
 
 extern int test_ncclVersion; // init'd with ncclGetVersion()
-extern ncclDataType_t test_types[ncclNumTypes];
-extern const char *test_typenames[ncclNumTypes];
-extern ncclRedOp_t test_ops[ncclNumOps];
-extern const char *test_opnames[ncclNumOps];
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
 extern int test_opnum;
 extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
 
 static int ncclstringtotype(char *str) {
     for (int t=0; t<ncclNumTypes; t++) {
diff --git a/src/gather.cu b/src/gather.cu
new file mode 100644
index 0000000000..d0cfa5dabb
--- /dev/null
+++ b/src/gather.cu
@@ -0,0 +1,131 @@
+/*************************************************************************
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
+}
+
+void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count/nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = count/nranks;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks;
+}
+
+testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    if (rank == root) {
+      for (int j=0; j<nranks; j++) {
+        TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+      }
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl gatherTest = {
+  "Gather",
+  GatherGetCollByteCount,
+  GatherInitData,
+  GatherGetBw,
+  GatherRunColl
+};
+
+void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &gatherTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine gatherEngine = {
+  GatherGetBuffSize,
+  GatherRunTest
+};
+
+#pragma weak ncclTestEngine=gatherEngine

From dc1ad4853d7ec738387d42a75a58a98d7af00c7b Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 22 Sep 2021 08:43:01 -0700
Subject: [PATCH 087/233] Fix divide by zero error

---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 6db2b4328b..4b62741fa3 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -527,7 +527,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   size_t count = args->nbytes / wordSize(type);
 
   // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
-  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes);
   size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
   size_t shift = totalnbytes * (iter % steps);
 

From 8b35847d36b14442caacfb08ff62ab52d0fc9f31 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 23 Sep 2021 16:39:11 -0700
Subject: [PATCH 088/233] Use rccl_bfloat16 class

---
 src/common.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 4b62741fa3..64f5e92263 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -157,10 +157,10 @@ template<> __device__
 float toFloat(half a) {
   return __half2float(a);
 }
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if defined(RCCL_BFLOAT16)
 template<> __device__
-float toFloat(__nv_bfloat16 a) {
-  return __bfloat162float(a);
+float toFloat(rccl_bfloat16 a) {
+  return (float)(a);
 }
 #endif
 
@@ -551,8 +551,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       union {
         int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
         half f16; float f32; double f64;
-        #if defined(__CUDA_BF16_TYPES_EXIST__)
-        __nv_bfloat16 bf16;
+        #if defined(RCCL_BFLOAT16)
+        rccl_bfloat16 bf16;
         #endif
       };
       int scalar = preMulScalar(rank);
@@ -566,8 +566,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       case ncclFloat16: f16 = __float2half(float(scalar)); break;
       case ncclFloat32: f32 = float(scalar); break;
       case ncclFloat64: f64 = double(scalar); break;
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
-      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
+      #if defined(RCCL_BFLOAT16)
+      case ncclBfloat16: bf16 = (rccl_bfloat16)(float(scalar)); break;
       #endif
       }
       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
@@ -892,7 +892,7 @@ int main(int argc, char* argv[]) {
     test_typenum = 9;
     if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
       test_opnum++; // ncclAvg
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      #if defined(RCCL_BFLOAT16)
         test_typenum++; // bfloat16
       #endif
     }

From 7130fa6096466f80b0c310b9a3070b6556c0e158 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 25 Oct 2021 16:30:57 -0700
Subject: [PATCH 089/233] Add MPI_IBM build option

---
 src/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index c8491ea537..9a1f62eeb0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -64,6 +64,10 @@ ifeq ($(MPI), 1)
 NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
 NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
 endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
 LIBRARIES += curand nccl nvToolsExt
 NVLDFLAGS += $(LIBRARIES:%=-l%)
 

From de3ddbe261d553d4356ffcd548f4f8d893f193e0 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 10 Nov 2021 09:14:22 -0800
Subject: [PATCH 090/233] Add option to statically link cudart

Build with CUDARTLIB=cudart_static to remove dynamic linkage

Also removed unused curand and nvToolsExt dependencies

BUG 95
---
 src/Makefile | 5 +++--
 src/common.h | 2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 9a1f62eeb0..2a399db7fa 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -12,6 +12,7 @@ DEBUG ?= 0
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
 NVCC = $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
 
 CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
@@ -36,7 +37,7 @@ endif
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 
 LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
-NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
 
 ifeq ($(DEBUG), 0)
 NVCUFLAGS += -O3 -g
@@ -68,7 +69,7 @@ ifeq ($(MPI_IBM),1)
 NVCUFLAGS += -DMPI_SUPPORT
 NVLDFLAGS += -lmpi_ibm
 endif
-LIBRARIES += curand nccl nvToolsExt
+LIBRARIES += nccl
 NVLDFLAGS += $(LIBRARIES:%=-l%)
 
 DST_DIR := $(BUILDDIR)
diff --git a/src/common.h b/src/common.h
index e13816f6f8..bd84d01853 100644
--- a/src/common.h
+++ b/src/common.h
@@ -10,7 +10,6 @@
 #include <stdio.h>
 #include <cstdint>
 #include <algorithm>
-#include <curand.h>
 #ifdef MPI_SUPPORT
 #include "mpi.h"
 #endif
@@ -46,7 +45,6 @@ typedef enum {
   testInternalError = 1,
   testCudaError = 2,
   testNcclError = 3,
-  testCuRandError = 4
 } testResult_t;
 
 // Relay errors up and trace

From 602b745ff48f4aa7d73e8fd946442b666fae344f Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 16 Nov 2021 07:50:18 -0800
Subject: [PATCH 091/233] Add missing hipStreamDestroy at test exit

---
 src/common.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 64f5e92263..98fa8dcfb7 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1268,6 +1268,10 @@ testResult_t run() {
     free(comms);
   }
 
+  for (int i=0; i<nGpus*nThreads; i++) {
+    HIPCHECK(hipStreamDestroy(streams[i]));
+  }
+
   // Free off HIP allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
     if (memorytype == ncclHost) {

From 698524e42e84c1db44f4e86a18b80356bbe3dd26 Mon Sep 17 00:00:00 2001
From: Ziyue Yang <yzylivezh@hotmail.com>
Date: Sat, 19 Feb 2022 00:31:40 +0800
Subject: [PATCH 092/233] move to a2a api (#9)

---
 src/alltoall.cu | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/alltoall.cu b/src/alltoall.cu
index ba3c6f1088..4b8e66d5a2 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -60,23 +60,8 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
 }
 
 testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  int nRanks;
-  NCCLCHECK(ncclCommCount(comm, &nRanks));
-  size_t rankOffset = count * wordSize(type);
-  if (count == 0) return testSuccess;
-
-#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
-  printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
-  return testNcclError;
-#else
-  NCCLCHECK(ncclGroupStart());
-  for (int r=0; r<nRanks; r++) {
-    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
-    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
-  }
-  NCCLCHECK(ncclGroupEnd());
+  NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
-#endif
 }
 
 struct testColl alltoAllTest = {

From 47238336d92a3395cdf5a90a71f720a99706e3d2 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 31 Mar 2022 13:18:02 -0400
Subject: [PATCH 093/233] Update include path for custom RCCL build

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index ef77730c70..ec0301b758 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -20,7 +20,7 @@ LDFLAGS    :=
 HIPLDFLAGS :=
 
 ifneq ($(NCCL_HOME), "")
-HIPCUFLAGS += -I$(NCCL_HOME)
+HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include
 HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
 endif
 HIPCUFLAGS += -I$(ROCM_PATH)/include

From 6156759a40d6f2f39bc78f53edae363943938c47 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 6 Apr 2022 16:46:17 +0000
Subject: [PATCH 094/233] Print GPU's full PCI bus ID

---
 src/common.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 98fa8dcfb7..b0fc63094f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1114,8 +1114,10 @@ testResult_t run() {
     int rank = proc*nThreads*nGpus+i;
     hipDeviceProp_t prop;
     HIPCHECK(hipGetDeviceProperties(&prop, hipDev));
-    len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
-                    rank, getpid(), hostname, hipDev, prop.pciBusID, prop.name);
+    char busIdStr[] = "00000000:00:00.0";
+    HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev));
+    len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
+                    rank, getpid(), hostname, hipDev, busIdStr, prop.name);
     maxMem = std::min(maxMem, prop.totalGlobalMem);
   }
 

From 3d6f70659a0356d68d396ae5923e5ef95150b4eb Mon Sep 17 00:00:00 2001
From: amdkila <47991923+amdkila@users.noreply.github.com>
Date: Thu, 28 May 2020 10:34:30 -0600
Subject: [PATCH 095/233] Check for error code in install script (#2)

---
 install.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/install.sh b/install.sh
index 7c8a865ef5..c56a6bfdde 100755
--- a/install.sh
+++ b/install.sh
@@ -67,6 +67,14 @@ while true; do
     esac
     done
 
+# throw error code after running a command in the install script
+check_exit_code( )
+{
+  if (( $1 != 0 )); then
+    exit $1
+  fi
+}
+
 # Install the pre-commit hook
 #bash ./githooks/install
 
@@ -87,6 +95,7 @@ if ($mpi_enabled); then
 else
     make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc)
 fi
+check_exit_code "$?"
 
 # Optionally, run tests if they're enabled.
 if ($run_tests); then

From 5cd2374edb3c5094d95874bd5361afde0da59de3 Mon Sep 17 00:00:00 2001
From: Edgar <edgar.gabriel@amd.com>
Date: Fri, 18 Mar 2022 11:37:05 -0400
Subject: [PATCH 096/233] create branch up-to-date with rccl-test

---
 src/common.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index b0fc63094f..45225ff10a 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1118,6 +1118,8 @@ testResult_t run() {
     HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev));
     len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
                     rank, getpid(), hostname, hipDev, busIdStr, prop.name);
+    len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+                    rank, getpid(), hostname, hipDev, prop.pciBusID, prop.name);
     maxMem = std::min(maxMem, prop.totalGlobalMem);
   }
 

From 0500f2f132914e39320e4886af33c0552fcad14b Mon Sep 17 00:00:00 2001
From: Edgar <edgar.gabriel@amd.com>
Date: Fri, 18 Mar 2022 11:42:15 -0400
Subject: [PATCH 097/233] implementation of multi-rank support in rccl-tests.

---
 src/all_gather.cu     |  23 ++--
 src/all_reduce.cu     |  21 ++--
 src/alltoall.cu       |  21 ++--
 src/alltoallv.cu      |  69 ++++++-----
 src/broadcast.cu      |  19 +++-
 src/common.cu         | 258 +++++++++++++++++++++++++++++-------------
 src/common.h          |   3 +
 src/gather.cu         |  25 ++--
 src/hypercube.cu      |  22 ++--
 src/reduce.cu         |  23 ++--
 src/reduce_scatter.cu |  23 ++--
 src/scatter.cu        |  18 ++-
 src/sendrecv.cu       |  21 ++--
 13 files changed, 365 insertions(+), 181 deletions(-)

diff --git a/src/all_gather.cu b/src/all_gather.cu
index 0ca428dbed..bc1c59969c 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -31,17 +31,24 @@ void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
 testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    for (int j=0; j<nranks; j++) {
-      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      for (int j=0; j<nranks; j++) {
+	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
+      }
+      k++;
     }
     HIPCHECK(hipDeviceSynchronize());
   }
@@ -99,4 +106,4 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
 struct testEngine ncclTestEngine = {
   AllGatherGetBuffSize,
   AllGatherRunTest
-};
\ No newline at end of file
+};
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 1c1d73a9d2..e76ee38dff 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -31,16 +31,23 @@ void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
 testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
+  int k = 0;
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
+      k++;
+    }
     HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
@@ -110,4 +117,4 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
 struct testEngine ncclTestEngine = {
   AllReduceGetBuffSize,
   AllReduceRunTest
-};
\ No newline at end of file
+};
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 4b8e66d5a2..48020e4fa3 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -31,18 +31,25 @@ void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *para
 testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     char* str = getenv("NCCL_TESTS_DEVICE");
     int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    for (int j=0; j<nranks; j++) {
-      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      for (int j=0; j<nranks; j++) {
+	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
+      }
+      k++;
     }
     HIPCHECK(hipDeviceSynchronize());
   }
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index fb6d0acde8..7a39bcce7b 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -41,44 +41,51 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
 testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     char* str = getenv("NCCL_TESTS_DEVICE");
     int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, type, rep, rank));
 #if 0
-    int *dataHost = (int *)malloc(args->sendBytes);
-    hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
-    printf(" Rank [%d] Original: ", rank);
-    for(int j=0; j<sendcount; j++) {
-      printf("%d:%d ", j, dataHost[j]);
-    }
-    printf("\n");
-    free(dataHost);
-#endif
-    size_t rdisp = 0;
-    size_t data_count = sendcount*2/nranks;
-    size_t chunksize = data_count/nranks;
-    for (int j=0; j<nranks; j++) {
-      size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
-      if (j+rank == nranks-1)
-          rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
-      size_t sdisp = 0;
-      for (int k=0; k<nranks; k++) {
-        scount = ((k+j)%nranks)*chunksize;
-        if (k+j == nranks-1)
-          scount += (sendcount-chunksize*(nranks-1)*nranks/2);
-        if (k == rank)
-          break;
-        sdisp += scount;
+      int *dataHost = (int *)malloc(args->sendBytes);
+      hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
+      printf(" Rank [%d] Original: ", rank);
+      for(int j=0; j<sendcount; j++) {
+	printf("%d:%d ", j, dataHost[j]);
       }
-      TESTCHECK(InitData(((char*)args->expected[i])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
-      rdisp += rcount;
+      printf("\n");
+      free(dataHost);
+#endif
+      size_t rdisp = 0;
+      size_t data_count = sendcount*2/nranks;
+      size_t chunksize = data_count/nranks;
+      for (int j=0; j<nranks; j++) {
+	size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
+	if (j+rank == nranks-1)
+          rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
+	size_t sdisp = 0;
+	for (int k=0; k<nranks; k++) {
+	  scount = ((k+j)%nranks)*chunksize;
+	  if (k+j == nranks-1)
+	    scount += (sendcount-chunksize*(nranks-1)*nranks/2);
+	  if (k == rank)
+	    break;
+	  sdisp += scount;
+	}
+	TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
+	rdisp += rcount;
+      }
+      k++;
     }
     HIPCHECK(hipDeviceSynchronize());
   }
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 61f0a9952a..dffb6b6256 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -32,14 +32,21 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(args->expected[k], recvcount, type, rep, root));
+      k++;
+    }
     HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
@@ -116,4 +123,4 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
 struct testEngine ncclTestEngine = {
   BroadcastGetBuffSize,
   BroadcastRunTest
-};
\ No newline at end of file
+};
diff --git a/src/common.cu b/src/common.cu
index 45225ff10a..c31cff308e 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -78,6 +78,9 @@ static uint32_t cumask[4];
 static int cudaGraphLaunches = 0;
 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
+static int numDevices = 1;
+static int ranksPerGpu = 1;
+static int enable_multiranks = 0;
 
 #define NUM_BLOCKS 32
 
@@ -117,6 +120,38 @@ static double parsesize(const char *value) {
     return size * units;
 }
 
+static bool minReqVersion(int rmajor, int rminor, int rpatch)
+{
+  int version;
+  int major, minor, patch, rem;
+  ncclGetVersion(&version);
+
+  if (version < 10000) {
+    major = version/1000;
+    rem   = version%1000;
+    minor = rem/100;
+    patch = rem%100;
+  }
+  else {
+    major = version/10000;
+    rem   = version%10000;
+    minor = rem/100;
+    patch = rem%100;
+  }
+
+  if (major < rmajor)      return false;
+  else if (major > rmajor) return true;
+
+  // major == rmajor
+  if (minor < rminor)      return false;
+  else if (minor > rminor) return true;
+
+  // major == rmajor && minor == rminor
+  if (patch < rpatch)      return false;
+
+  return true;
+}
+
 double DeltaMaxValue(ncclDataType_t type) {
   switch(type) {
     case ncclHalf: return 1e-2;
@@ -437,9 +472,9 @@ void Allreduce(struct threadArgs* args, double* value, int average) {
 testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
   size_t count = args->expectedBytes/wordSize(type);
   double maxDelta = 0.0;
-  for (int i=0; i<args->nGpus; i++) {
+  for (int i=0; i<args->nGpus*args->nRanks; i++) {
     int device;
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
     HIPCHECK(hipSetDevice(device));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
@@ -474,20 +509,20 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     //}
 #endif
   }
-  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  double nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
   if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
   *delta = maxDelta;
   return testSuccess;
 }
 
-testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) {
+testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_t* comms) {
   hipError_t hipErr;
-  int remaining = ngpus;
-  int* done = (int*)malloc(sizeof(int)*ngpus);
-  memset(done, 0, sizeof(int)*ngpus);
+  int remaining = nStreams;
+  int* done = (int*)malloc(sizeof(int)*nStreams);
+  memset(done, 0, sizeof(int)*nStreams);
   while (remaining) {
    int idle = 1;
-   for (int i=0; i<ngpus; i++) {
+   for (int i=0; i<nStreams; i++) {
      if (done[i]) continue;
 
      hipErr = hipStreamQuery(streams[i]);
@@ -507,7 +542,7 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
        if (ncclAsyncErr != ncclSuccess) {
          // An asynchronous error happened. Stop the operation and destroy
          // the communicator
-         for (int i=0; i<ngpus; i++)
+         for (int i=0; i<nStreams; i++)
            NCCLCHECK(ncclCommAbort(comms[i]));
          // Abort the perf test
          NCCLCHECK(ncclAsyncErr);
@@ -531,14 +566,14 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
   size_t shift = totalnbytes * (iter % steps);
 
-  if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
-  for (int i = 0; i < args->nGpus; i++) {
+  if (args->nGpus> 1 || args->nRanks > 1) NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus*args->nRanks; i++) {
 #ifndef NCCL_MAJOR
     int hipDev;
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev));
     HIPCHECK(hipSetDevice(hipDev));
 #endif
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
     char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
     ncclRedOp_t op;
@@ -585,11 +620,11 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     }
     #endif
   }
-  if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd());
+  if (args->nGpus > 1 || args->nRanks > 1) NCCLCHECK(ncclGroupEnd());
 
   if (blocking_coll) {
     // Complete op before returning
-    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+    TESTCHECK(testStreamSynchronize(args->nGpus*args->nRanks, args->streams, args->comms));
   }
   if (blocking_coll) Barrier(args);
   return testSuccess;
@@ -598,10 +633,11 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 testResult_t completeColl(struct threadArgs* args) {
   if (blocking_coll) return testSuccess;
 
-  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  TESTCHECK(testStreamSynchronize(args->nGpus*args->nRanks, args->streams, args->comms));
   return testSuccess;
 }
 
+//EDGAR: Revisit because of cudaGraphLaunches
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
   if (datacheck) {
@@ -616,11 +652,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   Barrier(args);
 
 #if CUDART_VERSION >= 11030
-  hipGraph_t graphs[args->nGpus];
-  hipGraphExec_t graphExec[args->nGpus];
+  hipGraph_t graphs[args->nGpus*args->nRanks];
+  hipGraphExec_t graphExec[args->nGpus*args->nRanks];
   if (cudaGraphLaunches >= 1) {
     // Begin cuda graph capture
-    for (int i=0; i<args->nGpus; i++) {
+    for (int i=0; i<args->nGpus*args->nRanks; i++) {
       // Thread local mode is needed for:
       // - Multi-thread mode
       // - P2P pre-connect
@@ -642,18 +678,18 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #if CUDART_VERSION >= 11030
   if (cudaGraphLaunches >= 1) {
     // End cuda graph capture
-    for (int i=0; i<args->nGpus; i++) {
+    for (int i=0; i<args->nGpus*args->nRanks; i++) {
       HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
     }
     // Instantiate cuda graph
-    for (int i=0; i<args->nGpus; i++) {
+    for (int i=0; i<args->nGpus*args->nRanks; i++) {
       HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
     }
     // Resync CPU, restart timing, launch cuda graph
     Barrier(args);
     start = std::chrono::high_resolution_clock::now();
     for (int l=0; l<cudaGraphLaunches; l++) {
-      for (int i=0; i<args->nGpus; i++) {
+      for (int i=0; i<args->nGpus*args->nRanks; i++) {
         HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
       }
     }
@@ -671,7 +707,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #if CUDART_VERSION >= 11030
   if (cudaGraphLaunches >= 1) {
     //destroy cuda graph
-    for (int i=0; i<args->nGpus; i++) {
+    for (int i=0; i<args->nGpus*args->nRanks; i++) {
       HIPCHECK(hipGraphExecDestroy(graphExec[i]));
       HIPCHECK(hipGraphDestroy(graphs[i]));
     }
@@ -679,7 +715,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #endif
 
   double algBw, busBw;
-  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus*args->nRanks);
 
   Barrier(args);
 
@@ -694,7 +730,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #if CUDART_VERSION >= 11030
       if (cudaGraphLaunches >= 1) {
         // Begin cuda graph capture for data check
-        for (int i=0; i<args->nGpus; i++) {
+        for (int i=0; i<args->nGpus*args->nRanks; i++) {
           HIPCHECK(chiptreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal));
         }
       }
@@ -706,15 +742,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #if CUDART_VERSION >= 11030
       if (cudaGraphLaunches >= 1) {
         // End cuda graph capture
-        for (int i=0; i<args->nGpus; i++) {
+        for (int i=0; i<args->nGpus*args->nRanks; i++) {
           HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
         }
         // Instantiate cuda graph
-        for (int i=0; i<args->nGpus; i++) {
+        for (int i=0; i<args->nGpus*args->nRanks; i++) {
           HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
         }
         // Launch cuda graph
-        for (int i=0; i<args->nGpus; i++) {
+        for (int i=0; i<args->nGpus*args->nRanks; i++) {
           HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
         }
       }
@@ -725,7 +761,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #if CUDART_VERSION >= 11030
       if (cudaGraphLaunches >= 1) {
         //destroy cuda graph
-        for (int i=0; i<args->nGpus; i++) {
+        for (int i=0; i<args->nGpus*args->nRanks; i++) {
           HIPCHECK(hipGraphExecDestroy(graphExec[i]));
           HIPCHECK(hipGraphDestroy(graphs[i]));
         }
@@ -759,7 +795,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 }
 
 void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
-  int nranks = args->nProcs*args->nGpus*args->nThreads;
+  int nranks = args->nProcs*args->nGpus*args->nThreads*args->nRanks;
   size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
 
   count = size / wordSize(type);
@@ -806,6 +842,8 @@ testResult_t threadRunTests(struct threadArgs* args) {
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
   int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
+  if (enable_multiranks)
+    gpuid = gpuid % numDevices;
   HIPCHECK(hipSetDevice(gpuid));
   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
   return testSuccess;
@@ -814,23 +852,33 @@ testResult_t threadRunTests(struct threadArgs* args) {
 testResult_t threadInit(struct threadArgs* args) {
   char hostname[1024];
   getHostName(hostname, 1024);
-  int nranks =  args->nProcs*args->nThreads*args->nGpus;
+  int nranks =  args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
   //set main thread again
   is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
 
   NCCLCHECK(ncclGroupStart());
   for (int i=0; i<args->nGpus; i++) {
-    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (enable_multiranks)
+      gpuid = gpuid % numDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
+
+    for (int j=0; j<args->nRanks; j++) {
+      int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j;
+      if (args->enable_multiranks)
+	NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
+#ifdef RCCL_MULTIRANKPERGPU
+      else
+	NCCLCHECK(ncclCommInitRankMulti(args->comms+i*args->nRanks+j, nranks, args->ncclId, rank, rank));
+#endif
+    }
   }
   NCCLCHECK(ncclGroupEnd());
 
   TESTCHECK(threadRunTests(args));
 
-  for (int i=0; i<args->nGpus; i++) {
+  for (int i=0; i<args->nGpus*args->nRanks; i++) {
     NCCLCHECK(ncclCommDestroy(args->comms[i]));
   }
   return testSuccess;
@@ -925,13 +973,21 @@ int main(int argc, char* argv[]) {
     {"cumask", required_argument, 0, 'u'},
     {"cudagraph", required_argument, 0, 'G'},
     {"average", required_argument, 0, 'a'},
+#ifdef RCCL_MULTIRANKPERGPU
+    {"enable_multiranks", required_argument, 0, 'x'},
+    {"ranks_per_gpu", required_argument, 0, 'R'},
+#endif
     {"help", no_argument, 0, 'h'},
     {}
   };
 
   while(1) {
     int c;
+#ifdef RCCL_MULTIRANKPERGPU
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:R:x:", longopts, &longindex);
+#else
     c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:", longopts, &longindex);
+#endif
 
     if (c == -1)
       break;
@@ -1022,6 +1078,14 @@ int main(int argc, char* argv[]) {
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
         break;
+#ifdef RCCL_MULTIRANKPERGPU
+      case 'x':
+        enable_multiranks = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'R':
+        ranksPerGpu = (int)strtol(optarg, NULL, 0);
+        break;
+#endif
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -1052,26 +1116,43 @@ int main(int argc, char* argv[]) {
             "[-u,--cumask <d0,d1,d2,d3>] \n\t"
             "[-G,--cudagraph <num graph launches>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+#ifdef RCCL_MULTIRANKPERGPU
+            "[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t"
+            "[-R,--ranks_per_gpu] \n\t"
+#endif
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;
     }
   }
 
-  int numDevices;
   HIPCHECK(hipGetDeviceCount(&numDevices));
   if (nGpus > numDevices)
   {
       fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices);
       return testNcclError;
   }
-
   if (minBytes > maxBytes) {
     fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
            (unsigned long long)minBytes,
            (unsigned long long)maxBytes);
     return -1;
   }
+  if (!minReqVersion(2, 12, 12) && enable_multiranks) {
+     fprintf(stderr, "Multiple Ranks per GPU requested, but rccl library found does not support this feature.\n");
+     fprintf(stderr, "Please check LD_LIBRARY_PATH. Resetting enable_multiranks and ranksPerGpu to default values.\n");
+     enable_multiranks = 0;
+     ranksPerGpu       = 1;
+  }
+
+  if (enable_multiranks && parallel_init) {
+    fprintf(stderr, "Cannot use parallel_init when using multiple ranks per GPU.\n");
+    return -1;
+  }
+  if (ranksPerGpu > 1 && !enable_multiranks) {
+    fprintf(stderr, "Need to enable multiranks option to use multiple ranks per GPU\n");
+    return -1;
+  }
 #ifdef MPI_SUPPORT
   MPI_Init(&argc, &argv);
 #endif
@@ -1098,7 +1179,7 @@ testResult_t run() {
 #endif
   is_main_thread = (proc == 0) ? 1 : 0;
 
-  PRINT("# nThread: %d nGpus: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+  PRINT("# nThreads: %d nGpus: %d nRanks: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, ranksPerGpu, minBytes, maxBytes,
       (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
   if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
   if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
@@ -1111,18 +1192,20 @@ testResult_t run() {
   size_t maxMem = ~0;
   for (int i=0; i<nThreads*nGpus; i++) {
     int hipDev = localRank*nThreads*nGpus+i;
-    int rank = proc*nThreads*nGpus+i;
+    if (enable_multiranks)
+      hipDev = hipDev % numDevices;
     hipDeviceProp_t prop;
     HIPCHECK(hipGetDeviceProperties(&prop, hipDev));
-    char busIdStr[] = "00000000:00:00.0";
-    HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev));
-    len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
-                    rank, getpid(), hostname, hipDev, busIdStr, prop.name);
-    len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
-                    rank, getpid(), hostname, hipDev, prop.pciBusID, prop.name);
-    maxMem = std::min(maxMem, prop.totalGlobalMem);
-  }
 
+    for (int j=0; j<ranksPerGpu; j++) {
+	int rank = proc*nThreads*nGpus*ranksPerGpu+i*ranksPerGpu + j;
+        char busIdStr[] = "00000000:00:00.0";
+    	HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev));
+	len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
+			rank, getpid(), hostname, hipDev, busIdStr, prop.name);
+	maxMem = std::min(maxMem, prop.totalGlobalMem);
+    }
+  }
 #if MPI_SUPPORT
   char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
   // Gather all output in rank order to root (0)
@@ -1152,42 +1235,61 @@ testResult_t run() {
   MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
-  hipStream_t streams[nGpus*nThreads];
-  void* sendbuffs[nGpus*nThreads];
-  void* recvbuffs[nGpus*nThreads];
-  void* expected[nGpus*nThreads];
+  hipStream_t streams[nGpus*nThreads*ranksPerGpu];
+  void* sendbuffs[nGpus*nThreads*ranksPerGpu];
+  void* recvbuffs[nGpus*nThreads*ranksPerGpu];
+  void* expected[nGpus*nThreads*ranksPerGpu];
   size_t sendBytes, recvBytes;
 
-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu);
 
-  for (int i=0; i<nGpus*nThreads; i++) {
-    HIPCHECK(hipSetDevice(localRank*nThreads*nGpus+i));
-    AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus);
-    //PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
-    if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
-      PRINT("cumask: ");
-      for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
-      PRINT("\n");
-      HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
-    } else
-      HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
-    // initialize data buffer to avoid all zero data
-    TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
+  for (int ii=0; ii<nGpus*nThreads; ii++) {
+    int gpuid = localRank*nThreads*nGpus+ii;
+    if (enable_multiranks)
+      gpuid = gpuid % numDevices;
+    HIPCHECK(hipSetDevice(gpuid));
+    for (int j=0; j<ranksPerGpu; j++) {
+      int i = ii*ranksPerGpu+j;
+      AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu);
+      //PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
+      if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
+	PRINT("cumask: ");
+	for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
+	PRINT("\n");
+	HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
+      } else
+	HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      // initialize data buffer to avoid all zero data
+      TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
+    }
     HIPCHECK(hipDeviceSynchronize());
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
   if (!parallel_init) {
-     if (nProcs == 1) {
+     if (nProcs == 1 && !enable_multiranks) {
        int gpuArray[nGpus*nThreads];
        for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
        NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
      } else {
        NCCLCHECK(ncclGroupStart());
-       for (int i=0; i<nGpus*nThreads; i++) {
-         HIPCHECK(hipSetDevice(localRank*nThreads*nGpus+i));
-         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
+       for (int ii=0; ii<nGpus*nThreads; ii++) {
+	 int gpuid = localRank*nThreads*nGpus+ii;
+         if (enable_multiranks) {
+	   gpuid = gpuid % numDevices;
+	 }
+         HIPCHECK(hipSetDevice(gpuid));
+	 if (!enable_multiranks) {
+	   NCCLCHECK(ncclCommInitRank(comms+ii, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
+	 }
+#ifdef RCCL_MULTIRANKPERGPU
+	 else
+	   for (int j=0; j<ranksPerGpu; j++) {
+	     int i = ii*ranksPerGpu+j;
+	     NCCLCHECK(ncclCommInitRankMulti(comms+i, nProcs*nThreads*nGpus*ranksPerGpu, ncclId, proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
+	   }
+#endif
        }
        NCCLCHECK(ncclGroupEnd());
      }
@@ -1219,18 +1321,20 @@ testResult_t run() {
     threads[t].args.stepbytes=stepBytes;
     threads[t].args.stepfactor=stepFactor;
     threads[t].args.localRank = localRank;
-
+    threads[t].args.localNumDevices = numDevices;
+    threads[t].args.enable_multiranks = enable_multiranks;
+    threads[t].args.nRanks = ranksPerGpu;
     threads[t].args.nProcs=nProcs;
     threads[t].args.proc=proc;
     threads[t].args.nThreads=nThreads;
     threads[t].args.thread=t;
     threads[t].args.nGpus=nGpus;
-    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
-    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
-    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.sendbuffs = sendbuffs+t*nGpus*ranksPerGpu;
+    threads[t].args.recvbuffs = recvbuffs+t*nGpus*ranksPerGpu;
+    threads[t].args.expected = expected+t*nGpus*ranksPerGpu;
     threads[t].args.ncclId = ncclId;
-    threads[t].args.comms=comms+t*nGpus;
-    threads[t].args.streams=streams+t*nGpus;
+    threads[t].args.comms=comms+t*nGpus*ranksPerGpu;
+    threads[t].args.streams=streams+t*nGpus*ranksPerGpu;
 
     threads[t].args.barrier = (volatile int*)barrier;
     threads[t].args.barrier_idx = 0;
@@ -1267,17 +1371,17 @@ testResult_t run() {
 #endif
 
   if (!parallel_init) {
-    for(int i=0; i<nGpus*nThreads; ++i)
+    for(int i=0; i<nGpus*nThreads*ranksPerGpu; ++i)
       NCCLCHECK(ncclCommDestroy(comms[i]));
     free(comms);
   }
 
-  for (int i=0; i<nGpus*nThreads; i++) {
+  for (int i=0; i<nGpus*nThreads*ranksPerGpu; i++) {
     HIPCHECK(hipStreamDestroy(streams[i]));
   }
 
   // Free off HIP allocated memory
-  for (int i=0; i<nGpus*nThreads; i++) {
+  for (int i=0; i<nGpus*nThreads*ranksPerGpu; i++) {
     if (memorytype == ncclHost) {
       HIPCHECK(hipHostFree(sendbuffs[i]));
       HIPCHECK(hipHostFree(recvbuffs[i]));
diff --git a/src/common.h b/src/common.h
index 5ad6953335..10712727ce 100644
--- a/src/common.h
+++ b/src/common.h
@@ -102,6 +102,9 @@ struct threadArgs {
   int thread;
   int nGpus;
   int localRank;
+  int localNumDevices;
+  int enable_multiranks;
+  int nRanks;
   void** sendbuffs;
   size_t sendBytes;
   size_t sendInplaceOffset;
diff --git a/src/gather.cu b/src/gather.cu
index 8c97fe9f26..c293793d96 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -31,20 +31,27 @@ void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramc
 testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
-    if (rank == root) {
-      for (int j=0; j<nranks; j++) {
-        TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
+      if (rank == root) {
+	for (int j=0; j<nranks; j++) {
+	  TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
+	}
       }
+      k++;
     }
     HIPCHECK(hipDeviceSynchronize());
   }
diff --git a/src/hypercube.cu b/src/hypercube.cu
index 946c9c670b..d654617ccd 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -33,17 +33,24 @@ void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
 testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    for (int j=0; j<nranks; j++) {
-      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      for (int j=0; j<nranks; j++) {
+	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
+      }
+      k++;
     }
     HIPCHECK(hipDeviceSynchronize());
   }
@@ -66,7 +73,6 @@ testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
   size_t rankSize = count * wordSize(type);
-
   if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
 
   // Hypercube AllGather
diff --git a/src/reduce.cu b/src/reduce.cu
index d0792a49f9..7ea7b0f726 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -31,17 +31,24 @@ void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramc
 testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
-    if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
+      if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
+      k++;
+    }
     HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
@@ -119,4 +126,4 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
 struct testEngine ncclTestEngine = {
   ReduceGetBuffSize,
   ReduceRunTest
-};
\ No newline at end of file
+};
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index bf5cbede8d..23b99de35b 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -31,17 +31,24 @@ void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t
 testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
-    TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
+      TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
+      k++;
+    }
     HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
@@ -111,4 +118,4 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
 struct testEngine ncclTestEngine = {
   ReduceScatterGetBuffSize,
   ReduceScatterRunTest
-};
\ No newline at end of file
+};
diff --git a/src/scatter.cu b/src/scatter.cu
index 884ec96a46..ec8c06b092 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -32,14 +32,22 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    TESTCHECK(InitData(args->expected[i], recvcount, type, rep+rank*recvcount, root));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(args->expected[k], recvcount, type, rep+rank*recvcount, root));
+      k++;
+
+    }
     HIPCHECK(hipDeviceSynchronize());
   }
   return testSuccess;
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index 6ded375678..84d7398e42 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -31,17 +31,24 @@ void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *para
 testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
+  int k=0;
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    if (args->enable_multiranks)
+      gpuid = gpuid % args->localNumDevices;
     HIPCHECK(hipSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    int peer = (rank-1+nranks)%nranks;
-    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      int peer = (rank-1+nranks)%nranks;
+      TESTCHECK(InitData(args->expected[k], recvcount, type, rep, peer));
+      k++;
+    }
     HIPCHECK(hipDeviceSynchronize());
   }
   // We don't support in-place sendrecv

From 67544e2c3450cf04a6784f57ebd09b05853e35cb Mon Sep 17 00:00:00 2001
From: Edgar <edgar.gabriel@amd.com>
Date: Mon, 13 Jun 2022 09:34:59 -0400
Subject: [PATCH 098/233] update pytest before running CI

There seems to be in an incompatibility between the python installation
used in the CI and pytest. Update pytest before running CI.
---
 .jenkins/common.groovy | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index 14c644b026..70dbbd7a3c 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -27,6 +27,8 @@ def runTestCommand (platform, project)
     def command = """#!/usr/bin/env bash
                 set -x
                 cd ${project.paths.project_build_prefix}
+		python3 -m pip install --upgrade pytest
+		python3 -m pytest --version
 		python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml
             """
 

From 9925195afc967bc50f82aecaa32c3c09866fbe15 Mon Sep 17 00:00:00 2001
From: akolliasAMD <99202231+akolliasAMD@users.noreply.github.com>
Date: Thu, 21 Jul 2022 10:28:53 -0600
Subject: [PATCH 099/233] updated alltoallV test to not have any zero values
 (#12)

updated alltoallV test to not have any zero values between ranks
---
 src/alltoallv.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 7a39bcce7b..cb8fcaff0d 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -71,12 +71,12 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
       size_t chunksize = data_count/nranks;
       for (int j=0; j<nranks; j++) {
 	size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
-	if (j+rank == nranks-1)
+	if ((j+rank)%nranks == 0)
           rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
 	size_t sdisp = 0;
 	for (int k=0; k<nranks; k++) {
 	  scount = ((k+j)%nranks)*chunksize;
-	  if (k+j == nranks-1)
+	  if ((k+j)%nranks == 0)
 	    scount += (sendcount-chunksize*(nranks-1)*nranks/2);
 	  if (k == rank)
 	    break;
@@ -119,7 +119,7 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   size_t chunksize = count*2/nranks;
   for (int i = 0; i < nranks; i++) {
       size_t scount = ((i+rank)%nranks)*chunksize;
-      if (i+rank == nranks-1)
+      if ((i+rank)%nranks == 0)
           scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
       sendcounts[i+rank*MAX_ALLTOALLV_RANKS] = recvcounts[i+rank*MAX_ALLTOALLV_RANKS] = scount;
       sdispls[i+rank*MAX_ALLTOALLV_RANKS] = rdispls[i+rank*MAX_ALLTOALLV_RANKS] = disp;

From 2af4f6bc3a02b761d138dd51171797f24281684f Mon Sep 17 00:00:00 2001
From: Eiden Yoshida <47196116+eidenyoshida@users.noreply.github.com>
Date: Thu, 28 Jul 2022 08:19:16 -0700
Subject: [PATCH 100/233] Allow gpu config override in CI (#14)

---
 .jenkins/precheckin.groovy | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
index aae81c922e..d316d47929 100644
--- a/.jenkins/precheckin.groovy
+++ b/.jenkins/precheckin.groovy
@@ -51,8 +51,10 @@ ci: {
 
     def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['rccl906']]), 
                        "rocm-docker":([ubuntu16:['rccl906']])]
-    jobNameList = auxiliary.appendJobNameList(jobNameList)
+                       
     jobNameList['compute-rocm-dkms-no-npi-hipclang'] = [ubuntu16:['rccl906']]
+    jobNameList = auxiliary.appendJobNameList(jobNameList)
+    
     
     propertyList.each 
     {

From d704668bf7376efd299d15cebe7b0e4ab183d5af Mon Sep 17 00:00:00 2001
From: Liam Wrubleski <Liam.Wrubleski@amd.com>
Date: Tue, 9 Aug 2022 11:17:07 -0600
Subject: [PATCH 101/233] Add CMake files to build & package (#15)

* Add CMake files to build & package

* Change build technique on CI

* Correct CI build command
---
 .jenkins/common.groovy |  5 ++-
 CMakeLists.txt         | 61 ++++++++++++++++++++++++++++++++++++
 src/CMakeLists.txt     | 71 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 src/CMakeLists.txt

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index 70dbbd7a3c..7426d35d75 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -13,7 +13,10 @@ def runCompileCommand(platform, project, jobName)
                 ${getRCCL}
                 ${auxiliary.exitIfNotSuccess()}
                 cd ${project.paths.project_build_prefix}
-                ${project.paths.build_command}
+                cmake \
+                    -DCMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \
+                    -S . -B build
+                make -C build -j\$(nproc)
                 ${auxiliary.exitIfNotSuccess()}
             """
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000000..539a1eae2b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,61 @@
+# ########################################################################
+# Copyright 2022 Advanced Micro Devices, Inc.
+# ########################################################################
+
+cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
+
+project(RCCL-tests VERSION 2.12.10 LANGUAGES CXX)
+
+# Get ROCm path from environment if available
+if (DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
+else()
+    set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
+endif()
+
+# Set CMake/CPack variables
+list( APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm)
+set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Prefix install path")
+set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.")
+set(CMAKE_CXX_STANDARD 14)
+
+# Get additional packages required
+find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}")
+find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
+
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMCheckTargetIds)
+include(ROCMClients)
+
+# Build variables
+option(USE_MPI "Build RCCL-tests with MPI support. Requires the MPI path to be set.")
+set(MPI_PATH "" CACHE PATH "Path to MPI installation")
+## Get default GPU targets using rocm_check_target_ids
+rocm_check_target_ids(
+    DEFAULT_AMDGPU_TARGETS
+    TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
+)
+set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.")
+
+# Find the MPI package if we're using MPI
+if (USE_MPI)
+    if(NOT MPI_PATH STREQUAL "")
+        set(MPI_HOME "${MPI_PATH}")
+    endif()
+    find_package(MPI REQUIRED MODULE)
+    add_definitions(-DOMPI_SKIP_MPICXX -DMPI_SUPPORT)
+endif()
+
+set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
+
+# Add all of the tests
+add_subdirectory(src)
+
+# Create ROCm standard packages
+rocm_create_package(
+    NAME rccl-separate-tests
+    DESCRIPTION "Tests for the ROCm Communication Collectives Library"
+    MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
+)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000000..b5a40aefc1
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,71 @@
+# ########################################################################
+# Copyright 2022 Advanced Micro Devices, Inc.
+# ########################################################################
+
+# Compile common object library
+set_property(SOURCE common.cu PROPERTY LANGUAGE CXX)
+add_library(rccl_common OBJECT common.cu)
+if(USE_MPI)
+    target_link_libraries(rccl_common roc::rccl MPI::MPI_CXX)
+else()
+    target_link_libraries(rccl_common roc::rccl)
+endif()
+
+function(add_relative_test test_name test_target)
+    get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
+    if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
+        set(EXE_PATH ".")
+    endif()
+    get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
+    if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+        get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
+        if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+            set(EXE_NAME "${test_target}")
+        endif()
+    endif()
+    file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
+    add_test(NAME "${test_name}" COMMAND "./${rel_path}")
+endfunction()
+
+function(add_rccl_test TEST)
+    set(TEST_SOURCE "${TEST}.cu")
+    set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
+    set(TEST_TARGET "${TEST}_perf")
+    add_executable(${TEST_TARGET} ${TEST_SOURCE})
+    target_link_libraries(
+        ${TEST_TARGET}
+        PRIVATE
+            rccl_common
+    )
+    if (NOT WIN32)
+        foreach(amdgpu_target ${AMDGPU_TARGETS})
+            target_link_libraries(${TEST_TARGET} PRIVATE --amdgpu-target=${amdgpu_target})
+        endforeach()
+    endif()
+    set_target_properties(
+        ${TEST_TARGET}
+        PROPERTIES
+            RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+            # LINKER_LANGUAGE CXX
+    )
+    add_relative_test(${TEST} ${TEST_TARGET})
+    rocm_install(TARGETS ${TEST_TARGET})
+    # TODO: copy/install DLLs on Windows
+    set_target_properties(
+        ${TEST_TARGET} PROPERTIES
+        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib"
+    )
+endfunction()
+
+add_rccl_test(all_gather)
+add_rccl_test(all_reduce)
+add_rccl_test(alltoall)
+add_rccl_test(alltoallv)
+add_rccl_test(broadcast)
+add_rccl_test(gather)
+add_rccl_test(hypercube)
+add_rccl_test(reduce_scatter)
+add_rccl_test(reduce)
+add_rccl_test(scatter)
+add_rccl_test(sendrecv)

From 9025051bbb62fecc15429876d5d7543fad370ec0 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Tue, 9 Aug 2022 11:04:38 -0700
Subject: [PATCH 102/233] Fix missing error checking for AllocateBuffs due to
 merge (#17)

---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index c31cff308e..a4577550ba 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1250,7 +1250,7 @@ testResult_t run() {
     HIPCHECK(hipSetDevice(gpuid));
     for (int j=0; j<ranksPerGpu; j++) {
       int i = ii*ranksPerGpu+j;
-      AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu);
+      TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu));
       //PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
       if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
 	PRINT("cumask: ");

From f6f3c44a7aec51141b87792773bb42206238919f Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Tue, 9 Aug 2022 16:45:27 -0600
Subject: [PATCH 103/233] Enabling hipGraph codepath for future support (#18)

---
 src/common.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index a4577550ba..bcb8df8b4b 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -651,7 +651,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   Barrier(args);
 
-#if CUDART_VERSION >= 11030
+#if HIP_VERSION >= 50221310
   hipGraph_t graphs[args->nGpus*args->nRanks];
   hipGraphExec_t graphExec[args->nGpus*args->nRanks];
   if (cudaGraphLaunches >= 1) {
@@ -675,7 +675,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
   }
 
-#if CUDART_VERSION >= 11030
+#if HIP_VERSION >= 50221310
   if (cudaGraphLaunches >= 1) {
     // End cuda graph capture
     for (int i=0; i<args->nGpus*args->nRanks; i++) {
@@ -704,7 +704,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
   Allreduce(args, &deltaSec, average);
 
-#if CUDART_VERSION >= 11030
+#if HIP_VERSION >= 50221310
   if (cudaGraphLaunches >= 1) {
     //destroy cuda graph
     for (int i=0; i<args->nGpus*args->nRanks; i++) {
@@ -727,7 +727,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       // Initialize sendbuffs, recvbuffs and expected
       TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
-#if CUDART_VERSION >= 11030
+#if HIP_VERSION >= 50221310
       if (cudaGraphLaunches >= 1) {
         // Begin cuda graph capture for data check
         for (int i=0; i<args->nGpus*args->nRanks; i++) {
@@ -739,7 +739,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       //test validation in single itertion, should ideally be included into the multi-iteration run
       TESTCHECK(startColl(args, type, op, root, in_place, 0));
 
-#if CUDART_VERSION >= 11030
+#if HIP_VERSION >= 50221310
       if (cudaGraphLaunches >= 1) {
         // End cuda graph capture
         for (int i=0; i<args->nGpus*args->nRanks; i++) {
@@ -758,7 +758,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
       TESTCHECK(completeColl(args));
 
-#if CUDART_VERSION >= 11030
+#if HIP_VERSION >= 50221310
       if (cudaGraphLaunches >= 1) {
         //destroy cuda graph
         for (int i=0; i<args->nGpus*args->nRanks; i++) {
@@ -1069,10 +1069,10 @@ int main(int argc, char* argv[]) {
         }
         break;
       case 'G':
-#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && HIP_VERSION >= 50221310
         cudaGraphLaunches = strtol(optarg, NULL, 0);
 #else
-        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+        printf("Option -G (HIP graph) not supported before NCCL 2.9 + ROCm 5.2 Ignoring\n");
 #endif
         break;
       case 'a':

From 45ec598ac4ca7fea8d3fd7b9e5e6206421bb3380 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Fri, 12 Aug 2022 14:42:17 +0000
Subject: [PATCH 104/233] Fix typo from previous merge

---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index bcb8df8b4b..332cc3f272 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -731,7 +731,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       if (cudaGraphLaunches >= 1) {
         // Begin cuda graph capture for data check
         for (int i=0; i<args->nGpus*args->nRanks; i++) {
-          HIPCHECK(chiptreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal));
+          HIPCHECK(hipStreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal));
         }
       }
 #endif

From 51af5572bf8ebf197bac7de8cd6bc7d847339575 Mon Sep 17 00:00:00 2001
From: John Bachan <jbachan@nvidia.com>
Date: Fri, 19 Aug 2022 15:15:10 -0500
Subject: [PATCH 105/233] Resync with NCCL 2.13

* Added "verifiable", a suite of kernels for generating and verifying reduction
  input and output arrays in a bit-precise way.
* Data corruption errors now reported in number of wrong elements instead of max
  deviation.
* Use ncclGetLastError.
* Don't run hypercube on non-powers of 2 ranks.
* Fix to hypercube data verification.
* Use "thread local" as the defaut CUDA capture mode.
* Replaced pthread_yield -> sched_yield()
* Bugfix to the cpu-side barrier/allreduce implementations.
---
 src/Makefile                  |    8 +-
 src/all_gather.cu             |   16 +-
 src/all_reduce.cu             |   14 +-
 src/alltoall.cu               |   17 +-
 src/broadcast.cu              |   16 +-
 src/common.cu                 |  517 +++++---------
 src/common.h                  |   30 +-
 src/gather.cu                 |   18 +-
 src/hypercube.cu              |   27 +-
 src/reduce.cu                 |   14 +-
 src/reduce_scatter.cu         |   14 +-
 src/scatter.cu                |   16 +-
 src/sendrecv.cu               |   16 +-
 verifiable/Makefile           |   24 +
 verifiable/inexact_regress.cu |  177 +++++
 verifiable/verifiable.cu      | 1227 +++++++++++++++++++++++++++++++++
 verifiable/verifiable.h       |   59 ++
 verifiable/verifiable.mk      |   11 +
 18 files changed, 1706 insertions(+), 515 deletions(-)
 create mode 100644 verifiable/Makefile
 create mode 100644 verifiable/inexact_regress.cu
 create mode 100644 verifiable/verifiable.cu
 create mode 100644 verifiable/verifiable.h
 create mode 100644 verifiable/verifiable.mk

diff --git a/src/Makefile b/src/Makefile
index 2a399db7fa..137b9d7925 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -83,12 +83,16 @@ build: ${BIN_FILES}
 clean:
 	rm -rf ${DST_DIR}
 
-${DST_DIR}/%.o: %.cu common.h
+TEST_VERIFIABLE_SRCDIR := ../verifiable
+TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
+include ../verifiable/verifiable.mk
+
+${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
 
-${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o $(TEST_VERIFIABLE_OBJS)
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
diff --git a/src/all_gather.cu b/src/all_gather.cu
index 0b9e0cc939..1eaafddfab 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -7,18 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s", size, count, typeName);
-}
-
 void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count/nranks;
   *recvcount = (count/nranks)*nranks;
@@ -38,9 +26,9 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
     for (int j=0; j<nranks; j++) {
-      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+      TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
     }
     CUDACHECK(cudaDeviceSynchronize());
   }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 9b6b7f02b9..9c65f25aba 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -7,18 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
-}
-
 void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count;
   *recvcount = count;
@@ -38,7 +26,7 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
     CUDACHECK(cudaDeviceSynchronize());
   }
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 865099743d..0eae1b07c9 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -7,18 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
-}
-
 void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = (count/nranks)*nranks;
   *recvcount = (count/nranks)*nranks;
@@ -39,9 +27,10 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
     for (int j=0; j<nranks; j++) {
-      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
+      size_t partcount = sendcount/nranks;
+      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
     }
     CUDACHECK(cudaDeviceSynchronize());
   }
diff --git a/src/broadcast.cu b/src/broadcast.cu
index e2b4421ac5..40dcb5d885 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -7,18 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
-}
-
 void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count;
   *recvcount = count;
@@ -37,8 +25,8 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root));
+    if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+    TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
     CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
diff --git a/src/common.cu b/src/common.cu
index 05f814d923..eaa3318f34 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -7,10 +7,13 @@
 #include "common.h"
 #include <pthread.h>
 #include <cstdio>
+#include <type_traits>
 #include <getopt.h>
 #include <libgen.h>
 #include "cuda.h"
 
+#include "../verifiable/verifiable.h"
+
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 #if NCCL_MAJOR >= 2
@@ -107,362 +110,154 @@ static double parsesize(const char *value) {
     return size * units;
 }
 
-double DeltaMaxValue(ncclDataType_t type) {
-  switch(type) {
-    case ncclHalf: return 1e-2;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16: return 1e-2;
-#endif
-    case ncclFloat: return 1e-5;
-    case ncclDouble: return 1e-12;
-    case ncclInt:
-#if NCCL_MAJOR >= 2
-    case ncclUint8:
-    //case ncclInt32:
-    case ncclUint32:
-#endif
-    case ncclInt64:
-    case ncclUint64: return 1e-200;
-  }
-  return 1e-200;
-}
-
-template<typename T> __device__
-double absDiff(T a, T b) {
-  return fabs((double)(b - a));
-}
-
-template<> __device__
-double absDiff<half>(half a, half b) {
-  float x = __half2float(a);
-  float y = __half2float(b);
-  return fabs((double)(y-x));
-}
-
-template<typename T> __device__
-float toFloat(T a) {
-  return (float)a;
-}
-template<> __device__
-float toFloat(half a) {
-  return __half2float(a);
-}
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<> __device__
-float toFloat(__nv_bfloat16 a) {
-  return __bfloat162float(a);
-}
-#endif
-
-template<typename T, int BSIZE> __global__
-void deltaKern(void* A_, void* B_, size_t count, double* max) {
-  const T* A = (const T*)A_;
-  const T* B = (const T*)B_;
-  __shared__ double temp[BSIZE];
-  int tid = blockIdx.x*blockDim.x + threadIdx.x;
-  double locmax = 0.0;
-  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
-
-    double delta = absDiff(A[i], B[i]);
-    if( delta > locmax ) {
-      locmax = delta;
-#ifdef DEBUG_PRINT
-      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
-#endif
-    }
-  }
-
-  tid = threadIdx.x;
-  temp[tid] = locmax;
-  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
-    __syncthreads();
-    if( tid < stride )
-      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
-  }
-  __syncthreads();
-  if( threadIdx.x == 0)
-    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
-}
-
-testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
-  switch (type) {
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16:
-      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-#endif
-    case ncclHalf:
-      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclFloat:
-      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclDouble:
-      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-
-    case ncclChar:
-#if NCCL_MAJOR >= 2
-    case ncclUint8:
-#endif
-      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclInt:
-#if NCCL_MAJOR >= 2
-    case ncclUint32:
-#endif
-      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclInt64:
-    case ncclUint64:
-      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-  }
+testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
+  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
   CUDACHECK(cudaDeviceSynchronize());
-  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
   return testSuccess;
 }
 
-// For integer values, we use values between 0 and 255
-template<typename T>
-__device__ T testValue(const size_t offset, const int rep, const int rank) {
-  uint8_t v = (rep+rank+offset) % 256;
-  return (T)v;
+testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
+  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
+  return testSuccess;
 }
 
-// For floating point datatype, we use values between 0 and 1 otherwise the
-// Product operation will produce NaNs.
-template<>
-__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
-}
-template<>
-__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
-}
-template<>
-__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
-  return __float2half(testValue<float>(offset, rep, rank));
-}
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
-  return __float2bfloat16(testValue<float>(offset, rep, rank));
-}
-#endif
-
-// Operations
-template<typename T>
-__device__ T ncclOpSum(T a, T b) { return a+b; }
-template<typename T>
-__device__ T ncclOpProd(T a, T b) { return a*b; }
-template<typename T>
-__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
-template<typename T>
-__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
-
-// Definitions for half
-template<>
-__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
-template<>
-__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
-template<>
-__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
-template<>
-__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
-
-template<typename T>
-__device__ T ncclPPOpIdent(T x, int arg) { return x; }
-template<typename T>
-__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
-template<typename T>
-__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
-template<>
-__device__ half ncclPPOpMul(half x, int arg) {
-  return __float2half(__half2float(x)*float(arg));
-}
-template<>
-__device__ half ncclPPOpDiv(half x, int n) {
-  return __float2half(__half2float(x)/n);
-}
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
-  return __float2bfloat16(__bfloat162float(x)*float(arg));
-}
-template<>
-__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
-  return __float2bfloat16(__bfloat162float(x)/n);
-}
-#endif
-
-__host__ __device__ int preMulScalar(int rank) {
-  return 1 + rank%2;
+testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
+  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
+  return testSuccess;
 }
 
-template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
-__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
-    T val = testValue<T>(o+offset, rep, 0);
-    val = PreOp(val, preMulScalar(0));
-    for (int i=1; i<nranks; i++) {
-      T val1 = testValue<T>(o+offset, rep, i);
-      val1 = PreOp(val1, preMulScalar(i));
-      val = Op(val, val1);
-    }
-    data[o] = PostOp(val, nranks);
+void Barrier(struct threadArgs *args) {
+  thread_local int epoch = 0;
+  static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
+  static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
+  static int counter[2] = {0, 0};
+
+  pthread_mutex_lock(&lock[epoch]);
+  if(++counter[epoch] == args->nThreads)
+    pthread_cond_broadcast(&cond[epoch]);
+
+  if(args->thread+1 == args->nThreads) {
+    while(counter[epoch] != args->nThreads)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+    #ifdef MPI_SUPPORT
+      MPI_Barrier(MPI_COMM_WORLD);
+    #endif
+    counter[epoch] = 0;
+    pthread_cond_broadcast(&cond[epoch]);
   }
+  else {
+    while(counter[epoch] != 0)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+  }
+  pthread_mutex_unlock(&lock[epoch]);
+  epoch ^= 1;
 }
 
-#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
-    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
-#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
-#else
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
-#endif
-
-static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
-  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  OPS(__nv_bfloat16)
-#endif
-};
-
-testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
-  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
-  return testSuccess;
-}
-
+// Inter-thread/process barrier+allreduce. The quality of the return value
+// for average=0 (which means broadcast from rank=0) is dubious. The returned
+// value will actually be the result of process-local broadcast from the local thread=0.
 template<typename T>
-__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
-    data[o] = testValue<T>(o, rep, rank);
-}
+void Allreduce(struct threadArgs* args, T* value, int average) {
+  thread_local int epoch = 0;
+  static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
+  static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
+  static T accumulator[2];
+  static int counter[2] = {0, 0};
 
-static void* const initDataKerns[ncclNumTypes] = {
-  (void*)InitDataKernel<  int8_t>,
-  (void*)InitDataKernel< uint8_t>,
-  (void*)InitDataKernel< int32_t>,
-  (void*)InitDataKernel<uint32_t>,
-  (void*)InitDataKernel< int64_t>,
-  (void*)InitDataKernel<uint64_t>,
-  (void*)InitDataKernel<    half>,
-  (void*)InitDataKernel<   float>,
-  (void*)InitDataKernel<  double>,
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  (void*)InitDataKernel<__nv_bfloat16>
-#endif
-};
-
-template<typename T>
-testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
-  T* ptr = (T*)dest;
-  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
-  return testSuccess;
-}
-
-testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
-  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
-  return testSuccess;
-}
-
-void Barrier(struct threadArgs* args) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
-  args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-    args->barrier[args->barrier_idx] = 0;
+  pthread_mutex_lock(&lock[epoch]);
+  if(counter[epoch] == 0) {
+    if(average != 0 || args->thread == 0) accumulator[epoch] = *value;
   } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
-  }
-  args->barrier_idx=!args->barrier_idx;
-}
-
-// Inter-thread/process barrier+allreduce
-void Allreduce(struct threadArgs* args, double* value, int average) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
-  double val = *value;
-  if (args->thread > 0) {
-    double val2 = args->reduce[args->barrier_idx];
-    if (average == 1) val += val2;
-    if (average == 2) val = std::min(val, val2);
-    if (average == 3) val = std::max(val, val2);
-  }
-  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
-  args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    if (average != 0) {
-      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
-      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
+    switch(average) {
+    case /*r0*/ 0: if(args->thread == 0) accumulator[epoch] = *value; break;
+    case /*avg*/1: accumulator[epoch] += *value; break;
+    case /*min*/2: accumulator[epoch] = std::min<T>(accumulator[epoch], *value); break;
+    case /*max*/3: accumulator[epoch] = std::max<T>(accumulator[epoch], *value); break;
+    case /*sum*/4: accumulator[epoch] += *value; break;
     }
-#endif
-    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
-    args->reduce[1-args->barrier_idx] = 0;
-    args->barrier[args->barrier_idx] = 0;
-  } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
   }
-  *value = args->reduce[args->barrier_idx];
-  args->barrier_idx=!args->barrier_idx;
+
+  if(++counter[epoch] == args->nThreads)
+    pthread_cond_broadcast(&cond[epoch]);
+
+  if(args->thread+1 == args->nThreads) {
+    while(counter[epoch] != args->nThreads)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+
+    #ifdef MPI_SUPPORT
+    if(average != 0) {
+      static_assert(std::is_same<T, long long>::value || std::is_same<T, double>::value, "Allreduce<T> only for T in {long long, double}");
+      MPI_Datatype ty = std::is_same<T, long long>::value ? MPI_LONG_LONG :
+                        std::is_same<T, double>::value ? MPI_DOUBLE :
+                        MPI_Datatype();
+      MPI_Op op = average == 1 ? MPI_SUM :
+                  average == 2 ? MPI_MIN :
+                  average == 3 ? MPI_MAX :
+                  average == 4 ? MPI_SUM : MPI_Op();
+      MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator[epoch], 1, ty, op, MPI_COMM_WORLD);
+    }
+    #endif
+
+    if(average == 1) accumulator[epoch] /= args->nProcs*args->nThreads;
+    counter[epoch] = 0;
+    pthread_cond_broadcast(&cond[epoch]);
+  }
+  else {
+    while(counter[epoch] != 0)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+  }
+  pthread_mutex_unlock(&lock[epoch]);
+
+  *value = accumulator[epoch];
+  epoch ^= 1;
 }
 
-testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int64_t *wrongElts) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
   size_t count = args->expectedBytes/wordSize(type);
-  double maxDelta = 0.0;
+
+  int64_t *wrongPerGpu = nullptr;
+  CUDACHECK(cudaHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
+
   for (int i=0; i<args->nGpus; i++) {
     int device;
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
     CUDACHECK(cudaSetDevice(device));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
-    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
-    maxDelta = std::max(*(args->deltaHost), maxDelta);
 
-#ifdef DEBUG_PRINT
-    if (rank == 0) {
-       int *expectedHost = (int *)malloc(args->expectedBytes);
-       int *dataHost = (int *)malloc(args->expectedBytes);
+    TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
 
-       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
-       printf("\n Expected: ");
-       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, expectedHost[j]);
-       }
-       printf("\n");
+#if 1 && DEBUG_PRINT
+    if (args->reportErrors && wrongPerGpu[i] != 0) {
+      printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
+      char *expectedHost = (char*)malloc(args->expectedBytes);
+      char *dataHost = (char*)malloc(args->expectedBytes);
+      int eltsz = wordSize(type);
+      cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost);
+      cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
 
-       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
-       printf("\n Actual: ");
-       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, dataHost[j]);
-       }
-       printf("\n");
-       free(expectedHost);
-       free(dataHost);
+      for(int j=0; j<args->expectedBytes/eltsz; j++) {
+        unsigned long long want, got;
+        want = 0;
+        memcpy(&want, expectedHost + j*eltsz, eltsz);
+        got = 0;
+        memcpy(&got, dataHost + j*eltsz, eltsz);
+        if(want != got) {
+          printf(" rank=%d elt[%d]: want=0x%llx got=0x%llx\n", rank, j, want, got);
+        }
+      }
+      free(expectedHost);
+      free(dataHost);
     }
 #endif
   }
-  double nranks = args->nProcs*args->nThreads*args->nGpus;
-  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
-  *delta = maxDelta;
+
+  *wrongElts = 0;
+  for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
+  cudaFree(wrongPerGpu);
+
+  if (args->reportErrors && *wrongElts) args->errors[0]++;
   return testSuccess;
 }
 
@@ -503,7 +298,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
    }
 
    // We might want to let other threads (including NCCL threads) use the CPU.
-   if (idle) pthread_yield();
+   if (idle) sched_yield();
   }
   free(done);
   return testSuccess;
@@ -541,19 +336,18 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
         __nv_bfloat16 bf16;
         #endif
       };
-      int scalar = preMulScalar(rank);
       switch(type) {
-      case ncclInt8: i8 = int8_t(scalar); break;
-      case ncclUint8: u8 = uint8_t(scalar); break;
-      case ncclInt32: i32 = int32_t(scalar); break;
-      case ncclUint32: u32 = uint32_t(scalar); break;
-      case ncclInt64: i64 = int32_t(scalar); break;
-      case ncclUint64: u64 = uint32_t(scalar); break;
-      case ncclFloat16: f16 = __float2half(float(scalar)); break;
-      case ncclFloat32: f32 = float(scalar); break;
-      case ncclFloat64: f64 = double(scalar); break;
+      case ncclInt8: i8 = ncclVerifiablePremulScalar<int8_t>(rank); break;
+      case ncclUint8: u8 = ncclVerifiablePremulScalar<uint8_t>(rank); break;
+      case ncclInt32: i32 = ncclVerifiablePremulScalar<int32_t>(rank); break;
+      case ncclUint32: u32 = ncclVerifiablePremulScalar<uint32_t>(rank); break;
+      case ncclInt64: i64 = ncclVerifiablePremulScalar<int64_t>(rank); break;
+      case ncclUint64: u64 = ncclVerifiablePremulScalar<uint64_t>(rank); break;
+      case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
+      case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
+      case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
       #if defined(__CUDA_BF16_TYPES_EXIST__)
-      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
+      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
       #endif
       }
       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
@@ -607,9 +401,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) {
     // Begin cuda graph capture
     for (int i=0; i<args->nGpus; i++) {
-      // Thread local mode is needed for:
-      // - Multi-thread mode
-      // - P2P pre-connect
+      // Thread local mdoe is needed for:
+      // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
+      // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
+      //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
       CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
     }
   }
@@ -669,7 +464,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   Barrier(args);
 
-  double maxDelta = 0;
+  int64_t wrongElts = 0;
   static __thread int rep = 0;
   rep++;
   if (datacheck) {
@@ -717,10 +512,12 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       }
 #endif
 
-      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+      TESTCHECK(CheckData(args, type, op, root, in_place, &wrongElts));
 
       //aggregate delta from all threads and procs
-      Allreduce(args, &maxDelta, 3);
+      long long wrongElts1 = wrongElts;
+      Allreduce(args, &wrongElts1, /*sum*/4);
+      wrongElts = wrongElts1;
   }
 
   double timeUsec = deltaSec*1.0E6;
@@ -733,9 +530,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     sprintf(timeStr, "%7.2f", timeUsec);
   }
   if (datacheck) {
-     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+    PRINT("  %7s  %6.2f  %6.2f  %5g", timeStr, algBw, busBw, (double)wrongElts);
   } else {
-     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
   }
 
   args->bw[0] += busBw;
@@ -775,7 +572,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   // Benchmark
   for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
       setupArgs(size, type, args);
-      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+      char rootName[100];
+      sprintf(rootName, "%6i", root);
+      PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
       TESTCHECK(BenchTime(args, type, op, root, 0));
       TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
@@ -828,7 +627,7 @@ testResult_t threadLaunch(struct testThread* thread) {
   return testSuccess;
 }
 
-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
     CUDACHECK(cudaMalloc(sendbuff, nbytes));
     CUDACHECK(cudaMalloc(recvbuff, nbytes));
     if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
@@ -1027,8 +826,10 @@ testResult_t run() {
 #endif
   is_main_thread = (proc == 0) ? 1 : 0;
 
-  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
-      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
+        nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
+        warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
   if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
   if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
   PRINT("#\n");
@@ -1087,7 +888,7 @@ testResult_t run() {
 
   for (int i=0; i<nGpus*nThreads; i++) {
     CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
+    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
     CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
   }
 
@@ -1119,11 +920,11 @@ testResult_t run() {
   }
 
   PRINT("#\n");
-  print_header();
-
-  int* sync = (int*)calloc(2, sizeof(int));
-  int* barrier = (int*)calloc(2, sizeof(int));
-  double* reduce = (double*)calloc(2, sizeof(double));
+  PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
+      "time", "algbw", "busbw", "#wrong", "time", "algbw", "busbw", "#wrong");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+      "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 
   struct testThread threads[nThreads];
   memset(threads, 0, sizeof(struct testThread)*nThreads);
@@ -1147,12 +948,6 @@ testResult_t run() {
     threads[t].args.comms=comms+t*nGpus;
     threads[t].args.streams=streams+t*nGpus;
 
-    threads[t].args.barrier = (volatile int*)barrier;
-    threads[t].args.barrier_idx = 0;
-    threads[t].args.reduce = (volatile double*)reduce;
-    threads[t].args.sync = (volatile int*)sync;
-    threads[t].args.sync_idx = 0;
-    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
     threads[t].args.errors=errors+t;
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
diff --git a/src/common.h b/src/common.h
index bd84d01853..51cf9da276 100644
--- a/src/common.h
+++ b/src/common.h
@@ -28,6 +28,21 @@
   }                                                 \
 } while(0)
 
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,12,10)
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d "           \
+           "'%s / %s'\n",                           \
+           hostname,__FILE__,__LINE__,              \
+           ncclGetErrorString(res),                 \
+           ncclGetLastError(NULL));                 \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+#else
 #define NCCLCHECK(cmd) do {                         \
   ncclResult_t res = cmd;                           \
   if (res != ncclSuccess) {                         \
@@ -39,6 +54,7 @@
     return testNcclError;                           \
   }                                                 \
 } while(0)
+#endif
 
 typedef enum {
   testSuccess = 0,
@@ -111,14 +127,6 @@ struct threadArgs {
 
   void** expected;
   size_t expectedBytes;
-  volatile int* sync;
-  int sync_idx;
-  volatile int* barrier;
-  int barrier_idx;
-  volatile double* reduce;
-  int syncRank;
-  int syncNranks;
-  double* deltaHost;
   int* errors;
   double* bw;
   int* bw_count;
@@ -141,8 +149,8 @@ struct testThread {
 // Provided by common.cu
 extern void Barrier(struct threadArgs* args);
 extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
-extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
-extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
 extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
 
 // Provided by each coll
@@ -228,7 +236,7 @@ static size_t wordSize(ncclDataType_t type) {
     case ncclInt64:
     case ncclUint64:
     case ncclDouble:
-    //case ncclFloat64: 
+    //case ncclFloat64:
       return 8;
     default: return 0;
   }
diff --git a/src/gather.cu b/src/gather.cu
index d0cfa5dabb..99088528d3 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -7,18 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
-}
-
 void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count/nranks;
   *recvcount = (count/nranks)*nranks;
@@ -38,12 +26,10 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
     CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
     if (rank == root) {
-      for (int j=0; j<nranks; j++) {
-        TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
-      }
+      TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
     }
     CUDACHECK(cudaDeviceSynchronize());
   }
diff --git a/src/hypercube.cu b/src/hypercube.cu
index 142f1a6359..ae9fbd0ad5 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -9,18 +9,6 @@
 
 #define ALIGN 4
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s", size, count, typeName);
-}
-
 void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   size_t base = (count/(ALIGN*nranks))*ALIGN;
   *sendcount = base;
@@ -41,9 +29,9 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
     for (int j=0; j<nranks; j++) {
-      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+      TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
     }
     CUDACHECK(cudaDeviceSynchronize());
   }
@@ -110,9 +98,16 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_typenames = test_typenames;
   }
 
-  for (int i=0; i<type_count; i++) {
-    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  // Check if this is a power of 2
+  int nRanks = args->nProcs*args->nThreads*args->nGpus;
+  if (nRanks && !(nRanks & (nRanks - 1))) {
+    for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+    }
+  } else {
+    printf("nRanks %d is not a power of 2, skipping\n", nRanks);
   }
+
   return testSuccess;
 }
 
diff --git a/src/reduce.cu b/src/reduce.cu
index 278768881d..c2707c75cc 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -7,18 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6s  %6i", size, count, typeName, opName, root);
-}
-
 void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count;
   *recvcount = count;
@@ -38,7 +26,7 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
     CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
     if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
     CUDACHECK(cudaDeviceSynchronize());
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index b0c4fab52e..e4a59dc20e 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -7,18 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
-}
-
 void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = (count/nranks)*nranks;
   *recvcount = count/nranks;
@@ -38,7 +26,7 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
     CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
     CUDACHECK(cudaDeviceSynchronize());
diff --git a/src/scatter.cu b/src/scatter.cu
index 93ab2e694a..d244b2b8bc 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -7,18 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
-}
-
 void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = (count/nranks)*nranks;
   *recvcount = count/nranks;
@@ -37,8 +25,8 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    TESTCHECK(InitData(args->expected[i], recvcount, type, rep+rank*recvcount, root));
+    if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+    TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
     CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index 8bebc48e3d..e73a92b2d5 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -7,18 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s", size, count, typeName);
-}
-
 void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count;
   *recvcount = count;
@@ -38,9 +26,9 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
     int peer = (rank-1+nranks)%nranks;
-    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer));
+    TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
     CUDACHECK(cudaDeviceSynchronize());
   }
   // We don't support in-place sendrecv
diff --git a/verifiable/Makefile b/verifiable/Makefile
new file mode 100644
index 0000000000..b141a2a7c5
--- /dev/null
+++ b/verifiable/Makefile
@@ -0,0 +1,24 @@
+include ../../makefiles/common.mk
+
+.PHONY: all clean
+
+BUILDDIR := $(abspath ../../build)
+NCCLDIR := $(BUILDDIR)
+NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
+DST_DIR := $(BUILDDIR)/test/verifiable
+
+all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
+
+clean:
+	rm -rf $(DST_DIR)
+
+TEST_VERIFIABLE_SRCDIR := .
+TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
+include verifiable.mk
+
+self_test: $(DST_DIR)/self_test
+
+$(DST_DIR)/self_test: verifiable.cu verifiable.h
+	@printf "Linking  %s\n" $@
+	@mkdir -p $(DST_DIR)
+	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
diff --git a/verifiable/inexact_regress.cu b/verifiable/inexact_regress.cu
new file mode 100644
index 0000000000..d7bd545f62
--- /dev/null
+++ b/verifiable/inexact_regress.cu
@@ -0,0 +1,177 @@
+/* Generate parameters for our error bound model of floating point average
+ * (sum of scaled values) by sampling sums of random sequences for each
+ * floating point type.
+ *
+ * The model has parameters "coef" and "power", where for two floats a & b,
+ * they are close enough if and only if:
+ *   abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
+ *
+ * Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
+ *
+ * Compile with:
+ *   nvcc -gencode=arch=compute_80,code=sm_80
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+using std::uint64_t;
+using std::uint32_t;
+using bfloat16 = __nv_bfloat16;
+
+template<typename T>
+struct float_traits;
+
+template<>
+struct float_traits<float> {
+  static constexpr int mantissa_bits = 23;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint32_t;
+  __device__ static float make(double x) { return (float)x; }
+  __device__ static float make(uint64_t x) { return (float)x; }
+  __device__ static double todouble(float x) { return x; }
+  __device__ static float add(float a, float b) { return a+b; }
+  __device__ static float mul(float a, float b) { return a*b; }
+};
+template<>
+struct float_traits<double> {
+  static constexpr int mantissa_bits = 52;
+  static constexpr int exponent_bits = 11;
+  using uint_t = uint64_t;
+  __device__ static double make(double x) { return x; }
+  __device__ static double make(uint64_t x) { return (double)x; }
+  __device__ static double todouble(double x) { return x; }
+  __device__ static double add(double a, double b) { return a+b; }
+  __device__ static double mul(double a, double b) { return a*b; }
+};
+template<>
+struct float_traits<half> {
+  static constexpr int mantissa_bits = 10;
+  static constexpr int exponent_bits = 5;
+  using uint_t = uint16_t;
+  __device__ static half make(double x) { return __double2half(x); }
+  __device__ static half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(half x) { return __half2float(x); }
+  __device__ static half add(half a, half b) { return __hadd(a, b); }
+  __device__ static half mul(half a, half b) { return __hmul(a, b); }
+};
+template<>
+struct float_traits<bfloat16> {
+  static constexpr int mantissa_bits = 7;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint16_t;
+  __device__ static bfloat16 make(double x) { return __double2bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); }
+  __device__ static double todouble(bfloat16 x) { return __bfloat162float(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); }
+};
+
+template<typename F>
+__device__ int compare(F a, F b) {
+  union { typename float_traits<F>::uint_t ua; F fa; };
+  union { typename float_traits<F>::uint_t ub; F fb; };
+  ua=0; ub=0;
+  fa=a; fb=b;
+  //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
+  return ua < ub ? ub-ua : ua-ub;
+}
+
+struct xoshiro256ss {
+	uint64_t s[4];
+  __device__ xoshiro256ss(int seed) {
+    constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
+    for(int i=0; i < 4; i++)
+      s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
+  }
+  __device__ uint64_t operator()() {
+    auto rol64 = [](uint64_t x, int k) {
+      return (x << k) | (x >> (64 - k));
+    };
+    uint64_t const result = rol64(s[1] * 5, 7) * 9;
+    uint64_t const t = s[1] << 17;
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+    s[2] ^= t;
+    s[3] = rol64(s[3], 45);
+    return result;
+  }
+};
+
+template<typename F>
+__global__ void kernel() {
+  using traits = float_traits<F>;
+  constexpr int samps = 4<<10;
+  __shared__ F accf[samps];
+  __shared__ double accd[samps];
+
+  xoshiro256ss rng(threadIdx.x);
+  float expo_avg = 1;
+  for(int pass=0; pass < 2; pass++) {
+    F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
+    int err_max = 0;
+    float coef = 0;
+    double expo_sum = 0;
+    int expo_n = 0;
+    int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
+    for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
+    //for(int round=0; round < 2; round++) {
+      for(int i=threadIdx.x; i < samps; i += blockDim.x) {
+        accf[i] = 0;
+        accd[i] = 0;
+      }
+      __syncthreads();
+      for(int r=0; r < max_ranks; r++) {
+        int err = 0;
+        for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
+          constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
+          double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
+          F f = traits::make(d);
+          accf[i] = traits::add(accf[i], traits::mul(scalar, f));
+          accd[i] += traits::todouble(f);
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
+          int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
+          err = err > e ? err : e;
+        }
+        err = __reduce_max_sync(-1u, err);
+        err_max = err_max > err ? err_max : err;
+        if (r >= 2) {
+          // err = 1 + coef*pow(r,expo)
+          float c = float(err-1)/powf(float(r), expo_avg);
+          coef = coef > c ? coef : c;
+        }
+        if (r >= 2) {
+          double expo = log2f(1+err_max)/log2f(r);
+          expo_sum += expo;
+          expo_n++;
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
+        }
+      }
+    }
+    if(pass==0)
+      expo_avg = expo_sum/expo_n;
+    else if(threadIdx.x == 0)
+      std::printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+  }
+}
+
+int main() {
+  std::printf("type=float:\n");
+  kernel<float><<<1,32>>>();
+  cudaDeviceSynchronize();
+
+  std::printf("\ntype=half:\n");
+  kernel<half><<<1,32>>>();
+  cudaDeviceSynchronize();
+
+  std::printf("\ntype=bfloat16:\n");
+  kernel<bfloat16><<<1,32>>>();
+  cudaDeviceSynchronize();
+  return 0;
+}
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
new file mode 100644
index 0000000000..5f617ee188
--- /dev/null
+++ b/verifiable/verifiable.cu
@@ -0,0 +1,1227 @@
+#pragma nv_diag_suppress declared_but_not_referenced
+
+#include "verifiable.h"
+#include <nccl.h>
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#if CUDART_VERSION >= 11000
+#include <cuda_bf16.h>
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__)
+  #define HAVE_ncclBfloat16 1
+#else
+  #define HAVE_ncclBfloat16 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define HAVE_ncclAvg 1
+#else
+  #define HAVE_ncclAvg 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define HAVE_ncclPreMulSum 1
+#else
+  #define HAVE_ncclPreMulSum 0
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <unistd.h>
+
+using std::size_t;
+using std::int8_t;
+using std::int16_t;
+using std::int32_t;
+using std::int64_t;
+using std::uint8_t;
+using std::uint16_t;
+using std::uint32_t;
+using std::uint64_t;
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+__device__ unsigned long long bitsOf(T x) {
+  union { unsigned long long ull; T val; } u;
+  u.ull = 0;
+  u.val = x;
+  return u.ull;
+}
+
+__host__ __device__ uint64_t mixBits(uint64_t x) {
+  union { uint32_t u32[2]; uint64_t u64; };
+  u64 = x;
+  u32[1] += 1;
+  u32[0] ^= u32[1];
+  u64 *= 0x9e3779b97f4a7c13u;
+  u32[0] ^= u32[1]<<16 ^ u32[1]>>16;
+  return u64;
+}
+
+__host__ __device__ uint64_t hashOf(uint64_t a, uint64_t b=0) {
+  a += uint64_t(1)<<32;
+  a += b;
+  a ^= a>>32;
+  a *= 0x9e3779b97f4a7c13u;
+  a += b>>16 ^ b<<48;
+  a ^= a>>32;
+  a *= 0xc4ceb9fe1a85ec53u;
+  return a;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct IsIntegral: std::is_integral<T> {};
+template<>
+struct IsIntegral<half>: std::false_type {};
+#ifdef __CUDA_BF16_TYPES_EXIST__
+template<>
+struct IsIntegral<__nv_bfloat16>: std::false_type {};
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Hide a value from arithmetic optimizations. Hopefully compiler cannot detect
+// that this is equivalent to the identity function.
+template<typename T>
+__host__ __device__ T inhibit(T x) {
+  union { uint64_t u64; T val; };
+  u64 = 0;
+  val = x;
+  u64 *= 0x0000000100000001u;
+  u64 *= 0xffffffff00000001u;
+  return val;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+  template<typename Y, typename X>
+  __host__ __device__ Y castTo(X x) {
+    return Y(x);
+  }
+  template<typename Y>
+  __host__ __device__ Y castTo(float x) {
+    return Y(x);
+  }
+  template<>
+  __host__ __device__ half castTo<half>(float x) {
+    return __float2half(x);
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  template<>
+  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
+    return __float2bfloat16(x);
+  }
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The reduction functions
+
+namespace {
+struct ReduceNil {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T /*b*/) const { return a; }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()+T())>
+  __host__ __device__ T operator()(T a, T b) const { return a + b; }
+  __host__ __device__ half operator()(half a, half b) const {
+    #if __CUDA_ARCH__ >= 530
+      return __hadd(a, b);
+    #else
+      return __float2half(__half2float(a) + __half2float(b));
+    #endif
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hadd(a, b);
+    #else
+      return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
+    #endif
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceProd {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()*T())>
+  __host__ __device__ T operator()(T a, T b) const { return a * b; }
+  __host__ __device__ half operator()(half a, half b) const {
+    #if __CUDA_ARCH__ >= 530
+      return __hmul(a, b);
+    #else
+      return __float2half(__half2float(a) * __half2float(b));
+    #endif
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmul(a, b);
+    #else
+      return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b));
+    #endif
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMin {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()<T())>
+  __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
+  __host__ __device__ half operator()(half a, half b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmin(a, b);
+    #elif __CUDA_ARCH__ >= 530
+      return __hlt(a, b) ? a : b;
+    #else
+      return __half2float(a) < __half2float(b) ? a : b;
+    #endif
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmin(a, b);
+    //#elif __CUDA_ARCH__ >= 530
+    //  return __hlt(a, b) ? a : b;
+    #else
+      return __bfloat162float(a) < __bfloat162float(b) ? a : b;
+    #endif
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMax {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()>T())>
+  __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
+  __host__ __device__ half operator()(half a, half b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmax(a, b);
+    #elif __CUDA_ARCH__ >= 530
+      return __hgt(a, b) ? a : b;
+    #else
+      return __half2float(a) > __half2float(b) ? a : b;
+    #endif
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmax(a, b);
+    //#elif __CUDA_ARCH__ >= 530
+    //  return __hgt(a, b) ? a : b;
+    #else
+      return __bfloat162float(a) > __bfloat162float(b) ? a : b;
+    #endif
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReducePreMulSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceProd()(x, ncclVerifiablePremulScalar<T>(rank_me));
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+template<typename T, bool integral = IsIntegral<T>::value>
+struct ReduceAvg_Base;
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/true> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x/rank_n; }
+};
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/false> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const {
+    using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+    return ReduceProd()(inhibit(castTo<T>(T1(1)/T1(rank_n))), inhibit(x));
+  }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+struct ReduceAvg {
+  int rank_n;
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceAvg_Base<T>{rank_n}.preOp(x, rank_me);
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const {
+    return ReduceAvg_Base<T>{rank_n}(a, b);
+  }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const {
+    return ReduceAvg_Base<T>{rank_n}.postOp(x);
+  }
+};
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct FloatLayout;
+template<>
+struct FloatLayout<float> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 23;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<double> {
+  static constexpr int exponent_bits = 11, mantissa_bits = 52;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<half> {
+  static constexpr int exponent_bits = 5, mantissa_bits = 10;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#ifdef __CUDA_BF16_TYPES_EXIST__
+template<>
+struct FloatLayout<__nv_bfloat16> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 7;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#endif
+
+template<typename T>
+__host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
+  union { T ans; uint64_t bits; };
+  bits = sign;
+  bits <<= FloatLayout<T>::exponent_bits;
+  bits |= exp;
+  bits <<= FloatLayout<T>::mantissa_bits;
+  bits |= mant;
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// High bits of multiplcation are useful for generating bounded random values
+// from unbounded random values. For instance, given X a totally random 32-bit
+// integer, `umul32hi(X,n)` will be totally random within [0,n).
+__host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
+#ifdef __CUDA_ARCH__
+  return __umulhi(a, b);
+#else
+  return uint64_t(a)*b >> 32;
+#endif
+}
+__host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
+#ifdef __CUDA_ARCH__
+  return __umul64hi(a, b);
+#else
+  return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
+#endif
+}
+
+__host__ __device__ int clz32(int x) {
+#ifdef __CUDA_ARCH__
+  return __clz(x);
+#else
+  return x==0 ? 32 : __builtin_clz(x);
+#endif
+}
+__host__ __device__ int clz64(long long x) {
+#ifdef __CUDA_ARCH__
+  return __clzll(x);
+#else
+  return x==0 ? 64 : __builtin_clzll(x);
+#endif
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// Returns a wildly permuted rank index. Useful when we know we want exactly N
+// random ranks to exhibit some behavior, we can just test if:
+// `shuffleRank(rank_n, rank_me, rng) < N`. Note that rank_n > 0 must be true
+// for well defined results. This mixes the bits of rng.
+__host__ __device__ int shuffleRank(int rank_n, int rank_me, uint64_t &rng) {
+  uint32_t a = uint32_t(rng);
+  uint32_t b = uint32_t(rng>>32);
+  rng = mixBits(rng);
+
+  uint32_t r = rank_me;
+  // round down rank_n to largest pow2, then subtract 1
+  uint32_t n2 = (~uint32_t(0)>>1) >> clz32(rank_n);
+
+  // These are 1:1 functions modulo 2^n:
+  //   f(x) = x*a + b : for odd a, any b
+  //   f(x) = (x*x + x)/2
+  // So we apply both to the bottom n2+1 ranks, then rotate the top
+  // (rank_n-n2-1) to the bottom and apply both again.
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 ranks
+    r = (r*(a|1) + b) & n2;
+    r = (r*r + r)/2 & n2;
+    // rotate top to bottom
+    r += rank_n - (n2+1);
+  }
+  else
+    r -= n2+1; // rotate top to bottom
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 again
+    r = (r*(b|1) + a) & n2;
+    r = (r*r + r)/2 & n2;
+  }
+  return r;
+}
+}
+
+namespace {
+// Generate wild integers x and y such that if every rank submits its x into a
+// summation the result will be y with y <= y_max. Ranks should be shuffled
+// before calling.
+template<typename Uint>
+__host__ __device__ void genSumXY(
+    int rank_n, int rank_me, uint64_t &rng, Uint y_max, Uint &x, Uint &y,
+    bool avoid_y=false // if true then returned y will not equal given y
+  ) {
+  static_assert(std::is_unsigned<Uint>::value, "Type must be unsigned integral.");
+
+  { // Pick y as a random value in [y_max/2, y_max]
+    Uint d, y_min = (y_max+1)/2;
+    if(8*sizeof(Uint) > 32)
+      d = umul64hi(rng, y_max/2 + (avoid_y ? 0 : 1));
+    else
+      d = umul32hi(uint32_t(rng), y_max/2 + (avoid_y ? 0 : 1));
+    Uint y1 = (avoid_y ? y+1 : y_min) + d;
+    y = y1 - (avoid_y && (y1 < y_min || y_max < y1) ? y_max/2 : 0);
+  }
+  rng = mixBits(rng);
+
+  unsigned r = unsigned(rank_me);
+  unsigned rn = unsigned(rank_n);
+  // Partition our rn ranks into pn distinct subsets each of size rn/pn. If each
+  // rank submits 1+p (where p is 0-based partition index) then the sum be:
+  //   (rn/pn) * pn*(pn+1)/2
+  // So set this equal to our desired sum y and solve for pn.
+  //   (rn/pn) * pn*(pn+1)/2 = y
+  //   rn*(pn+1)/2 = y
+  //   pn = 2*(y/rn)-1
+  Uint pn = rn == 1 ? 1 : 2*(y/rn) - 1;
+  // In the case where rn is huge (compared to y) use only one partition meaning
+  // that all rn ranks will submit 1 (since p=0).
+  pn = pn == 0 ? 1 : pn;
+  // Can't have more partitions than ranks.
+  pn = rn < pn ? rn : pn;
+  // Compute sum of contribution from pn partitions where each submits p+1.
+  Uint p_sum;
+  if(y_max <= ~uint32_t(0)>>1) // compile time known
+    p_sum = Uint(uint32_t(pn)*uint32_t(pn+1)/2);
+  else
+    p_sum = Uint(uint64_t(pn)*uint64_t(pn+1)/2);
+  // Let s be the number of ranks per partition. This is either rn/pn as we
+  // intended, or y/p_sum if that's smaller to prevent overshooting our target y.
+  uint32_t s = y/p_sum < rn/pn ? y/p_sum : rn/pn;
+  x = r/s < pn ? 1 + r/s : 0; //  First s*pn ranks contribute partition index +1.
+  x += r == rn-1 ? y - s*p_sum : 0; // Last rank contributes discrepancy.
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    bool same_sign
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int x_sign = y_sign;
+  int xy_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+
+  // If we're using mixed signs then partition into evens and odds.
+  int subrank_n = same_sign ? rank_n : (rank_n+1)/2;
+  int subrank_me = same_sign ? rank_me : rank_me/2;
+  uintmant_t x0_mant, y0_mant;
+  genSumXY(subrank_n, subrank_me, rng, max_mant, x0_mant, y0_mant);
+
+  if (!same_sign && (rank_n+0)/2 != 0) {
+    uintmant_t x1_mant, y1_mant = y0_mant;
+    // Avoid generating y1_mant == y0_mant so we don't have to worry about
+    // signed zero as the result.
+    genSumXY((rank_n+0)/2, rank_me/2, rng, max_mant, x1_mant, y1_mant, /*avoid_y=*/true);
+    y_sign ^= y0_mant < y1_mant ? 1 : 0;
+    y0_mant = (y0_mant < y1_mant ? -1 : 1)*(y0_mant - y1_mant);
+    x_sign ^= rank_me%2;
+    x0_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  }
+
+  uintmant_t ans_mant = input_not_output ? x0_mant : y0_mant;
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = input_not_output ? x_sign : y_sign;
+    int ans_exp = xy_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatPreMulSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int y_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  int subrank_me0 = shuffleRank((rank_n+1)/2, rank_me/2, rng);
+  int subrank_me1 = shuffleRank((rank_n+0)/2, rank_me/2, rng);
+
+  // when ncclVerifiablePremulScalar() = 1.0 (rank_me%2 == 0)
+  uintmant_t x0_mant, y0_mant;
+  genSumXY((rank_n+1)/2, subrank_me0, rng, max_mant>>1, x0_mant, y0_mant);
+
+  // when ncclVerifiablePremulScalar() = 2.0 (rank_me%2 == 1)
+  uintmant_t x1_mant=0, y1_mant=0;
+  if((rank_n+0)/2 != 0)
+    genSumXY((rank_n+0)/2, subrank_me1, rng, max_mant>>2, x1_mant, y1_mant);
+
+  uintmant_t x_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  uintmant_t y_mant = y0_mant + 2*y1_mant;
+  uintmant_t ans_mant = input_not_output ? x_mant : y_mant;
+
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = y_sign;
+    int ans_exp = y_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatProd(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  // Three kinds of contributions (values for x):
+  // 1) x = random value: only one rank does this
+  // 2) x = 2^n: random positive n
+  // 3) x = 1
+  // Since only one rank submits a random value, the result of the product
+  // will have the same mantissa as that value but with an exponent incorporating
+  // the sum of the exponents from case (2)
+
+  uint64_t rng = hashOf(seed, index);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+  int y_sign = (rank_n/2)%2;
+  int x_sign = rank_me%2;
+
+  constexpr unsigned max_exp = -1 + (1<<(FloatLayout<T>::exponent_bits-1));
+  unsigned x_exp=0, y_exp=0;
+  genSumXY(rank_n, rank_me, rng, max_exp, x_exp, y_exp);
+  x_exp += FloatLayout<T>::exponent_bias;
+  y_exp += FloatLayout<T>::exponent_bias;
+
+  constexpr uint64_t mant_mask = (uint64_t(1)<<FloatLayout<T>::mantissa_bits)-1;
+  uint64_t y_mant = rng & mant_mask;
+  if (y_mant == 0) y_mant = 1;
+
+  return makeFloat<T>(
+    input_not_output ? x_sign : y_sign,
+    input_not_output ? x_exp : y_exp,
+    !input_not_output || rank_me==0 ? y_mant : 0
+  );
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// What follows is lots of overloads for genInput/genOutput to generate data
+
+namespace {
+// General case for integral data for all ops but ReduceNil/premulsum
+template<typename T, typename ReduceFn,
+         typename = typename std::enable_if<
+             !std::is_same<ReduceFn, ReduceNil>::value
+           >::type>
+__host__ __device__ void genInput(
+    T &ans, ReduceFn, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = uint64_t(-1)>>(64 - 8*sizeof(T));
+  bits &= hashOf(index ^ index<<16 ^ rank_me, seed);
+  // make sure we never return 0 in products
+  ans = std::is_same<ReduceFn, ReduceProd>::value && bits == 0 ? T(1) : tmp;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Dumb/generic case for genOutput just reduces results of genInput
+
+namespace {
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceFn op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  T acc = genInput<T>(op, rank_n, 0, seed, index);
+  acc = op.preOp(acc, 0);
+  for(int r=1; r < rank_n; r++)
+    acc = op(acc, op.preOp(genInput<T>(op, rank_n, r, seed, index), r));
+  ans = op.postOp(acc);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Nil reduction (byte copy functions). Optimized to assume rank_n=1
+
+namespace {
+template<typename T, bool IsIntegral>
+__host__ __device__ void genInput(
+    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  (void)rank_n, (void)rank_me; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = mixBits(seed ^ index);
+  bits >>= 64 - 8*sizeof(T);
+  bits &= uint64_t(-1)>>(64 - 8*sizeof(T));
+  ans = tmp;
+}
+
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  ans = genInput<T>(op, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Sum of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/false);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/false);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Product of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceProd, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceProd, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// PreMulSum of int/float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type integral
+  ) {
+  genInput(ans, ReduceSum(), rank_n, rank_me, seed, index, integral);
+}
+
+// No genOutput overload specific to premulsum(int), just use generic case.
+
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReducePreMulSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// Average of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
+  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+  ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// min/max of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMin, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type integral
+  ) {
+  genInput<T>(ans, ReduceMax(), rank_n, rank_me, seed, index, integral);
+}
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMax, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
+  int sign = rng & 1;
+  rng ^= rng>>1;
+  int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
+  exp += 1<<(FloatLayout<T>::exponent_bits-2);
+  rng ^= rng >> FloatLayout<T>::exponent_bits;
+  uint64_t mant = rng & mant_mask;
+  ans = makeFloat<T>(sign, exp, mant);
+}
+
+// No genOutput overload specific to floating point min/max, just use generic case.
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Entry API for genInput/genOutput
+
+namespace {
+template<typename T, typename ReduceFn>
+__host__ __device__ T genInput(
+    ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genInput(ans, op, rank_n, rank_me, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+
+template<typename T, typename ReduceFn>
+__host__ __device__ T genOutput(
+    ReduceFn op, int rank_n, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genOutput(ans, op, rank_n, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareInput2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genInput<T>(op, rank_n, rank_me, seed, elt_ix0+i);
+    #if 0
+    T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
+      std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareInput1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareInput(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
+    else \
+      prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareExpected2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    #if 0
+    printf("prepareExpected2 seed=0x%llx ix=%lld x=%g elts=%p\n",
+      (long long)seed, (long long)(elt_ix0+i), (float)elts[i], elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareExpected1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareExpected(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
+    else \
+      prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+/* How we compare floating point values when exactness is impossible is interesting.
+ * First, we take note that simply reinterpreting integer bits as floating point
+ * gives us a monotonic mapping which exponentially spaces out floats. Thus
+ * consecutive integers encode consecutive floats. In general, using integer
+ * subraction on the bitpatterns of two floats gives us an integer which is the
+ * logarithm of their relative difference. But, if the floats always have similar
+ * exponents, than the integer difference is actually proportional to the
+ * relative error (this is because we are counting hops in the mantissa bits only,
+ * not the exponent bits). So a cheap way to compare if two floats are relatively
+ * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
+ * calculates such a tolerance for a summation of n floats. This formula
+ * was derived by inspecting the maximum observed integer difference over many
+ * random runs of summation. The parameter values were computed by the
+ * companion program "inexact_regress.cu".
+ */
+__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
+  float power, coef;
+  switch(elt_ty) {
+  case ncclFloat32:
+  case ncclFloat64:
+    power = .51f;
+    coef = 1.25f;
+    break;
+  case ncclFloat16:
+    power = .91f;
+    coef = .75f;
+    break;
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16:
+    power = .91f;
+    coef = .66f;
+    break;
+  #endif
+  }
+  #if __CUDA_ARCH__
+    return 1 + unsigned(coef*powf(float(rank_n), power));
+  #else
+    return 1 + unsigned(coef*std::pow(float(rank_n), power));
+  #endif
+}
+
+template<typename T>
+__host__ __device__  uint64_t calcDelta(T a, T b) {
+  union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
+  x.t = a;
+  y.t = b;
+  switch(sizeof(T)) {
+  case 1:  return x.i1 < y.i1 ? y.i1 - x.i1 : x.i1 - y.i1;
+  case 2:  return x.i2 < y.i2 ? y.i2 - x.i2 : x.i2 - y.i2;
+  case 4:  return x.i4 < y.i4 ? y.i4 - x.i4 : x.i4 - y.i4;
+  default: return x.i8 < y.i8 ? y.i8 - x.i8 : x.i8 - y.i8;
+  }
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T>
+__global__ void verifyPrepared(
+    T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    T a = results[i], b = expected[i];
+    T delta = a < b ? b - a : a - b;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      if(tolerance < delta) {
+        printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+}
+
+template<typename T, typename Uint, typename ReduceFn>
+__global__ void verifyInline2(
+    T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
+    intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    union { T t; Uint u; } a, b;
+    a.t = results[i];
+    b.t = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    Uint delta = a.u < b.u ? b.u - a.u : a.u - b.u;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      T input = genInput<T>(op, rank_n, 0, seed, elt_ix0+i);
+      if(tolerance < delta) {
+        printf("verifyInline2 fail T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      } else {
+        printf("verifyInline2 pass T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+}
+
+template<typename T, typename Uint>
+void verifyInline1(
+    T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    else \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+}
+
+void ncclVerifiableVerify(
+    void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+    int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    int64_t *bad_elt_n, cudaStream_t stream
+  ) {
+  bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
+  #if HAVE_ncclBfloat16
+    floating |= elt_ty == ncclBfloat16;
+  #endif
+
+  unsigned tolerance = 0;
+  #if HAVE_ncclAvg
+  if (floating && red_op == ncclAvg)
+    tolerance = calcSumFloatTolerance(rank_n, elt_ty);
+  #endif
+
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+
+  *bad_elt_n = 0;
+  #define CASE_TY(T, Uint) { \
+      if(expected != nullptr) { \
+        verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
+      } else { \
+        verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
+      } \
+    } break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t, uint8_t)
+  case ncclUint8: CASE_TY(uint8_t, uint8_t)
+  case ncclInt32: CASE_TY(int32_t, uint32_t)
+  case ncclUint32: CASE_TY(uint32_t, uint32_t)
+  case ncclInt64: CASE_TY(int64_t, uint64_t)
+  case ncclUint64: CASE_TY(uint64_t, uint64_t)
+  case ncclFloat16: CASE_TY(half, uint16_t)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
+  #endif
+  case ncclFloat32: CASE_TY(float, uint32_t)
+  case ncclFloat64: CASE_TY(double, uint64_t)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if SELF_TEST
+#include <iostream>
+
+template<typename T, typename Op>
+__device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
+  //if(!std::is_same<T,half>::value) return;
+  //if(!std::is_same<Op,ReduceProd>::value) return;
+  //if(rank_n!=3) return;
+
+  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
+  uint64_t seed = 0xc8e2bed69766d533;
+
+  for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
+    //if(ix!=387) continue;
+    T y = genOutput<T>(op, rank_n, seed, ix);
+    T sum;
+    for(int r=0; r < rank_n; r++) {
+      T x = genInput<T>(op, rank_n, r, seed, ix);
+      x = op.preOp(x, r);
+      sum = r==0 ? x : op(sum, inhibit(x));
+      //std::printf("x = %llx, sum = %llx\n", bitsOf(x), bitsOf(sum));
+    }
+    sum = op.postOp(sum);
+    if(tolerance < calcDelta(sum, y)) {
+      std::printf(
+        //"%10g != %10g  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        "%llx != %llx  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
+      );
+    }
+  }
+}
+
+template<typename T>
+__device__ void sweep1(int ty, char const *tyname) {
+  for(int i=0; i < 10; i++) {
+    int rank_n = (1<<i) + i;
+    sweep2<T>(ty, tyname, ReduceSum(), "sum", rank_n);
+    sweep2<T>(ty, tyname, ReduceProd(), "prod", rank_n);
+    sweep2<T>(ty, tyname, ReduceMin(), "min", rank_n);
+    sweep2<T>(ty, tyname, ReduceMax(), "max", rank_n);
+    sweep2<T>(ty, tyname, ReducePreMulSum(), "premulsum", rank_n);
+    sweep2<T>(ty, tyname, ReduceAvg{rank_n}, "avg", rank_n);
+  }
+}
+
+__global__ void sweep() {
+  sweep1<int8_t>(ncclInt8, "int8");
+  sweep1<uint8_t>(ncclUint8, "uint8");
+  sweep1<int32_t>(ncclInt32, "int32");
+  sweep1<uint32_t>(ncclUint32, "uint32");
+  sweep1<int64_t>(ncclInt64, "int64");
+  sweep1<uint64_t>(ncclUint64, "uint64");
+  sweep1<half>(ncclFloat16, "half");
+  #if HAVE_ncclBfloat16
+    sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
+  #endif
+  sweep1<float>(ncclFloat32, "float");
+  sweep1<double>(ncclFloat64, "double");
+}
+
+int main(int arg_n, char **args) {
+  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
+  cudaSetDevice(0);
+  sweep<<<1,512>>>();
+  cudaDeviceSynchronize();
+  return 0;
+}
+#endif
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
new file mode 100644
index 0000000000..aca0565a6b
--- /dev/null
+++ b/verifiable/verifiable.h
@@ -0,0 +1,59 @@
+#ifndef _d41d8cd98f00b204e9800998ecf8427e
+#define _d41d8cd98f00b204e9800998ecf8427e
+
+#include <cuda_runtime.h>
+
+#include <stdint.h>
+
+/* Routines for launching kernels that verify reduction results. A significant
+ * feature of these routines is they carefully craft floating point input
+ * to produce exactly predictable output.
+ *
+ * int elt_ty: actually just a ncclDataType_t
+ *
+ * int red_op: mostly just a  ncclRedOp_t. Since PreMulSum ops are dynamically
+ * created, these are encoded as the value ncclNumOps and their scalar is
+ * assumed to be `ncclVerifiablePremulScalar(rank_me)`
+ *
+ * uint64_t seed: arbitrary 64-bits to use in seeding the random values
+ *
+ * intptr_t elt_ix0: index of first element pointed to by elts when generating
+ * random values. This makes it possible to generate subsequences independently
+ * as well as in aggregate.
+ *
+ * int rank_n: Number of contributions into the reduction. Non-reduction
+ * collectives like broadcast, gather, etc will always set this to one.
+ *
+ * int rank_me: Index of this contribution
+ */
+
+// Use this as the local scalar for PreMulSum ops
+template<typename T>
+__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
+  return T(rank_me%2 == 0 ? 1.0f : 2.0f);
+}
+
+// Enqueue kernel to generate data which is to be reduced.
+void ncclVerifiablePrepareInput(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+);
+
+// Enqueue kernel to generate expected results of reduction.
+void ncclVerifiablePrepareExpected(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+);
+
+// Enqueue kernel to verify reduced data matches expectation. The number of
+// failed elements is written to bad_elt_n which must be in cudaHost memory.
+// If `expected == nullptr` then the expected results are generated on-the-fly
+// which can be costly. Thus if you plan to run the same reduction multiple
+// times it is advantageous to precompute the expected values with
+// ncclVerifiablePrepareExpected and pass them as `expected` here.
+void ncclVerifiableVerify(
+  void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+  int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+  int64_t *bad_elt_n, cudaStream_t stream
+);
+#endif
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
new file mode 100644
index 0000000000..225c32a3c3
--- /dev/null
+++ b/verifiable/verifiable.mk
@@ -0,0 +1,11 @@
+# We requires both of the following paths to be set upon including this makefile
+# TEST_VERIFIABLE_SRCDIR = <points to this directory>
+# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
+
+TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
+	@printf "Compiling %s\n" $@
+	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu

From bc5f7cfb0aad52af4388f5b4cc6214baf1e1a8ed Mon Sep 17 00:00:00 2001
From: John Bachan <jbachan@nvidia.com>
Date: Thu, 7 Jul 2022 11:42:21 +0200
Subject: [PATCH 106/233] Changed top-level Makefile behavior so that BUILDDIR
 is interpreted as relative to top-level directory. This done is by
 abspath'ing it before passing it to subdirectory Makefile's.

The old behavior had two cases: with and without BUILDDIR being set by
the user. With BUILDDIR not set, the build dir would be named "build"
in the top-level directory. If BUILDDIR was set, then the build dir
would be placed at "src/${BUILDDIR}".

The new behavior is simpler, if BUILDDIR is not set then it defaults
to "build", and the directory holding the final build is always at just
"${BUILDDIR}" in the top level.
---
 Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 29409a8422..43729f897a 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,9 @@
 # See LICENCE.txt for license information
 #
 
+BUILDDIR ?= build
+override BUILDDIR := $(abspath $(BUILDDIR))
+
 .PHONY : all clean
 
 default : src.build
@@ -14,7 +17,7 @@ all:   ${TARGETS:%=%.build}
 clean: ${TARGETS:%=%.clean}
 
 %.build:
-	${MAKE} -C $* build
+	${MAKE} -C $* build BUILDDIR=${BUILDDIR}
 
 %.clean:
-	${MAKE} -C $* clean
+	${MAKE} -C $* clean BUILDDIR=${BUILDDIR}

From a0a14911ee5405353a85a7e345c188514410e10e Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 6 Sep 2022 13:17:15 -0700
Subject: [PATCH 107/233] Display N/A for error count in AlltoAll in-place test

AlltoAll does not support in-place buffers
---
 src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index eaa3318f34..0bc047c4f1 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -467,7 +467,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   int64_t wrongElts = 0;
   static __thread int rep = 0;
   rep++;
-  if (datacheck) {
+  if (args->reportErrors) {
       // Initialize sendbuffs, recvbuffs and expected
       TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
@@ -529,7 +529,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   } else {
     sprintf(timeStr, "%7.2f", timeUsec);
   }
-  if (datacheck) {
+  if (args->reportErrors) {
     PRINT("  %7s  %6.2f  %6.2f  %5g", timeStr, algBw, busBw, (double)wrongElts);
   } else {
     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");

From afa4c56b6aeae3d198dfc30d9d8f26cc5ee75dba Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 7 Sep 2022 11:23:49 -0700
Subject: [PATCH 108/233] Fix an issue with the last commit when data checking
 is disabled

---
 src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 0bc047c4f1..8fe9258164 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -467,7 +467,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   int64_t wrongElts = 0;
   static __thread int rep = 0;
   rep++;
-  if (args->reportErrors) {
+  if (datacheck) {
       // Initialize sendbuffs, recvbuffs and expected
       TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
@@ -952,7 +952,7 @@ testResult_t run() {
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
 
-    threads[t].args.reportErrors = 1;
+    threads[t].args.reportErrors = datacheck;
 
     threads[t].func = parallel_init ? threadInit : threadRunTests;
     if (t)

From 749573f2d65027859c8ace9d41fabf4b81eda491 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 7 Sep 2022 16:10:41 -0700
Subject: [PATCH 109/233] Fix preprocessor version check for ncclGetLastError()

ncclGetLastError() was added in NCCL 2.13.0
---
 src/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.h b/src/common.h
index 51cf9da276..84967ed6a1 100644
--- a/src/common.h
+++ b/src/common.h
@@ -28,7 +28,7 @@
   }                                                 \
 } while(0)
 
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,12,10)
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
 #define NCCLCHECK(cmd) do {                         \
   ncclResult_t res = cmd;                           \
   if (res != ncclSuccess) {                         \

From d313d20a2695b7a9be9b22bd9417fe2e201fef3f Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 20 Sep 2022 02:21:36 -0700
Subject: [PATCH 110/233] Update NCCL tests

---
 src/Makefile          |   9 ++-
 src/all_gather.cu     |  18 ++---
 src/all_reduce.cu     |   5 +-
 src/alltoall.cu       |   9 +--
 src/broadcast.cu      |   7 +-
 src/common.cu         | 151 ++++++++++++++++++++++++++++--------------
 src/common.h          |  56 ++++------------
 src/gather.cu         |   7 +-
 src/hypercube.cu      |   5 +-
 src/reduce.cu         |   5 +-
 src/reduce_scatter.cu |  16 +++--
 src/scatter.cu        |   7 +-
 src/sendrecv.cu       |   5 +-
 src/timer.cc          |  28 ++++++++
 src/timer.h           |  15 +++++
 15 files changed, 206 insertions(+), 137 deletions(-)
 create mode 100644 src/timer.cc
 create mode 100644 src/timer.h

diff --git a/src/Makefile b/src/Makefile
index 137b9d7925..6d8b1ef40f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -92,7 +92,12 @@ ${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
 
-${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o $(TEST_VERIFIABLE_OBJS)
+${DST_DIR}/timer.o: timer.cc timer.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
diff --git a/src/all_gather.cu b/src/all_gather.cu
index 1eaafddfab..0831207433 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,12 +7,15 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
+#define ALIGN 4
+
 void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = count/nranks;
-  *recvcount = (count/nranks)*nranks;
-  *sendInplaceOffset = count/nranks;
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
   *recvInplaceOffset = 0;
-  *paramcount = *sendcount;
+  *paramcount = base;
 }
 
 testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -21,8 +24,7 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
@@ -78,7 +80,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
   }
 
   for (int i=0; i<type_count; i++) {
-    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
   }
   return testSuccess;
 }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 9c65f25aba..a38eabe057 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -21,8 +21,7 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 0eae1b07c9..41c7c4ae33 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -21,9 +21,7 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    char* str = getenv("NCCL_TESTS_DEVICE");
-    int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
@@ -51,7 +49,6 @@ testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclD
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   size_t rankOffset = count * wordSize(type);
-  if (count == 0) return testSuccess;
 
 #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
   printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
@@ -97,7 +94,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
   }
 
   for (int i=0; i<type_count; i++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
   }
   return testSuccess;
 }
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 40dcb5d885..903066a2b8 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -20,8 +20,7 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
   size_t recvcount = args->expectedBytes / wordSize(type);
 
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
@@ -94,7 +93,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
 
   for (int i=0; i<type_count; i++) {
     for (int j=begin_root; j<=end_root; j++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
     }
   }
   return testSuccess;
diff --git a/src/common.cu b/src/common.cu
index 8fe9258164..5837ed1bcd 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -50,6 +50,12 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
   int test_opnum = 4;
 #endif
 
+// For libnccl's < 2.13
+extern "C" __attribute__((weak)) char const* ncclGetLastError(ncclComm_t comm) {
+  return "";
+}
+
+int is_main_proc = 0;
 thread_local int is_main_thread = 0;
 
 // Command line parameter defaults
@@ -68,7 +74,10 @@ static int nccltype = ncclFloat;
 static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
+static int streamnull = 0;
+static int timeout = 0;
 static int cudaGraphLaunches = 0;
+static int report_cputime = 0;
 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
 
@@ -198,7 +207,7 @@ void Allreduce(struct threadArgs* args, T* value, int average) {
     }
     #endif
 
-    if(average == 1) accumulator[epoch] /= args->nProcs*args->nThreads;
+    if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads;
     counter[epoch] = 0;
     pthread_cond_broadcast(&cond[epoch]);
   }
@@ -220,10 +229,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   CUDACHECK(cudaHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
 
   for (int i=0; i<args->nGpus; i++) {
-    int device;
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
 
     TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
@@ -266,6 +273,8 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
   int remaining = ngpus;
   int* done = (int*)malloc(sizeof(int)*ngpus);
   memset(done, 0, sizeof(int)*ngpus);
+  timer tim;
+
   while (remaining) {
    int idle = 1;
    for (int i=0; i<ngpus; i++) {
@@ -294,6 +303,19 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
          NCCLCHECK(ncclAsyncErr);
        }
      }
+     double delta = tim.elapsed();
+     if (delta > timeout && timeout > 0) {
+       for (int i=0; i<ngpus; i++)
+         NCCLCHECK(ncclCommAbort(comms[i]));
+       char hostname[1024];
+       getHostName(hostname, 1024);
+       printf("%s: Test timeout (%ds) %s:%d\n",
+           hostname,
+           timeout,
+           __FILE__,__LINE__);
+       free(done);
+       return testTimeout;
+     }
 #endif
    }
 
@@ -315,9 +337,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
   for (int i = 0; i < args->nGpus; i++) {
 #ifndef NCCL_MAJOR
-    int cudaDev;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
-    CUDACHECK(cudaSetDevice(cudaDev));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
 #endif
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -411,7 +431,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #endif
 
   // Performance Benchmark
-  auto start = std::chrono::high_resolution_clock::now();
+  timer tim;
   for (int iter = 0; iter < iters; iter++) {
     if (agg_iters>1) NCCLCHECK(ncclGroupStart());
     for (int aiter = 0; aiter < agg_iters; aiter++) {
@@ -432,7 +452,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     }
     // Resync CPU, restart timing, launch cuda graph
     Barrier(args);
-    start = std::chrono::high_resolution_clock::now();
+    tim.reset();
     for (int l=0; l<cudaGraphLaunches; l++) {
       for (int i=0; i<args->nGpus; i++) {
         CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
@@ -441,10 +461,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   }
 #endif
 
+  double cputimeSec = tim.elapsed()/(iters*agg_iters);
   TESTCHECK(completeColl(args));
 
-  auto delta = std::chrono::high_resolution_clock::now() - start;
-  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  double deltaSec = tim.elapsed();
   deltaSec = deltaSec/(iters*agg_iters);
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
   Allreduce(args, &deltaSec, average);
@@ -520,7 +540,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       wrongElts = wrongElts1;
   }
 
-  double timeUsec = deltaSec*1.0E6;
+  double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
   char timeStr[100];
   if (timeUsec >= 10000.0) {
     sprintf(timeStr, "%7.0f", timeUsec);
@@ -555,6 +575,9 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
 }
 
 testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+  // Sync to avoid first-call timeout
+  Barrier(args);
+
   // Warm-up for large size
   setupArgs(args->maxbytes, type, args);
   for (int iter = 0; iter < warmup_iters; iter++) {
@@ -586,8 +609,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
-  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
-  CUDACHECK(cudaSetDevice(gpuid));
+  CUDACHECK(cudaSetDevice(args->gpus[0]));
   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
   return testSuccess;
 }
@@ -598,13 +620,12 @@ testResult_t threadInit(struct threadArgs* args) {
   int nranks =  args->nProcs*args->nThreads*args->nGpus;
 
   //set main thread again
-  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+  is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0;
 
   NCCLCHECK(ncclGroupStart());
   for (int i=0; i<args->nGpus; i++) {
     int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
   }
   NCCLCHECK(ncclGroupEnd());
@@ -679,7 +700,10 @@ int main(int argc, char* argv[]) {
     {"datatype", required_argument, 0, 'd'},
     {"root", required_argument, 0, 'r'},
     {"blocking", required_argument, 0, 'z'},
+    {"stream_null", required_argument, 0, 'y'},
+    {"timeout", required_argument, 0, 'T'},
     {"cudagraph", required_argument, 0, 'G'},
+    {"report_cputime", required_argument, 0, 'C'},
     {"average", required_argument, 0, 'a'},
     {"help", no_argument, 0, 'h'},
     {}
@@ -687,7 +711,7 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -752,6 +776,12 @@ int main(int argc, char* argv[]) {
       case 'z':
         blocking_coll = strtol(optarg, NULL, 0);
         break;
+      case 'y':
+        streamnull = strtol(optarg, NULL, 0);
+        break;
+      case 'T':
+        timeout = strtol(optarg, NULL, 0);
+        break;
       case 'G':
 #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
         cudaGraphLaunches = strtol(optarg, NULL, 0);
@@ -759,6 +789,9 @@ int main(int argc, char* argv[]) {
         printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
 #endif
         break;
+      case 'C':
+        report_cputime = strtol(optarg, NULL, 0);
+        break;
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
         break;
@@ -787,11 +820,14 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
+            "[-y,--stream_null <0/1>] \n\t"
+            "[-T,--timeout <time in seconds>] \n\t"
             "[-G,--cudagraph <num graph launches>] \n\t"
+            "[-C,--report_cputime <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
             "[-h,--help]\n",
-	    basename(argv[0]));
-	return 0;
+          basename(argv[0]));
+        return 0;
     }
   }
   if (minBytes > maxBytes) {
@@ -808,23 +844,31 @@ int main(int argc, char* argv[]) {
 }
 
 testResult_t run() {
-  int nProcs = 1, proc = 0;
+  int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
   int localRank = 0;
   char hostname[1024];
   getHostName(hostname, 1024);
 
 #ifdef MPI_SUPPORT
-  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_size(MPI_COMM_WORLD, &totalProcs);
   MPI_Comm_rank(MPI_COMM_WORLD, &proc);
-  uint64_t hostHashs[nProcs];
+  uint64_t hostHashs[totalProcs];
   hostHashs[proc] = getHostHash(hostname);
   MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
-  for (int p=0; p<nProcs; p++) {
+  for (int p=0; p<totalProcs; p++) {
     if (p == proc) break;
     if (hostHashs[p] == hostHashs[proc]) localRank++;
   }
+
+  char* str = getenv("NCCL_TESTS_SPLIT_MASK");
+  uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
+  MPI_Comm mpi_comm;
+  color = proc & mask;
+  MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
+  MPI_Comm_size(mpi_comm, &ncclProcs);
+  MPI_Comm_rank(mpi_comm, &ncclProc);
 #endif
-  is_main_thread = (proc == 0) ? 1 : 0;
+  is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
 
   PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
         nThreads, nGpus, minBytes, maxBytes,
@@ -839,22 +883,24 @@ testResult_t run() {
   char line[MAX_LINE];
   int len = 0;
   size_t maxMem = ~0;
+  char* envstr = getenv("NCCL_TESTS_DEVICE");
+  int gpu0 = envstr ? atoi(envstr) : -1;
   for (int i=0; i<nThreads*nGpus; i++) {
-    int cudaDev = localRank*nThreads*nGpus+i;
+    int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
     int rank = proc*nThreads*nGpus+i;
     cudaDeviceProp prop;
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
-                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    len += snprintf(line+len, MAX_LINE-len, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+                    rank, color, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
     maxMem = std::min(maxMem, prop.totalGlobalMem);
   }
 
 #if MPI_SUPPORT
-  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+  char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL;
   // Gather all output in rank order to root (0)
   MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
   if (proc == 0) {
-    for (int p = 0; p < nProcs; p++)
+    for (int p = 0; p < totalProcs; p++)
       PRINT("%s", lines+MAX_LINE*p);
     free(lines);
   }
@@ -871,39 +917,43 @@ testResult_t run() {
   }
 
   ncclUniqueId ncclId;
-  if (proc == 0) {
+  if (ncclProc == 0) {
     NCCLCHECK(ncclGetUniqueId(&ncclId));
   }
 #ifdef MPI_SUPPORT
-  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
 #endif
+  int gpus[nGpus*nThreads];
   cudaStream_t streams[nGpus*nThreads];
   void* sendbuffs[nGpus*nThreads];
   void* recvbuffs[nGpus*nThreads];
   void* expected[nGpus*nThreads];
   size_t sendBytes, recvBytes;
 
-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads);
 
+  envstr = getenv("NCCL_TESTS_DEVICE");
+  gpu0 = envstr ? atoi(envstr) : -1;
   for (int i=0; i<nGpus*nThreads; i++) {
-    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+    gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
+    CUDACHECK(cudaSetDevice(gpus[i]));
     TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
-    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+    if (streamnull)
+      streams[i] = NULL;
+    else
+      CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
   if (!parallel_init) {
-     if (nProcs == 1) {
-       int gpuArray[nGpus*nThreads];
-       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
-       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
+     if (ncclProcs == 1) {
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
      } else {
        NCCLCHECK(ncclGroupStart());
        for (int i=0; i<nGpus*nThreads; i++) {
-         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
+         CUDACHECK(cudaSetDevice(gpus[i]));
+         NCCLCHECK(ncclCommInitRank(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i));
        }
        NCCLCHECK(ncclGroupEnd());
      }
@@ -919,10 +969,11 @@ testResult_t run() {
     errors[t] = bw_count[t] = 0;
   }
 
+  const char* timeStr = report_cputime ? "cputime" : "time";
   PRINT("#\n");
   PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
   PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
-      "time", "algbw", "busbw", "#wrong", "time", "algbw", "busbw", "#wrong");
+      timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
   PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
       "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 
@@ -936,11 +987,13 @@ testResult_t run() {
     threads[t].args.stepfactor=stepFactor;
     threads[t].args.localRank = localRank;
 
-    threads[t].args.nProcs=nProcs;
-    threads[t].args.proc=proc;
+    threads[t].args.totalProcs=totalProcs;
+    threads[t].args.nProcs=ncclProcs;
+    threads[t].args.proc=ncclProc;
     threads[t].args.nThreads=nThreads;
     threads[t].args.thread=t;
     threads[t].args.nGpus=nGpus;
+    threads[t].args.gpus=gpus+t*nGpus;
     threads[t].args.sendbuffs = sendbuffs+t*nGpus;
     threads[t].args.recvbuffs = recvbuffs+t*nGpus;
     threads[t].args.expected = expected+t*nGpus;
@@ -990,8 +1043,8 @@ testResult_t run() {
   }
   CUDACHECK(cudaFreeHost(delta));
 
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
+  envstr = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = envstr ? atof(envstr) : -1;
   bw[0] /= bw_count[0];
 
   PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
@@ -1001,6 +1054,8 @@ testResult_t run() {
   MPI_Finalize();
 #endif
 
+  PRINT("%s\n", ncclGetLastError(NULL));
+
   // 'cuda-memcheck --leak-check full' requires this
   cudaDeviceReset();
 
diff --git a/src/common.h b/src/common.h
index 84967ed6a1..b69d071606 100644
--- a/src/common.h
+++ b/src/common.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -15,6 +15,10 @@
 #endif
 #include <pthread.h>
 #include "nccl1_compat.h"
+#include "timer.h"
+
+// For nccl.h < 2.13 since we define a weak fallback
+extern "C" char const* ncclGetLastError(ncclComm_t comm);
 
 #define CUDACHECK(cmd) do {                         \
   cudaError_t err = cmd;                            \
@@ -61,6 +65,8 @@ typedef enum {
   testInternalError = 1,
   testCudaError = 2,
   testNcclError = 3,
+  testTimeout = 4,
+  testNumResults = 5
 } testResult_t;
 
 // Relay errors up and trace
@@ -110,11 +116,13 @@ struct threadArgs {
   size_t stepbytes;
   size_t stepfactor;
 
+  int totalProcs;
   int nProcs;
   int proc;
   int nThreads;
   int thread;
   int nGpus;
+  int* gpus;
   int localRank;
   void** sendbuffs;
   size_t sendBytes;
@@ -144,8 +152,6 @@ struct testThread {
   testResult_t ret;
 };
 
-#include <chrono>
-
 // Provided by common.cu
 extern void Barrier(struct threadArgs* args);
 extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
@@ -153,10 +159,6 @@ extern testResult_t InitDataReduce(void* data, const size_t count, const size_t
 extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
 extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
 
-// Provided by each coll
-extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
-extern void print_header();
-
 #include <unistd.h>
 
 static void getHostName(char* hostname, int maxlen) {
@@ -171,46 +173,15 @@ static void getHostName(char* hostname, int maxlen) {
 
 #include <stdint.h>
 
-static uint64_t getHash(const char* string, size_t n) {
-  // Based on DJB2a, result = result * 33 ^ char
+static uint64_t getHostHash(const char* string) {
+  // Based on DJB2, result = result * 33 + char
   uint64_t result = 5381;
-  for (size_t c = 0; c < n; c++) {
-    result = ((result << 5) + result) ^ string[c];
+  for (int c = 0; string[c] != '\0'; c++){
+    result = ((result << 5) + result) + string[c];
   }
   return result;
 }
 
-/* Generate a hash of the unique identifying string for this host
- * that will be unique for both bare-metal and container instances
- * Equivalent of a hash of;
- *
- * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
- *
- */
-#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
-static uint64_t getHostHash(const char* hostname) {
-  char hostHash[1024];
-
-  // Fall back is the hostname if something fails
-  (void) strncpy(hostHash, hostname, sizeof(hostHash));
-  int offset = strlen(hostHash);
-
-  FILE *file = fopen(HOSTID_FILE, "r");
-  if (file != NULL) {
-    char *p;
-    if (fscanf(file, "%ms", &p) == 1) {
-        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
-        free(p);
-    }
-  }
-  fclose(file);
-
-  // Make sure the string is terminated
-  hostHash[sizeof(hostHash)-1]='\0';
-
-  return getHash(hostHash, strlen(hostHash));
-}
-
 static size_t wordSize(ncclDataType_t type) {
   switch(type) {
     case ncclChar:
@@ -277,6 +248,7 @@ static int ncclstringtoop (char *str) {
     return ncclSum;
 }
 
+extern int is_main_proc;
 extern thread_local int is_main_thread;
 #define PRINT if (is_main_thread) printf
 
diff --git a/src/gather.cu b/src/gather.cu
index 99088528d3..03ef4d9e3f 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -21,8 +21,7 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
@@ -103,7 +102,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
 
   for (int i=0; i<type_count; i++) {
     for (int j=begin_root; j<=end_root; j++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
     }
   }
   return testSuccess;
diff --git a/src/hypercube.cu b/src/hypercube.cu
index ae9fbd0ad5..5c1456f8c7 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -24,8 +24,7 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
diff --git a/src/reduce.cu b/src/reduce.cu
index c2707c75cc..f2fa80dd95 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -21,8 +21,7 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index e4a59dc20e..ed372e3b9a 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,12 +7,15 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
+#define ALIGN 4
+
 void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks;
-  *recvcount = count/nranks;
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base*nranks;
+  *recvcount = base;
   *sendInplaceOffset = 0;
-  *recvInplaceOffset = count/nranks;
-  *paramcount = *recvcount;
+  *recvInplaceOffset = base;
+  *paramcount = base;
 }
 
 testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -21,8 +24,7 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
diff --git a/src/scatter.cu b/src/scatter.cu
index d244b2b8bc..49d20e1601 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -20,8 +20,7 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
   size_t recvcount = args->expectedBytes / wordSize(type);
 
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
@@ -99,7 +98,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
 
   for (int i=0; i<type_count; i++) {
     for (int j=begin_root; j<=end_root; j++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
     }
   }
   return testSuccess;
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index e73a92b2d5..c9eb5bb427 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -21,8 +21,7 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
diff --git a/src/timer.cc b/src/timer.cc
new file mode 100644
index 0000000000..f65be4dbfe
--- /dev/null
+++ b/src/timer.cc
@@ -0,0 +1,28 @@
+#include "timer.h"
+
+// Make sure to compile this translation unit with the host compiler and not
+// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
+#include <chrono>
+
+namespace {
+  std::uint64_t now() {
+    using clock = std::chrono::steady_clock;
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
+  }
+}
+
+timer::timer() {
+  t0 = now();
+}
+
+double timer::elapsed() const {
+  std::uint64_t t1 = now();
+  return 1.e-9*(t1 - t0);
+}
+
+double timer::reset() {
+  std::uint64_t t1 = now();
+  double ans = 1.e-9*(t1 - t0);
+  t0 = t1;
+  return ans;
+}
diff --git a/src/timer.h b/src/timer.h
new file mode 100644
index 0000000000..33aed4d7a6
--- /dev/null
+++ b/src/timer.h
@@ -0,0 +1,15 @@
+#ifndef _408319ecdd5b47b28bf8f511c4fdf816
+#define _408319ecdd5b47b28bf8f511c4fdf816
+
+#include <cstdint>
+
+// Can't include <chrono> because of bug with gcc 10.3.0
+class timer {
+  std::uint64_t t0;
+public:
+  timer();
+  double elapsed() const;
+  double reset();
+};
+
+#endif

From 3fbd3280ce2d747a902176a3c87c8c49c32f3fc2 Mon Sep 17 00:00:00 2001
From: akolliasAMD <99202231+akolliasAMD@users.noreply.github.com>
Date: Thu, 29 Sep 2022 15:36:39 -0600
Subject: [PATCH 111/233] removed hypercube from Makefile (#19)

---
 Makefile     | 2 +-
 src/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 29409a8422..4025f10e06 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@
 
 default : src.build
 
-TARGETS=src
+TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
 
 all:   ${TARGETS:%=%.build}
 clean: ${TARGETS:%=%.clean}
diff --git a/src/Makefile b/src/Makefile
index ec0301b758..3dbd41ff9a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -57,7 +57,7 @@ HIPLDFLAGS   += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube alltoallv
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}

From 365b92a1ead1b80077fac0929e2bbfbd25cdcdd0 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Wed, 12 Oct 2022 01:23:46 -0700
Subject: [PATCH 112/233] Fix build on RHEL7 with GCC 4.8

Add -std=c++11 to CXXFLAGS.
Fixes #116.
---
 src/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Makefile b/src/Makefile
index 6d8b1ef40f..6ea07303b5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -35,6 +35,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
 endif
 
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+CXXFLAGS   := -std=c++11
 
 LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt

From d22281cb3f609246c5d7e2d2467b4423c28d537f Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Wed, 12 Oct 2022 17:28:04 -0700
Subject: [PATCH 113/233] Allow more precise measurements of single operation
 (#20)

---
 src/common.cu | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 332cc3f272..86d62bfd2e 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -81,6 +81,7 @@ static int average = 1;
 static int numDevices = 1;
 static int ranksPerGpu = 1;
 static int enable_multiranks = 0;
+static int delay_inout_place = 0;
 
 #define NUM_BLOCKS 32
 
@@ -645,9 +646,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
   }
 
-  // Sync
-  TESTCHECK(startColl(args, type, op, root, in_place, 0));
-  TESTCHECK(completeColl(args));
+  if (warmup_iters) {
+    // Sync
+    TESTCHECK(startColl(args, type, op, root, in_place, 0));
+    TESTCHECK(completeColl(args));
+  }
 
   Barrier(args);
 
@@ -830,6 +833,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
         setupArgs(size, type, args);
         print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
         TESTCHECK(BenchTime(args, type, op, root, 0));
+        usleep(delay_inout_place);
         TESTCHECK(BenchTime(args, type, op, root, 1));
         PRINT("\n");
     }
@@ -984,9 +988,9 @@ int main(int argc, char* argv[]) {
   while(1) {
     int c;
 #ifdef RCCL_MULTIRANKPERGPU
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:R:x:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:R:x:q:", longopts, &longindex);
 #else
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:q:", longopts, &longindex);
 #endif
 
     if (c == -1)
@@ -1086,6 +1090,9 @@ int main(int argc, char* argv[]) {
         ranksPerGpu = (int)strtol(optarg, NULL, 0);
         break;
 #endif
+      case 'q':
+        delay_inout_place = (int)strtol(optarg, NULL, 10);
+        break;
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -1120,6 +1127,7 @@ int main(int argc, char* argv[]) {
             "[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t"
             "[-R,--ranks_per_gpu] \n\t"
 #endif
+            "[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;
@@ -1253,14 +1261,14 @@ testResult_t run() {
       TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu));
       //PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
       if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
-	PRINT("cumask: ");
-	for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
-	PRINT("\n");
-	HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
+        PRINT("cumask: ");
+        for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
+        PRINT("\n");
+        HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
       } else
-	HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+        HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
       // initialize data buffer to avoid all zero data
-      TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
+      if (datacheck) TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
     }
     HIPCHECK(hipDeviceSynchronize());
   }

From 641e93e99ccb38920154741e84cf9a10ac3da25b Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Mon, 17 Oct 2022 14:13:48 +0000
Subject: [PATCH 114/233] make rccl-test compile again.

all files compile now.
mpi tests also pass
---
 CMakeLists.txt                |   1 +
 src/Makefile                  |   5 +-
 src/alltoallv.cu              |  48 ++++------
 src/common.cu                 | 103 +++++++++-----------
 src/common.h                  |  12 +--
 verifiable/Makefile           |  59 +++++++++++-
 verifiable/inexact_regress.cu |  56 +++++++----
 verifiable/verifiable.cu      | 171 ++++++++++++++--------------------
 verifiable/verifiable.h       |  15 ++-
 verifiable/verifiable.mk      |   9 +-
 10 files changed, 251 insertions(+), 228 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 539a1eae2b..1e22365515 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,7 @@ endif()
 set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
 
 # Add all of the tests
+add_subdirectory(verifiable)
 add_subdirectory(src)
 
 # Create ROCm standard packages
diff --git a/src/Makefile b/src/Makefile
index 0c3c424616..dd01c484f9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -20,12 +20,11 @@ LDFLAGS    :=
 HIPLDFLAGS :=
 
 ifneq ($(NCCL_HOME), "")
-HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
 HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
 endif
 HIPCUFLAGS += -I$(ROCM_PATH)/include
-HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
-HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
 HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
 
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index cb8fcaff0d..c5818d9ded 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -10,18 +10,6 @@
 
 #define USE_RCCL_GATHER_SCATTER
 
-void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
-}
-
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   if (count < nranks*nranks/2) {
     *sendcount = 0;
@@ -45,17 +33,14 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    char* str = getenv("NCCL_TESTS_DEVICE");
-    int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
+
 #if 0
       int *dataHost = (int *)malloc(args->sendBytes);
       hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
@@ -66,24 +51,25 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
       printf("\n");
       free(dataHost);
 #endif
+
       size_t rdisp = 0;
       size_t data_count = sendcount*2/nranks;
       size_t chunksize = data_count/nranks;
       for (int j=0; j<nranks; j++) {
-	size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
-	if ((j+rank)%nranks == 0)
+        size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
+        if ((j+rank)%nranks == 0)
           rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
-	size_t sdisp = 0;
-	for (int k=0; k<nranks; k++) {
-	  scount = ((k+j)%nranks)*chunksize;
-	  if ((k+j)%nranks == 0)
-	    scount += (sendcount-chunksize*(nranks-1)*nranks/2);
-	  if (k == rank)
-	    break;
-	  sdisp += scount;
-	}
-	TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
-	rdisp += rcount;
+        size_t sdisp = 0;
+        for (int kk=0; kk<nranks; kk++) {
+          scount = ((kk+j)%nranks)*chunksize;
+          if ((kk+j)%nranks == 0)
+            scount += (sendcount-chunksize*(nranks-1)*nranks/2);
+          if (kk == rank)
+            break;
+          sdisp += scount;
+        }
+        TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
+        rdisp += rcount;
       }
       k++;
     }
diff --git a/src/common.cu b/src/common.cu
index 5f8d7f58fe..4f80115869 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -165,18 +165,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
 }
 
 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
-  CUDACHECK(cudaDeviceSynchronize());
+  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
+  HIPCHECK(hipDeviceSynchronize());
   return testSuccess;
 }
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
+  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
   return testSuccess;
 }
 
 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
+  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
   return testSuccess;
 }
 
@@ -271,7 +271,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   size_t count = args->expectedBytes/wordSize(type);
 
   int64_t *wrongPerGpu = nullptr;
-  CUDACHECK(hipHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostAllocMapped));
+  HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
   
   for (int i=0; i<args->nGpus*args->nRanks; i++) {
     int device;
@@ -352,7 +352,7 @@ testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_
      }
      double delta = tim.elapsed();
      if (delta > timeout && timeout > 0) {
-       for (int i=0; i<ngpus; i++)
+       for (int i=0; i<nStreams; i++)
          NCCLCHECK(ncclCommAbort(comms[i]));
        char hostname[1024];
        getHostName(hostname, 1024);
@@ -387,7 +387,6 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     int hipDev;
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev));
     HIPCHECK(hipSetDevice(hipDev));
-    //CUDACHECK(cudaSetDevice(args->gpus[i])); EDGAR CHECK LATER
 #endif
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -417,7 +416,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
       case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
       #if defined(RCCL_BFLOAT16)
-      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
+      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<rccl_bfloat16>(rank); break;
       #endif
       }
       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
@@ -452,7 +451,7 @@ testResult_t completeColl(struct threadArgs* args) {
   return testSuccess;
 }
 
-//EDGAR: Revisit because of cudaGraphLaunches
+//RCCL: Revisit because of cudaGraphLaunches
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
   if (datacheck) {
@@ -648,7 +647,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
     // Benchmark
     for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
         setupArgs(size, type, args);
-        print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+	char rootName[100];
+	sprintf(rootName, "%6i", root);	
+	PRINT("%12li  %12li  %8s  %6s  %6s", (size_t)max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
         TESTCHECK(BenchTime(args, type, op, root, 0));
         TESTCHECK(BenchTime(args, type, op, root, 1));
         PRINT("\n");
@@ -661,10 +662,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
-  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
-  if (enable_multiranks)
-    gpuid = gpuid % numDevices;
-  HIPCHECK(hipSetDevice(gpuid));
+  HIPCHECK(hipSetDevice(args->gpus[0]));
   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
   return testSuccess;
 }
@@ -679,11 +677,7 @@ testResult_t threadInit(struct threadArgs* args) {
 
   NCCLCHECK(ncclGroupStart());
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (enable_multiranks)
-      gpuid = gpuid % numDevices;
-    HIPCHECK(hipSetDevice(gpuid));
-    //CUDACHECK(cudaSetDevice(args->gpus[i]));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int j=0; j<args->nRanks; j++) {
       int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j;
@@ -715,7 +709,7 @@ testResult_t threadLaunch(struct testThread* thread) {
   return testSuccess;
 }
 
-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
   if (memorytype == ncclFine) {
     HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
     HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
@@ -807,12 +801,10 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-    // EDGAR NOTE: y is used by 'memory_type' (a RCCL argument) and 'stream_null' (a NCCL argument)
-    // also not sure about G vs. hG (we had G, they have hG)
 #ifdef RCCL_MULTIRANKPERGPU    
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:R:x:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:", longopts, &longindex);
 #else
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:", longopts, &longindex);
 #endif
 
     if (c == -1)
@@ -878,7 +870,7 @@ int main(int argc, char* argv[]) {
       case 'z':
         blocking_coll = strtol(optarg, NULL, 0);
         break;
-      case 'y':
+      case 'Y':
         memorytype = ncclstringtomtype(optarg);
         break;
       case 's':
@@ -946,7 +938,7 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
-            "[-y,--memory_type <coarse/fine/host/managed>] \n\t"
+            "[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
             "[-s,--stress_cycles <number of cycles>] \n\t"
             "[-u,--cumask <d0,d1,d2,d3>] \n\t"
             "[-y,--stream_null <0/1>] \n\t"
@@ -1084,15 +1076,15 @@ testResult_t run() {
 #ifdef MPI_SUPPORT
   MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
 #endif
-<<<<<<< HEAD
-  int gpus[nGpus*nThreads*ranksPerGpu];
+
+  int gpus[nGpus*nThreads];
   hipStream_t streams[nGpus*nThreads*ranksPerGpu];
   void* sendbuffs[nGpus*nThreads*ranksPerGpu];
   void* recvbuffs[nGpus*nThreads*ranksPerGpu];
   void* expected[nGpus*nThreads*ranksPerGpu];
   size_t sendBytes, recvBytes;
 
-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu);
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads*ranksPerGpu);
 
   envstr = getenv("NCCL_TESTS_DEVICE");
   gpu0 = envstr ? atoi(envstr) : -1;
@@ -1101,53 +1093,44 @@ testResult_t run() {
     if (enable_multiranks)
       gpuid = gpuid % numDevices;
 
+    gpus[ii] = gpu0 != -1 ? gpu0+ii : gpuid;
+    HIPCHECK(hipSetDevice(gpus[ii]));
+
     for (int j=0; j<ranksPerGpu; j++) {
       int i = ii*ranksPerGpu+j;
-      gpus[i] = gpu0 != -1 ? gpu0+ii : gpuid;
-      HIPCHECK(hipSetDevice(gpus[i]));
-
-      TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu));
-      //PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
-    if (streamnull)
-      streams[i] = NULL;
-    else {
-      if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
-	PRINT("cumask: ");
-	for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
-	PRINT("\n");
-	HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
-      } else
-	HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
+      if (streamnull)
+	streams[i] = NULL;
+      else {
+	if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
+	  PRINT("cumask: ");
+	  for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
+	  PRINT("\n");
+	  HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
+	} else
+	  HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      }
     }
-#if 0 //EDGAR
-    // initialize data buffer to avoid all zero data
-      TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
-    }
-    HIPCHECK(hipDeviceSynchronize());
-#endif //EDGAR
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
   if (!parallel_init) {
-     if (nProcs == 1 && !enable_multiranks) {
+     if (ncclProcs == 1 && !enable_multiranks) {
        NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
      } else {
        NCCLCHECK(ncclGroupStart());
        for (int ii=0; ii<nGpus*nThreads; ii++) {
-	 int gpuid = localRank*nThreads*nGpus+ii;
-         if (enable_multiranks) {
-	   gpuid = gpuid % numDevices;
-	 }
-         HIPCHECK(hipSetDevice(gpuid));
+         HIPCHECK(hipSetDevice(gpus[ii]));
 	 if (!enable_multiranks) {
-	   NCCLCHECK(ncclCommInitRank(comms+ii, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
+	   NCCLCHECK(ncclCommInitRank(comms+ii, ncclProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
 	 }
 #ifdef RCCL_MULTIRANKPERGPU
 	 else
 	   for (int j=0; j<ranksPerGpu; j++) {
 	     int i = ii*ranksPerGpu+j;
-	     NCCLCHECK(ncclCommInitRankMulti(comms+i, nProcs*nThreads*nGpus*ranksPerGpu, ncclId, proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
+	     NCCLCHECK(ncclCommInitRankMulti(comms+i, ncclProcs*nThreads*nGpus*ranksPerGpu, ncclId,
+					     proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
 	   }
 #endif
        }
@@ -1182,6 +1165,8 @@ testResult_t run() {
     threads[t].args.stepbytes=stepBytes;
     threads[t].args.stepfactor=stepFactor;
     threads[t].args.localRank = localRank;
+
+    threads[t].args.totalProcs = totalProcs;
     threads[t].args.localNumDevices = numDevices;
     threads[t].args.enable_multiranks = enable_multiranks;
     threads[t].args.nRanks = ranksPerGpu;
@@ -1190,7 +1175,7 @@ testResult_t run() {
     threads[t].args.nThreads=nThreads;
     threads[t].args.thread=t;
     threads[t].args.nGpus=nGpus;
-    threads[t].args.gpus=gpus+t*nGpus*ranksPerGpu;
+    threads[t].args.gpus=gpus+t*nGpus;
     threads[t].args.sendbuffs = sendbuffs+t*nGpus*ranksPerGpu;
     threads[t].args.recvbuffs = recvbuffs+t*nGpus*ranksPerGpu;
     threads[t].args.expected = expected+t*nGpus*ranksPerGpu;
diff --git a/src/common.h b/src/common.h
index 1d687528b3..cb3bd3f3c2 100644
--- a/src/common.h
+++ b/src/common.h
@@ -7,7 +7,7 @@
 #ifndef __COMMON_H__
 #define __COMMON_H__
 
-#include "rccl.h"
+#include "rccl/rccl.h"
 #include <stdio.h>
 #include <cstdint>
 #include <algorithm>
@@ -21,14 +21,14 @@
 // For nccl.h < 2.13 since we define a weak fallback
 extern "C" char const* ncclGetLastError(ncclComm_t comm);
 
-#define HIPCHECK(cmd) do {                         \
-  hipError_t e = cmd;                              \
-  if( e != hipSuccess ) {                          \
+#define HIPCHECK(cmd) do {                          \
+  hipError_t e = cmd;                               \
+  if( e != hipSuccess ) {                           \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
-    printf("%s: Test HIP failure %s:%d '%s'\n",    \
+    printf("%s: Test HIP failure %s:%d '%s'\n",     \
          hostname,                                  \
-        __FILE__,__LINE__,hipGetErrorString(e));   \
+        __FILE__,__LINE__,hipGetErrorString(e));    \
     return testCudaError;                           \
   }                                                 \
 } while(0)
diff --git a/verifiable/Makefile b/verifiable/Makefile
index b141a2a7c5..182d44e727 100644
--- a/verifiable/Makefile
+++ b/verifiable/Makefile
@@ -1,13 +1,62 @@
-include ../../makefiles/common.mk
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+#include ../../makefiles/common.mk
 
 .PHONY: all clean
 
 BUILDDIR := $(abspath ../../build)
-NCCLDIR := $(BUILDDIR)
-NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
 DST_DIR := $(BUILDDIR)/test/verifiable
 
-all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
+ROCM_PATH ?= /opt/rocm
+MPI_HOME ?= /usr/lib/openmpi
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+NCCL_HOME ?= ""
+
+HIPCC = $(ROCM_PATH)/bin/hipcc
+CXX = $(HIPCC)
+
+HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+endif
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+
+ifeq ($(DEBUG), 0)
+HIPCUFLAGS += -O3
+else
+HIPCUFLAGS += -O0 -g -ggdb3
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+ifeq ($(MPI), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+else ifeq ($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L/usr/lib -lmpich
+endif
+
+LIBRARIES += rccl
+HIPLDFLAGS   += $(LIBRARIES:%=-l%)
+
+all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test 
 
 clean:
 	rm -rf $(DST_DIR)
@@ -21,4 +70,4 @@ self_test: $(DST_DIR)/self_test
 $(DST_DIR)/self_test: verifiable.cu verifiable.h
 	@printf "Linking  %s\n" $@
 	@mkdir -p $(DST_DIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
diff --git a/verifiable/inexact_regress.cu b/verifiable/inexact_regress.cu
index d7bd545f62..973b965412 100644
--- a/verifiable/inexact_regress.cu
+++ b/verifiable/inexact_regress.cu
@@ -1,3 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 /* Generate parameters for our error bound model of floating point average
  * (sum of scaled values) by sampling sums of random sequences for each
  * floating point type.
@@ -16,12 +23,12 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdint>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
 
 using std::uint64_t;
 using std::uint32_t;
-using bfloat16 = __nv_bfloat16;
+using bfloat16 = hip_bfloat16;
 
 template<typename T>
 struct float_traits;
@@ -49,26 +56,26 @@ struct float_traits<double> {
   __device__ static double mul(double a, double b) { return a*b; }
 };
 template<>
-struct float_traits<half> {
+struct float_traits<__half> {
   static constexpr int mantissa_bits = 10;
   static constexpr int exponent_bits = 5;
   using uint_t = uint16_t;
-  __device__ static half make(double x) { return __double2half(x); }
-  __device__ static half make(uint64_t x) { return __int2half_rn(x); }
-  __device__ static double todouble(half x) { return __half2float(x); }
-  __device__ static half add(half a, half b) { return __hadd(a, b); }
-  __device__ static half mul(half a, half b) { return __hmul(a, b); }
+  __device__ static __half make(double x) { return __float2half((float)x); }
+  __device__ static __half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(__half x) { return __half2float(x); }
+  __device__ static __half add(__half a, __half b) { return __hadd(a, b); }
+  __device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
 };
 template<>
 struct float_traits<bfloat16> {
   static constexpr int mantissa_bits = 7;
   static constexpr int exponent_bits = 8;
   using uint_t = uint16_t;
-  __device__ static bfloat16 make(double x) { return __double2bfloat16(x); }
-  __device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); }
-  __device__ static double todouble(bfloat16 x) { return __bfloat162float(x); }
-  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); }
-  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); }
+  __device__ static bfloat16 make(double x) { return bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
+  __device__ static double todouble(bfloat16 x) { return double(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
 };
 
 template<typename F>
@@ -104,6 +111,17 @@ struct xoshiro256ss {
   }
 };
 
+static __device__ int __reduce_max_sync(unsigned int mask, int value)
+{
+  //We ignore mask, since all bits are set when calling them in the
+  //test code below.
+  int width = warpSize;
+  for (unsigned int i = warpSize; i; i >>= 1) {
+    value = max(__shfl_down(value, i, width), value);
+  }
+  return value;
+}
+
 template<typename F>
 __global__ void kernel() {
   using traits = float_traits<F>;
@@ -123,7 +141,7 @@ __global__ void kernel() {
     for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
     //for(int round=0; round < 2; round++) {
       for(int i=threadIdx.x; i < samps; i += blockDim.x) {
-        accf[i] = 0;
+        accf[i] = (F)0;
         accd[i] = 0;
       }
       __syncthreads();
@@ -157,21 +175,21 @@ __global__ void kernel() {
     if(pass==0)
       expo_avg = expo_sum/expo_n;
     else if(threadIdx.x == 0)
-      std::printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+      printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
   }
 }
 
 int main() {
   std::printf("type=float:\n");
   kernel<float><<<1,32>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
 
   std::printf("\ntype=half:\n");
   kernel<half><<<1,32>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
 
   std::printf("\ntype=bfloat16:\n");
   kernel<bfloat16><<<1,32>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
   return 0;
 }
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 5f617ee188..9d8e56aba9 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -1,15 +1,23 @@
-#pragma nv_diag_suppress declared_but_not_referenced
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+//#pragma nv_diag_suppress declared_but_not_referenced
 
 #include "verifiable.h"
-#include <nccl.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
 
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#if CUDART_VERSION >= 11000
-#include <cuda_bf16.h>
-#endif
+#include "rccl/rccl.h"
 
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__)
+
+#define RCCL_BFLOAT 1
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
   #define HAVE_ncclBfloat16 1
 #else
   #define HAVE_ncclBfloat16 0
@@ -83,10 +91,10 @@ namespace {
 template<typename T>
 struct IsIntegral: std::is_integral<T> {};
 template<>
-struct IsIntegral<half>: std::false_type {};
-#ifdef __CUDA_BF16_TYPES_EXIST__
+struct IsIntegral<__half>: std::false_type {};
+#if RCCL_BFLOAT16 == 1
 template<>
-struct IsIntegral<__nv_bfloat16>: std::false_type {};
+struct IsIntegral<hip_bfloat16>: std::false_type {};
 #endif
 }
 
@@ -116,13 +124,13 @@ namespace {
     return Y(x);
   }
   template<>
-  __host__ __device__ half castTo<half>(float x) {
+  __host__ __device__ half castTo<__half>(float x) {
     return __float2half(x);
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if RCCL_BFLOAT16 == 1
   template<>
-  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
-    return __float2bfloat16(x);
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
+    return hip_bfloat16(x);
   }
   #endif
 }
@@ -144,20 +152,12 @@ struct ReduceSum {
   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
   template<typename T, typename=decltype(T()+T())>
   __host__ __device__ T operator()(T a, T b) const { return a + b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 530
-      return __hadd(a, b);
-    #else
+  __host__ __device__ __half operator()(__half a, __half b) const {
       return __float2half(__half2float(a) + __half2float(b));
-    #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hadd(a, b);
-    #else
-      return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
   }
   #endif
   template<typename T>
@@ -168,20 +168,12 @@ struct ReduceProd {
   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
   template<typename T, typename=decltype(T()*T())>
   __host__ __device__ T operator()(T a, T b) const { return a * b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 530
-      return __hmul(a, b);
-    #else
+  __host__ __device__ __half operator()(__half a, __half b) const {
       return __float2half(__half2float(a) * __half2float(b));
-    #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmul(a, b);
-    #else
-      return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b));
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
   }
   #endif
   template<typename T>
@@ -192,24 +184,12 @@ struct ReduceMin {
   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
   template<typename T, typename=decltype(T()<T())>
   __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmin(a, b);
-    #elif __CUDA_ARCH__ >= 530
-      return __hlt(a, b) ? a : b;
-    #else
-      return __half2float(a) < __half2float(b) ? a : b;
-    #endif
+  __host__ __device__ __half operator()(__half a, __half b) const {
+    return __half2float(a) < __half2float(b) ? a : b;
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmin(a, b);
-    //#elif __CUDA_ARCH__ >= 530
-    //  return __hlt(a, b) ? a : b;
-    #else
-      return __bfloat162float(a) < __bfloat162float(b) ? a : b;
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) < static_cast<float>(b) ? a : b;
   }
   #endif
   template<typename T>
@@ -220,24 +200,12 @@ struct ReduceMax {
   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
   template<typename T, typename=decltype(T()>T())>
   __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmax(a, b);
-    #elif __CUDA_ARCH__ >= 530
-      return __hgt(a, b) ? a : b;
-    #else
+  __host__ __device__ __half operator()(__half a, __half b) const {
       return __half2float(a) > __half2float(b) ? a : b;
-    #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmax(a, b);
-    //#elif __CUDA_ARCH__ >= 530
-    //  return __hgt(a, b) ? a : b;
-    #else
-      return __bfloat162float(a) > __bfloat162float(b) ? a : b;
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) > static_cast<float>(b) ? a : b;
   }
   #endif
   template<typename T>
@@ -309,13 +277,13 @@ struct FloatLayout<double> {
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
-struct FloatLayout<half> {
+struct FloatLayout<__half> {
   static constexpr int exponent_bits = 5, mantissa_bits = 10;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
-#ifdef __CUDA_BF16_TYPES_EXIST__
+#if RCCL_BFLOAT16 == 1
 template<>
-struct FloatLayout<__nv_bfloat16> {
+struct FloatLayout<hip_bfloat16> {
   static constexpr int exponent_bits = 8, mantissa_bits = 7;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
@@ -340,14 +308,14 @@ namespace {
 // from unbounded random values. For instance, given X a totally random 32-bit
 // integer, `umul32hi(X,n)` will be totally random within [0,n).
 __host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
   return __umulhi(a, b);
 #else
   return uint64_t(a)*b >> 32;
 #endif
 }
 __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
   return __umul64hi(a, b);
 #else
   return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
@@ -355,14 +323,14 @@ __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
 }
 
 __host__ __device__ int clz32(int x) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
   return __clz(x);
 #else
   return x==0 ? 32 : __builtin_clz(x);
 #endif
 }
 __host__ __device__ int clz64(long long x) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
   return __clzll(x);
 #else
   return x==0 ? 64 : __builtin_clzll(x);
@@ -747,8 +715,9 @@ __host__ __device__ void genOutput(
   ) {
   ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
   using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
-  ans = ReduceProd()(ans, T1(1)/T1(rank_n));
-}
+  //ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+  ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
+ }
 }
 
 /////////////////////////////////////////////////////////////////////////////////
@@ -835,7 +804,7 @@ __global__ void prepareInput2(
 template<typename ReduceOp>
 void prepareInput1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
   ) {
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
   #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
@@ -846,9 +815,9 @@ void prepareInput1(
   case ncclUint32: CASE_TY(uint32_t)
   case ncclInt64: CASE_TY(int64_t)
   case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -860,7 +829,7 @@ void prepareInput1(
 
 void ncclVerifiablePrepareInput(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -911,7 +880,7 @@ __global__ void prepareExpected2(
 template<typename ReduceOp>
 void prepareExpected1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
   ) {
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
   #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
@@ -922,9 +891,9 @@ void prepareExpected1(
   case ncclUint32: CASE_TY(uint32_t)
   case ncclInt64: CASE_TY(int64_t)
   case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -936,7 +905,7 @@ void prepareExpected1(
 
 void ncclVerifiablePrepareExpected(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -1044,7 +1013,8 @@ __global__ void verifyPrepared(
     #endif
     i += blockDim.x;
   }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
 }
 
 template<typename T, typename Uint, typename ReduceFn>
@@ -1077,13 +1047,14 @@ __global__ void verifyInline2(
     #endif
     i += blockDim.x;
   }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long*)bad_elt_n, (unsigned long)bad);
 }
 
 template<typename T, typename Uint>
 void verifyInline1(
     T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
+    unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -1112,7 +1083,7 @@ void verifyInline1(
 void ncclVerifiableVerify(
     void const *results, void const *expected, intptr_t elt_n, int elt_ty,
     int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    int64_t *bad_elt_n, cudaStream_t stream
+    int64_t *bad_elt_n, hipStream_t stream
   ) {
   bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
   #if HAVE_ncclBfloat16
@@ -1142,9 +1113,9 @@ void ncclVerifiableVerify(
   case ncclUint32: CASE_TY(uint32_t, uint32_t)
   case ncclInt64: CASE_TY(int64_t, uint64_t)
   case ncclUint64: CASE_TY(uint64_t, uint64_t)
-  case ncclFloat16: CASE_TY(half, uint16_t)
+  case ncclFloat16: CASE_TY(__half, uint16_t)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
+  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
   #endif
   case ncclFloat32: CASE_TY(float, uint32_t)
   case ncclFloat64: CASE_TY(double, uint64_t)
@@ -1180,7 +1151,7 @@ __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, in
     }
     sum = op.postOp(sum);
     if(tolerance < calcDelta(sum, y)) {
-      std::printf(
+      printf(
         //"%10g != %10g  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
         "%llx != %llx  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
         *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
@@ -1209,9 +1180,9 @@ __global__ void sweep() {
   sweep1<uint32_t>(ncclUint32, "uint32");
   sweep1<int64_t>(ncclInt64, "int64");
   sweep1<uint64_t>(ncclUint64, "uint64");
-  sweep1<half>(ncclFloat16, "half");
+  sweep1<__half>(ncclFloat16, "half");
   #if HAVE_ncclBfloat16
-    sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
+    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
   #endif
   sweep1<float>(ncclFloat32, "float");
   sweep1<double>(ncclFloat64, "double");
@@ -1219,9 +1190,9 @@ __global__ void sweep() {
 
 int main(int arg_n, char **args) {
   std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  cudaSetDevice(0);
+  hipSetDevice(0);
   sweep<<<1,512>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
   return 0;
 }
 #endif
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index aca0565a6b..b41ef1ad12 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -1,7 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #ifndef _d41d8cd98f00b204e9800998ecf8427e
 #define _d41d8cd98f00b204e9800998ecf8427e
 
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 
 #include <stdint.h>
 
@@ -36,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 // Enqueue kernel to generate data which is to be reduced.
 void ncclVerifiablePrepareInput(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
 );
 
 // Enqueue kernel to generate expected results of reduction.
 void ncclVerifiablePrepareExpected(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
 );
 
 // Enqueue kernel to verify reduced data matches expectation. The number of
@@ -54,6 +61,6 @@ void ncclVerifiablePrepareExpected(
 void ncclVerifiableVerify(
   void const *results, void const *expected, intptr_t elt_n, int elt_ty,
   int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-  int64_t *bad_elt_n, cudaStream_t stream
+  int64_t *bad_elt_n, hipStream_t stream
 );
 #endif
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
index 225c32a3c3..fba1fbf35c 100644
--- a/verifiable/verifiable.mk
+++ b/verifiable/verifiable.mk
@@ -1,3 +1,9 @@
+# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+
+
 # We requires both of the following paths to be set upon including this makefile
 # TEST_VERIFIABLE_SRCDIR = <points to this directory>
 # TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
@@ -8,4 +14,5 @@ TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
 $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu

From 9a89c300b6c2c04c1720d4096f6fe55e17addbb1 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Wed, 12 Oct 2022 17:28:04 -0700
Subject: [PATCH 115/233] Allow more precise measurements of single operation
 (#20)

---
 src/common.cu | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 4f80115869..54f62c515f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -93,6 +93,7 @@ static int average = 1;
 static int numDevices = 1;
 static int ranksPerGpu = 1;
 static int enable_multiranks = 0;
+static int delay_inout_place = 0;
 
 #define NUM_BLOCKS 32
 
@@ -459,9 +460,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
   }
 
-  // Sync
-  TESTCHECK(startColl(args, type, op, root, in_place, 0));
-  TESTCHECK(completeColl(args));
+  if (warmup_iters) {
+    // Sync
+    TESTCHECK(startColl(args, type, op, root, in_place, 0));
+    TESTCHECK(completeColl(args));
+  }
 
   Barrier(args);
 
@@ -651,6 +654,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
 	sprintf(rootName, "%6i", root);	
 	PRINT("%12li  %12li  %8s  %6s  %6s", (size_t)max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
         TESTCHECK(BenchTime(args, type, op, root, 0));
+        usleep(delay_inout_place);
         TESTCHECK(BenchTime(args, type, op, root, 1));
         PRINT("\n");
     }
@@ -802,9 +806,9 @@ int main(int argc, char* argv[]) {
   while(1) {
     int c;
 #ifdef RCCL_MULTIRANKPERGPU    
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:q:", longopts, &longindex);
 #else
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:q:", longopts, &longindex);
 #endif
 
     if (c == -1)
@@ -913,6 +917,9 @@ int main(int argc, char* argv[]) {
         ranksPerGpu = (int)strtol(optarg, NULL, 0);
         break;
 #endif
+      case 'q':
+        delay_inout_place = (int)strtol(optarg, NULL, 10);
+        break;
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -950,6 +957,7 @@ int main(int argc, char* argv[]) {
             "[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t"
             "[-R,--ranks_per_gpu] \n\t"
 #endif
+            "[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
             "[-h,--help]\n",
           basename(argv[0]));
         return 0;
@@ -1112,7 +1120,6 @@ testResult_t run() {
       }
     }
   }
-
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
   if (!parallel_init) {

From 8a754f15adcb23a89f0e1ba57a731c6f1997bf41 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Tue, 25 Oct 2022 16:31:57 +0000
Subject: [PATCH 116/233] fix a messing endif statement

error introduced with the web merger-resolution tool :-(
---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index cc8fc5b5f3..ce7f193ae1 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -810,7 +810,7 @@ int main(int argc, char* argv[]) {
     c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:q:", longopts, &longindex);
 #else
     c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:q:", longopts, &longindex);
-
+#endif
 
     if (c == -1)
       break;

From 9c9746739ae9ca57791607ace03ed187fb435d33 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Mon, 31 Oct 2022 19:01:22 +0000
Subject: [PATCH 117/233] add the rccl/lib directory to the link path

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index dd01c484f9..ea925ad3fc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -21,7 +21,7 @@ HIPLDFLAGS :=
 
 ifneq ($(NCCL_HOME), "")
 HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
-HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
 endif
 HIPCUFLAGS += -I$(ROCM_PATH)/include
 HIPCUFLAGS += -I$(ROCM_PATH)/include/hip

From 377b28e5fbd41962a8e10012f3e74ee39e3c9cdc Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Mon, 31 Oct 2022 21:39:34 +0000
Subject: [PATCH 118/233] make cmake stage also pass in CI

the subdir entry is not actually required for the compilation.
---
 CMakeLists.txt     | 1 -
 src/CMakeLists.txt | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e22365515..539a1eae2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,6 @@ endif()
 set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
 
 # Add all of the tests
-add_subdirectory(verifiable)
 add_subdirectory(src)
 
 # Create ROCm standard packages
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b5a40aefc1..6511a419c9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,8 +3,8 @@
 # ########################################################################
 
 # Compile common object library
-set_property(SOURCE common.cu PROPERTY LANGUAGE CXX)
-add_library(rccl_common OBJECT common.cu)
+set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
+add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
 if(USE_MPI)
     target_link_libraries(rccl_common roc::rccl MPI::MPI_CXX)
 else()

From 9d3a53dfa3af22bfdc86b1ed37867f3cdc2762b1 Mon Sep 17 00:00:00 2001
From: akolliasAMD <99202231+akolliasAMD@users.noreply.github.com>
Date: Tue, 1 Nov 2022 11:34:55 -0600
Subject: [PATCH 119/233] added std::max to avoid buffer overflow in printing
 (#25)

---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index ce7f193ae1..bdac4b6b11 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -652,7 +652,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
         setupArgs(size, type, args);
 	char rootName[100];
 	sprintf(rootName, "%6i", root);	
-	PRINT("%12li  %12li  %8s  %6s  %6s", (size_t)max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
+	PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
         TESTCHECK(BenchTime(args, type, op, root, 0));
         usleep(delay_inout_place);
         TESTCHECK(BenchTime(args, type, op, root, 1));

From 3bd2bd292bd3b249892b63f6342d1ca559c37391 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 22 Nov 2022 11:16:47 -0800
Subject: [PATCH 120/233] Add fflush(stdout) before perf output

---
 src/common.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 5837ed1bcd..41d747905b 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -969,6 +969,8 @@ testResult_t run() {
     errors[t] = bw_count[t] = 0;
   }
 
+  fflush(stdout);
+
   const char* timeStr = report_cputime ? "cputime" : "time";
   PRINT("#\n");
   PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");

From 24fcf64ed19bb178aa867b14c1d7f13493656e74 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 22 Nov 2022 11:18:37 -0800
Subject: [PATCH 121/233] Call cudaFreeHost() on wrongPerGpu not cudaFree()

---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 41d747905b..48a629ce10 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -262,7 +262,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   *wrongElts = 0;
   for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
-  cudaFree(wrongPerGpu);
+  cudaFreeHost(wrongPerGpu);
 
   if (args->reportErrors && *wrongElts) args->errors[0]++;
   return testSuccess;

From e9f5be184c23c730258ff114119fd6c52cf31be6 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Wed, 30 Nov 2022 23:01:46 +0000
Subject: [PATCH 122/233] fix algorithm assigning values in testsuite

avoid a division by zero which seems to only occur for op=prod and
datatype=half, since the maximum exponent is small (15) and can exceed
the number of ranks.
---
 verifiable/verifiable.cu | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 9d8e56aba9..a375809bcf 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -14,9 +14,6 @@
 
 #include "rccl/rccl.h"
 
-
-#define RCCL_BFLOAT 1
-
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
   #define HAVE_ncclBfloat16 1
 #else
@@ -124,7 +121,7 @@ namespace {
     return Y(x);
   }
   template<>
-  __host__ __device__ half castTo<__half>(float x) {
+  __host__ __device__ __half castTo<__half>(float x) {
     return __float2half(x);
   }
   #if RCCL_BFLOAT16 == 1
@@ -425,7 +422,7 @@ __host__ __device__ void genSumXY(
   // Let s be the number of ranks per partition. This is either rn/pn as we
   // intended, or y/p_sum if that's smaller to prevent overshooting our target y.
   uint32_t s = y/p_sum < rn/pn ? y/p_sum : rn/pn;
-  x = r/s < pn ? 1 + r/s : 0; //  First s*pn ranks contribute partition index +1.
+  x = (s != 0 && r/s < pn) ? 1 + r/s : 0; //  First s*pn ranks contribute partition index +1.
   x += r == rn-1 ? y - s*p_sum : 0; // Last rank contributes discrepancy.
 }
 }

From 0aeba157db77cc9e99186639bf71368b74fb90e2 Mon Sep 17 00:00:00 2001
From: Jithin Jose <jijos@microsoft.com>
Date: Fri, 18 Dec 2020 10:12:54 -0800
Subject: [PATCH 123/233] Use DJB2a hash algorithm in getHostHash()

---
 src/common.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.h b/src/common.h
index b69d071606..6fc6bfdd69 100644
--- a/src/common.h
+++ b/src/common.h
@@ -174,10 +174,10 @@ static void getHostName(char* hostname, int maxlen) {
 #include <stdint.h>
 
 static uint64_t getHostHash(const char* string) {
-  // Based on DJB2, result = result * 33 + char
+  // Based on DJB2a, result = result * 33 ^ char
   uint64_t result = 5381;
   for (int c = 0; string[c] != '\0'; c++){
-    result = ((result << 5) + result) + string[c];
+    result = ((result << 5) + result) ^ string[c];
   }
   return result;
 }

From 0b4c4cb99fb1381edec1f78c37230688ea1ceb26 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 4 Jan 2021 11:37:32 -0800
Subject: [PATCH 124/233] Add boot_id to the hostname hash due to collisions on
 Azure

Fixes #60
---
 src/common.h | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/common.h b/src/common.h
index 6fc6bfdd69..20fa4612db 100644
--- a/src/common.h
+++ b/src/common.h
@@ -173,15 +173,46 @@ static void getHostName(char* hostname, int maxlen) {
 
 #include <stdint.h>
 
-static uint64_t getHostHash(const char* string) {
+static uint64_t getHash(const char* string, size_t n) {
   // Based on DJB2a, result = result * 33 ^ char
   uint64_t result = 5381;
-  for (int c = 0; string[c] != '\0'; c++){
+  for (size_t c = 0; c < n; c++) {
     result = ((result << 5) + result) ^ string[c];
   }
   return result;
 }
 
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
 static size_t wordSize(ncclDataType_t type) {
   switch(type) {
     case ncclChar:

From 2cbb968101e2bfc7d3a7f0f1826c0189355de6fe Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 3 Jan 2023 08:47:43 +0100
Subject: [PATCH 125/233] Update README.md

Improve MPI example to avoid confusion of number of processes / total number of GPUs.

https://github.com/NVIDIA/nccl-tests/issues/54#issuecomment-1212023369
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bff6433b89..12be254542 100644
--- a/README.md
+++ b/README.md
@@ -29,9 +29,9 @@ Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```
 
-Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each :
+Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
 ```shell
-$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
+$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
 ```
 
 ### Performance

From 2b2f23f42d7d0cb18594de1314f613da809a56b2 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Tue, 14 Feb 2023 22:31:54 +0000
Subject: [PATCH 126/233] auto-detect and enable MPI

---
 CMakeLists.txt | 50 ++++++++++++++++++++++++++++++++++++++++++--------
 README.md      | 16 ++++++++++++++++
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 539a1eae2b..f440060946 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,33 @@
 # ########################################################################
 # Copyright 2022 Advanced Micro Devices, Inc.
 # ########################################################################
+macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so)
+    find_program(MPI_MPICXX ${mpi_compiler})
+    if (MPI_MPICXX)
+        message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
+        if (${CMAKE_VERSION} VERSION_LESS "3.20.0")
+            get_filename_component(mpi.tmpdir ${MPI_MPICXX} DIRECTORY)
+            get_filename_component(mpi_base_dir ${mpi.tmpdir} DIRECTORY)
+        else()
+            cmake_path(GET MPI_MPICXX PARENT_PATH mpi.tmpdir)
+            cmake_path(GET mpi.tmpdir PARENT_PATH mpi_base_dir)
+        endif()
+        find_file(MPI_H mpi.h PATHS ${mpi_base_dir} PATH_SUFFIXES include include/x86_64-linux-gnu ${ARGN} {REQUIRED)
+        if (${CMAKE_VERSION} VERSION_LESS "3.20.0")
+            get_filename_component(mpi_inc_dir ${MPI_H} DIRECTORY)
+        else()
+            cmake_path(GET MPI_H PARENT_PATH mpi_inc_dir)
+        endif()
+        message ("-- mpi.h is in ${mpi_inc_dir}")
+        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu REQIRED)
+        message ("-- libmpi is ${MPI_LIB}")
+        add_definitions(-DMPI_SUPPORT)
+        include_directories(${mpi_inc_dir})
+        link_libraries(${MPI_LIB})
+    else()
+        message ("-- ${mpi_compiler} not found")
+    endif()
+endmacro()
 
 cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
 
@@ -30,8 +57,7 @@ include(ROCMCheckTargetIds)
 include(ROCMClients)
 
 # Build variables
-option(USE_MPI "Build RCCL-tests with MPI support. Requires the MPI path to be set.")
-set(MPI_PATH "" CACHE PATH "Path to MPI installation")
+option(NO_MPI "Build RCCL-tests without MPI support.")
 ## Get default GPU targets using rocm_check_target_ids
 rocm_check_target_ids(
     DEFAULT_AMDGPU_TARGETS
@@ -39,13 +65,21 @@ rocm_check_target_ids(
 )
 set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.")
 
-# Find the MPI package if we're using MPI
-if (USE_MPI)
-    if(NOT MPI_PATH STREQUAL "")
-        set(MPI_HOME "${MPI_PATH}")
+if (NOT NO_MPI)
+    # Check for MPICH first
+    check_mpi(mpicxx.mpich libmpich.a libmpich.so include/x86_64-linux-gnu/mpich)
+
+    # Check for MPI in general. If we find mpicxx, we don't know whether its
+    # MPICH or another MPI implementation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so)
     endif()
-    find_package(MPI REQUIRED MODULE)
-    add_definitions(-DOMPI_SKIP_MPICXX -DMPI_SUPPORT)
+
+    if (NOT MPI_MPICXX)
+        message ("-- no MPI library found")
+    endif()
+else()
+    message ("-- MPI support explicitely disabled")
 endif()
 
 set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
diff --git a/README.md b/README.md
index c2847232e6..0a88c5d384 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,22 @@ RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If y
 $ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
 ```
 
+RCCL tests can also be built using cmake. A typical sequence will be:
+
+```shell
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl ..
+$ make
+```
+
+When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
+for cmake target and config files that are created during the RCCL build.
+
+Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitley request
+MPI builds. A user can explicitely disable MPI builds by adding the -DNO_MPI=1 flag to the cmake command line.
+
+
 ## Usage
 
 RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).

From 5275aa57159eaed269c63020a8b12defff875cb0 Mon Sep 17 00:00:00 2001
From: Pedram Alizadeh <pmohamma@amd.com>
Date: Fri, 24 Feb 2023 21:39:04 -0500
Subject: [PATCH 127/233] Adding -pthread flag for linking issues into
 src/Makefile (#30)

* Adding -pthread flag for linking issues into src/Makefile

* Adding -pthread flag for linking issues into CMakeLists.txt
---
 CMakeLists.txt | 2 ++
 src/Makefile   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f440060946..bae921038b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,8 @@
 # ########################################################################
 # Copyright 2022 Advanced Micro Devices, Inc.
 # ########################################################################
+#Adding pthread flag for linking
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
 macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so)
     find_program(MPI_MPICXX ${mpi_compiler})
     if (MPI_MPICXX)
diff --git a/src/Makefile b/src/Makefile
index ea925ad3fc..500549d2da 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -26,7 +26,7 @@ endif
 HIPCUFLAGS += -I$(ROCM_PATH)/include
 HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
-HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
 
 ifeq ($(DEBUG), 0)
 HIPCUFLAGS += -O3

From 255750b094265f89256102b3b68c57e72e3a0e45 Mon Sep 17 00:00:00 2001
From: Pedram Alizadeh <pmohamma@amd.com>
Date: Thu, 2 Mar 2023 11:05:25 -0500
Subject: [PATCH 128/233] Adding -pthread flag for linking issues into
 CMakeLists.txt and src/Makefile (#31)

---
 CMakeLists.txt | 2 ++
 src/Makefile   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 539a1eae2b..d950565e2f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,8 @@
 # ########################################################################
 # Copyright 2022 Advanced Micro Devices, Inc.
 # ########################################################################
+#Adding pthread flag for linking
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
 
 cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
 
diff --git a/src/Makefile b/src/Makefile
index 3dbd41ff9a..f01e7b3850 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -27,7 +27,7 @@ HIPCUFLAGS += -I$(ROCM_PATH)/include
 HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
 HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
-HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
 
 ifeq ($(DEBUG), 0)
 HIPCUFLAGS += -O3

From bdf58b1656785fa10112a1fc6be7c2d164f77507 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Thu, 2 Mar 2023 18:22:03 +0000
Subject: [PATCH 129/233] revamp cmake MPI detection

we honor user requested MPI installations using MPI_PATH first,
and check afterwards for MPICH and Open MPI in the default
Ubuntu and RHEL installation directories.
---
 CMakeLists.txt | 69 +++++++++++++++++++++++++++++++-------------------
 README.md      |  5 ++--
 2 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bae921038b..5577fb80fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,29 +3,23 @@
 # ########################################################################
 #Adding pthread flag for linking
 set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
-macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so)
-    find_program(MPI_MPICXX ${mpi_compiler})
+macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
+    find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
     if (MPI_MPICXX)
         message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
-        if (${CMAKE_VERSION} VERSION_LESS "3.20.0")
-            get_filename_component(mpi.tmpdir ${MPI_MPICXX} DIRECTORY)
-            get_filename_component(mpi_base_dir ${mpi.tmpdir} DIRECTORY)
-        else()
-            cmake_path(GET MPI_MPICXX PARENT_PATH mpi.tmpdir)
-            cmake_path(GET mpi.tmpdir PARENT_PATH mpi_base_dir)
-        endif()
-        find_file(MPI_H mpi.h PATHS ${mpi_base_dir} PATH_SUFFIXES include include/x86_64-linux-gnu ${ARGN} {REQUIRED)
-        if (${CMAKE_VERSION} VERSION_LESS "3.20.0")
-            get_filename_component(mpi_inc_dir ${MPI_H} DIRECTORY)
-        else()
-            cmake_path(GET MPI_H PARENT_PATH mpi_inc_dir)
-        endif()
-        message ("-- mpi.h is in ${mpi_inc_dir}")
-        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu REQIRED)
+        find_file(MPI_H mpi.h PATHS ${mpi_inc_dir} NO_DEFAULT_PATH)
+        message ("-- mpi.h is in ${MPI_H}")
+        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_lib_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu NO_DEFAULT_PATH)
         message ("-- libmpi is ${MPI_LIB}")
-        add_definitions(-DMPI_SUPPORT)
-        include_directories(${mpi_inc_dir})
-        link_libraries(${MPI_LIB})
+	if (NOT MPI_H OR NOT MPI_LIB)
+	    set (MPI_MPICXX "MPI_MPICXX-NOTFOUND")
+	    set (MPI_H "MPI_H-NOTFOUND")
+	    set (MPI_LIB "MPI_LIB-NOTFOUND")
+	else()
+            add_definitions(-DMPI_SUPPORT)
+            include_directories(${mpi_inc_dir})
+            link_libraries(${MPI_LIB})
+	endif()
     else()
         message ("-- ${mpi_compiler} not found")
     endif()
@@ -60,6 +54,7 @@ include(ROCMClients)
 
 # Build variables
 option(NO_MPI "Build RCCL-tests without MPI support.")
+option(MPI_PATH "Use MPI in the specified directory.")
 ## Get default GPU targets using rocm_check_target_ids
 rocm_check_target_ids(
     DEFAULT_AMDGPU_TARGETS
@@ -68,13 +63,35 @@ rocm_check_target_ids(
 set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.")
 
 if (NOT NO_MPI)
-    # Check for MPICH first
-    check_mpi(mpicxx.mpich libmpich.a libmpich.so include/x86_64-linux-gnu/mpich)
+    # CHECK for MPI Path first. User requested this directory explicitely
+    if (MPI_PATH)
+        set(mpi_spec_bin_dir "${MPI_PATH}/bin")
+	set(mpi_spec_inc_dir "${MPI_PATH}/include")
+        check_mpi(mpicxx libmpi.a libmpi.so ${mpi_spec_bin_dir} ${MPI_PATH} ${mpi_spec_inc_dir})
+	if (NOT MPI_MPICXX)
+            # Since the user explicitely requested this directory, abort if something went wrong.
+	    MESSAGE(FATAL_ERROR "Could not find MPI in ${MPI_PATH}")
+        endif()
+    endif()
 
-    # Check for MPI in general. If we find mpicxx, we don't know whether its
-    # MPICH or another MPI implementation
+    # Check for MPICH Ubuntu installation
     if (NOT MPI_MPICXX)
-        check_mpi(mpicxx libmpi.a libmpi.so)
+        check_mpi(mpicxx.mpich libmpich.a libmpich.so /usr/bin /usr /usr/include/x86_64-linux-gnu/mpich)
+    endif()
+
+    # Check for Open MPI Ubuntu installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx.openmpi libmpi.a libmpi.so /usr/bin  /usr/lib/x86_64-linux-gnu/openmpi /usr/lib/x86_64-linux-gnu/openmpi/include)
+    endif()
+
+    # Check for MPICH RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpich/bin /usr/lib64/mpich /usr/include/mpich-x86_64)
+    endif()
+
+    # Check for Open MPI RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
     endif()
 
     if (NOT MPI_MPICXX)
@@ -91,7 +108,7 @@ add_subdirectory(src)
 
 # Create ROCm standard packages
 rocm_create_package(
-    NAME rccl-separate-tests
+    NAME rccl-tests
     DESCRIPTION "Tests for the ROCm Communication Collectives Library"
     MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
 )
diff --git a/README.md b/README.md
index 0a88c5d384..74f15515b4 100644
--- a/README.md
+++ b/README.md
@@ -30,8 +30,9 @@ $ make
 When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
 for cmake target and config files that are created during the RCCL build.
 
-Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitley request
-MPI builds. A user can explicitely disable MPI builds by adding the -DNO_MPI=1 flag to the cmake command line.
+Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitly request
+MPI builds. A user can request to use a particular MPI library by using the MPI_PATH variable. MPI support can be explicitely disabled by adding the -DNO_MPI=1
+flag to the cmake command line.
 
 
 ## Usage

From 17d0a42d5a4328e0e0e0d68440d8821224826d2f Mon Sep 17 00:00:00 2001
From: Felix Abecassis <fabecassis@nvidia.com>
Date: Thu, 23 Mar 2023 09:05:41 -0700
Subject: [PATCH 130/233] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 12be254542..580996b28d 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ All tests support the same set of arguments :
   * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
   * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
   * Increments can be either fixed or a multiplication factor. Only one of those should be used
-    * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
+    * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
     * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
 * NCCL operations arguments
   * `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.

From 7ccda3c97baf6924ff38411e364c0442096fc4be Mon Sep 17 00:00:00 2001
From: "alan.souza" <alan.geof.ba@gmail.com>
Date: Sat, 25 Mar 2023 16:56:16 -0300
Subject: [PATCH 131/233] fix handling of variable NVCC. Permit overriding the
 variable using environment variables

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 6ea07303b5..393de8e41b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -11,7 +11,7 @@ DEBUG ?= 0
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
-NVCC = $(CUDA_HOME)/bin/nvcc
+NVCC ?= $(CUDA_HOME)/bin/nvcc
 CUDARTLIB ?= cudart
 
 CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))

From e146460810a88b1a47b22308be63702485994fce Mon Sep 17 00:00:00 2001
From: Pedram Alizadeh <pmohamma@amd.com>
Date: Mon, 3 Apr 2023 11:37:13 -0400
Subject: [PATCH 132/233] fixing the error message for mpirun when number of
 requested GPUs exceeds the limits (#33)

---
 src/common.cu | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 332cc3f272..eb0743a52f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1127,11 +1127,13 @@ int main(int argc, char* argv[]) {
   }
 
   HIPCHECK(hipGetDeviceCount(&numDevices));
+#ifndef MPI_SUPPORT  
   if (nGpus > numDevices)
   {
       fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices);
       return testNcclError;
   }
+#endif
   if (minBytes > maxBytes) {
     fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
            (unsigned long long)minBytes,
@@ -1154,7 +1156,14 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 #ifdef MPI_SUPPORT
+  int nProcs = 1;
   MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  if (nGpus * nProcs > numDevices)
+  {
+      fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus*nProcs, numDevices);
+      return testNcclError;
+  }
 #endif
   TESTCHECK(run());
   return 0;

From e856fa720ff45d319225ef1ee4c3f7467e90e823 Mon Sep 17 00:00:00 2001
From: Pedram Alizadeh <pmohamma@amd.com>
Date: Tue, 25 Apr 2023 13:44:43 -0400
Subject: [PATCH 133/233] Revert "fixing the error message for mpirun when
 number of requested GPUs exceeds the limits (#33)" (#36)

This reverts commit e146460810a88b1a47b22308be63702485994fce.
---
 src/common.cu | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index eb0743a52f..332cc3f272 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1127,13 +1127,11 @@ int main(int argc, char* argv[]) {
   }
 
   HIPCHECK(hipGetDeviceCount(&numDevices));
-#ifndef MPI_SUPPORT  
   if (nGpus > numDevices)
   {
       fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices);
       return testNcclError;
   }
-#endif
   if (minBytes > maxBytes) {
     fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
            (unsigned long long)minBytes,
@@ -1156,14 +1154,7 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 #ifdef MPI_SUPPORT
-  int nProcs = 1;
   MPI_Init(&argc, &argv);
-  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
-  if (nGpus * nProcs > numDevices)
-  {
-      fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus*nProcs, numDevices);
-      return testNcclError;
-  }
 #endif
   TESTCHECK(run());
   return 0;

From d16d1fb16b2abe1c1c88464097e6f1d8070d1116 Mon Sep 17 00:00:00 2001
From: Pedram Alizadeh <pmohamma@amd.com>
Date: Thu, 27 Apr 2023 14:06:17 -0400
Subject: [PATCH 134/233] fixing the error message for mpirun when number of
 requested GPUs exceeds the limits (#37)

---
 src/common.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 332cc3f272..7107d8b4e6 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1127,11 +1127,13 @@ int main(int argc, char* argv[]) {
   }
 
   HIPCHECK(hipGetDeviceCount(&numDevices));
+#ifndef MPI_SUPPORT
   if (nGpus > numDevices)
   {
       fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices);
       return testNcclError;
   }
+#endif
   if (minBytes > maxBytes) {
     fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
            (unsigned long long)minBytes,
@@ -1163,6 +1165,7 @@ int main(int argc, char* argv[]) {
 testResult_t run() {
   int nProcs = 1, proc = 0;
   int localRank = 0;
+  int localSize = 0;
   char hostname[1024];
   getHostName(hostname, 1024);
 
@@ -1176,6 +1179,14 @@ testResult_t run() {
     if (p == proc) break;
     if (hostHashs[p] == hostHashs[proc]) localRank++;
   }
+  for (int p=0; p<nProcs; p++) {
+    if (hostHashs[p] == hostHashs[proc]) localSize++;
+  }
+  if (nGpus * localSize > numDevices)
+  {
+      fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d) on node (%s)\n", nGpus*localSize, numDevices, hostname);
+      return testNcclError;
+  }
 #endif
   is_main_thread = (proc == 0) ? 1 : 0;
 

From 52ea1b214802fc37ef4baa29eb19942dcbf0a187 Mon Sep 17 00:00:00 2001
From: yangxingwu <xingwu.yang@gmail.com>
Date: Tue, 6 Jun 2023 09:47:50 +0000
Subject: [PATCH 135/233] makefile: remove extra space

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 43729f897a..f652b78a99 100644
--- a/Makefile
+++ b/Makefile
@@ -7,9 +7,9 @@
 BUILDDIR ?= build
 override BUILDDIR := $(abspath $(BUILDDIR))
 
-.PHONY : all clean
+.PHONY: all clean
 
-default : src.build
+default: src.build
 
 TARGETS=src
 

From 652a24d38d1187437c62a132fe90d9315ec46b55 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 14 Jun 2023 20:26:33 +0000
Subject: [PATCH 136/233] Fix merge error

---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index c71bf00049..d3fb87aea9 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1027,7 +1027,7 @@ testResult_t run() {
   MPI_Comm_size(mpi_comm, &ncclProcs);
   MPI_Comm_rank(mpi_comm, &ncclProc);
 
-  for (int p=0; p<nProcs; p++) {
+  for (int p=0; p<totalProcs; p++) {
     if (hostHashs[p] == hostHashs[proc]) localSize++;
   }
   if (nGpus * localSize > numDevices)

From fcd0888d538a68b99ab902e665428a558a31698b Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Sun, 18 Jun 2023 18:07:29 -0700
Subject: [PATCH 137/233] Remove hardcoded number of GPUs limit for alltoallv
 (#41)

---
 src/alltoallv.cu | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index c5818d9ded..3f2204cd49 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -93,11 +93,16 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   NCCLCHECK(ncclCommCount(comm, &nranks));
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
-  #define MAX_ALLTOALLV_RANKS 256
-  static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS];
+
   if (count == 0) return testSuccess;
-  if (nranks > MAX_ALLTOALLV_RANKS) {
-    printf("Number of ranks %d exceeds limit %d\n", nranks, MAX_ALLTOALLV_RANKS);
+
+  size_t *sendcounts, *recvcounts, *sdispls, *rdispls;
+  sendcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  recvcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  sdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  rdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  if (sendcounts == nullptr || recvcounts == nullptr || sdispls == nullptr || rdispls == nullptr) {
+    printf("failed to allocate buffers for alltoallv\n");
     return testNcclError;
   }
 
@@ -107,10 +112,10 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
       size_t scount = ((i+rank)%nranks)*chunksize;
       if ((i+rank)%nranks == 0)
           scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
-      sendcounts[i+rank*MAX_ALLTOALLV_RANKS] = recvcounts[i+rank*MAX_ALLTOALLV_RANKS] = scount;
-      sdispls[i+rank*MAX_ALLTOALLV_RANKS] = rdispls[i+rank*MAX_ALLTOALLV_RANKS] = disp;
+      sendcounts[i+rank*nranks] = recvcounts[i+rank*nranks] = scount;
+      sdispls[i+rank*nranks] = rdispls[i+rank*nranks] = disp;
       disp += scount;
-      //printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type), sdispls[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type));
+      //printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*nranks]*wordSize(type), sdispls[i+rank*nranks]*wordSize(type));
   }
 
 #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
@@ -118,23 +123,23 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   return testNcclError;
 #else
 #if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER)
-  NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*MAX_ALLTOALLV_RANKS, sdispls+rank*MAX_ALLTOALLV_RANKS, recvbuff, recvcounts+rank*MAX_ALLTOALLV_RANKS, rdispls+rank*MAX_ALLTOALLV_RANKS, type, comm, stream));
+  NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*nranks, sdispls+rank*nranks, recvbuff, recvcounts+rank*nranks, rdispls+rank*nranks, type, comm, stream));
 #else
   NCCLCHECK(ncclGroupStart());
   for (int r=0; r<nranks; r++) {
-    if (sendcounts[r+rank*MAX_ALLTOALLV_RANKS] != 0) {
+    if (sendcounts[r+rank*nranks] != 0) {
       NCCLCHECK(ncclSend(
-          ((char*)sendbuff) + sdispls[r+rank*MAX_ALLTOALLV_RANKS] * wordSize(type),
-          sendcounts[r+rank*MAX_ALLTOALLV_RANKS],
+          ((char*)sendbuff) + sdispls[r+rank*nranks] * wordSize(type),
+          sendcounts[r+rank*nranks],
           type,
           r,
           comm,
           stream));
     }
-    if (recvcounts[r+rank*MAX_ALLTOALLV_RANKS] != 0) {
+    if (recvcounts[r+rank*nranks] != 0) {
       NCCLCHECK(ncclRecv(
-          ((char*)recvbuff) + rdispls[r+rank*MAX_ALLTOALLV_RANKS] * wordSize(type),
-          recvcounts[r+rank*MAX_ALLTOALLV_RANKS],
+          ((char*)recvbuff) + rdispls[r+rank*nranks] * wordSize(type),
+          recvcounts[r+rank*nranks],
           type,
           r,
           comm,
@@ -143,8 +148,12 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   }
   NCCLCHECK(ncclGroupEnd());
 #endif
-  return testSuccess;
 #endif
+  free(sendcounts);
+  free(recvcounts);
+  free(sdispls);
+  free(rdispls);
+  return testSuccess;
 }
 
 struct testColl alltoAllTest = {

From 6048078be2cb09b430cdcd622a65b41a0a14d889 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Mon, 24 Jul 2023 12:02:44 -0700
Subject: [PATCH 138/233] search SLES install paths for MPI

---
 CMakeLists.txt | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5577fb80fc..8be6480f6e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,6 +94,26 @@ if (NOT NO_MPI)
         check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
     endif()
 
+    # Check for MPICH SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpi/gcc/mpich/bin /usr/lib64/mpi/gcc/mpich /usr/lib64/mpi/gcc/mpich/include)
+    endif()
+
+    # Check for Open MPI v4 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi4/bin /usr/lib64/mpi/gcc/openmpi4 /usr/lib64/mpi/gcc/openmpi4/include)
+    endif()
+
+    # Check for Open MPI v3 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi3/bin /usr/lib64/mpi/gcc/openmpi3 /usr/lib64/mpi/gcc/openmpi3/include)
+    endif()
+    
+    # Check for Open MPI v2 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi2/bin /usr/lib64/mpi/gcc/openmpi2 /usr/lib64/mpi/gcc/openmpi2/include)
+    endif()
+
     if (NOT MPI_MPICXX)
         message ("-- no MPI library found")
     endif()

From c96ff57ac79522e813b0661bb5831d71012f5366 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Tue, 14 Feb 2023 22:31:54 +0000
Subject: [PATCH 139/233] auto-detect and enable MPI

---
 CMakeLists.txt | 50 ++++++++++++++++++++++++++++++++++++++++++--------
 README.md      | 16 ++++++++++++++++
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d950565e2f..f3138fa2fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,33 @@
 # ########################################################################
 #Adding pthread flag for linking
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so)
+    find_program(MPI_MPICXX ${mpi_compiler})
+    if (MPI_MPICXX)
+        message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
+        if (${CMAKE_VERSION} VERSION_LESS "3.20.0")
+            get_filename_component(mpi.tmpdir ${MPI_MPICXX} DIRECTORY)
+            get_filename_component(mpi_base_dir ${mpi.tmpdir} DIRECTORY)
+        else()
+            cmake_path(GET MPI_MPICXX PARENT_PATH mpi.tmpdir)
+            cmake_path(GET mpi.tmpdir PARENT_PATH mpi_base_dir)
+        endif()
+        find_file(MPI_H mpi.h PATHS ${mpi_base_dir} PATH_SUFFIXES include include/x86_64-linux-gnu ${ARGN} {REQUIRED)
+        if (${CMAKE_VERSION} VERSION_LESS "3.20.0")
+            get_filename_component(mpi_inc_dir ${MPI_H} DIRECTORY)
+        else()
+            cmake_path(GET MPI_H PARENT_PATH mpi_inc_dir)
+        endif()
+        message ("-- mpi.h is in ${mpi_inc_dir}")
+        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu REQIRED)
+        message ("-- libmpi is ${MPI_LIB}")
+        add_definitions(-DMPI_SUPPORT)
+        include_directories(${mpi_inc_dir})
+        link_libraries(${MPI_LIB})
+    else()
+        message ("-- ${mpi_compiler} not found")
+    endif()
+endmacro()
 
 cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
 
@@ -32,8 +59,7 @@ include(ROCMCheckTargetIds)
 include(ROCMClients)
 
 # Build variables
-option(USE_MPI "Build RCCL-tests with MPI support. Requires the MPI path to be set.")
-set(MPI_PATH "" CACHE PATH "Path to MPI installation")
+option(NO_MPI "Build RCCL-tests without MPI support.")
 ## Get default GPU targets using rocm_check_target_ids
 rocm_check_target_ids(
     DEFAULT_AMDGPU_TARGETS
@@ -41,13 +67,21 @@ rocm_check_target_ids(
 )
 set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.")
 
-# Find the MPI package if we're using MPI
-if (USE_MPI)
-    if(NOT MPI_PATH STREQUAL "")
-        set(MPI_HOME "${MPI_PATH}")
+if (NOT NO_MPI)
+    # Check for MPICH first
+    check_mpi(mpicxx.mpich libmpich.a libmpich.so include/x86_64-linux-gnu/mpich)
+
+    # Check for MPI in general. If we find mpicxx, we don't know whether its
+    # MPICH or another MPI implementation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so)
     endif()
-    find_package(MPI REQUIRED MODULE)
-    add_definitions(-DOMPI_SKIP_MPICXX -DMPI_SUPPORT)
+
+    if (NOT MPI_MPICXX)
+        message ("-- no MPI library found")
+    endif()
+else()
+    message ("-- MPI support explicitely disabled")
 endif()
 
 set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
diff --git a/README.md b/README.md
index c2847232e6..0a88c5d384 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,22 @@ RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If y
 $ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
 ```
 
+RCCL tests can also be built using cmake. A typical sequence will be:
+
+```shell
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl ..
+$ make
+```
+
+When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
+for cmake target and config files that are created during the RCCL build.
+
+Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitley request
+MPI builds. A user can explicitely disable MPI builds by adding the -DNO_MPI=1 flag to the cmake command line.
+
+
 ## Usage
 
 RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).

From 8fc00ec32ef9185f277c26d03a4824c75f2072cb Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Thu, 2 Mar 2023 18:22:03 +0000
Subject: [PATCH 140/233] revamp cmake MPI detection

we honor user requested MPI installations using MPI_PATH first,
and check afterwards for MPICH and Open MPI in the default
Ubuntu and RHEL installation directories.
---
 CMakeLists.txt | 69 +++++++++++++++++++++++++++++++-------------------
 README.md      |  5 ++--
 2 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3138fa2fa..3cdbad6f75 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,29 +3,23 @@
 # ########################################################################
 #Adding pthread flag for linking
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
-macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so)
-    find_program(MPI_MPICXX ${mpi_compiler})
+macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
+    find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
     if (MPI_MPICXX)
         message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
-        if (${CMAKE_VERSION} VERSION_LESS "3.20.0")
-            get_filename_component(mpi.tmpdir ${MPI_MPICXX} DIRECTORY)
-            get_filename_component(mpi_base_dir ${mpi.tmpdir} DIRECTORY)
-        else()
-            cmake_path(GET MPI_MPICXX PARENT_PATH mpi.tmpdir)
-            cmake_path(GET mpi.tmpdir PARENT_PATH mpi_base_dir)
-        endif()
-        find_file(MPI_H mpi.h PATHS ${mpi_base_dir} PATH_SUFFIXES include include/x86_64-linux-gnu ${ARGN} {REQUIRED)
-        if (${CMAKE_VERSION} VERSION_LESS "3.20.0")
-            get_filename_component(mpi_inc_dir ${MPI_H} DIRECTORY)
-        else()
-            cmake_path(GET MPI_H PARENT_PATH mpi_inc_dir)
-        endif()
-        message ("-- mpi.h is in ${mpi_inc_dir}")
-        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu REQIRED)
+        find_file(MPI_H mpi.h PATHS ${mpi_inc_dir} NO_DEFAULT_PATH)
+        message ("-- mpi.h is in ${MPI_H}")
+        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_lib_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu NO_DEFAULT_PATH)
         message ("-- libmpi is ${MPI_LIB}")
-        add_definitions(-DMPI_SUPPORT)
-        include_directories(${mpi_inc_dir})
-        link_libraries(${MPI_LIB})
+	if (NOT MPI_H OR NOT MPI_LIB)
+	    set (MPI_MPICXX "MPI_MPICXX-NOTFOUND")
+	    set (MPI_H "MPI_H-NOTFOUND")
+	    set (MPI_LIB "MPI_LIB-NOTFOUND")
+	else()
+            add_definitions(-DMPI_SUPPORT)
+            include_directories(${mpi_inc_dir})
+            link_libraries(${MPI_LIB})
+	endif()
     else()
         message ("-- ${mpi_compiler} not found")
     endif()
@@ -60,6 +54,7 @@ include(ROCMClients)
 
 # Build variables
 option(NO_MPI "Build RCCL-tests without MPI support.")
+option(MPI_PATH "Use MPI in the specified directory.")
 ## Get default GPU targets using rocm_check_target_ids
 rocm_check_target_ids(
     DEFAULT_AMDGPU_TARGETS
@@ -68,13 +63,35 @@ rocm_check_target_ids(
 set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.")
 
 if (NOT NO_MPI)
-    # Check for MPICH first
-    check_mpi(mpicxx.mpich libmpich.a libmpich.so include/x86_64-linux-gnu/mpich)
+    # CHECK for MPI Path first. User requested this directory explicitely
+    if (MPI_PATH)
+        set(mpi_spec_bin_dir "${MPI_PATH}/bin")
+	set(mpi_spec_inc_dir "${MPI_PATH}/include")
+        check_mpi(mpicxx libmpi.a libmpi.so ${mpi_spec_bin_dir} ${MPI_PATH} ${mpi_spec_inc_dir})
+	if (NOT MPI_MPICXX)
+            # Since the user explicitely requested this directory, abort if something went wrong.
+	    MESSAGE(FATAL_ERROR "Could not find MPI in ${MPI_PATH}")
+        endif()
+    endif()
 
-    # Check for MPI in general. If we find mpicxx, we don't know whether its
-    # MPICH or another MPI implementation
+    # Check for MPICH Ubuntu installation
     if (NOT MPI_MPICXX)
-        check_mpi(mpicxx libmpi.a libmpi.so)
+        check_mpi(mpicxx.mpich libmpich.a libmpich.so /usr/bin /usr /usr/include/x86_64-linux-gnu/mpich)
+    endif()
+
+    # Check for Open MPI Ubuntu installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx.openmpi libmpi.a libmpi.so /usr/bin  /usr/lib/x86_64-linux-gnu/openmpi /usr/lib/x86_64-linux-gnu/openmpi/include)
+    endif()
+
+    # Check for MPICH RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpich/bin /usr/lib64/mpich /usr/include/mpich-x86_64)
+    endif()
+
+    # Check for Open MPI RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
     endif()
 
     if (NOT MPI_MPICXX)
@@ -91,7 +108,7 @@ add_subdirectory(src)
 
 # Create ROCm standard packages
 rocm_create_package(
-    NAME rccl-separate-tests
+    NAME rccl-tests
     DESCRIPTION "Tests for the ROCm Communication Collectives Library"
     MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
 )
diff --git a/README.md b/README.md
index 0a88c5d384..74f15515b4 100644
--- a/README.md
+++ b/README.md
@@ -30,8 +30,9 @@ $ make
 When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
 for cmake target and config files that are created during the RCCL build.
 
-Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitley request
-MPI builds. A user can explicitely disable MPI builds by adding the -DNO_MPI=1 flag to the cmake command line.
+Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitly request
+MPI builds. A user can request to use a particular MPI library by using the MPI_PATH variable. MPI support can be explicitely disabled by adding the -DNO_MPI=1
+flag to the cmake command line.
 
 
 ## Usage

From efdd4ad40bcc59cd2fecae4cd34eed3644db917c Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Mon, 24 Jul 2023 12:02:44 -0700
Subject: [PATCH 141/233] search SLES install paths for MPI

---
 CMakeLists.txt | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3cdbad6f75..296c01c28b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,6 +94,26 @@ if (NOT NO_MPI)
         check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
     endif()
 
+    # Check for MPICH SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpi/gcc/mpich/bin /usr/lib64/mpi/gcc/mpich /usr/lib64/mpi/gcc/mpich/include)
+    endif()
+
+    # Check for Open MPI v4 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi4/bin /usr/lib64/mpi/gcc/openmpi4 /usr/lib64/mpi/gcc/openmpi4/include)
+    endif()
+
+    # Check for Open MPI v3 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi3/bin /usr/lib64/mpi/gcc/openmpi3 /usr/lib64/mpi/gcc/openmpi3/include)
+    endif()
+    
+    # Check for Open MPI v2 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi2/bin /usr/lib64/mpi/gcc/openmpi2 /usr/lib64/mpi/gcc/openmpi2/include)
+    endif()
+
     if (NOT MPI_MPICXX)
         message ("-- no MPI library found")
     endif()

From a6593375bceae04df3392ec95e86bb1a2f154458 Mon Sep 17 00:00:00 2001
From: arvindcheru <90783369+arvindcheru@users.noreply.github.com>
Date: Fri, 4 Aug 2023 19:33:39 -0400
Subject: [PATCH 142/233] Update Makefile - HIPCC Path Updated to latest (#45)

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 500549d2da..42daba2706 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -12,7 +12,7 @@ VERBOSE ?= 0
 DEBUG ?= 0
 NCCL_HOME ?= ""
 
-HIPCC = $(ROCM_PATH)/hip/bin/hipcc
+HIPCC = $(ROCM_PATH)/bin/hipcc
 CXX = $(HIPCC)
 
 HIPCUFLAGS := -std=c++14

From c1ec0c8aaf15a7a06c168854cedf0eb2cee222a5 Mon Sep 17 00:00:00 2001
From: arvindcheru <90783369+arvindcheru@users.noreply.github.com>
Date: Fri, 4 Aug 2023 19:42:33 -0400
Subject: [PATCH 143/233] Update Makefile - HIPCC Path Updated to latest (#46)

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index f01e7b3850..e694499ce2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -12,7 +12,7 @@ VERBOSE ?= 0
 DEBUG ?= 0
 NCCL_HOME ?= ""
 
-HIPCC = $(ROCM_PATH)/hip/bin/hipcc
+HIPCC = $(ROCM_PATH)/bin/hipcc
 CXX = $(HIPCC)
 
 HIPCUFLAGS := -std=c++14
@@ -25,7 +25,7 @@ HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
 endif
 HIPCUFLAGS += -I$(ROCM_PATH)/include
 HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
-HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
 HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
 

From 6c46206a478203b6453035fe0d40dc6418acd089 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 13 Sep 2023 11:15:13 -0700
Subject: [PATCH 144/233] Make the -c option be a datacheck iteration count
 parameter

Default is 1
---
 README.md     | 2 +-
 src/common.cu | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 580996b28d..4281799430 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ All tests support the same set of arguments :
   * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
-  * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
+  * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
 
diff --git a/src/common.cu b/src/common.cu
index 48a629ce10..dcead4ddd4 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -487,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   int64_t wrongElts = 0;
   static __thread int rep = 0;
   rep++;
-  if (datacheck) {
+  for (int c = 0; c < datacheck; c++) {
       // Initialize sendbuffs, recvbuffs and expected
       TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
@@ -536,8 +536,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
       //aggregate delta from all threads and procs
       long long wrongElts1 = wrongElts;
+      //if (wrongElts) fprintf(stderr, "\nERROR: Data corruption : rank %d size %ld wrongElts %ld\n", args->proc, args->expectedBytes, wrongElts);
       Allreduce(args, &wrongElts1, /*sum*/4);
       wrongElts = wrongElts1;
+      if (wrongElts) break;
   }
 
   double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
@@ -809,7 +811,7 @@ int main(int argc, char* argv[]) {
             "[-m,--agg_iters <aggregated iteration count>] \n\t"
             "[-w,--warmup_iters <warmup iteration count>] \n\t"
             "[-p,--parallel_init <0/1>] \n\t"
-            "[-c,--check <0/1>] \n\t"
+            "[-c,--check <check iteration count>] \n\t"
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
             "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
 #elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)

From 46375b1c527b2e3afe80fdd6dd136151bd939675 Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Fri, 22 Sep 2023 14:38:31 -0600
Subject: [PATCH 145/233] Fixing hipcc location for CI (#47)

---
 .jenkins/common.groovy | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index 7426d35d75..fe58a3e41a 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -14,7 +14,7 @@ def runCompileCommand(platform, project, jobName)
                 ${auxiliary.exitIfNotSuccess()}
                 cd ${project.paths.project_build_prefix}
                 cmake \
-                    -DCMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \
+                    -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
                     -S . -B build
                 make -C build -j\$(nproc)
                 ${auxiliary.exitIfNotSuccess()}
@@ -30,9 +30,9 @@ def runTestCommand (platform, project)
     def command = """#!/usr/bin/env bash
                 set -x
                 cd ${project.paths.project_build_prefix}
-		python3 -m pip install --upgrade pytest
-		python3 -m pytest --version
-		python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml
+                python3 -m pip install --upgrade pytest
+                python3 -m pytest --version
+                python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml
             """
 
    platform.runCommand(this, command)

From 1292b25553bd0384f2faa2965f9d82b99797a348 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 12 Oct 2023 16:53:32 -0700
Subject: [PATCH 146/233] Added an MPI_Barrier() call after MPI_Bcast() for
 HCOLL issue

---
 src/common.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/common.cu b/src/common.cu
index dcead4ddd4..8588047d78 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -924,6 +924,7 @@ testResult_t run() {
   }
 #ifdef MPI_SUPPORT
   MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
+  MPI_Barrier(MPI_COMM_WORLD); // Ensure Bcast is complete for HCOLL
 #endif
   int gpus[nGpus*nThreads];
   cudaStream_t streams[nGpus*nThreads];

From 5ee7a08994ea68e4ab607c2fa912cd838457d1fa Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Mon, 16 Oct 2023 12:13:50 -0700
Subject: [PATCH 147/233] Warm up both out-of-place and in-place collectives
 (#51)

---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index d3fb87aea9..4d9e302802 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -641,7 +641,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   // Warm-up for small size
   setupArgs(args->minbytes, type, args);
   for (int iter = 0; iter < warmup_iters; iter++) {
-    TESTCHECK(startColl(args, type, op, root, 0, iter));
+    TESTCHECK(startColl(args, type, op, root, iter < warmup_iters/2 ? 0 : 1, iter));
   }
   TESTCHECK(completeColl(args));
 

From 8bfb67faf305e4a36c5a2812f5b09b2f825d267a Mon Sep 17 00:00:00 2001
From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com>
Date: Thu, 19 Oct 2023 13:29:42 -0600
Subject: [PATCH 148/233] Fixing hipcc location for develop CI (#52)

---
 .jenkins/common.groovy | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index 7426d35d75..fe58a3e41a 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -14,7 +14,7 @@ def runCompileCommand(platform, project, jobName)
                 ${auxiliary.exitIfNotSuccess()}
                 cd ${project.paths.project_build_prefix}
                 cmake \
-                    -DCMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \
+                    -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
                     -S . -B build
                 make -C build -j\$(nproc)
                 ${auxiliary.exitIfNotSuccess()}
@@ -30,9 +30,9 @@ def runTestCommand (platform, project)
     def command = """#!/usr/bin/env bash
                 set -x
                 cd ${project.paths.project_build_prefix}
-		python3 -m pip install --upgrade pytest
-		python3 -m pytest --version
-		python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml
+                python3 -m pip install --upgrade pytest
+                python3 -m pytest --version
+                python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml
             """
 
    platform.runCommand(this, command)

From e1a816b8691cc6fa4fcada2e32db29be2f1650cd Mon Sep 17 00:00:00 2001
From: Lauren Wrubleski <Lauren.Wrubleski@amd.com>
Date: Tue, 5 Dec 2023 18:20:46 -0700
Subject: [PATCH 149/233] Offload arch linking (#54)

* Update CMakeLists.txt

* Update CMakeLists.txt

* Link rccl_common object against hip::device

Previously the tests were compiled with `--amdgpu-target` to compile for multiple architectures, As rccl_common was not compiled against those architectures, this didn't work. Linking it against hip::device automatically links against all architectures in `AMDGPU_TARGETS`, and so are the test executables.
---
 src/CMakeLists.txt | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6511a419c9..41d312855f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,10 +5,9 @@
 # Compile common object library
 set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
 add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
+target_link_libraries(rccl_common roc::rccl hip::device)
 if(USE_MPI)
-    target_link_libraries(rccl_common roc::rccl MPI::MPI_CXX)
-else()
-    target_link_libraries(rccl_common roc::rccl)
+    target_link_libraries(rccl_common MPI::MPI_CXX)
 endif()
 
 function(add_relative_test test_name test_target)
@@ -38,11 +37,6 @@ function(add_rccl_test TEST)
         PRIVATE
             rccl_common
     )
-    if (NOT WIN32)
-        foreach(amdgpu_target ${AMDGPU_TARGETS})
-            target_link_libraries(${TEST_TARGET} PRIVATE --amdgpu-target=${amdgpu_target})
-        endforeach()
-    endif()
     set_target_properties(
         ${TEST_TARGET}
         PROPERTIES

From b1f86ea6eb9c60cea3b5bf60d5aa169849a051d8 Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Wed, 6 Dec 2023 17:24:37 -0600
Subject: [PATCH 150/233] Update default GPUs and build for AMDGPU_TARGETS
 (#55)

* Update default GPUs and build for AMDGPU_TARGETS
* Make GPU_TARGETS a cache variable
---------
Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
---
 CMakeLists.txt | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8be6480f6e..7bf508d614 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,8 +44,6 @@ set(CMAKE_CXX_STANDARD 14)
 
 # Get additional packages required
 find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}")
-find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
-
 include(ROCMSetupVersion)
 include(ROCMCreatePackage)
 include(ROCMInstallTargets)
@@ -55,12 +53,37 @@ include(ROCMClients)
 # Build variables
 option(NO_MPI "Build RCCL-tests without MPI support.")
 option(MPI_PATH "Use MPI in the specified directory.")
-## Get default GPU targets using rocm_check_target_ids
-rocm_check_target_ids(
-    DEFAULT_AMDGPU_TARGETS
-    TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
-)
-set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.")
+
+# Default GPU architectures to build
+#==================================================================================================
+set(DEFAULT_GPUS
+      gfx803
+      gfx900:xnack-
+      gfx906:xnack-
+      gfx908:xnack-
+      gfx90a:xnack-
+      gfx90a:xnack+
+      gfx940
+      gfx941
+      gfx942
+      gfx1030
+      gfx1100
+      gfx1101
+      gfx1102)
+
+set(AMDGPU_TARGETS ${DEFAULT_GPUS} CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined.")
+## Determine which GPU architectures to build for
+if (COMMAND rocm_check_target_ids)
+    message(STATUS "Checking for ROCm support for GPU targets:")
+    rocm_check_target_ids(SUPPORTED_GPUS TARGETS "${AMDGPU_TARGETS}")
+else()
+    message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs")
+    set(SUPPORTED_GPUS ${DEFAULT_GPUS})
+endif()
+set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architectures to build for.")
+message(STATUS "Compiling for ${GPU_TARGETS}")
+
+find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
 
 if (NOT NO_MPI)
     # CHECK for MPI Path first. User requested this directory explicitely

From a2bec5d2f69b28a47c6cde881028e4b5c258b0c4 Mon Sep 17 00:00:00 2001
From: Nusrat Islam <Nusrat.Islam@amd.com>
Date: Thu, 4 Jan 2024 16:20:42 -0600
Subject: [PATCH 151/233] Add option to disable out-of-place

---
 src/common.cu | 36 ++++++++++++++++++++++++++----------
 src/common.h  |  1 +
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 4d9e302802..e49c0d3f52 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -94,6 +94,7 @@ static int numDevices = 1;
 static int ranksPerGpu = 1;
 static int enable_multiranks = 0;
 static int delay_inout_place = 0;
+static int enable_out_of_place = 1;
 
 #define NUM_BLOCKS 32
 
@@ -653,8 +654,10 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
 	char rootName[100];
 	sprintf(rootName, "%6i", root);	
 	PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
-        TESTCHECK(BenchTime(args, type, op, root, 0));
-        usleep(delay_inout_place);
+	if (enable_out_of_place) {
+        	TESTCHECK(BenchTime(args, type, op, root, 0));
+        	usleep(delay_inout_place);
+	}
         TESTCHECK(BenchTime(args, type, op, root, 1));
         PRINT("\n");
     }
@@ -795,6 +798,7 @@ int main(int argc, char* argv[]) {
     {"cudagraph", required_argument, 0, 'G'},
     {"report_cputime", required_argument, 0, 'C'},
     {"average", required_argument, 0, 'a'},
+    {"out_of_place", required_argument, 0, 'O'},
 #ifdef RCCL_MULTIRANKPERGPU
     {"enable_multiranks", required_argument, 0, 'x'},
     {"ranks_per_gpu", required_argument, 0, 'R'},
@@ -807,9 +811,9 @@ int main(int argc, char* argv[]) {
     int c;
 
 #ifdef RCCL_MULTIRANKPERGPU    
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:q:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:a:y:s:u:h:R:x:q:", longopts, &longindex);
 #else
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:q:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:a:y:s:u:h:q:", longopts, &longindex);
 #endif
 
     if (c == -1)
@@ -907,6 +911,9 @@ int main(int argc, char* argv[]) {
       case 'C':
         report_cputime = strtol(optarg, NULL, 0);
         break;
+      case 'O':
+        enable_out_of_place = strtol(optarg, NULL, 0);
+        break;
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
         break;
@@ -953,6 +960,7 @@ int main(int argc, char* argv[]) {
             "[-T,--timeout <time in seconds>] \n\t"
             "[-G,--cudagraph <num graph launches>] \n\t"
             "[-C,--report_cputime <0/1>] \n\t"
+	    "[-O,--out_of_place <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
 #ifdef RCCL_MULTIRANKPERGPU
             "[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t"
@@ -1173,11 +1181,19 @@ testResult_t run() {
 
   const char* timeStr = report_cputime ? "cputime" : "time";
   PRINT("#\n");
-  PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
-      timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
-  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
-      "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+  if (enable_out_of_place) {
+  	PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
+  	PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
+      	timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
+  	PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+      	"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+  } else {
+	PRINT("# %10s  %12s  %8s  %6s  %6s           in-place          \n", "", "", "", "", "");
+        PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
+        timeStr, "algbw", "busbw", "#wrong");
+        PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "");
+  }
 
   struct testThread threads[nThreads];
   memset(threads, 0, sizeof(struct testThread)*nThreads);
@@ -1205,7 +1221,7 @@ testResult_t run() {
     threads[t].args.ncclId = ncclId;
     threads[t].args.comms=comms+t*nGpus*ranksPerGpu;
     threads[t].args.streams=streams+t*nGpus*ranksPerGpu;
-
+    threads[t].args.enable_out_of_place=enable_out_of_place;
     threads[t].args.errors=errors+t;
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
diff --git a/src/common.h b/src/common.h
index 9ed929a905..e9619a7ea8 100644
--- a/src/common.h
+++ b/src/common.h
@@ -127,6 +127,7 @@ struct threadArgs {
   int localRank;
   int localNumDevices;
   int enable_multiranks;
+  int enable_out_of_place;
   int nRanks;
   void** sendbuffs;
   size_t sendBytes;

From c6afef0b6f76ffc55d4172d971be6cf5a08a73a4 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 5 Feb 2024 08:53:54 -0800
Subject: [PATCH 152/233] Added missing MPI_Comm_free() call before
 MPI_Finalize()

---
 src/common.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/common.cu b/src/common.cu
index 8588047d78..4ac00fb3d7 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1056,6 +1056,7 @@ testResult_t run() {
   PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
   PRINT("#\n");
 #ifdef MPI_SUPPORT
+  MPI_Comm_free(&mpi_comm);
   MPI_Finalize();
 #endif
 

From 7715a0cf1ff638b81ca75f9166b0e2eff135bb37 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Thu, 15 Feb 2024 12:04:38 -0800
Subject: [PATCH 153/233] Fix typo in rank assignment (#59)

---
 src/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index e49c0d3f52..d71f0a7a35 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1152,7 +1152,7 @@ testResult_t run() {
        for (int ii=0; ii<nGpus*nThreads; ii++) {
          HIPCHECK(hipSetDevice(gpus[ii]));
 	 if (!enable_multiranks) {
-	   NCCLCHECK(ncclCommInitRank(comms+ii, ncclProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
+	   NCCLCHECK(ncclCommInitRank(comms+ii, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+ii));
 	 }
 #ifdef RCCL_MULTIRANKPERGPU
 	 else

From 88cf7dbf456e91d7d008365cf5e28d85d64d27fb Mon Sep 17 00:00:00 2001
From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com>
Date: Tue, 5 Mar 2024 09:47:18 -0700
Subject: [PATCH 154/233] Add hipify steps prior to build (#62)

* Add hipify steps prior to build
---
 src/CMakeLists.txt            | 122 +++++++++++++++++++++++++-------
 src/Makefile                  |  21 ++++--
 src/all_gather.cu             |  10 +--
 src/all_reduce.cu             |  10 +--
 src/alltoall.cu               |  10 +--
 src/alltoallv.cu              |  12 ++--
 src/broadcast.cu              |  10 +--
 src/common.cu                 | 129 +++++++++++++++++-----------------
 src/common.h                  |  14 ++--
 src/gather.cu                 |  12 ++--
 src/hypercube.cu              |  12 ++--
 src/nccl1_compat.h            |  11 ++-
 src/reduce.cu                 |  12 ++--
 src/reduce_scatter.cu         |  12 ++--
 src/scatter.cu                |  10 +--
 src/sendrecv.cu               |  10 +--
 verifiable/inexact_regress.cu |  10 +--
 verifiable/verifiable.cu      |  42 +++++------
 verifiable/verifiable.h       |   8 +--
 verifiable/verifiable.mk      |  22 ++++--
 20 files changed, 295 insertions(+), 204 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 41d312855f..fb4dc7d5b7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,37 +1,33 @@
 # ########################################################################
-# Copyright 2022 Advanced Micro Devices, Inc.
+# Copyright 2022-2024 Advanced Micro Devices, Inc.
 # ########################################################################
 
-# Compile common object library
-set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
-add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
-target_link_libraries(rccl_common roc::rccl hip::device)
-if(USE_MPI)
-    target_link_libraries(rccl_common MPI::MPI_CXX)
-endif()
-
-function(add_relative_test test_name test_target)
-    get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
-    if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
-        set(EXE_PATH ".")
-    endif()
-    get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-    get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
-    if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
-        get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
-        if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
-            set(EXE_NAME "${test_target}")
-        endif()
-    endif()
-    file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
-    add_test(NAME "${test_name}" COMMAND "./${rel_path}")
-endfunction()
-
 function(add_rccl_test TEST)
     set(TEST_SOURCE "${TEST}.cu")
     set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
+
+    # Check that file exists
+    if (NOT EXISTS ${SOURCE_DIR}/${TEST_SOURCE})
+        message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${TEST_SOURCE}")
+    endif()
+
+    # Establish hipified copy of the source file
+    set(HIP_FILE "${HIPIFY_DIR}/${TEST_SOURCE}")
+    get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY)
+
+    # Convert .cu files to .cpp so that they get processed properly
+    string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
+
+    # Create a custom command to create hipified source code
+    add_custom_command(
+        OUTPUT ${HIP_FILE}
+        COMMAND mkdir -p ${HIP_FILE_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${TEST_SOURCE} -o ${HIP_FILE}
+        MAIN_DEPENDENCY ${TEST_SOURCE}
+        COMMENT "Hipifying ${TEST_SOURCE} -> ${HIP_FILE}"
+    )
+
     set(TEST_TARGET "${TEST}_perf")
-    add_executable(${TEST_TARGET} ${TEST_SOURCE})
+    add_executable(${TEST_TARGET} ${HIP_FILE})
     target_link_libraries(
         ${TEST_TARGET}
         PRIVATE
@@ -52,6 +48,78 @@ function(add_rccl_test TEST)
     )
 endfunction()
 
+function(add_relative_test test_name test_target)
+    get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
+    if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
+        set(EXE_PATH ".")
+    endif()
+    get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
+    if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+        get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
+        if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+            set(EXE_NAME "${test_target}")
+        endif()
+    endif()
+    file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
+    add_test(NAME "${test_name}" COMMAND "./${rel_path}")
+endfunction()
+
+# Collect list of common source files
+#==================================================================================================
+set(COMMON_FILES
+  common.h
+  common.cu
+  nccl1_compat.h
+  rccl_bfloat16.h
+  timer.h
+  timer.cc
+  ../verifiable/verifiable.h
+  ../verifiable/verifiable.cu
+)
+
+# Hipify common files (copy of source generated into hipify directory)
+#==================================================================================================
+find_program(hipify-perl_executable hipify-perl)
+set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify")
+set(SOURCE_DIR "${CMAKE_SOURCE_DIR}/src")
+
+## Loop over each common file to hipify
+foreach(COMMON_FILE ${COMMON_FILES})
+  # Check that file exists
+  if (NOT EXISTS ${SOURCE_DIR}/${COMMON_FILE})
+    message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${COMMON_FILE}")
+  endif()
+
+  # Establish hipified copy of the common file
+  get_filename_component(HIP_FILE_NAME ${HIPIFY_DIR}/${COMMON_FILE} NAME)
+  set(HIP_FILE "${HIPIFY_DIR}/${HIP_FILE_NAME}")
+
+  # Convert .cu files to .cpp so that they get processed properly
+  string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
+  list(APPEND HIP_COMMON_SOURCES ${HIP_FILE})
+
+  # Create a custom command to create hipified source code
+  add_custom_command(
+    OUTPUT ${HIP_FILE}
+    COMMAND mkdir -p ${HIPIFY_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${COMMON_FILE} -o ${HIP_FILE}
+    MAIN_DEPENDENCY ${COMMON_FILE}
+    COMMENT "Hipifying ${COMMON_FILE} -> ${HIP_FILE}"
+  )
+endforeach()
+
+# Compile common object library
+#==================================================================================================
+add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
+add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
+add_dependencies(rccl_common hipify)
+target_link_libraries(rccl_common roc::rccl hip::device)
+if(USE_MPI)
+    target_link_libraries(rccl_common MPI::MPI_CXX)
+endif()
+
+# Compile tests
+#==================================================================================================
 add_rccl_test(all_gather)
 add_rccl_test(all_reduce)
 add_rccl_test(alltoall)
diff --git a/src/Makefile b/src/Makefile
index 42daba2706..00a17b56a8 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,6 @@
 #
 # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Modifications are Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -41,6 +41,9 @@ endif
 .PHONY: build clean
 
 BUILDDIR ?= ../build
+HIPIFY_DIR ?= $(BUILDDIR)/hipify
+
+.PRECIOUS: $(HIPIFY_DIR)/%.cu.cpp $(HIPIFY_DIR)/%.h
 
 ifeq ($(MPI), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
@@ -68,11 +71,21 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable
 TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
 include ../verifiable/verifiable.mk
 
-${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
+${HIPIFY_DIR}/%.cu.cpp: %.cu
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+${HIPIFY_DIR}/%.h: %.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
-	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
+	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<
 
 ${DST_DIR}/timer.o: timer.cc timer.h
 	@printf "Compiling  %-35s > %s\n" $< $@
diff --git a/src/all_gather.cu b/src/all_gather.cu
index f18ce0cb65..7efc8f2c5e 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 #define ALIGN 4
@@ -25,15 +25,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
     for (int j=0; j<nranks; j++) {
       TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -46,7 +46,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index de03a206ff..21e3ce3a9e 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,13 +22,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -41,7 +41,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/alltoall.cu b/src/alltoall.cu
index acfeb7d8ee..2773223dc9 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,16 +22,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
     for (int j=0; j<nranks; j++) {
       size_t partcount = sendcount/nranks;
       TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   // We don't support in-place alltoall
   args->reportErrors = in_place ? 0 : 1;
@@ -46,7 +46,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 73b53d20c3..5bab3071b1 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 #define USE_RCCL_GATHER_SCATTER
@@ -32,15 +32,15 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
 
 #if 0
     int *dataHost = (int *)malloc(args->sendBytes);
-    hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
+    cudaMemcpy(dataHost, data, args->sendBytes, cudaMemcpyDeviceToHost);
     printf(" Rank [%d] Original: ", rank);
     for(int j=0; j<sendcount; j++) {
 	    printf("%d:%d ", j, dataHost[j]);
@@ -68,7 +68,7 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
       TESTCHECK(InitData(((char*)args->expected[i])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
       rdisp += rcount;
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   // We don't support in-place alltoall
   args->reportErrors = in_place ? 0 : 1;
@@ -83,7 +83,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int nranks;
   NCCLCHECK(ncclCommCount(comm, &nranks));
   int rank;
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 5cd6147f10..9157c4c0c0 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -21,13 +21,13 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
   size_t recvcount = args->expectedBytes / wordSize(type);
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
     TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -40,7 +40,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
 #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
diff --git a/src/common.cu b/src/common.cu
index 0096ecb729..0979e6992d 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -6,7 +6,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "hip/hip_runtime.h"
+#include "cuda_runtime.h"
 #include "rccl_bfloat16.h"
 #include "common.h"
 #include <pthread.h>
@@ -14,10 +14,11 @@
 #include <type_traits>
 #include <getopt.h>
 #include <libgen.h>
+#include "cuda.h"
 
 //#define DEBUG_PRINT
 
-#include "../verifiable/verifiable.h"
+#include "verifiable.h"
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
@@ -165,18 +166,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
 }
 
 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
-  HIPCHECK(hipDeviceSynchronize());
+  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
+  CUDACHECK(cudaDeviceSynchronize());
   return testSuccess;
 }
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
+  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
   return testSuccess;
 }
 
 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
+  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
   return testSuccess;
 }
 
@@ -271,11 +272,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   size_t count = args->expectedBytes/wordSize(type);
 
   int64_t *wrongPerGpu = nullptr;
-  HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
+  CUDACHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
   
   for (int i=0; i<args->nGpus; i++) {
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
 
     TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
@@ -286,8 +287,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       char *expectedHost = (char*)malloc(args->expectedBytes);
       char *dataHost = (char*)malloc(args->expectedBytes);
       int eltsz = wordSize(type);
-      hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost);
-      hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
+      cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost);
+      cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
 
       for(int j=0; j<args->expectedBytes/eltsz; j++) {
         unsigned long long want, got;
@@ -307,14 +308,14 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   *wrongElts = 0;
   for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
-  hipHostFree(wrongPerGpu);
+  cudaFreeHost(wrongPerGpu);
 
   if (args->reportErrors && *wrongElts) args->errors[0]++;
   return testSuccess;
 }
     
-testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) {
-  hipError_t hipErr;
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
   int remaining = ngpus;
   int* done = (int*)malloc(sizeof(int)*ngpus);
   memset(done, 0, sizeof(int)*ngpus);
@@ -325,15 +326,15 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
    for (int i=0; i<ngpus; i++) {
      if (done[i]) continue;
 
-     hipErr = hipStreamQuery(streams[i]);
-     if (hipErr == hipSuccess) {
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
        done[i] = 1;
        remaining--;
        idle = 0;
        continue;
      }
 
-     if (hipErr != hipErrorNotReady) HIPCHECK(hipErr);
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
      if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
@@ -382,7 +383,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
   for (int i = 0; i < args->nGpus; i++) {
 #ifndef NCCL_MAJOR
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
 #endif
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -463,16 +464,16 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   Barrier(args);
 
 #if HIP_VERSION >= 50221310
-  hipGraph_t graphs[args->nGpus];
-  hipGraphExec_t graphExec[args->nGpus];
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
   if (cudaGraphLaunches >= 1) {
     // Begin cuda graph capture
     for (int i=0; i<args->nGpus; i++) {
       // Thread local mdoe is needed for:
       // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
       // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
-      //   Since pre-connect calls hipMalloc, we cannot use global capture mode
-      HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal));
+      //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
     }
   }
 #endif
@@ -491,18 +492,18 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) {
     // End cuda graph capture
     for (int i=0; i<args->nGpus; i++) {
-      HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
     }
     // Instantiate cuda graph
     for (int i=0; i<args->nGpus; i++) {
-      HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
     }
     // Resync CPU, restart timing, launch cuda graph
     Barrier(args);
     tim.reset();
     for (int l=0; l<cudaGraphLaunches; l++) {
       for (int i=0; i<args->nGpus; i++) {
-        HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
       }
     }
   }
@@ -520,8 +521,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) {
     //destroy cuda graph
     for (int i=0; i<args->nGpus; i++) {
-      HIPCHECK(hipGraphExecDestroy(graphExec[i]));
-      HIPCHECK(hipGraphDestroy(graphs[i]));
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
     }
   }
 #endif
@@ -542,7 +543,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       if (cudaGraphLaunches >= 1) {
         // Begin cuda graph capture for data check
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipStreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal));
+          CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
         }
       }
 #endif
@@ -554,15 +555,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       if (cudaGraphLaunches >= 1) {
         // End cuda graph capture
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
+          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
         }
         // Instantiate cuda graph
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
         }
         // Launch cuda graph
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
+          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
         }
       }
 #endif
@@ -573,8 +574,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       if (cudaGraphLaunches >= 1) {
         //destroy cuda graph
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipGraphExecDestroy(graphExec[i]));
-          HIPCHECK(hipGraphDestroy(graphs[i]));
+          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+          CUDACHECK(cudaGraphDestroy(graphs[i]));
         }
       }
 #endif
@@ -664,7 +665,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
-  HIPCHECK(hipSetDevice(args->gpus[0]));
+  CUDACHECK(cudaSetDevice(args->gpus[0]));
   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
   return testSuccess;
 }
@@ -680,7 +681,7 @@ testResult_t threadInit(struct threadArgs* args) {
   NCCLCHECK(ncclGroupStart());
   for (int i=0; i<args->nGpus; i++) {
     int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
   }
   NCCLCHECK(ncclGroupEnd());
@@ -705,29 +706,29 @@ testResult_t threadLaunch(struct testThread* thread) {
 
 testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
   if (memorytype == ncclFine) {
-    HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
-    HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
-    if (datacheck) HIPCHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
+    CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
+    CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
+    if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
   }
   else if (memorytype == ncclHost) {
-    HIPCHECK(hipHostMalloc(sendbuff, nbytes));
-    HIPCHECK(hipHostMalloc(recvbuff, nbytes));
-    if (datacheck) HIPCHECK(hipHostMalloc(expected, recvBytes));
+    CUDACHECK(hipHostMalloc(sendbuff, nbytes));
+    CUDACHECK(hipHostMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
   }
   else if (memorytype == ncclManaged) {
-    HIPCHECK(hipMallocManaged(sendbuff, nbytes));
-    HIPCHECK(hipMallocManaged(recvbuff, nbytes));
-    if (datacheck) HIPCHECK(hipMallocManaged(expected, recvBytes));
+    CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
+    CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
 #if 0
-    HIPCHECK(hipMemset(*sendbuff, 0, nbytes));
-    HIPCHECK(hipMemset(*recvbuff, 0, nbytes));
-    if (datacheck) HIPCHECK(hipMemset(*expected, 0, recvBytes));
+    CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
+    CUDACHECK(cudaMemset(*recvbuff, 0, nbytes));
+    if (datacheck) CUDACHECK(cudaMemset(*expected, 0, recvBytes));
 #endif
   }
   else {
-    HIPCHECK(hipMalloc(sendbuff, nbytes));
-    HIPCHECK(hipMalloc(recvbuff, nbytes));
-    if (datacheck) HIPCHECK(hipMalloc(expected, recvBytes));
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
   }
   return testSuccess;
 }
@@ -940,7 +941,7 @@ int main(int argc, char* argv[]) {
     }
   }
 
-  HIPCHECK(hipGetDeviceCount(&numDevices));
+  CUDACHECK(cudaGetDeviceCount(&numDevices));
 #ifndef MPI_SUPPORT
   if (nGpus > numDevices)
   {
@@ -1016,10 +1017,10 @@ testResult_t run() {
   for (int i=0; i<nThreads*nGpus; i++) {
     int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
     int rank = proc*nThreads*nGpus+i;
-    hipDeviceProp_t prop;
-    HIPCHECK(hipGetDeviceProperties(&prop, cudaDev));
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
     char busIdStr[] = "00000000:00:00.0";
-    HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
+    CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
     len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
       rank, getpid(), hostname, cudaDev, busIdStr, prop.name);
 	  maxMem = std::min(maxMem, prop.totalGlobalMem);
@@ -1055,7 +1056,7 @@ testResult_t run() {
 #endif
 
   int gpus[nGpus*nThreads];
-  hipStream_t streams[nGpus*nThreads];
+  cudaStream_t streams[nGpus*nThreads];
   void* sendbuffs[nGpus*nThreads];
   void* recvbuffs[nGpus*nThreads];
   void* expected[nGpus*nThreads];
@@ -1067,12 +1068,12 @@ testResult_t run() {
   gpu0 = envstr ? atoi(envstr) : -1;
   for (int i=0; i<nGpus*nThreads; i++) {
     gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
-    HIPCHECK(hipSetDevice(gpus[i]));
+    CUDACHECK(cudaSetDevice(gpus[i]));
     TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
     if (streamnull)
       streams[i] = NULL;
     else
-      HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL
@@ -1083,7 +1084,7 @@ testResult_t run() {
      } else {
        NCCLCHECK(ncclGroupStart());
        for (int i=0; i<nGpus*nThreads; i++) {
-         HIPCHECK(hipSetDevice(gpus[i]));
+         CUDACHECK(cudaSetDevice(gpus[i]));
          NCCLCHECK(ncclCommInitRank(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i));
        }
        NCCLCHECK(ncclGroupEnd());
@@ -1093,7 +1094,7 @@ testResult_t run() {
   int errors[nThreads];
   double bw[nThreads];
   double* delta;
-  HIPCHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, hipHostMallocPortable | hipHostMallocMapped));
+  CUDACHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
   int bw_count[nThreads];
   for (int t=0; t<nThreads; t++) {
     bw[t] = 0.0;
@@ -1178,11 +1179,11 @@ testResult_t run() {
 
   // Free off CUDA allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
-    if (sendbuffs[i]) HIPCHECK(hipFree((char*)sendbuffs[i]));
-    if (recvbuffs[i]) HIPCHECK(hipFree((char*)recvbuffs[i]));
-    if (datacheck) HIPCHECK(hipFree(expected[i]));
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (datacheck) CUDACHECK(cudaFree(expected[i]));
   }
-  HIPCHECK(hipHostFree(delta));
+  CUDACHECK(cudaFreeHost(delta));
 
   envstr = getenv("NCCL_TESTS_MIN_BW");
   double check_avg_bw = envstr ? atof(envstr) : -1;
@@ -1197,9 +1198,9 @@ testResult_t run() {
   MPI_Finalize();
 #endif
 
-  // 'hip-memcheck --leak-check full' requires this
+  // 'cuda-memcheck --leak-check full' requires this
   PRINT("%s\n", ncclGetLastError(NULL));
-  hipDeviceReset();
+  cudaDeviceReset();
 
   if (errors[0] || bw[0] < check_avg_bw*(0.9))
     exit(EXIT_FAILURE);
diff --git a/src/common.h b/src/common.h
index efba238755..cabc6a6657 100644
--- a/src/common.h
+++ b/src/common.h
@@ -21,14 +21,14 @@
 // For nccl.h < 2.13 since we define a weak fallback
 extern "C" char const* ncclGetLastError(ncclComm_t comm);
 
-#define HIPCHECK(cmd) do {                          \
-  hipError_t e = cmd;                               \
-  if( e != hipSuccess ) {                           \
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
-    printf("%s: Test HIP failure %s:%d '%s'\n",     \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
          hostname,                                  \
-        __FILE__,__LINE__,hipGetErrorString(e));    \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
     return testCudaError;                           \
   }                                                 \
 } while(0)
@@ -93,7 +93,7 @@ struct testColl {
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
   testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-      ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
@@ -133,7 +133,7 @@ struct threadArgs {
   size_t recvInplaceOffset;
   ncclUniqueId ncclId;
   ncclComm_t* comms;
-  hipStream_t* streams;
+  cudaStream_t* streams;
 
   void** expected;
   size_t expectedBytes;
diff --git a/src/gather.cu b/src/gather.cu
index 3ac9cfb6b4..784d637c84 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "hip/hip_runtime.h"
+#include "cuda_runtime.h"
 #include "common.h"
 
 void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,16 +22,16 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
     if (rank == root) {
       TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -44,7 +44,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/hypercube.cu b/src/hypercube.cu
index 2058de1dd3..9c49cd7984 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "hip/hip_runtime.h"
+#include "cuda_runtime.h"
 #include "common.h"
 
 #define ALIGN 4
@@ -25,15 +25,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
     for (int j=0; j<nranks; j++) {
       TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   char* sbuff = (char*)sendbuff;
   char* rbuff = (char*)recvbuff;
   int nRanks;
@@ -54,7 +54,7 @@ testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
   size_t rankSize = count * wordSize(type);
-  if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
+  if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
 
   // Hypercube AllGather
   for (int mask=1; mask<nRanks; mask<<=1) {
diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h
index 3c241d3d14..020a4bc36f 100644
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
@@ -1,6 +1,5 @@
 /*************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -21,28 +20,28 @@ static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
 #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
 
 static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   CHECKCOUNT(count);
   return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
 }
 static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
   CHECKCOUNT(count);
   return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
 }
 static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream) {
+    ncclComm_t comm, cudaStream_t stream) {
   CHECKCOUNT(count);
   return ncclBcast(buff, (int)count, datatype, root, comm, stream);
 }
 static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    hipStream_t stream) {
+    cudaStream_t stream) {
   CHECKCOUNT(recvcount);
   return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
 }
 static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
   CHECKCOUNT(sendcount);
   return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
 }
diff --git a/src/reduce.cu b/src/reduce.cu
index 62850f8212..dd90c25bf4 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,14 +22,14 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
     if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -40,7 +40,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw;
 }
 
-testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
   return testSuccess;
 }
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index a58d2578af..2f6c8c56d6 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 #define ALIGN 4
@@ -25,14 +25,14 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -45,7 +45,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
   *busBw = baseBw * factor;
 }
 
-testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/scatter.cu b/src/scatter.cu
index 7445624b71..993289203c 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -21,13 +21,13 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
   size_t recvcount = args->expectedBytes / wordSize(type);
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
     TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -40,7 +40,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index d5b0300cdf..cda6d699ca 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,14 +22,14 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
     int peer = (rank-1+nranks)%nranks;
     TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   // We don't support in-place sendrecv
   args->reportErrors = in_place ? 0 : 1;
@@ -44,7 +44,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/verifiable/inexact_regress.cu b/verifiable/inexact_regress.cu
index 973b965412..3200ff3918 100644
--- a/verifiable/inexact_regress.cu
+++ b/verifiable/inexact_regress.cu
@@ -24,11 +24,11 @@
 #include <cstdio>
 #include <cstdint>
 #include <hip/hip_bfloat16.h>
-#include <hip/hip_fp16.h>
+#include <cuda_fp16.h>
 
 using std::uint64_t;
 using std::uint32_t;
-using bfloat16 = hip_bfloat16;
+using bfloat16 = __nv_bfloat16;
 
 template<typename T>
 struct float_traits;
@@ -182,14 +182,14 @@ __global__ void kernel() {
 int main() {
   std::printf("type=float:\n");
   kernel<float><<<1,32>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
 
   std::printf("\ntype=half:\n");
   kernel<half><<<1,32>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
 
   std::printf("\ntype=bfloat16:\n");
   kernel<bfloat16><<<1,32>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
   return 0;
 }
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index a375809bcf..31fdfe10c8 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -8,8 +8,8 @@
 //#pragma nv_diag_suppress declared_but_not_referenced
 
 #include "verifiable.h"
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
 #include <hip/hip_bfloat16.h>
 
 #include "rccl/rccl.h"
@@ -91,7 +91,7 @@ template<>
 struct IsIntegral<__half>: std::false_type {};
 #if RCCL_BFLOAT16 == 1
 template<>
-struct IsIntegral<hip_bfloat16>: std::false_type {};
+struct IsIntegral<__nv_bfloat16>: std::false_type {};
 #endif
 }
 
@@ -126,7 +126,7 @@ namespace {
   }
   #if RCCL_BFLOAT16 == 1
   template<>
-  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
+  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
     return hip_bfloat16(x);
   }
   #endif
@@ -153,7 +153,7 @@ struct ReduceSum {
       return __float2half(__half2float(a) + __half2float(b));
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
       return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
   }
   #endif
@@ -169,7 +169,7 @@ struct ReduceProd {
       return __float2half(__half2float(a) * __half2float(b));
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
       return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
   }
   #endif
@@ -185,7 +185,7 @@ struct ReduceMin {
     return __half2float(a) < __half2float(b) ? a : b;
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
       return static_cast<float>(a) < static_cast<float>(b) ? a : b;
   }
   #endif
@@ -201,7 +201,7 @@ struct ReduceMax {
       return __half2float(a) > __half2float(b) ? a : b;
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
       return static_cast<float>(a) > static_cast<float>(b) ? a : b;
   }
   #endif
@@ -280,7 +280,7 @@ struct FloatLayout<__half> {
 };
 #if RCCL_BFLOAT16 == 1
 template<>
-struct FloatLayout<hip_bfloat16> {
+struct FloatLayout<__nv_bfloat16> {
   static constexpr int exponent_bits = 8, mantissa_bits = 7;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
@@ -801,7 +801,7 @@ __global__ void prepareInput2(
 template<typename ReduceOp>
 void prepareInput1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
   #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
@@ -814,7 +814,7 @@ void prepareInput1(
   case ncclUint64: CASE_TY(uint64_t)
   case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -826,7 +826,7 @@ void prepareInput1(
 
 void ncclVerifiablePrepareInput(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -877,7 +877,7 @@ __global__ void prepareExpected2(
 template<typename ReduceOp>
 void prepareExpected1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
   #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
@@ -890,7 +890,7 @@ void prepareExpected1(
   case ncclUint64: CASE_TY(uint64_t)
   case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -902,7 +902,7 @@ void prepareExpected1(
 
 void ncclVerifiablePrepareExpected(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -1051,7 +1051,7 @@ __global__ void verifyInline2(
 template<typename T, typename Uint>
 void verifyInline1(
     T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
+    unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -1080,7 +1080,7 @@ void verifyInline1(
 void ncclVerifiableVerify(
     void const *results, void const *expected, intptr_t elt_n, int elt_ty,
     int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    int64_t *bad_elt_n, hipStream_t stream
+    int64_t *bad_elt_n, cudaStream_t stream
   ) {
   bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
   #if HAVE_ncclBfloat16
@@ -1112,7 +1112,7 @@ void ncclVerifiableVerify(
   case ncclUint64: CASE_TY(uint64_t, uint64_t)
   case ncclFloat16: CASE_TY(__half, uint16_t)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
+  case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
   #endif
   case ncclFloat32: CASE_TY(float, uint32_t)
   case ncclFloat64: CASE_TY(double, uint64_t)
@@ -1179,7 +1179,7 @@ __global__ void sweep() {
   sweep1<uint64_t>(ncclUint64, "uint64");
   sweep1<__half>(ncclFloat16, "half");
   #if HAVE_ncclBfloat16
-    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
+    sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
   #endif
   sweep1<float>(ncclFloat32, "float");
   sweep1<double>(ncclFloat64, "double");
@@ -1187,9 +1187,9 @@ __global__ void sweep() {
 
 int main(int arg_n, char **args) {
   std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  hipSetDevice(0);
+  cudaSetDevice(0);
   sweep<<<1,512>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
   return 0;
 }
 #endif
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index b41ef1ad12..da54778a6f 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -8,7 +8,7 @@
 #ifndef _d41d8cd98f00b204e9800998ecf8427e
 #define _d41d8cd98f00b204e9800998ecf8427e
 
-#include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
 
 #include <stdint.h>
 
@@ -43,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 // Enqueue kernel to generate data which is to be reduced.
 void ncclVerifiablePrepareInput(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
 
 // Enqueue kernel to generate expected results of reduction.
 void ncclVerifiablePrepareExpected(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
 
 // Enqueue kernel to verify reduced data matches expectation. The number of
@@ -61,6 +61,6 @@ void ncclVerifiablePrepareExpected(
 void ncclVerifiableVerify(
   void const *results, void const *expected, intptr_t elt_n, int elt_ty,
   int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-  int64_t *bad_elt_n, hipStream_t stream
+  int64_t *bad_elt_n, cudaStream_t stream
 );
 #endif
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
index fba1fbf35c..c526ffb720 100644
--- a/verifiable/verifiable.mk
+++ b/verifiable/verifiable.mk
@@ -1,5 +1,5 @@
 # Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 
@@ -8,11 +8,21 @@
 # TEST_VERIFIABLE_SRCDIR = <points to this directory>
 # TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
 
-TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
-TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+TEST_VERIFIABLE_HDRS      = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+TEST_VERIFIABLE_OBJS      = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
 
-$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
+${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
-	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
-	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<

From 7a7a5969d093d376fad8054c408168bf060ba196 Mon Sep 17 00:00:00 2001
From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com>
Date: Wed, 6 Mar 2024 11:11:44 -0700
Subject: [PATCH 155/233] Revert __nv_bfloat16 back to hip_bfloat16 (#64)

---
 verifiable/inexact_regress.cu |  2 +-
 verifiable/verifiable.cu      | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/verifiable/inexact_regress.cu b/verifiable/inexact_regress.cu
index 3200ff3918..49831b6c46 100644
--- a/verifiable/inexact_regress.cu
+++ b/verifiable/inexact_regress.cu
@@ -28,7 +28,7 @@
 
 using std::uint64_t;
 using std::uint32_t;
-using bfloat16 = __nv_bfloat16;
+using bfloat16 = hip_bfloat16;
 
 template<typename T>
 struct float_traits;
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 31fdfe10c8..d5eac2b556 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -91,7 +91,7 @@ template<>
 struct IsIntegral<__half>: std::false_type {};
 #if RCCL_BFLOAT16 == 1
 template<>
-struct IsIntegral<__nv_bfloat16>: std::false_type {};
+struct IsIntegral<hip_bfloat16>: std::false_type {};
 #endif
 }
 
@@ -126,7 +126,7 @@ namespace {
   }
   #if RCCL_BFLOAT16 == 1
   template<>
-  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
     return hip_bfloat16(x);
   }
   #endif
@@ -153,7 +153,7 @@ struct ReduceSum {
       return __float2half(__half2float(a) + __half2float(b));
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
       return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
   }
   #endif
@@ -169,7 +169,7 @@ struct ReduceProd {
       return __float2half(__half2float(a) * __half2float(b));
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
       return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
   }
   #endif
@@ -185,7 +185,7 @@ struct ReduceMin {
     return __half2float(a) < __half2float(b) ? a : b;
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
       return static_cast<float>(a) < static_cast<float>(b) ? a : b;
   }
   #endif
@@ -201,7 +201,7 @@ struct ReduceMax {
       return __half2float(a) > __half2float(b) ? a : b;
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
       return static_cast<float>(a) > static_cast<float>(b) ? a : b;
   }
   #endif
@@ -280,7 +280,7 @@ struct FloatLayout<__half> {
 };
 #if RCCL_BFLOAT16 == 1
 template<>
-struct FloatLayout<__nv_bfloat16> {
+struct FloatLayout<hip_bfloat16> {
   static constexpr int exponent_bits = 8, mantissa_bits = 7;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
@@ -814,7 +814,7 @@ void prepareInput1(
   case ncclUint64: CASE_TY(uint64_t)
   case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -890,7 +890,7 @@ void prepareExpected1(
   case ncclUint64: CASE_TY(uint64_t)
   case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -1112,7 +1112,7 @@ void ncclVerifiableVerify(
   case ncclUint64: CASE_TY(uint64_t, uint64_t)
   case ncclFloat16: CASE_TY(__half, uint16_t)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
+  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
   #endif
   case ncclFloat32: CASE_TY(float, uint32_t)
   case ncclFloat64: CASE_TY(double, uint64_t)
@@ -1179,7 +1179,7 @@ __global__ void sweep() {
   sweep1<uint64_t>(ncclUint64, "uint64");
   sweep1<__half>(ncclFloat16, "half");
   #if HAVE_ncclBfloat16
-    sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
+    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
   #endif
   sweep1<float>(ncclFloat32, "float");
   sweep1<double>(ncclFloat64, "double");

From 21e59fb283ee88abee852d9595f16ac893b2fc29 Mon Sep 17 00:00:00 2001
From: Andy li <liand@microsoft.com>
Date: Fri, 8 Mar 2024 08:54:41 +0800
Subject: [PATCH 156/233] Enable fp8 support (#63)

* initial checkin

* rename the fp8 datatype name

* update based on cr comments

* resolve the build issue

* resolve fp8 campability issue

* fix minior bug and catch up to reflex latest develop branch change

* add fp8 + operatior support

* update fp8 header file

* resolve merge issue from develop branch
---
 src/CMakeLists.txt       |    1 +
 src/common.cu            |   26 +-
 src/common.h             |    5 +
 src/rccl_bfloat8.h       | 1021 ++++++++++++++++++++++++++++++++++++++
 verifiable/verifiable.cu |  104 +++-
 verifiable/verifiable.mk |    7 +-
 6 files changed, 1158 insertions(+), 6 deletions(-)
 create mode 100644 src/rccl_bfloat8.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fb4dc7d5b7..e27a0cf74b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -71,6 +71,7 @@ set(COMMON_FILES
   common.h
   common.cu
   nccl1_compat.h
+  rccl_bfloat8.h
   rccl_bfloat16.h
   timer.h
   timer.cc
diff --git a/src/common.cu b/src/common.cu
index 0979e6992d..531cc0bea8 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -2,11 +2,13 @@
 /*************************************************************************
  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "cuda_runtime.h"
+#include "rccl_bfloat8.h"
 #include "rccl_bfloat16.h"
 #include "common.h"
 #include <pthread.h>
@@ -28,12 +30,18 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
   #if RCCL_BFLOAT16 == 1
     , ncclBfloat16
   #endif
+  #if RCCL_FLOAT8 == 1
+    , ncclFp8E4M3, ncclFp8E5M2
+  #endif
   };
   const char *test_typenames[ncclNumTypes] = {
     "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
   #if RCCL_BFLOAT16 == 1
     , "bfloat16"
   #endif
+  #if RCCL_FLOAT8 == 1
+    , "fp8_e4m3", "fp8_e5m2"
+  #endif
   };
   int test_typenum = -1;
 
@@ -100,13 +108,13 @@ static int enable_out_of_place = 1;
 static double parsesize(const char *value) {
     long long int units;
     double size;
-    char size_lit;
+    char size_lit[2];
 
-    int count = sscanf(value, "%lf %1s", &size, &size_lit);
+    int count = sscanf(value, "%lf %1s", &size, size_lit);
 
     switch (count) {
     case 2:
-      switch (size_lit) {
+      switch (size_lit[0]) {
       case 'G':
       case 'g':
         units = 1024*1024*1024;
@@ -401,6 +409,9 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
         #if defined(RCCL_BFLOAT16)
         rccl_bfloat16 bf16;
         #endif
+        #if defined(RCCL_FLOAT8)
+        rccl_float8 fp8_e4m3; rccl_bfloat8 fp8_e5m2;
+        #endif
       };
       switch(type) {
       case ncclInt8: i8 = ncclVerifiablePremulScalar<int8_t>(rank); break;
@@ -415,6 +426,11 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       #if defined(RCCL_BFLOAT16)
       case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<rccl_bfloat16>(rank); break;
       #endif
+      #if defined(RCCL_FLOAT8)
+      case ncclFp8E4M3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
+      case ncclFp8E5M2: fp8_e5m2 = ncclVerifiablePremulScalar<rccl_bfloat8>(rank); break;
+      #endif
+      case ncclNumTypes: break;
       }
       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
     }
@@ -753,6 +769,10 @@ int main(int argc, char* argv[]) {
       #if defined(RCCL_BFLOAT16)
         test_typenum++; // bfloat16
       #endif
+      #if defined(RCCL_FLOAT8)
+        test_typenum++; // fp8_e4m3
+        test_typenum++; // fp8_e5m2
+      #endif
     }
     if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
       test_opnum++; // PreMulSum
diff --git a/src/common.h b/src/common.h
index cabc6a6657..23dccebc7d 100644
--- a/src/common.h
+++ b/src/common.h
@@ -1,6 +1,7 @@
 /*************************************************************************
  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -221,6 +222,10 @@ static size_t wordSize(ncclDataType_t type) {
 #if NCCL_MAJOR >= 2
     //case ncclInt8:
     case ncclUint8:
+#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
+    case ncclFp8E4M3:
+    case ncclFp8E5M2:
+#endif
 #endif
       return 1;
     case ncclHalf:
diff --git a/src/rccl_bfloat8.h b/src/rccl_bfloat8.h
new file mode 100644
index 0000000000..01cab41f71
--- /dev/null
+++ b/src/rccl_bfloat8.h
@@ -0,0 +1,1021 @@
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef ROCBLAS_FLOAT8_H
+#define ROCBLAS_FLOAT8_H
+
+#include <stdint.h>
+
+#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+/*! \brief Struct to represent a 8 bit floating-point number. */
+
+typedef struct
+{
+    uint8_t data;
+} rccl_float8;
+
+typedef struct
+{
+    uint8_t data;
+} rccl_bfloat8;
+
+#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#define HIP_HOST_DEVICE __host__ __device__
+#define HIP_HOST __host__
+#define HIP_DEVICE __device__
+
+// We are clipping in down conversion by default
+#define rccl_float8_downcast_clipping 1
+
+namespace rocblas_hip_f8_impl
+{
+    __host__ inline int clz(uint32_t x)
+    {
+        return __builtin_clz(x);
+    }
+    __device__ inline int clz(uint32_t x)
+    {
+        return __clz(x);
+    }
+
+    template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
+    HIP_HOST_DEVICE uint8_t cast_to_f8(T _x, bool stoch = false, uint32_t rng = 0)
+    {
+        constexpr bool is_half  = std::is_same<T, _Float16>::value;
+        constexpr bool is_float = std::is_same<T, float>::value;
+        static_assert(wm + we == 7, "wm+we==7");
+        static_assert(is_half || is_float, "Only half and float can be cast to f8");
+
+        const int mfmt = (sizeof(T) == 4) ? 23 : 10;
+        uint32_t  x;
+        if(sizeof(T) == 4)
+            x = reinterpret_cast<uint32_t&>(_x);
+        else
+            x = reinterpret_cast<uint16_t&>(_x);
+
+        uint32_t y, head, mantissa;
+        int      exponent, bias;
+        uint32_t sign;
+
+        if(sizeof(T) == 4)
+        {
+            head     = x & 0xFF800000;
+            mantissa = x & 0x7FFFFF;
+            exponent = (head >> 23) & 0xFF;
+            sign     = head >> 31;
+            bias     = 127;
+        }
+        else
+        {
+            head     = x & 0xFC00;
+            mantissa = x & 0x3FF;
+            exponent = (head >> 10) & 0x1F;
+            sign     = head >> 15;
+            bias     = 15;
+        }
+
+        uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
+
+        // Deal with inf and NaNs
+        if(negative_zero_nan)
+        {
+            if(sizeof(T) == 4)
+            {
+                if((x & 0x7F800000) == 0x7F800000)
+                    return 0x80;
+            }
+            else
+            {
+                //if(__hisinf(x) || __hisnan(x))
+                if((x & 0x7C00) == 0x7C00)
+                    return 0x80;
+            }
+        }
+        else
+        {
+            if(sizeof(T) == 4)
+            {
+                if((x & 0x7F800000) == 0x7F800000)
+                    return signed_inf + (mantissa != 0 ? 1 : 0);
+            }
+            else
+            {
+                if((x & 0x7C00) == 0x7C00)
+                    return signed_inf + (mantissa != 0 ? 1 : 0);
+            }
+        }
+        if(x == 0)
+            return 0;
+
+        // First need to check if it is normal or denorm as there is a difference of implict 1
+        // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
+        // The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
+        // RNE, no need to add rng. Then probably need to check whether there is carry and adjust
+        // exponent and mantissa again
+
+        // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
+        const int f8_bias                  = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
+        const int f8_denormal_act_exponent = 1 - f8_bias; //actual exponent of f8 denormal
+        // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+        // f8_exponent is the converted f8 exponent with bias encoding
+        // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+        // the difference needs to be adjusted and mantissa shifted
+        int act_exponent, f8_exponent, exponent_diff;
+
+        if(exponent == 0)
+        { // fp32/fp16 is in denormal.
+            /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16 here.
+   In this case, f8 is usually in denormal. But there could be exceptions.
+   fp16 denormal has exponent bias 15 while bf8 with NANOO has exponent bias 16.
+   It means that there are some numbers in fp16 denormal but they are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15.
+   fp16 numbers where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 (NANOO) normal.
+   In this case, the fp16 mantissa should be shift left by 1  */
+            act_exponent  = exponent - bias + 1;
+            exponent_diff = f8_denormal_act_exponent
+                            - act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+        }
+        else
+        { // fp32/fp16 is normal with implicit 1
+            act_exponent = exponent - bias;
+            if(act_exponent <= f8_denormal_act_exponent)
+            {
+                /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
+       For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
+       actual exponent is -7, it is actually larger due to the implict 1,
+       Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+       So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
+                exponent_diff = f8_denormal_act_exponent - act_exponent;
+            }
+            else
+            { //both fp32/fp16 and f8 are in normal range
+                exponent_diff
+                    = 0; // exponent_diff=0 does not mean there is no difference for this case,
+                //act_exponent could be larger. Just that it does not need shift mantissa
+            }
+            mantissa += (1 << mfmt); //Add the implicit 1 into mantissa
+        }
+
+        bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1))
+                        == (1 << (mfmt - wm + exponent_diff - 1));
+        /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we shift right
+     as shift right could rip off some residual part and make something not midpoint look like midpoint.
+     For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than midpoint,
+     but after shift right by 4 bits, it would look like midpoint.
+  */
+
+        if(exponent_diff > 0)
+            mantissa >>= exponent_diff;
+        else if(exponent_diff == -1)
+            mantissa <<= -exponent_diff;
+        bool implicit_one = mantissa & (1 << mfmt);
+        //if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
+        f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias
+                      - (implicit_one ? 0 : 1);
+
+        //Now we have the exponent and mantissa adjusted
+        uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
+        bool     odd       = mantissa
+                   & (1 << (mfmt - wm)); // if the least significant bit that is not truncated is 1
+        mantissa
+            += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
+
+        //Now we deal with overflow
+        if(f8_exponent == 0)
+        {
+            if((1 << mfmt) & mantissa)
+            {
+                f8_exponent = 1; //denormal overflow to become normal, promote exponent
+            }
+        }
+        else
+        {
+            if((1 << (mfmt + 1)) & mantissa)
+            {
+                mantissa >>= 1;
+                f8_exponent++;
+            }
+        }
+
+        mantissa >>= (mfmt - wm);
+
+        // above range: quantize to maximum possible float of the same sign
+        const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
+        if(f8_exponent > max_exp)
+        {
+            if(clip)
+            {
+                mantissa    = (1 << wm) - 1;
+                f8_exponent = max_exp;
+            }
+            else
+            {
+                return signed_inf;
+            }
+        }
+
+        if(f8_exponent == 0 && mantissa == 0)
+            return negative_zero_nan ? 0 : (sign << 7);
+        mantissa &= (1 << wm) - 1;
+        return (sign << 7) | (f8_exponent << wm) | mantissa;
+    }
+
+    template <int wm, int we, typename T, bool negative_zero_nan>
+    HIP_HOST_DEVICE T cast_from_f8(uint8_t x)
+    {
+        constexpr bool is_half  = std::is_same<T, _Float16>::value;
+        constexpr bool is_float = std::is_same<T, float>::value;
+        static_assert(is_half || is_float, "only half and float are supported");
+
+        constexpr int weo = is_half ? 5 : 8;
+        constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
+
+        T fInf, fNegInf, fNaN, fNeg0;
+        if(is_half)
+        {
+            const uint16_t ihInf    = 0x7C00;
+            const uint16_t ihNegInf = 0xFC00;
+            const uint16_t ihNaN    = 0x7C01;
+            const uint16_t ihNeg0   = 0x8000;
+            fInf                    = reinterpret_cast<const _Float16&>(ihInf);
+            fNegInf                 = reinterpret_cast<const _Float16&>(ihNegInf);
+            fNaN                    = reinterpret_cast<const _Float16&>(ihNaN);
+            fNeg0                   = reinterpret_cast<const _Float16&>(ihNeg0);
+        }
+        else if(is_float)
+        {
+            const uint32_t ifInf    = 0x7F800000;
+            const uint32_t ifNegInf = 0xFF800000;
+            const uint32_t ifNaN    = 0x7F800001;
+            const uint32_t ifNeg0   = 0x80000000;
+            fInf                    = reinterpret_cast<const float&>(ifInf);
+            fNegInf                 = reinterpret_cast<const float&>(ifNegInf);
+            fNaN                    = reinterpret_cast<const float&>(ifNaN);
+            fNeg0                   = reinterpret_cast<const float&>(ifNeg0);
+        }
+
+        if(x == 0)
+            return 0;
+
+        uint32_t sign     = x >> 7;
+        uint32_t mantissa = x & ((1 << wm) - 1);
+        int      exponent = (x & 0x7F) >> wm;
+        if(negative_zero_nan)
+        {
+            if(x == 0x80)
+                return fNaN;
+        }
+        else
+        {
+            if(x == 0x80)
+                return fNeg0;
+            if(exponent == ((1 << we) - 1))
+                return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
+        }
+        typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
+        if(we == 5 && is_half && !negative_zero_nan)
+        {
+            retval = x << 8;
+            return reinterpret_cast<const T&>(retval);
+        }
+
+        const int exp_low_cutoff
+            = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+
+        //subnormal input
+        if(exponent == 0)
+        {
+            //guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+            int sh = 1 + clz(mantissa) - (32 - wm);
+            mantissa <<= sh;
+            exponent += 1 - sh;
+            mantissa &= ((1 << wm) - 1);
+        }
+        exponent += exp_low_cutoff - 1;
+        mantissa <<= wmo - wm;
+
+        // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+        if(exponent <= 0)
+        {
+            mantissa |= 1 << wmo;
+            mantissa >>= 1 - exponent;
+            exponent = 0;
+        }
+
+        if(sizeof(T) == 2)
+            retval = (sign << 15) | (exponent << 10) | mantissa;
+        else
+            retval = (sign << 31) | (exponent << 23) | mantissa;
+        return reinterpret_cast<const T&>(retval);
+    }
+} // namespace rocblas_hip_f8_impl
+
+static __device__ bool rocblas_hip_f8_bias_mode_bit_device = true;
+static bool            rocblas_hip_f8_bias_mode_bit_host   = true;
+
+struct rccl_float8
+{
+    uint8_t data;
+    enum class rocblas_hip_f8_rounding_mode
+    {
+        standard,
+        stochastic
+    };
+
+    // default constructor
+    HIP_HOST_DEVICE rccl_float8() = default;
+
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // device specific optimized F8 down-conversion code
+
+    template <bool stochastic_rounding = false>
+    static HIP_DEVICE uint8_t cast_to_f8_from_f32(float v, uint32_t rng = 0)
+    {
+        uint8_t i8data;
+        union
+        {
+            float    fval;
+            uint32_t i32val;
+            uint8_t  i8val[4]; // NOTE: not endian independent
+        } val;
+
+        uint32_t ival = 0;
+        val.fval      = v;
+
+#ifdef rccl_float8_downcast_clipping
+        if((val.i32val & 0x7F800000) != 0x7F800000) /// propagate NAN/INF, no clipping
+            val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
+#endif
+        if(stochastic_rounding)
+        {
+            ival       = __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0); // 0 pos
+            val.i32val = ival;
+            i8data     = val.i8val[0]; // little endian
+        }
+        else // RNE CVT
+        {
+            ival = __builtin_amdgcn_cvt_pk_fp8_f32(
+                val.fval, val.fval, ival, false); // false -> WORD0
+            val.i32val = ival;
+            i8data     = val.i8val[0];
+        }
+        return i8data;
+    }
+
+#endif // __gfx940__
+
+    // constructor from float
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+
+    // NOTE: ON-DEVICE... always optimal bias
+    explicit HIP_DEVICE rccl_float8(float                        v,
+                                   rocblas_hip_f8_rounding_mode rm
+                                   = rocblas_hip_f8_rounding_mode::standard,
+                                   uint32_t rng = 0)
+    {
+        // runtime branch, use cast_to_f8_from_f32 if want to avoid it
+        if(rm == rocblas_hip_f8_rounding_mode::stochastic)
+            data = cast_to_f8_from_f32<true>(v, rng);
+        else
+            data = cast_to_f8_from_f32<false>(v);
+    }
+
+    // Host only implementation using s/w simulation
+    explicit HIP_HOST
+#else
+    // both Host and DEVICE for non-gfx940 using s/w simulation
+    explicit HIP_HOST_DEVICE
+#endif
+        rccl_float8(float                        v,
+                   rocblas_hip_f8_rounding_mode rm  = rocblas_hip_f8_rounding_mode::standard,
+                   uint32_t                     rng = 0)
+    {
+#ifdef rccl_float8_downcast_clipping
+        data = rocblas_hip_f8_impl::
+            cast_to_f8<3, 4, float, true /*negative_zero_nan*/, true /*clip*/>(
+                v, (rm == rocblas_hip_f8_rounding_mode::stochastic), rng);
+#else // rccl_float8_downcast_clipping
+        data = rocblas_hip_f8_impl::
+            cast_to_f8<3, 4, float, true /*negative_zero_nan*/, false /*clip*/>(
+                v, (rm == rocblas_hip_f8_rounding_mode::stochastic), rng);
+#endif // rccl_float8_downcast_clipping
+    }
+
+    // Constructor from half
+    explicit HIP_HOST_DEVICE rccl_float8(_Float16                     v,
+                                        rocblas_hip_f8_rounding_mode rm
+                                        = rocblas_hip_f8_rounding_mode::standard,
+                                        uint32_t rng = 0)
+        : rccl_float8((float)v, rm, rng)
+    {
+    }
+    // constructor from int
+    explicit HIP_HOST_DEVICE rccl_float8(int                          v,
+                                        rocblas_hip_f8_rounding_mode rm
+                                        = rocblas_hip_f8_rounding_mode::standard,
+                                        uint32_t rng = 0)
+        : rccl_float8((float)v, rm, rng)
+    {
+    }
+    // constructor from double
+    explicit HIP_HOST_DEVICE rccl_float8(double                       v,
+                                        rocblas_hip_f8_rounding_mode rm
+                                        = rocblas_hip_f8_rounding_mode::standard,
+                                        uint32_t rng = 0)
+        : rccl_float8((float)v, rm, rng)
+    {
+    }
+
+    // convert to float
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // upcast using device specific intrinsic
+    explicit inline HIP_DEVICE operator float() const
+    {
+        float    fval;
+        uint32_t i32val = static_cast<uint32_t>(data);
+
+        // upcast
+        asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
+
+        return fval;
+    }
+
+    explicit inline HIP_HOST operator float() const
+#else // non gfx940
+    explicit inline HIP_HOST_DEVICE operator float() const
+#endif
+    {
+        return rocblas_hip_f8_impl::cast_from_f8<3, 4, float, true /*negative_zero_nan*/>(data);
+    }
+
+    // convert to half
+    explicit inline HIP_HOST_DEVICE operator _Float16() const
+    {
+        return _Float16(float(*this)); // convert to float, then convert to f16
+    }
+
+    // check for zero
+    inline HIP_HOST_DEVICE bool is_zero() const
+    {
+        return data == 0x00;
+    }
+
+    // check for nan
+    inline HIP_HOST_DEVICE bool is_nan() const
+    {
+        return data == 0x80;
+    }
+
+    // check for inf
+    inline HIP_HOST_DEVICE bool is_inf() const
+    {
+        return data == 0x80;
+    }
+
+    // assignment overloading only from the same F8 types
+    inline __host__ __device__ rccl_float8& operator=(const rccl_float8& a)
+    {
+        data = a.data;
+        return *this;
+    }
+};
+
+struct rccl_bfloat8
+{
+    uint8_t data;
+    enum class rocblas_hip_f8_rounding_mode
+    {
+        standard,
+        stochastic
+    };
+
+    // default constructor
+    HIP_HOST_DEVICE rccl_bfloat8() = default;
+
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // device specific optimized F8 down-conversion code
+
+    template <bool stochastic_rounding = false>
+    static HIP_DEVICE uint8_t cast_to_bf8_from_f32(float v, uint32_t rng = 0)
+    {
+        uint8_t i8data;
+        union
+        {
+            float    fval;
+            uint32_t i32val;
+            uint8_t  i8val[4]; // NOTE: not endian independent
+        } val;
+
+        uint32_t ival = 0;
+        val.fval      = v;
+
+#ifdef rccl_float8_downcast_clipping
+        if((val.i32val & 0x7F800000) != 0x7F800000) // propagate NAN/INF, no clipping
+            val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0);
+#endif
+        if(stochastic_rounding)
+        {
+            ival       = __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
+            val.i32val = ival;
+            i8data     = val.i8val[0]; // little endian
+        }
+        else // RNE CVT
+        {
+            ival = __builtin_amdgcn_cvt_pk_bf8_f32(
+                val.fval, val.fval, ival, false); // false -> WORD0
+            val.i32val = ival;
+            i8data     = val.i8val[0];
+        }
+        return i8data;
+    }
+
+#endif // __gfx940__
+
+    // constructor from float
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+
+    // NOTE: ON-DEVICE... always optimal bias
+    explicit HIP_DEVICE rccl_bfloat8(float                        v,
+                                    rocblas_hip_f8_rounding_mode rm
+                                    = rocblas_hip_f8_rounding_mode::standard,
+                                    uint32_t rng = 0)
+    {
+        // runtime branch, use cast_to_f8_from_f32 if want to avoid it
+        if(rm == rocblas_hip_f8_rounding_mode::stochastic)
+            data = cast_to_bf8_from_f32<true>(v, rng);
+        else
+            data = cast_to_bf8_from_f32<false>(v);
+    }
+
+    // Host only implementation using s/w simulation
+    explicit HIP_HOST
+#else
+    // both Host and DEVICE for non-gfx940 using s/w simulation
+    explicit HIP_HOST_DEVICE
+#endif
+        rccl_bfloat8(float                        v,
+                    rocblas_hip_f8_rounding_mode rm  = rocblas_hip_f8_rounding_mode::standard,
+                    uint32_t                     rng = 0)
+    {
+#ifdef rccl_float8_downcast_clipping
+        data = rocblas_hip_f8_impl::
+            cast_to_f8<2, 5, float, true /*negative_zero_nan*/, true /*clip*/>(
+                v, (rm == rocblas_hip_f8_rounding_mode::stochastic), rng);
+#else
+        data = rocblas_hip_f8_impl::
+            cast_to_f8<2, 5, float, true /*negative_zero_nan*/, false /*clip*/>(
+                v, (rm == rocblas_hip_f8_rounding_mode::stochastic), rng);
+#endif // rccl_float8_downcast_clipping
+    }
+
+    // Constructor from half
+    explicit HIP_HOST_DEVICE rccl_bfloat8(_Float16                     v,
+                                         rocblas_hip_f8_rounding_mode rm
+                                         = rocblas_hip_f8_rounding_mode::standard,
+                                         uint32_t rng = 0)
+        : rccl_bfloat8((float)v, rm, rng)
+    {
+    }
+    // constructor from int
+    explicit HIP_HOST_DEVICE rccl_bfloat8(int                          v,
+                                         rocblas_hip_f8_rounding_mode rm
+                                         = rocblas_hip_f8_rounding_mode::standard,
+                                         uint32_t rng = 0)
+        : rccl_bfloat8((float)v, rm, rng)
+    {
+    }
+    // constructor from double
+    explicit HIP_HOST_DEVICE rccl_bfloat8(double                       v,
+                                         rocblas_hip_f8_rounding_mode rm
+                                         = rocblas_hip_f8_rounding_mode::standard,
+                                         uint32_t rng = 0)
+        : rccl_bfloat8((float)v, rm, rng)
+    {
+    }
+
+    // convert to float
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // upcast using device specific intrinsic
+    explicit inline HIP_DEVICE operator float() const
+    {
+        float    fval;
+        uint32_t i32val = static_cast<uint32_t>(data);
+
+        // upcast
+        asm volatile("v_cvt_f32_bf8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
+
+        return fval;
+    }
+
+    explicit inline HIP_HOST operator float() const
+#else // non gfx940
+    explicit inline HIP_HOST_DEVICE operator float() const
+#endif
+    {
+        return rocblas_hip_f8_impl::cast_from_f8<2, 5, float, true /*negative_zero_nan*/>(data);
+    }
+
+    // convert to half
+    explicit inline HIP_HOST_DEVICE operator _Float16() const
+    {
+        return _Float16(float(*this)); // convert to float, then convert to f16
+    }
+
+    // check for zero
+    inline HIP_HOST_DEVICE bool is_zero() const
+    {
+        return data == 0x00;
+    }
+
+    // check for nan
+    inline HIP_HOST_DEVICE bool is_nan() const
+    {
+        return data == 0x80;
+    }
+
+    // check for inf
+    inline HIP_HOST_DEVICE bool is_inf() const
+    {
+        return data == 0x80;
+    }
+
+    // assignment overloading only from the same F8 types
+    inline __host__ __device__ rccl_bfloat8& operator=(const rccl_bfloat8& a)
+    {
+        data = a.data;
+        return *this;
+    }
+};
+
+namespace std
+{
+    inline rccl_float8 sin(rccl_float8 a)
+    {
+        return rccl_float8(sinf(float(a)));
+    }
+    inline rccl_float8 cos(rccl_float8 a)
+    {
+        return rccl_float8(cosf(float(a)));
+    }
+    inline rccl_bfloat8 sin(rccl_bfloat8 a)
+    {
+        return rccl_bfloat8(sinf(float(a)));
+    }
+    inline rccl_bfloat8 cos(rccl_bfloat8 a)
+    {
+        return rccl_bfloat8(cosf(float(a)));
+    }
+    __device__ __host__ constexpr rccl_float8 real(const rccl_float8& a)
+    {
+        return a;
+    }
+    __device__ __host__ constexpr rccl_bfloat8 real(const rccl_bfloat8& a)
+    {
+        return a;
+    }
+}
+
+// Special operator overloading
+inline std::ostream& operator<<(std::ostream& os, const rccl_float8& f8)
+{
+    return os << float(f8);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat8& bf8)
+{
+    return os << float(bf8);
+}
+
+// all + operator overloading with mixed types
+// mixed types, always converts to f32, does computation in f32, and returns float
+inline __host__ __device__ float operator+(const float fa, rccl_float8 b)
+{
+    return (fa + float(b));
+}
+
+inline __host__ __device__ float operator+(const float fa, rccl_bfloat8 b)
+{
+    return (fa + float(b));
+}
+
+inline __host__ __device__ float operator+(rccl_float8 a, const float fb)
+{
+    return (float(a) + fb);
+}
+
+inline __host__ __device__ float operator+(rccl_bfloat8 a, const float fb)
+{
+    return (float(a) + fb);
+}
+
+inline __host__ __device__ float operator+(rccl_float8 a, rccl_bfloat8 b)
+{
+    return (float(a) + float(b));
+}
+
+inline __host__ __device__ float operator+(rccl_bfloat8 a, rccl_float8 b)
+{
+    return (float(a) + float(b));
+}
+
+inline __host__ __device__ rccl_float8 operator+(rccl_float8 a, rccl_float8 b)
+{
+    return rccl_float8(float(a) + float(b));
+}
+
+inline __host__ __device__ rccl_bfloat8 operator+(rccl_bfloat8 a, rccl_bfloat8 b)
+{
+    return rccl_bfloat8(float(a) + float(b));
+}
+
+inline __host__ __device__ rccl_float8& operator+=(rccl_float8& a, rccl_float8 b)
+{
+    return a = rccl_float8(float(a) + float(b));
+}
+
+inline __host__ __device__ rccl_bfloat8& operator+=(rccl_bfloat8& a, rccl_bfloat8 b)
+{
+    return a = rccl_bfloat8(float(a) + float(b));
+}
+
+// overloading multiplication, always returns float,
+inline __host__ __device__ float operator*(rccl_float8 a, rccl_float8 b)
+{
+    return float(a) * float(b);
+}
+
+inline __host__ __device__ float operator*(float a, rccl_float8 b)
+{
+    return (a * float(b));
+}
+
+inline __host__ __device__ float operator*(rccl_float8 a, float b)
+{
+    return (float(a) * b);
+}
+
+inline __host__ __device__ float operator*(int32_t a, rccl_float8 b)
+{
+    return ((float)a * float(b));
+}
+
+inline __host__ __device__ float operator*(double a, rccl_float8 b)
+{
+    return ((float)a * float(b));
+}
+
+inline __host__ __device__ float operator*(rccl_bfloat8 a, rccl_bfloat8 b)
+{
+    return float(a) * float(b);
+}
+
+inline __host__ __device__ float operator*(float a, rccl_bfloat8 b)
+{
+    return (a * float(b));
+}
+
+inline __host__ __device__ float operator*(rccl_bfloat8 a, float b)
+{
+    return (float(a) * b);
+}
+
+inline __host__ __device__ float operator*(int32_t a, rccl_bfloat8 b)
+{
+    return ((float)a * float(b));
+}
+
+inline __host__ __device__ float operator*(double a, rccl_bfloat8 b)
+{
+    return ((float)a * float(b));
+}
+
+// overloading for mixed f8 and bf8 types
+inline __host__ __device__ float operator*(rccl_float8 a, rccl_bfloat8 b)
+{
+    return float(a) * float(b);
+}
+
+inline __host__ __device__ float operator*(rccl_bfloat8 a, rccl_float8 b)
+{
+    return float(a) * float(b);
+}
+
+// all - operator overloading with mixed types
+// mixed types, always converts to f32, does computation in f32, and returns float
+inline __host__ __device__ float operator-(const float fa, rccl_float8 b)
+{
+    return (fa - float(b));
+}
+
+inline __host__ __device__ float operator-(const float fa, rccl_bfloat8 b)
+{
+    return (fa - float(b));
+}
+
+inline __host__ __device__ float operator-(rccl_float8 a, const float fb)
+{
+    return (float(a) - fb);
+}
+
+inline __host__ __device__ float operator-(rccl_bfloat8 a, const float fb)
+{
+    return (float(a) - fb);
+}
+
+inline __host__ __device__ float operator-(rccl_float8 a, rccl_bfloat8 b)
+{
+    return (float(a) - float(b));
+}
+
+inline __host__ __device__ float operator-(rccl_bfloat8 a, rccl_float8 b)
+{
+    return (float(a) - float(b));
+}
+
+inline __host__ __device__ rccl_float8 operator-(rccl_float8 a, rccl_float8 b)
+{
+    return rccl_float8(float(a) - float(b));
+}
+
+inline __host__ __device__ rccl_bfloat8 operator-(rccl_bfloat8 a, rccl_bfloat8 b)
+{
+    return rccl_bfloat8(float(a) - float(b));
+}
+
+inline __host__ __device__ rccl_float8& operator-=(rccl_float8& a, rccl_float8 b)
+{
+    return a = rccl_float8(float(a) - float(b));
+}
+
+inline __host__ __device__ rccl_bfloat8& operator-=(rccl_bfloat8& a, rccl_bfloat8 b)
+{
+    return a = rccl_bfloat8(float(a) - float(b));
+}
+
+// overloading division, always returns float,
+inline __host__ __device__ float operator/(rccl_float8 a, rccl_float8 b)
+{
+    return float(a) / float(b);
+}
+
+inline __host__ __device__ float operator/(float a, rccl_float8 b)
+{
+    return (a / float(b));
+}
+
+inline __host__ __device__ float operator/(rccl_float8 a, float b)
+{
+    return (float(a) / b);
+}
+
+inline __host__ __device__ float operator/(int32_t a, rccl_float8 b)
+{
+    return ((float)a / float(b));
+}
+
+inline __host__ __device__ float operator/(double a, rccl_float8 b)
+{
+    return ((float)a / float(b));
+}
+
+inline __host__ __device__ float operator/(rccl_bfloat8 a, rccl_bfloat8 b)
+{
+    return float(a) / float(b);
+}
+
+inline __host__ __device__ float operator/(float a, rccl_bfloat8 b)
+{
+    return (a / float(b));
+}
+
+inline __host__ __device__ float operator/(rccl_bfloat8 a, float b)
+{
+    return (float(a) / b);
+}
+
+inline __host__ __device__ float operator/(int32_t a, rccl_bfloat8 b)
+{
+    return ((float)a / float(b));
+}
+
+inline __host__ __device__ float operator/(double a, rccl_bfloat8 b)
+{
+    return ((float)a / float(b));
+}
+
+// overloading for mixed f8 and bf8 types
+inline __host__ __device__ float operator/(rccl_float8 a, rccl_bfloat8 b)
+{
+    return float(a) / float(b);
+}
+
+inline __host__ __device__ float operator/(rccl_bfloat8 a, rccl_float8 b)
+{
+    return float(a) / float(b);
+}
+
+// overloading for compare
+inline __host__ __device__ bool operator==(rccl_float8 a, rccl_float8 b)
+{
+    return (a.data == b.data);
+}
+
+inline __host__ __device__ bool operator==(rccl_bfloat8 a, rccl_bfloat8 b)
+{
+    return (a.data == b.data);
+}
+
+inline __host__ __device__ bool operator!=(rccl_float8 a, rccl_float8 b)
+{
+    return (a.data != b.data);
+}
+
+inline __host__ __device__ bool operator!=(rccl_bfloat8 a, rccl_bfloat8 b)
+{
+    return (a.data != b.data);
+}
+
+// ================ Explicit downcasting to support different rounding (RNE, SR) ===============
+// NOTE: we going to remove all assignment operator overloading from other types and enforce
+// this explicit_downcast function to make any roudning behavior default
+// We have to explicitly call this function with SR flag
+
+template <typename T,
+          typename Ta,
+          bool stochastic_rounding,
+          typename std::enable_if<std::is_same<T, Ta>{}, int>::type = 0>
+inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng = 0)
+{
+    // same type, no conversion
+    return a;
+}
+
+// Use h/w intrinsic and optimized version when __gfx940__
+template <
+    typename T,
+    typename Ta,
+    bool stochastic_rounding,
+    typename std::enable_if<(!(std::is_same<T, Ta>{})
+                             && (std::is_same<T, rccl_float8>{} || std::is_same<T, rccl_bfloat8>{})),
+                            int>::type
+    = 0>
+inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
+{
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch
+    T val;
+    if(std::is_same<T, rccl_float8>::value)
+        val.data = rccl_float8::cast_to_f8_from_f32<stochastic_rounding>(float(a), rng);
+    else
+        val.data = rccl_bfloat8::cast_to_bf8_from_f32<stochastic_rounding>(float(a), rng);
+    return val;
+#else // non gfx940
+    return T(float(a),
+             stochastic_rounding ? T::rocblas_hip_f8_rounding_mode::stochastic
+                                 : T::rocblas_hip_f8_rounding_mode::standard,
+             rng);
+#endif // __gfx940__
+}
+
+// NOTE NOTE: The above code is good if we don't consider HIP-GEMM code and only consider the quantization
+// However, if we need HIP-GEMM for fall-back, we would need explicit_cast handles Tacc=f32 to To=f16/bf16 conversion
+template <
+    typename T,
+    typename Ta,
+    bool stochastic_rounding,
+    typename std::enable_if<(!(std::is_same<T, Ta>{})
+                             && !(std::is_same<T, rccl_float8>{} || std::is_same<T, rccl_bfloat8>{})),
+                            int>::type
+    = 0>
+inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
+{
+    // the return type is not a F8 types, no SR for those types
+    // not sure if we have direct conversion, so converting to float first
+    // no effect if the input type is float
+    return T(float(a));
+}
+
+// =================================================================================================
+
+#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#endif // ROCBLAS_FLOAT8_H
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index d5eac2b556..e6fd1dfc35 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -1,7 +1,7 @@
 /*************************************************************************
  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
  * See LICENSE.txt for license information
  ************************************************************************/
 
@@ -13,13 +13,20 @@
 #include <hip/hip_bfloat16.h>
 
 #include "rccl/rccl.h"
+#include "rccl_bfloat8.h"
 
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 == 1
   #define HAVE_ncclBfloat16 1
 #else
   #define HAVE_ncclBfloat16 0
 #endif
 
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_FLOAT8 == 1
+  #define HAVE_ncclfp8 1
+#else
+  #define HAVE_ncclfp8 0
+#endif
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
   #define HAVE_ncclAvg 1
 #else
@@ -93,6 +100,12 @@ struct IsIntegral<__half>: std::false_type {};
 template<>
 struct IsIntegral<hip_bfloat16>: std::false_type {};
 #endif
+#if RCCL_FLOAT8 == 1
+template<>
+struct IsIntegral<rccl_float8>: std::false_type {};
+template<>
+struct IsIntegral<rccl_bfloat8>: std::false_type {};
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -130,6 +143,16 @@ namespace {
     return hip_bfloat16(x);
   }
   #endif
+  #if RCCL_FLOAT8 == 1
+  template<>
+  __host__ __device__ rccl_float8 castTo<rccl_float8>(float x) {
+    return static_cast<rccl_float8>(x);
+  }
+  template<>
+  __host__ __device__ rccl_bfloat8 castTo<rccl_bfloat8>(float x) {
+    return static_cast<rccl_bfloat8>(x);
+  }
+  #endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -157,6 +180,14 @@ struct ReduceSum {
       return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
   }
   #endif
+  #if RCCL_FLOAT8 == 1
+  __host__ __device__ rccl_float8 operator()(rccl_float8 a, rccl_float8 b) const {
+      return rccl_float8(static_cast<float>(a) + static_cast<float>(b));
+  }
+  __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, rccl_bfloat8 b) const {
+      return rccl_bfloat8(static_cast<float>(a) + static_cast<float>(b));
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -173,6 +204,20 @@ struct ReduceProd {
       return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
   }
   #endif
+  #if RCCL_FLOAT8 == 1
+  __host__ __device__ rccl_float8 operator()(rccl_float8 a, rccl_float8 b) const {
+      return static_cast<rccl_float8>(a * b);
+  }
+  __host__ __device__ rccl_float8 operator()(rccl_float8 a, float b) const {
+      return static_cast<rccl_float8>(a * b);
+  }
+  __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, rccl_bfloat8 b) const {
+      return static_cast<rccl_bfloat8>(a * b);
+  }
+  __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, float b) const {
+      return static_cast<rccl_bfloat8>(a * b);
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -189,6 +234,14 @@ struct ReduceMin {
       return static_cast<float>(a) < static_cast<float>(b) ? a : b;
   }
   #endif
+  #if RCCL_FLOAT8 == 1
+  __host__ __device__ rccl_float8 operator()(rccl_float8 a, rccl_float8 b) const {
+      return static_cast<float>(a) < static_cast<float>(b) ? a : b;
+  }
+  __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, rccl_bfloat8 b) const {
+      return static_cast<float>(a) < static_cast<float>(b) ? a : b;
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -205,6 +258,14 @@ struct ReduceMax {
       return static_cast<float>(a) > static_cast<float>(b) ? a : b;
   }
   #endif
+  #if RCCL_FLOAT8 == 1
+  __host__ __device__ rccl_float8 operator()(rccl_float8 a, rccl_float8 b) const {
+      return static_cast<float>(a) > static_cast<float>(b) ? a : b;
+  }
+  __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, rccl_bfloat8 b) const {
+      return static_cast<float>(a) > static_cast<float>(b) ? a : b;
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -285,6 +346,18 @@ struct FloatLayout<hip_bfloat16> {
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 #endif
+#if RCCL_FLOAT8 == 1
+template<>
+struct FloatLayout<rccl_float8> {
+  static constexpr int exponent_bits = 4, mantissa_bits = 3;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<rccl_bfloat8> {
+  static constexpr int exponent_bits = 5, mantissa_bits = 2;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#endif
 
 template<typename T>
 __host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
@@ -816,6 +889,10 @@ void prepareInput1(
   #if HAVE_ncclBfloat16
   case ncclBfloat16: CASE_TY(hip_bfloat16)
   #endif
+  #if HAVE_ncclfp8
+  case ncclFp8E4M3: CASE_TY(rccl_float8)
+  case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+  #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
   default: assert(0);
@@ -892,6 +969,10 @@ void prepareExpected1(
   #if HAVE_ncclBfloat16
   case ncclBfloat16: CASE_TY(hip_bfloat16)
   #endif
+  #if HAVE_ncclfp8
+  case ncclFp8E4M3: CASE_TY(rccl_float8)
+  case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+  #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
   default: assert(0);
@@ -962,6 +1043,13 @@ __host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
     coef = .66f;
     break;
   #endif
+  #if HAVE_ncclfp8
+  case ncclFp8E4M3:
+  case ncclFp8E5M2:
+    power = .91f;
+    coef = .66f;
+    break;
+  #endif
   }
   #if __CUDA_ARCH__
     return 1 + unsigned(coef*powf(float(rank_n), power));
@@ -1086,6 +1174,10 @@ void ncclVerifiableVerify(
   #if HAVE_ncclBfloat16
     floating |= elt_ty == ncclBfloat16;
   #endif
+  #if HAVE_ncclfp8
+    floating |= elt_ty == ncclFp8E4M3;
+    floating |= elt_ty == ncclFp8E5M2;
+  #endif
 
   unsigned tolerance = 0;
   #if HAVE_ncclAvg
@@ -1114,6 +1206,10 @@ void ncclVerifiableVerify(
   #if HAVE_ncclBfloat16
   case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
   #endif
+  #if HAVE_ncclfp8
+  case ncclFp8E4M3: CASE_TY(rccl_float8, uint8_t)
+  case ncclFp8E5M2: CASE_TY(rccl_bfloat8, uint8_t)
+  #endif
   case ncclFloat32: CASE_TY(float, uint32_t)
   case ncclFloat64: CASE_TY(double, uint64_t)
   default: assert(0);
@@ -1181,6 +1277,10 @@ __global__ void sweep() {
   #if HAVE_ncclBfloat16
     sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
   #endif
+  #if HAVE_ncclfp8
+    sweep1<rccl_float8>(ncclFp8E4M3, "fp8_e4m3");
+    sweep1<rccl_bfloat8>(ncclFp8E5M2, "fp8_e5m2");
+  #endif
   sweep1<float>(ncclFloat32, "float");
   sweep1<double>(ncclFloat64, "double");
 }
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
index c526ffb720..98ffc6ecba 100644
--- a/verifiable/verifiable.mk
+++ b/verifiable/verifiable.mk
@@ -21,7 +21,12 @@ ${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 	@mkdir -p ${HIPIFY_DIR}
 	hipify-perl -quiet-warnings $< > $@
 
-$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h
+${HIPIFY_DIR}/rccl_bfloat8.h: $(TEST_VERIFIABLE_SRCDIR)/../src/rccl_bfloat8.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h $(HIPIFY_DIR)/rccl_bfloat8.h
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
 	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"

From e447c173822e5e6ae47681f3e8386b72036f0bce Mon Sep 17 00:00:00 2001
From: Andy li <liand@microsoft.com>
Date: Sat, 9 Mar 2024 02:02:40 +0800
Subject: [PATCH 157/233] update the fp8 header file name (#65)

* update the fp8 header name
---
 src/CMakeLists.txt                    | 2 +-
 src/common.cu                         | 2 +-
 src/{rccl_bfloat8.h => rccl_float8.h} | 0
 verifiable/verifiable.cu              | 2 +-
 verifiable/verifiable.mk              | 4 ++--
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename src/{rccl_bfloat8.h => rccl_float8.h} (100%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e27a0cf74b..2ae943b13e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -71,7 +71,7 @@ set(COMMON_FILES
   common.h
   common.cu
   nccl1_compat.h
-  rccl_bfloat8.h
+  rccl_float8.h
   rccl_bfloat16.h
   timer.h
   timer.cc
diff --git a/src/common.cu b/src/common.cu
index 531cc0bea8..dd9add2383 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -8,7 +8,7 @@
  ************************************************************************/
 
 #include "cuda_runtime.h"
-#include "rccl_bfloat8.h"
+#include "rccl_float8.h"
 #include "rccl_bfloat16.h"
 #include "common.h"
 #include <pthread.h>
diff --git a/src/rccl_bfloat8.h b/src/rccl_float8.h
similarity index 100%
rename from src/rccl_bfloat8.h
rename to src/rccl_float8.h
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index e6fd1dfc35..32c13b048e 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -13,7 +13,7 @@
 #include <hip/hip_bfloat16.h>
 
 #include "rccl/rccl.h"
-#include "rccl_bfloat8.h"
+#include "rccl_float8.h"
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 == 1
   #define HAVE_ncclBfloat16 1
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
index 98ffc6ecba..8eb160b12a 100644
--- a/verifiable/verifiable.mk
+++ b/verifiable/verifiable.mk
@@ -21,12 +21,12 @@ ${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 	@mkdir -p ${HIPIFY_DIR}
 	hipify-perl -quiet-warnings $< > $@
 
-${HIPIFY_DIR}/rccl_bfloat8.h: $(TEST_VERIFIABLE_SRCDIR)/../src/rccl_bfloat8.h
+${HIPIFY_DIR}/rccl_float8.h: $(TEST_VERIFIABLE_SRCDIR)/../src/rccl_float8.h
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
 	hipify-perl -quiet-warnings $< > $@
 
-$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h $(HIPIFY_DIR)/rccl_bfloat8.h
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h $(HIPIFY_DIR)/rccl_float8.h
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
 	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"

From a31679775ce521825b3baab8aeed7af1d08d3560 Mon Sep 17 00:00:00 2001
From: mberenjk <146776561+mberenjk@users.noreply.github.com>
Date: Wed, 20 Mar 2024 10:04:12 -0500
Subject: [PATCH 158/233] adding git version to rccl-test (#66)

* adding git version to rccl-test

---------

Co-authored-by: mberenjk <mberenjk@banff-cyxtera-s74-2.ctr.dcgpu>
---
 src/CMakeLists.txt          | 15 ++++++++-
 src/cmake/git_version.cmake | 62 +++++++++++++++++++++++++++++++++++++
 src/common.cu               |  2 ++
 src/git_version.h           |  7 +++++
 4 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 src/cmake/git_version.cmake
 create mode 100644 src/git_version.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2ae943b13e..9b99ac1067 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -68,6 +68,7 @@ endfunction()
 # Collect list of common source files
 #==================================================================================================
 set(COMMON_FILES
+  git_version.h
   common.h
   common.cu
   nccl1_compat.h
@@ -112,8 +113,20 @@ endforeach()
 # Compile common object library
 #==================================================================================================
 add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
+
+# Create an initial git_version.cpp file (that will be updated with latest git version)
+#==================================================================================================
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "")
+list(APPEND HIP_COMMON_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
+
+#Create a custom target that updates git_version.cpp and executes whenever rccl is built
+add_custom_target(git_version_check
+  COMMENT "Updating git_version.cpp if necessary"
+  COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/git_version.cmake
+  VERBATIM
+)
 add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
-add_dependencies(rccl_common hipify)
+add_dependencies(rccl_common hipify git_version_check)
 target_link_libraries(rccl_common roc::rccl hip::device)
 if(USE_MPI)
     target_link_libraries(rccl_common MPI::MPI_CXX)
diff --git a/src/cmake/git_version.cmake b/src/cmake/git_version.cmake
new file mode 100644
index 0000000000..c320f0ca60
--- /dev/null
+++ b/src/cmake/git_version.cmake
@@ -0,0 +1,62 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Attempt to collect the latest git hash
+execute_process(COMMAND git log --pretty=format:'%h' -n 1
+                OUTPUT_VARIABLE GIT_REV
+                ERROR_QUIET)
+
+# Check if git information was found
+if ("${GIT_REV}" STREQUAL "")
+  set(CURR_GIT_VERSION "const char *rcclTestsGitHash =\"Unknown \";")
+else()
+  # Check for changes (denote with a '+') after hash
+  execute_process(
+    COMMAND bash -c "git diff --quiet --exit-code || echo +"
+    OUTPUT_VARIABLE GIT_DIFF)
+  # Collect branch information
+  execute_process(
+    COMMAND git rev-parse --abbrev-ref HEAD
+    OUTPUT_VARIABLE GIT_BRANCH)
+
+  string(STRIP "${GIT_REV}" GIT_REV)
+  string(SUBSTRING "${GIT_REV}" 1 7 GIT_REV)
+  string(STRIP "${GIT_DIFF}" GIT_DIFF)
+  string(STRIP "${GIT_BRANCH}" GIT_BRANCH)
+
+  set(CURR_GIT_VERSION "const char *rcclTestsGitHash =\"${GIT_BRANCH}:${GIT_REV}${GIT_DIFF}\";")
+endif()
+
+# Compare file with older git version file (git_version.cpp)
+if (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
+  #MESSAGE(STATUS "Found ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp")
+  file(READ ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp PREV_GIT_VERSION)
+  #message(STATUS "CURR GIT version: ${CURR_GIT_VERSION}")
+  #message(STATUS "PREV GIT version: ${PREV_GIT_VERSION}")
+  if (NOT "${CURR_GIT_VERSION}" STREQUAL "${PREV_GIT_VERSION}")
+    message(STATUS "Updating git_version.cpp")
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+  else()
+    message(STATUS "No changes to git_version.cpp required")
+  endif()
+else()
+  # Create git_version.cpp if it doesn't exist yet
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+endif()
diff --git a/src/common.cu b/src/common.cu
index dd9add2383..4534d10131 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -21,6 +21,7 @@
 //#define DEBUG_PRINT
 
 #include "verifiable.h"
+#include "git_version.h"
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
@@ -754,6 +755,7 @@ testResult_t run(); // Main function
 int main(int argc, char* argv[]) {
   // Make sure everyline is flushed so that we see the progress of the test
   setlinebuf(stdout);
+  printf("rccl-tests: Version %s\n", rcclTestsGitHash);
 
   #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
     ncclGetVersion(&test_ncclVersion);
diff --git a/src/git_version.h b/src/git_version.h
new file mode 100644
index 0000000000..861c0fe970
--- /dev/null
+++ b/src/git_version.h
@@ -0,0 +1,7 @@
+#ifndef RCCL_TESTS_GIT_VERSION_H_
+#define RCCL_TESTS_GIT_VERSION_H_
+
+extern const char *rcclTestsGitHash;
+
+#endif
+

From 91609be0ef155801396863c145b191543365c6b6 Mon Sep 17 00:00:00 2001
From: akolliasAMD <99202231+akolliasAMD@users.noreply.github.com>
Date: Fri, 22 Mar 2024 10:21:37 -0600
Subject: [PATCH 159/233] Revert "adding git version to rccl-test (#66)"

This reverts commit a31679775ce521825b3baab8aeed7af1d08d3560.
---
 src/CMakeLists.txt          | 15 +--------
 src/cmake/git_version.cmake | 62 -------------------------------------
 src/common.cu               |  2 --
 src/git_version.h           |  7 -----
 4 files changed, 1 insertion(+), 85 deletions(-)
 delete mode 100644 src/cmake/git_version.cmake
 delete mode 100644 src/git_version.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9b99ac1067..2ae943b13e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -68,7 +68,6 @@ endfunction()
 # Collect list of common source files
 #==================================================================================================
 set(COMMON_FILES
-  git_version.h
   common.h
   common.cu
   nccl1_compat.h
@@ -113,20 +112,8 @@ endforeach()
 # Compile common object library
 #==================================================================================================
 add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
-
-# Create an initial git_version.cpp file (that will be updated with latest git version)
-#==================================================================================================
-file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "")
-list(APPEND HIP_COMMON_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
-
-#Create a custom target that updates git_version.cpp and executes whenever rccl is built
-add_custom_target(git_version_check
-  COMMENT "Updating git_version.cpp if necessary"
-  COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/git_version.cmake
-  VERBATIM
-)
 add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
-add_dependencies(rccl_common hipify git_version_check)
+add_dependencies(rccl_common hipify)
 target_link_libraries(rccl_common roc::rccl hip::device)
 if(USE_MPI)
     target_link_libraries(rccl_common MPI::MPI_CXX)
diff --git a/src/cmake/git_version.cmake b/src/cmake/git_version.cmake
deleted file mode 100644
index c320f0ca60..0000000000
--- a/src/cmake/git_version.cmake
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# Attempt to collect the latest git hash
-execute_process(COMMAND git log --pretty=format:'%h' -n 1
-                OUTPUT_VARIABLE GIT_REV
-                ERROR_QUIET)
-
-# Check if git information was found
-if ("${GIT_REV}" STREQUAL "")
-  set(CURR_GIT_VERSION "const char *rcclTestsGitHash =\"Unknown \";")
-else()
-  # Check for changes (denote with a '+') after hash
-  execute_process(
-    COMMAND bash -c "git diff --quiet --exit-code || echo +"
-    OUTPUT_VARIABLE GIT_DIFF)
-  # Collect branch information
-  execute_process(
-    COMMAND git rev-parse --abbrev-ref HEAD
-    OUTPUT_VARIABLE GIT_BRANCH)
-
-  string(STRIP "${GIT_REV}" GIT_REV)
-  string(SUBSTRING "${GIT_REV}" 1 7 GIT_REV)
-  string(STRIP "${GIT_DIFF}" GIT_DIFF)
-  string(STRIP "${GIT_BRANCH}" GIT_BRANCH)
-
-  set(CURR_GIT_VERSION "const char *rcclTestsGitHash =\"${GIT_BRANCH}:${GIT_REV}${GIT_DIFF}\";")
-endif()
-
-# Compare file with older git version file (git_version.cpp)
-if (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
-  #MESSAGE(STATUS "Found ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp")
-  file(READ ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp PREV_GIT_VERSION)
-  #message(STATUS "CURR GIT version: ${CURR_GIT_VERSION}")
-  #message(STATUS "PREV GIT version: ${PREV_GIT_VERSION}")
-  if (NOT "${CURR_GIT_VERSION}" STREQUAL "${PREV_GIT_VERSION}")
-    message(STATUS "Updating git_version.cpp")
-    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
-  else()
-    message(STATUS "No changes to git_version.cpp required")
-  endif()
-else()
-  # Create git_version.cpp if it doesn't exist yet
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
-endif()
diff --git a/src/common.cu b/src/common.cu
index 4534d10131..dd9add2383 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -21,7 +21,6 @@
 //#define DEBUG_PRINT
 
 #include "verifiable.h"
-#include "git_version.h"
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
@@ -755,7 +754,6 @@ testResult_t run(); // Main function
 int main(int argc, char* argv[]) {
   // Make sure everyline is flushed so that we see the progress of the test
   setlinebuf(stdout);
-  printf("rccl-tests: Version %s\n", rcclTestsGitHash);
 
   #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
     ncclGetVersion(&test_ncclVersion);
diff --git a/src/git_version.h b/src/git_version.h
deleted file mode 100644
index 861c0fe970..0000000000
--- a/src/git_version.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef RCCL_TESTS_GIT_VERSION_H_
-#define RCCL_TESTS_GIT_VERSION_H_
-
-extern const char *rcclTestsGitHash;
-
-#endif
-

From 3f7f7859bfd863415f1671b723d9969f1207af9e Mon Sep 17 00:00:00 2001
From: mberenjk <146776561+mberenjk@users.noreply.github.com>
Date: Thu, 28 Mar 2024 14:03:59 -0500
Subject: [PATCH 160/233] adding git version to rccl-tests (#69)

Co-authored-by: mberenjk <mberenjk@amd.com>
---
 src/CMakeLists.txt          | 15 ++++++++-
 src/Makefile                | 14 +++++++--
 src/cmake/git_version.cmake | 62 +++++++++++++++++++++++++++++++++++++
 src/common.cu               |  3 +-
 src/git_version.h           |  6 ++++
 verifiable/Makefile         |  2 +-
 6 files changed, 97 insertions(+), 5 deletions(-)
 create mode 100644 src/cmake/git_version.cmake
 create mode 100644 src/git_version.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2ae943b13e..f0aae7cc10 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -68,6 +68,7 @@ endfunction()
 # Collect list of common source files
 #==================================================================================================
 set(COMMON_FILES
+  git_version.h
   common.h
   common.cu
   nccl1_compat.h
@@ -109,11 +110,23 @@ foreach(COMMON_FILE ${COMMON_FILES})
   )
 endforeach()
 
+# Create an initial git_version.cpp file (that will be updated with latest git version)
+#==================================================================================================
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "")
+list(APPEND HIP_COMMON_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
+
+#Create a custom target that updates git_version.cpp and executes whenever rccl is built
+add_custom_target(git_version_check
+  COMMENT "Updating git_version.cpp if necessary"
+  COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/git_version.cmake
+  VERBATIM
+)
+
 # Compile common object library
 #==================================================================================================
 add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
 add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
-add_dependencies(rccl_common hipify)
+add_dependencies(rccl_common hipify git_version_check)
 target_link_libraries(rccl_common roc::rccl hip::device)
 if(USE_MPI)
     target_link_libraries(rccl_common MPI::MPI_CXX)
diff --git a/src/Makefile b/src/Makefile
index 00a17b56a8..6810e1ac2b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -62,6 +62,11 @@ OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
 BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
+GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
+GIT_REV          := $(shell git log --pretty=format:'%h' -n 1)
+GIT_DIFF         := $(shell git diff --quiet --exit-code || echo +)
+GIT_BRANCH       := $(shell git rev-parse --abbrev-ref HEAD)
+
 build: ${BIN_FILES}
 
 clean:
@@ -71,6 +76,11 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable
 TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
 include ../verifiable/verifiable.mk
 
+# Rule to create git_version.cpp
+$(GIT_VERSION_FILE):
+	@mkdir -p ${DST_DIR}/src
+	@echo 'const char* rcclTestsGitHash = "$(GIT_BRANCH):$(GIT_REV)$(GIT_DIFF)";' > $@
+
 ${HIPIFY_DIR}/%.cu.cpp: %.cu
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
@@ -81,7 +91,7 @@ ${HIPIFY_DIR}/%.h: %.h
 	@mkdir -p ${HIPIFY_DIR}
 	hipify-perl -quiet-warnings $< > $@
 
-${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS)
+${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<"
@@ -92,7 +102,7 @@ ${DST_DIR}/timer.o: timer.cc timer.h
 	@mkdir -p ${DST_DIR}
 	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
 
-${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
diff --git a/src/cmake/git_version.cmake b/src/cmake/git_version.cmake
new file mode 100644
index 0000000000..c320f0ca60
--- /dev/null
+++ b/src/cmake/git_version.cmake
@@ -0,0 +1,62 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Attempt to collect the latest git hash
+execute_process(COMMAND git log --pretty=format:'%h' -n 1
+                OUTPUT_VARIABLE GIT_REV
+                ERROR_QUIET)
+
+# Check if git information was found
+if ("${GIT_REV}" STREQUAL "")
+  set(CURR_GIT_VERSION "const char *rcclTestsGitHash =\"Unknown \";")
+else()
+  # Check for changes (denote with a '+') after hash
+  execute_process(
+    COMMAND bash -c "git diff --quiet --exit-code || echo +"
+    OUTPUT_VARIABLE GIT_DIFF)
+  # Collect branch information
+  execute_process(
+    COMMAND git rev-parse --abbrev-ref HEAD
+    OUTPUT_VARIABLE GIT_BRANCH)
+
+  string(STRIP "${GIT_REV}" GIT_REV)
+  string(SUBSTRING "${GIT_REV}" 1 7 GIT_REV)
+  string(STRIP "${GIT_DIFF}" GIT_DIFF)
+  string(STRIP "${GIT_BRANCH}" GIT_BRANCH)
+
+  set(CURR_GIT_VERSION "const char *rcclTestsGitHash =\"${GIT_BRANCH}:${GIT_REV}${GIT_DIFF}\";")
+endif()
+
+# Compare file with older git version file (git_version.cpp)
+if (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
+  #MESSAGE(STATUS "Found ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp")
+  file(READ ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp PREV_GIT_VERSION)
+  #message(STATUS "CURR GIT version: ${CURR_GIT_VERSION}")
+  #message(STATUS "PREV GIT version: ${PREV_GIT_VERSION}")
+  if (NOT "${CURR_GIT_VERSION}" STREQUAL "${PREV_GIT_VERSION}")
+    message(STATUS "Updating git_version.cpp")
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+  else()
+    message(STATUS "No changes to git_version.cpp required")
+  endif()
+else()
+  # Create git_version.cpp if it doesn't exist yet
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+endif()
diff --git a/src/common.cu b/src/common.cu
index dd9add2383..b034ca768c 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -21,6 +21,7 @@
 //#define DEBUG_PRINT
 
 #include "verifiable.h"
+#include "git_version.h"
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
@@ -1026,7 +1027,7 @@ testResult_t run() {
   if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
   if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
   PRINT("#\n");
-
+  PRINT("rccl-tests: Version %s\n", rcclTestsGitHash);
   PRINT("# Using devices\n");
 #define MAX_LINE 2048
   char line[MAX_LINE];
diff --git a/src/git_version.h b/src/git_version.h
new file mode 100644
index 0000000000..dda3d44c18
--- /dev/null
+++ b/src/git_version.h
@@ -0,0 +1,6 @@
+#ifndef RCCL_TESTS_GIT_VERSION_H_
+#define RCCL_TESTS_GIT_VERSION_H_
+
+extern const char *rcclTestsGitHash;
+
+#endif
diff --git a/verifiable/Makefile b/verifiable/Makefile
index 182d44e727..4b71c03e2a 100644
--- a/verifiable/Makefile
+++ b/verifiable/Makefile
@@ -56,7 +56,7 @@ endif
 LIBRARIES += rccl
 HIPLDFLAGS   += $(LIBRARIES:%=-l%)
 
-all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test 
+all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test
 
 clean:
 	rm -rf $(DST_DIR)

From 990f88cbaa16b1de86111453b969db78731f0d4f Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Fri, 12 Apr 2024 12:01:32 -0500
Subject: [PATCH 161/233] Ammend use of CUSTOM_RCCL_LIB to avoid build error
 (#71)

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 6810e1ac2b..2814646b45 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -26,7 +26,7 @@ endif
 HIPCUFLAGS += -I$(ROCM_PATH)/include
 HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
-HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
+HIPLDFLAGS += -L$(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
 
 ifeq ($(DEBUG), 0)
 HIPCUFLAGS += -O3

From e8650b184412d3589d50a19d411f50fdfaf4a95b Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Tue, 23 Apr 2024 14:30:06 -0500
Subject: [PATCH 162/233] [DOCS] Update README for performance-oriented runs
 (#73)

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index c56eac9fcb..1eadbe768e 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,12 @@ Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, f
 $ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
 ```
 
+For performance-oriented runs, on both single-node and multi-node, we suggest using 1 MPI process per GPU and `-g 1`. So, a run on 8 GPUs looks like :
+```shell
+$ mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 1
+```
+Running with 1 MPI process per GPU ensures a 1:1 mapping for CPUs and GPUs, which can be beneficial for smaller message sizes and better represents the real-world use of RCCL in Deep Learning frameworks like Pytorch and TensorFlow.
+
 ### Performance
 
 See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.

From eb65dadfc5e054c53a7aafa6af750bd7f6fec4db Mon Sep 17 00:00:00 2001
From: mberenjk <146776561+mberenjk@users.noreply.github.com>
Date: Tue, 23 Apr 2024 17:00:20 -0500
Subject: [PATCH 163/233] replacing rccl_bfloat16 with hip_bfloat16 (#70)

Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>
---
 src/CMakeLists.txt  |   1 -
 src/common.cu       |   6 +-
 src/rccl_bfloat16.h | 274 --------------------------------------------
 3 files changed, 3 insertions(+), 278 deletions(-)
 delete mode 100644 src/rccl_bfloat16.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f0aae7cc10..12aeb641a7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -73,7 +73,6 @@ set(COMMON_FILES
   common.cu
   nccl1_compat.h
   rccl_float8.h
-  rccl_bfloat16.h
   timer.h
   timer.cc
   ../verifiable/verifiable.h
diff --git a/src/common.cu b/src/common.cu
index b034ca768c..6e1306830f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -9,7 +9,7 @@
 
 #include "cuda_runtime.h"
 #include "rccl_float8.h"
-#include "rccl_bfloat16.h"
+#include <hip/hip_bfloat16.h>
 #include "common.h"
 #include <pthread.h>
 #include <cstdio>
@@ -408,7 +408,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
         int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
         half f16; float f32; double f64;
         #if defined(RCCL_BFLOAT16)
-        rccl_bfloat16 bf16;
+        hip_bfloat16 bf16;
         #endif
         #if defined(RCCL_FLOAT8)
         rccl_float8 fp8_e4m3; rccl_bfloat8 fp8_e5m2;
@@ -425,7 +425,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
       case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
       #if defined(RCCL_BFLOAT16)
-      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<rccl_bfloat16>(rank); break;
+      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<hip_bfloat16>(rank); break;
       #endif
       #if defined(RCCL_FLOAT8)
       case ncclFp8E4M3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
diff --git a/src/rccl_bfloat16.h b/src/rccl_bfloat16.h
deleted file mode 100644
index cbc6e059a5..0000000000
--- a/src/rccl_bfloat16.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/**
- * MIT License
- *
- * Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*!\file
- * \brief rccl_bfloat16.h provides struct for rccl_bfloat16 typedef
- */
-
-#ifndef _RCCL_BFLOAT16_H_
-#define _RCCL_BFLOAT16_H_
-
-#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
-
-// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
-// include a minimal definition of rccl_bfloat16
-
-#include <stdint.h>
-/*! \brief Struct to represent a 16 bit brain floating point number. */
-typedef struct
-{
-    uint16_t data;
-} rccl_bfloat16;
-
-#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
-
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <hip/hip_runtime.h>
-#include <ostream>
-#include <type_traits>
-
-struct rccl_bfloat16
-{
-    uint16_t data;
-
-    enum truncate_t
-    {
-        truncate
-    };
-
-    __host__ __device__ rccl_bfloat16() = default;
-
-    // round upper 16 bits of IEEE float to convert to bfloat16
-    explicit __host__ __device__ rccl_bfloat16(float f)
-        : data(float_to_bfloat16(f))
-    {
-    }
-
-    explicit __host__ __device__ rccl_bfloat16(float f, truncate_t)
-        : data(truncate_float_to_bfloat16(f))
-    {
-    }
-
-    // zero extend lower 16 bits of bfloat16 to convert to IEEE float
-    __host__ __device__ operator float() const
-    {
-        union
-        {
-            uint32_t int32;
-            float    fp32;
-        } u = {uint32_t(data) << 16};
-        return u.fp32;
-    }
-
-private:
-    static __host__ __device__ uint16_t float_to_bfloat16(float f)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {f};
-        if(~u.int32 & 0x7f800000)
-        {
-            // When the exponent bits are not all 1s, then the value is zero, normal,
-            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-            // least significant bits of the float mantissa are greater than 0x8000,
-            // or if they are equal to 0x8000 and the least significant bit of the
-            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-            // has the value 0x7f, then incrementing it causes it to become 0x00 and
-            // the exponent is incremented by one, which is the next higher FP value
-            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
-            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-            // incrementing it causes it to become an exponent of 0xFF and a mantissa
-            // of 0x00, which is Inf, the next higher value to the unrounded value.
-            u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
-        }
-        else if(u.int32 & 0xffff)
-        {
-            // When all of the exponent bits are 1, the value is Inf or NaN.
-            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-            // bit being 1. Signaling NaN is indicated by the most significant
-            // mantissa bit being 0 but some other bit(s) being 1. If any of the
-            // lower 16 bits of the mantissa are 1, we set the least significant bit
-            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-            // the bloat16's mantissa bits are all 0.
-            u.int32 |= 0x10000; // Preserve signaling NaN
-        }
-        return uint16_t(u.int32 >> 16);
-    }
-
-    // Truncate instead of rounding, preserving SNaN
-    static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {f};
-        return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
-    }
-};
-
-typedef struct
-{
-    uint16_t data;
-} rccl_bfloat16_public;
-
-static_assert(std::is_standard_layout<rccl_bfloat16>{},
-              "rccl_bfloat16 is not a standard layout type, and thus is "
-              "incompatible with C.");
-
-static_assert(std::is_trivial<rccl_bfloat16>{},
-              "rccl_bfloat16 is not a trivial type, and thus is "
-              "incompatible with C.");
-
-static_assert(sizeof(rccl_bfloat16) == sizeof(rccl_bfloat16_public)
-                  && offsetof(rccl_bfloat16, data) == offsetof(rccl_bfloat16_public, data),
-              "internal rccl_bfloat16 does not match public rccl_bfloat16");
-
-inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat16& bf16)
-{
-    return os << float(bf16);
-}
-inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a)
-{
-    return a;
-}
-inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a)
-{
-    a.data ^= 0x8000;
-    return a;
-}
-inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return rccl_bfloat16(float(a) + float(b));
-}
-inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return rccl_bfloat16(float(a) - float(b));
-}
-inline __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return rccl_bfloat16(float(a) * float(b));
-}
-inline __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return rccl_bfloat16(float(a) / float(b));
-}
-inline __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return float(a) < float(b);
-}
-inline __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return float(a) == float(b);
-}
-inline __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return b < a;
-}
-inline __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return !(a > b);
-}
-inline __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return !(a == b);
-}
-inline __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b)
-{
-    return !(a < b);
-}
-inline __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b)
-{
-    return a = a + b;
-}
-inline __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b)
-{
-    return a = a - b;
-}
-inline __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b)
-{
-    return a = a * b;
-}
-inline __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b)
-{
-    return a = a / b;
-}
-inline __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a)
-{
-    return a += rccl_bfloat16(1.0f);
-}
-inline __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a)
-{
-    return a -= rccl_bfloat16(1.0f);
-}
-inline __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int)
-{
-    rccl_bfloat16 orig = a;
-    ++a;
-    return orig;
-}
-inline __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int)
-{
-    rccl_bfloat16 orig = a;
-    --a;
-    return orig;
-}
-
-namespace std
-{
-    constexpr __host__ __device__ bool isinf(rccl_bfloat16 a)
-    {
-        return !(~a.data & 0x7f80) && !(a.data & 0x7f);
-    }
-    constexpr __host__ __device__ bool isnan(rccl_bfloat16 a)
-    {
-        return !(~a.data & 0x7f80) && +(a.data & 0x7f);
-    }
-    constexpr __host__ __device__ bool iszero(rccl_bfloat16 a)
-    {
-        return !(a.data & 0x7fff);
-    }
-    inline rccl_bfloat16 sin(rccl_bfloat16 a)
-    {
-        return rccl_bfloat16(sinf(float(a)));
-    }
-    inline rccl_bfloat16 cos(rccl_bfloat16 a)
-    {
-        return rccl_bfloat16(cosf(float(a)));
-    }
-}
-
-#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
-
-#endif // _RCCL_BFLOAT16_H_

From 0c762d210c29988c4572dc7ffb738103bec5c759 Mon Sep 17 00:00:00 2001
From: Corey Derochie <corey.derochie@amd.com>
Date: Wed, 1 May 2024 20:41:12 -0500
Subject: [PATCH 164/233] Wrapped the warmup iters in captures when doing graph
 mode to do a proper warmup.

---
 src/common.cu | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 6e1306830f..bae774abc9 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -647,18 +647,106 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
 
   // Warm-up for large size
   setupArgs(args->maxbytes, type, args);
+#if HIP_VERSION >= 50221310
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
+  if (cudaGraphLaunches >= 1) {
+    // Begin cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      // Thread local mdoe is needed for:
+      // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
+      // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
+      //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+    }
+  }
+#endif
   for (int iter = 0; iter < warmup_iters; iter++) {
     TESTCHECK(startColl(args, type, op, root, 0, iter));
   }
+
+#if HIP_VERSION >= 50221310
+  if (cudaGraphLaunches >= 1) {
+    // End cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+    }
+    // Instantiate cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+    }
+    // Resync CPU, restart timing, launch cuda graph
+    Barrier(args);
+    for (int l=0; l<cudaGraphLaunches; l++) {
+      for (int i=0; i<args->nGpus; i++) {
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+      }
+    }
+  }
+#endif
+
   TESTCHECK(completeColl(args));
 
+#if HIP_VERSION >= 50221310
+  if (cudaGraphLaunches >= 1) {
+    //destroy cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
+    }
+  }
+#endif
+
   // Warm-up for small size
   setupArgs(args->minbytes, type, args);
+#if HIP_VERSION >= 50221310
+  if (cudaGraphLaunches >= 1) {
+    // Begin cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      // Thread local mdoe is needed for:
+      // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
+      // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
+      //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+    }
+  }
+#endif
   for (int iter = 0; iter < warmup_iters; iter++) {
     TESTCHECK(startColl(args, type, op, root, iter < warmup_iters/2 ? 0 : 1, iter));
   }
+
+#if HIP_VERSION >= 50221310
+  if (cudaGraphLaunches >= 1) {
+    // End cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+    }
+    // Instantiate cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+    }
+    // Resync CPU, restart timing, launch cuda graph
+    Barrier(args);
+    for (int l=0; l<cudaGraphLaunches; l++) {
+      for (int i=0; i<args->nGpus; i++) {
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+      }
+    }
+  }
+#endif
+
   TESTCHECK(completeColl(args));
 
+#if HIP_VERSION >= 50221310
+  if (cudaGraphLaunches >= 1) {
+    //destroy cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
+    }
+  }
+#endif
+
   for (size_t iter = 0; iter < stress_cycles; iter++) {
     if (iter > 0) PRINT("# Testing %lu cycle.\n", iter+1);
     // Benchmark

From f74c04b6863ff727ac8b000ed68bbb0df0c79f0c Mon Sep 17 00:00:00 2001
From: corey-derochie-amd
 <161367113+corey-derochie-amd@users.noreply.github.com>
Date: Thu, 2 May 2024 09:18:25 -0600
Subject: [PATCH 165/233] Fixed spelling

---
 src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index bae774abc9..def6ca029e 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -653,7 +653,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   if (cudaGraphLaunches >= 1) {
     // Begin cuda graph capture
     for (int i=0; i<args->nGpus; i++) {
-      // Thread local mdoe is needed for:
+      // Thread local mode is needed for:
       // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
       // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
       //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
@@ -703,7 +703,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   if (cudaGraphLaunches >= 1) {
     // Begin cuda graph capture
     for (int i=0; i<args->nGpus; i++) {
-      // Thread local mdoe is needed for:
+      // Thread local mode is needed for:
       // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
       // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
       //   Since pre-connect calls cudaMalloc, we cannot use global capture mode

From 16dfeaf89b3f0203c64cbdea7769884d6bb8f744 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Thu, 2 May 2024 11:14:57 -0700
Subject: [PATCH 166/233] Fix incorrect device ordinal with limited device
 visibility (#74)

---
 src/common.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index def6ca029e..691c2402dd 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1100,7 +1100,7 @@ testResult_t run() {
   for (int p=0; p<totalProcs; p++) {
     if (hostHashs[p] == hostHashs[proc]) localSize++;
   }
-  if (nGpus * localSize > numDevices)
+  if (nGpus * localSize > numDevices && numDevices != 1)
   {
       fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d) on node (%s)\n", nGpus*localSize, numDevices, hostname);
       return testNcclError;
@@ -1124,7 +1124,7 @@ testResult_t run() {
   char* envstr = getenv("NCCL_TESTS_DEVICE");
   int gpu0 = envstr ? atoi(envstr) : -1;
   for (int i=0; i<nThreads*nGpus; i++) {
-    int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
+    int cudaDev = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices;
     int rank = proc*nThreads*nGpus+i;
     cudaDeviceProp prop;
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
@@ -1176,7 +1176,7 @@ testResult_t run() {
   envstr = getenv("NCCL_TESTS_DEVICE");
   gpu0 = envstr ? atoi(envstr) : -1;
   for (int i=0; i<nGpus*nThreads; i++) {
-    gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
+    gpus[i] = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices;
     CUDACHECK(cudaSetDevice(gpus[i]));
     TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
     if (streamnull)

From 3c0728e8ebecac1f2935806404da3b32bf083910 Mon Sep 17 00:00:00 2001
From: saurabhAMD <Saurabh.Roychowdhury@amd.com>
Date: Tue, 7 May 2024 11:09:32 -0500
Subject: [PATCH 167/233] Cache flush

---
 src/common.cu | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/common.h  |  1 +
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 691c2402dd..19c60913d8 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -103,9 +103,47 @@ static int average = 1;
 static int numDevices = 1;
 static int delay_inout_place = 0;
 static int enable_out_of_place = 1;
+static int enable_cache_flush = 0;
 
 #define NUM_BLOCKS 32
 
+#ifndef CHECK_HIP_ERROR
+#define CHECK_HIP_ERROR(error)                    \
+    if(error != hipSuccess)                       \
+    {                                             \
+        fprintf(stderr,                           \
+                "Hip error: '%s'(%d) at %s:%d\n", \
+                hipGetErrorString(error),         \
+                error,                            \
+                __FILE__,                         \
+                __LINE__);                        \
+        exit(EXIT_FAILURE);                       \
+    }
+#endif
+
+extern "C" __global__ void flush_icache()
+{
+    printf("flush_icache called \n");
+    asm __volatile__("s_icache_inv \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t" ::
+                         :);
+}
+
 static double parsesize(const char *value) {
     long long int units;
     double size;
@@ -437,6 +475,13 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     }
     #endif
 
+    if(enable_cache_flush > 0 && (enable_cache_flush==1 || ((iter % enable_cache_flush) == 0))) {
+      hipDeviceProp_t deviceProps;
+      CHECK_HIP_ERROR(hipGetDeviceProperties(&deviceProps, 0));
+      int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
+      hipLaunchKernelGGL(flush_icache, dim3(gpu_block3), dim3(64), 0, args->streams[i]);
+    }
+
     TESTCHECK(args->collTest->runColl(
           (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
           (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
@@ -896,6 +941,7 @@ int main(int argc, char* argv[]) {
     {"report_cputime", required_argument, 0, 'C'},
     {"average", required_argument, 0, 'a'},
     {"out_of_place", required_argument, 0, 'O'},
+    {"cache_flush", required_argument, 0, 'F'},
     {"help", no_argument, 0, 'h'},
     {}
   };
@@ -903,7 +949,7 @@ int main(int argc, char* argv[]) {
   while(1) {
     int c;
 
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:a:y:s:u:h:q:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:a:y:s:u:h:q:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -1003,6 +1049,9 @@ int main(int argc, char* argv[]) {
       case 'O':
         enable_out_of_place = strtol(optarg, NULL, 0);
         break;
+      case 'F':
+        enable_cache_flush = strtol(optarg, NULL, 0);
+        break;
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
         break;
@@ -1042,6 +1091,7 @@ int main(int argc, char* argv[]) {
             "[-G,--cudagraph <num graph launches>] \n\t"
             "[-C,--report_cputime <0/1>] \n\t"
 	    "[-O,--out_of_place <0/1>] \n\t"
+      "[-F,--cache_flush <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
             "[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
             "[-h,--help]\n",
@@ -1252,6 +1302,7 @@ testResult_t run() {
     threads[t].args.comms=comms+t*nGpus;
     threads[t].args.streams=streams+t*nGpus;
     threads[t].args.enable_out_of_place=enable_out_of_place;
+    threads[t].args.enable_cache_flush = enable_cache_flush;
     threads[t].args.errors=errors+t;
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
diff --git a/src/common.h b/src/common.h
index 23dccebc7d..dc4aa4a7a2 100644
--- a/src/common.h
+++ b/src/common.h
@@ -127,6 +127,7 @@ struct threadArgs {
   int* gpus;
   int localRank;
   int enable_out_of_place;
+  int enable_cache_flush;
   void** sendbuffs;
   size_t sendBytes;
   size_t sendInplaceOffset;

From 699478dadf35a3a7644704efadf2198f503ed8d5 Mon Sep 17 00:00:00 2001
From: saurabhAMD <Saurabh.Roychowdhury@amd.com>
Date: Tue, 7 May 2024 11:32:30 -0500
Subject: [PATCH 168/233] Enable cache flush after every -F iteration. Default
 : 0 (No cache flush)

---
 README.md     | 1 +
 src/common.cu | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1eadbe768e..9bc34de49e 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,7 @@ All tests support the same set of arguments :
   * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+  * `-F,--cache_flush <cache flush after every -F iteration>` Enable cache flush after every -F iteration. Default : 0 (No cache flush).
 
 ## Unit tests
 
diff --git a/src/common.cu b/src/common.cu
index 19c60913d8..4dcac1cbd9 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -123,7 +123,6 @@ static int enable_cache_flush = 0;
 
 extern "C" __global__ void flush_icache()
 {
-    printf("flush_icache called \n");
     asm __volatile__("s_icache_inv \n\t"
                      "s_nop 0 \n\t"
                      "s_nop 0 \n\t"

From 74c4177f58113a61638bccd44df1b7d2d7915923 Mon Sep 17 00:00:00 2001
From: saurabhAMD <Saurabh.Roychowdhury@amd.com>
Date: Fri, 10 May 2024 08:46:13 -0700
Subject: [PATCH 169/233] updating cache flush on functionality

---
 src/common.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 4dcac1cbd9..e545870545 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -24,6 +24,7 @@
 #include "git_version.h"
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
+int32_t gpu_block3;
 
 #if NCCL_MAJOR >= 2
   ncclDataType_t test_types[ncclNumTypes] = {
@@ -474,10 +475,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     }
     #endif
 
-    if(enable_cache_flush > 0 && (enable_cache_flush==1 || ((iter % enable_cache_flush) == 0))) {
-      hipDeviceProp_t deviceProps;
-      CHECK_HIP_ERROR(hipGetDeviceProperties(&deviceProps, 0));
-      int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
+    if(enable_cache_flush > 0 && ((iter % enable_cache_flush) == 0)) {
       hipLaunchKernelGGL(flush_icache, dim3(gpu_block3), dim3(64), 0, args->streams[i]);
     }
 
@@ -1050,6 +1048,11 @@ int main(int argc, char* argv[]) {
         break;
       case 'F':
         enable_cache_flush = strtol(optarg, NULL, 0);
+        if (enable_cache_flush > 0) {
+          hipDeviceProp_t deviceProps;
+          CHECK_HIP_ERROR(hipGetDeviceProperties(&deviceProps, 0));
+          gpu_block3 = deviceProps.multiProcessorCount * 60;
+        }
         break;
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
@@ -1090,7 +1093,7 @@ int main(int argc, char* argv[]) {
             "[-G,--cudagraph <num graph launches>] \n\t"
             "[-C,--report_cputime <0/1>] \n\t"
 	    "[-O,--out_of_place <0/1>] \n\t"
-      "[-F,--cache_flush <0/1>] \n\t"
+      "[-F,--cache_flush <number of iterations between instruction cache flush>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
             "[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
             "[-h,--help]\n",

From a1efb427e764241bc43d2d91be875c9f55da03a5 Mon Sep 17 00:00:00 2001
From: Giuseppe Congiu <gcongiu@nvidia.com>
Date: Wed, 28 Feb 2024 05:18:40 -0800
Subject: [PATCH 170/233] Add -R option to register user buffers

---
 src/common.cu | 63 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 4ac00fb3d7..fc5af1e014 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -80,6 +80,9 @@ static int cudaGraphLaunches = 0;
 static int report_cputime = 0;
 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+static int local_register = 0;
+#endif
 
 #define NUM_BLOCKS 32
 
@@ -631,10 +634,22 @@ testResult_t threadInit(struct threadArgs* args) {
     NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
   }
   NCCLCHECK(ncclGroupEnd());
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+  void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
+  void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
+  for (int i=0; i<args->nGpus; i++) {
+    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
+    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
+  }
+#endif
 
   TESTCHECK(threadRunTests(args));
 
   for (int i=0; i<args->nGpus; i++) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
+    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
+#endif
     NCCLCHECK(ncclCommDestroy(args->comms[i]));
   }
   return testSuccess;
@@ -651,9 +666,15 @@ testResult_t threadLaunch(struct testThread* thread) {
 }
 
 testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    NCCLCHECK(ncclMemAlloc(sendbuff, nbytes));
+    NCCLCHECK(ncclMemAlloc(recvbuff, nbytes));
+    if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes));
+#else
     CUDACHECK(cudaMalloc(sendbuff, nbytes));
     CUDACHECK(cudaMalloc(recvbuff, nbytes));
     if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
+#endif
     return testSuccess;
 }
 
@@ -707,13 +728,14 @@ int main(int argc, char* argv[]) {
     {"cudagraph", required_argument, 0, 'G'},
     {"report_cputime", required_argument, 0, 'C'},
     {"average", required_argument, 0, 'a'},
+    {"local_register", required_argument, 0, 'R'},
     {"help", no_argument, 0, 'h'},
     {}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -797,6 +819,15 @@ int main(int argc, char* argv[]) {
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
         break;
+      case 'R':
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+        if ((int)strtol(optarg, NULL, 0)) {
+          local_register = 1;
+        }
+#else
+        printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
+#endif
+        break;
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -827,6 +858,7 @@ int main(int argc, char* argv[]) {
             "[-G,--cudagraph <num graph launches>] \n\t"
             "[-C,--report_cputime <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+            "[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
             "[-h,--help]\n",
           basename(argv[0]));
         return 0;
@@ -949,6 +981,10 @@ testResult_t run() {
 
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+  void **sendRegHandles = NULL;
+  void **recvRegHandles = NULL;
+#endif
   if (!parallel_init) {
      if (ncclProcs == 1) {
        NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
@@ -960,6 +996,14 @@ testResult_t run() {
        }
        NCCLCHECK(ncclGroupEnd());
      }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+     sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
+     recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
+     for (int i=0; i<nGpus*nThreads; i++) {
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], sendBytes, &sendRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], recvBytes, &recvRegHandles[i]));
+     }
+#endif
   }
 
   int errors[nThreads];
@@ -1035,18 +1079,33 @@ testResult_t run() {
 #endif
 
   if (!parallel_init) {
-    for(int i=0; i<nGpus*nThreads; ++i)
+    for(int i=0; i<nGpus*nThreads; ++i) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
+      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
+#endif
       NCCLCHECK(ncclCommDestroy(comms[i]));
+    }
     free(comms);
   }
 
   // Free off CUDA allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    if (sendbuffs[i]) NCCLCHECK(ncclMemFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) NCCLCHECK(ncclMemFree((char*)recvbuffs[i]));
+    if (datacheck) NCCLCHECK(ncclMemFree(expected[i]));
+#else
     if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
     if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
     if (datacheck) CUDACHECK(cudaFree(expected[i]));
+#endif
   }
   CUDACHECK(cudaFreeHost(delta));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+  free(sendRegHandles);
+  free(recvRegHandles);
+#endif
 
   envstr = getenv("NCCL_TESTS_MIN_BW");
   double check_avg_bw = envstr ? atof(envstr) : -1;

From 36a2c372acf5d15293e87505532d4c646def92ce Mon Sep 17 00:00:00 2001
From: saurabhAMD <saurabh.roychowdhury@amd.com>
Date: Tue, 4 Jun 2024 11:35:39 -0500
Subject: [PATCH 171/233] Rotating tensor -R (default:off)

---
 src/common.cu | 28 +++++++++++++++++++++++-----
 src/common.h  |  1 +
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index e545870545..f7ab2f4372 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -25,6 +25,7 @@
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 int32_t gpu_block3;
+size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
 
 #if NCCL_MAJOR >= 2
   ncclDataType_t test_types[ncclNumTypes] = {
@@ -105,6 +106,7 @@ static int numDevices = 1;
 static int delay_inout_place = 0;
 static int enable_out_of_place = 1;
 static int enable_cache_flush = 0;
+static int enable_rotating_tensor = 0;
 
 #define NUM_BLOCKS 32
 
@@ -423,10 +425,16 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   size_t count = args->nbytes / wordSize(type);
 
   // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
-  size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes);
-  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
-  size_t shift = totalnbytes * (iter % steps);
-
+  size_t shift = 0;
+  if(enable_rotating_tensor) {
+    shift = cache_bytes * (iter % 2);
+  }
+  else {
+    size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes);
+    size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+    shift = totalnbytes * (iter % steps);
+  }
+  
   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
   for (int i = 0; i < args->nGpus; i++) {
 #ifndef NCCL_MAJOR
@@ -852,6 +860,10 @@ testResult_t threadLaunch(struct testThread* thread) {
 }
 
 testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
+  if(enable_rotating_tensor) {
+    recvBytes = recvBytes + cache_bytes;
+    nbytes = nbytes + cache_bytes;
+  }
   if (memorytype == ncclFine) {
     CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
     CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
@@ -939,6 +951,7 @@ int main(int argc, char* argv[]) {
     {"average", required_argument, 0, 'a'},
     {"out_of_place", required_argument, 0, 'O'},
     {"cache_flush", required_argument, 0, 'F'},
+    {"rotating_tensor", required_argument, 0, 'R'},
     {"help", no_argument, 0, 'h'},
     {}
   };
@@ -946,7 +959,7 @@ int main(int argc, char* argv[]) {
   while(1) {
     int c;
 
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:a:y:s:u:h:q:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:R:a:y:s:u:h:q:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -1054,6 +1067,9 @@ int main(int argc, char* argv[]) {
           gpu_block3 = deviceProps.multiProcessorCount * 60;
         }
         break;
+      case 'R':
+        enable_rotating_tensor = strtol(optarg, NULL, 0);
+        break;
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
         break;
@@ -1094,6 +1110,7 @@ int main(int argc, char* argv[]) {
             "[-C,--report_cputime <0/1>] \n\t"
 	    "[-O,--out_of_place <0/1>] \n\t"
       "[-F,--cache_flush <number of iterations between instruction cache flush>] \n\t"
+      "[-R,--rotating_tensor <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
             "[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
             "[-h,--help]\n",
@@ -1305,6 +1322,7 @@ testResult_t run() {
     threads[t].args.streams=streams+t*nGpus;
     threads[t].args.enable_out_of_place=enable_out_of_place;
     threads[t].args.enable_cache_flush = enable_cache_flush;
+    threads[t].args.enable_rotating_tensor = enable_rotating_tensor;
     threads[t].args.errors=errors+t;
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
diff --git a/src/common.h b/src/common.h
index dc4aa4a7a2..e14648dc97 100644
--- a/src/common.h
+++ b/src/common.h
@@ -128,6 +128,7 @@ struct threadArgs {
   int localRank;
   int enable_out_of_place;
   int enable_cache_flush;
+  int enable_rotating_tensor;
   void** sendbuffs;
   size_t sendBytes;
   size_t sendInplaceOffset;

From d028efcf35101c6663ae8c5f33ad41bad00efb4d Mon Sep 17 00:00:00 2001
From: Kaiming Ouyang <kouyang@nvidia.com>
Date: Thu, 6 Jun 2024 04:59:28 -0700
Subject: [PATCH 172/233] Change ncclCommRegister size to maxBytes in serial
 comm init

---
 src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index fc5af1e014..04e81422f0 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1000,8 +1000,8 @@ testResult_t run() {
      sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
      recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
      for (int i=0; i<nGpus*nThreads; i++) {
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], sendBytes, &sendRegHandles[i]));
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], recvBytes, &recvRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
      }
 #endif
   }

From 746549b28d3b654e0670feca0065f51affdb7db8 Mon Sep 17 00:00:00 2001
From: Stefano Salsano <stefano.salsano@uniroma2.it>
Date: Fri, 14 Jun 2024 11:28:55 +0200
Subject: [PATCH 173/233] improve parsing of stepbytes (increment size)
 argument

---
 src/common.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 04e81422f0..7706dd9a54 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -764,7 +764,12 @@ int main(int argc, char* argv[]) {
         maxBytes = (size_t)parsed;
         break;
       case 'i':
-        stepBytes = strtol(optarg, NULL, 0);
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'stepBytes'\n");
+          return -1;
+        }
+        stepBytes = (size_t)parsed;
         break;
       case 'f':
         stepFactor = strtol(optarg, NULL, 0);

From c5cae38bb87c85ca1386e3c6f52cd1b03959ef37 Mon Sep 17 00:00:00 2001
From: Rahul Vaidya <ravaidya@amd.com>
Date: Fri, 14 Jun 2024 11:46:08 -0500
Subject: [PATCH 174/233] Fix --root all issue. (#83)

Signed-off-by: rahulvaidya20 <ravaidya@amd.com>
---
 src/common.cu | 4 ++--
 src/common.h  | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index f7ab2f4372..02c03bace5 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1019,7 +1019,7 @@ int main(int argc, char* argv[]) {
         nccltype = ncclstringtotype(optarg);
         break;
       case 'r':
-        ncclroot = strtol(optarg, NULL, 0);
+        ncclroot = ncclstringtoroot(optarg);
         break;
       case 'z':
         blocking_coll = strtol(optarg, NULL, 0);
@@ -1099,7 +1099,7 @@ int main(int argc, char* argv[]) {
             "[-o,--op <sum/prod/min/max/all>] \n\t"
 #endif
             "[-d,--datatype <nccltype/all>] \n\t"
-            "[-r,--root <root>] \n\t"
+            "[-r,--root <root/all>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
             "[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
             "[-s,--stress_cycles <number of cycles>] \n\t"
diff --git a/src/common.h b/src/common.h
index e14648dc97..e5c8b0fbe0 100644
--- a/src/common.h
+++ b/src/common.h
@@ -294,6 +294,13 @@ static int ncclstringtoop (char *str) {
     return ncclSum;
 }
 
+static int ncclstringtoroot (char *str) {
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    return strtol(str, NULL, 0);
+}
+
 static int ncclstringtomtype (char *str) {
     for (int o=0; o<nccl_NUM_MTYPES; o++) {
       if (strcmp(str, test_memorytypes[o]) == 0) {

From e635e9c9befce49773071a9ac7d97435e348a858 Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Tue, 23 Jul 2024 22:21:26 -0500
Subject: [PATCH 175/233] [CI] Add static analysis CI (#85)

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
---
 .jenkins/staticanalysis.groovy | 76 ++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 .jenkins/staticanalysis.groovy

diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
new file mode 100644
index 0000000000..adc4f07779
--- /dev/null
+++ b/.jenkins/staticanalysis.groovy
@@ -0,0 +1,76 @@
+#!/usr/bin/env groovy
+// This shared library is available at https://github.com/ROCm/rocJENKINS/
+@Library('rocJenkins@pong') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path
+
+def runCI = 
+{
+    nodeDetails, jobName->
+
+    def prj  = new rocProject('rccl-tests', 'StaticAnalysis')
+    prj.paths.build_command = './install.sh'
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    boolean formatCheck = false
+    boolean staticAnalysis = true
+
+    def commonGroovy
+
+    def compileCommand =
+    {
+        platform, project->
+
+        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
+        commonGroovy.runCompileCommand(platform, project, jobName)
+    }
+
+    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, null, null)
+}
+
+ci: { 
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])],
+                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
+                        "rocm-docker":[]]
+    propertyList = auxiliary.appendPropertyList(propertyList)
+
+    def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu22:['cpu']]),
+                       "rocm-docker":([ubuntu22:['cpu']])]
+
+    jobNameList['compute-rocm-dkms-no-npi-hipclang'] = [ubuntu22:['cpu']]
+    jobNameList = auxiliary.appendJobNameList(jobNameList)
+
+    propertyList.each
+    {
+        jobName, property->
+        if (urlJobName == jobName)
+            properties(auxiliary.addCommonProperties(property))
+    }
+
+    jobNameList.each
+    {
+        jobName, nodeDetails->
+        if (urlJobName == jobName)
+            stage(jobName) {
+                runCI(nodeDetails, jobName)
+            }
+    }
+
+    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
+    if(!jobNameList.keySet().contains(urlJobName))
+    {
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        stage(urlJobName) {
+            runCI([ubuntu22:['cpu']], urlJobName)
+        }
+    }
+}

From c6eb15875f508076f3f26de4f7da3899701bc4db Mon Sep 17 00:00:00 2001
From: Oren <47992694+OrenLeung@users.noreply.github.com>
Date: Wed, 24 Jul 2024 22:55:00 -0400
Subject: [PATCH 176/233] doc: add all2all factor

---
 doc/PERFORMANCE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md
index 21fef609af..942f054968 100644
--- a/doc/PERFORMANCE.md
+++ b/doc/PERFORMANCE.md
@@ -140,5 +140,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_
 * AllGather : (_n_-1)/_n_
 * Broadcast : 1
 * Reduce : 1
+* AlltoAll: (_n_-1)/_n_
 
 The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.

From d2d40cc8249378efa4d7e2c949528c15eeb7d8e7 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 25 Jul 2024 21:47:40 -0700
Subject: [PATCH 177/233] Added -N,--run_cycles option

---
 src/common.cu | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 04e81422f0..872a18a1b6 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -69,6 +69,7 @@ static int datacheck = 1;
 static int warmup_iters = 5;
 static int iters = 20;
 static int agg_iters = 1;
+static int run_cycles = 1;
 static int ncclop = ncclSum;
 static int nccltype = ncclFloat;
 static int ncclroot = 0;
@@ -598,7 +599,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   TESTCHECK(completeColl(args));
 
   // Benchmark
-  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+  long repeat = run_cycles;
+  do {
+    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
       setupArgs(size, type, args);
       char rootName[100];
       sprintf(rootName, "%6i", root);
@@ -606,7 +609,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
       TESTCHECK(BenchTime(args, type, op, root, 0));
       TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
-  }
+    }
+  } while (--repeat);
+
   return testSuccess;
 }
 
@@ -717,6 +722,7 @@ int main(int argc, char* argv[]) {
     {"iters", required_argument, 0, 'n'},
     {"agg_iters", required_argument, 0, 'm'},
     {"warmup_iters", required_argument, 0, 'w'},
+    {"run_cycles", required_argument, 0, 'N'},
     {"parallel_init", required_argument, 0, 'p'},
     {"check", required_argument, 0, 'c'},
     {"op", required_argument, 0, 'o'},
@@ -735,7 +741,7 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -782,6 +788,9 @@ int main(int argc, char* argv[]) {
       case 'w':
         warmup_iters = (int)strtol(optarg, NULL, 0);
         break;
+      case 'N':
+        run_cycles = (int)strtol(optarg, NULL, 0);
+        break;
       case 'c':
         datacheck = (int)strtol(optarg, NULL, 0);
         break;
@@ -841,6 +850,7 @@ int main(int argc, char* argv[]) {
             "[-n,--iters <iteration count>] \n\t"
             "[-m,--agg_iters <aggregated iteration count>] \n\t"
             "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-N,--run_cycles <cycle count> run & print each cycle (default: 1; 0=infinite)] \n\t"
             "[-p,--parallel_init <0/1>] \n\t"
             "[-c,--check <check iteration count>] \n\t"
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)

From 0d86b5a6e755c52be6f23ef3f4792385f5e255b1 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 30 Jul 2024 14:50:45 -0700
Subject: [PATCH 178/233] Added some missing command line options to README.md
 Also updated single and multi-node examples.

---
 README.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4281799430..44e406a633 100644
--- a/README.md
+++ b/README.md
@@ -24,14 +24,15 @@ NCCL tests can run on multiple processes, multiple threads, and multiple CUDA de
 
 ### Quick examples
 
-Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
 ```shell
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```
 
-Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
+Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
+(NB: The nccl-tests binaries must be compiled with `MPI=1` for this case)
 ```shell
-$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
+$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
 ```
 
 ### Performance
@@ -59,14 +60,18 @@ All tests support the same set of arguments :
   * `-n,--iters <iteration count>` number of iterations. Default : 20.
   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-N,--run_cycles <cycle count>` run & print each cycle. Default : 1; 0=infinite.
   * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
   * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+  * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
+  * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
+  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
 
 ## Copyright
 
-NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
 

From 71355df9592741843eb205b52cb1a370dc75d70e Mon Sep 17 00:00:00 2001
From: AtlantaPepsi <timhu102@amd.com>
Date: Wed, 31 Jul 2024 14:59:47 +0000
Subject: [PATCH 179/233] Fixing typo in readme

Signed-off-by: AtlantaPepsi <timhu102@amd.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9bc34de49e..9470ef8348 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ $ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rc
 RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
 
 ```shell
-$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
+$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl
 ```
 
 RCCL tests can also be built using cmake. A typical sequence will be:

From ae3e6357cb3514d5649b3b5dfaa80cb2d2af3532 Mon Sep 17 00:00:00 2001
From: Tim <43156029+AtlantaPepsi@users.noreply.github.com>
Date: Tue, 10 Sep 2024 19:05:22 -0400
Subject: [PATCH 180/233] Scaling tests to #ngpus (#81)

* scaling tests to #ngpus

Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>

* switching to rocminfo

---------

Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>
---
 test/test_AllGather.py     | 14 ++++++++++++--
 test/test_AllReduce.py     | 14 ++++++++++++--
 test/test_Broadcast.py     | 14 ++++++++++++--
 test/test_Reduce.py        | 14 ++++++++++++--
 test/test_ReduceScatter.py | 14 ++++++++++++--
 5 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/test/test_AllGather.py b/test/test_AllGather.py
index 2d3d74bcef..1213de78c7 100644
--- a/test/test_AllGather.py
+++ b/test/test_AllGather.py
@@ -22,12 +22,22 @@
 import os
 import subprocess
 import itertools
+import math
 
 import pytest
 
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
 nthreads = ["1"]
 nprocs = ["2"]
-ngpus_single = ["1","2","4"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
 ngpus_mpi = ["1","2"]
 byte_range = [("4", "128M")]
 op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
         print(rccl_test.stdout)
         pytest.fail("AllGather test error(s) detected.")
 
-    assert rccl_test.returncode == 0
\ No newline at end of file
+    assert rccl_test.returncode == 0
diff --git a/test/test_AllReduce.py b/test/test_AllReduce.py
index b3cb5f99ff..34d22493f4 100644
--- a/test/test_AllReduce.py
+++ b/test/test_AllReduce.py
@@ -22,12 +22,22 @@
 import os
 import subprocess
 import itertools
+import math
 
 import pytest
 
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
 nthreads = ["1"]
 nprocs = ["2"]
-ngpus_single = ["1","2","4"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
 ngpus_mpi = ["1","2"]
 byte_range = [("4", "128M")]
 op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
         print(rccl_test.stdout)
         pytest.fail("AllReduce test error(s) detected.")
 
-    assert rccl_test.returncode == 0
\ No newline at end of file
+    assert rccl_test.returncode == 0
diff --git a/test/test_Broadcast.py b/test/test_Broadcast.py
index f4b8b38363..f6bd9003a9 100644
--- a/test/test_Broadcast.py
+++ b/test/test_Broadcast.py
@@ -22,12 +22,22 @@
 import os
 import subprocess
 import itertools
+import math
 
 import pytest
 
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
 nthreads = ["1"]
 nprocs = ["2"]
-ngpus_single = ["1","2","4"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
 ngpus_mpi = ["1","2"]
 byte_range = [("4", "128M")]
 op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
         print(rccl_test.stdout)
         pytest.fail("Broadcast test error(s) detected.")
 
-    assert rccl_test.returncode == 0
\ No newline at end of file
+    assert rccl_test.returncode == 0
diff --git a/test/test_Reduce.py b/test/test_Reduce.py
index 5df694490d..0e6671e84f 100644
--- a/test/test_Reduce.py
+++ b/test/test_Reduce.py
@@ -22,12 +22,22 @@
 import os
 import subprocess
 import itertools
+import math
 
 import pytest
 
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
 nthreads = ["1"]
 nprocs = ["2"]
-ngpus_single = ["1","2","4"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
 ngpus_mpi = ["1","2"]
 byte_range = [("4", "128M")]
 op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_fa
         print(rccl_test.stdout)
         pytest.fail("Reduce test error(s) detected.")
 
-    assert rccl_test.returncode == 0
\ No newline at end of file
+    assert rccl_test.returncode == 0
diff --git a/test/test_ReduceScatter.py b/test/test_ReduceScatter.py
index 66b431b00a..58dd709abf 100644
--- a/test/test_ReduceScatter.py
+++ b/test/test_ReduceScatter.py
@@ -22,12 +22,22 @@
 import os
 import subprocess
 import itertools
+import math
 
 import pytest
 
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
 nthreads = ["1"]
 nprocs = ["2"]
-ngpus_single = ["1","2","4"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
 ngpus_mpi = ["1","2"]
 byte_range = [("4", "128M")]
 op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op,
         print(rccl_test.stdout)
         pytest.fail("ReduceScatter test error(s) detected.")
 
-    assert rccl_test.returncode == 0
\ No newline at end of file
+    assert rccl_test.returncode == 0

From 34d6d5391084d30d7698e347497c2ebcc2d82b78 Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Thu, 24 Oct 2024 09:21:37 -0700
Subject: [PATCH 181/233] Future-proof ncclstringtotype

Ensure that ncclstringtotype iterates only over data types known to
nccl-tests (as indicated by test_typenum), not over a potentially larger
set of all NCCL types.
---
 src/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common.h b/src/common.h
index 20fa4612db..e6762e1c97 100644
--- a/src/common.h
+++ b/src/common.h
@@ -254,7 +254,7 @@ extern ncclRedOp_t test_ops[];
 extern const char *test_opnames[];
 
 static int ncclstringtotype(char *str) {
-    for (int t=0; t<ncclNumTypes; t++) {
+    for (int t=0; t<test_typenum; t++) {
       if (strcmp(str, test_typenames[t]) == 0) {
         return t;
       }

From 29f4114f027fed903649a3c81babc5d52e8d41ae Mon Sep 17 00:00:00 2001
From: John Bachan <jbachan@nvidia.com>
Date: Wed, 18 Dec 2024 11:14:18 -0800
Subject: [PATCH 182/233] Fixes to all tests that divide buffers by nranks so
 that they trim buffer sizes to be multiples of 16 bytes. This ensures
 non-pow2 ranks have buffer addresses aligned suitably for performance.

---
 src/all_gather.cu     |  8 +++-----
 src/all_reduce.cu     |  4 ++--
 src/alltoall.cu       | 10 +++++-----
 src/broadcast.cu      |  4 ++--
 src/common.cu         |  2 +-
 src/common.h          |  2 +-
 src/gather.cu         | 12 ++++++------
 src/hypercube.cu      |  6 +++---
 src/reduce.cu         |  4 ++--
 src/reduce_scatter.cu |  8 +++-----
 src/scatter.cu        | 12 ++++++------
 src/sendrecv.cu       |  4 ++--
 12 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/src/all_gather.cu b/src/all_gather.cu
index 0831207433..6db67e6d00 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -7,10 +7,8 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-#define ALIGN 4
-
-void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  size_t base = (count/(ALIGN*nranks))*ALIGN;
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
   *sendcount = base;
   *recvcount = base*nranks;
   *sendInplaceOffset = base;
@@ -60,7 +58,7 @@ struct testColl allGatherTest = {
 
 void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index a38eabe057..4aa1feead7 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -7,7 +7,7 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
   *recvcount = count;
   *sendInplaceOffset = 0;
@@ -55,7 +55,7 @@ struct testColl allReduceTest = {
 
 void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 41c7c4ae33..dd085e54a9 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -7,12 +7,12 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks;
-  *recvcount = (count/nranks)*nranks;
+void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *paramcount = (count/nranks) & -(16/eltSize);
+  *sendcount = nranks*(*paramcount);
+  *recvcount = *sendcount;
   *sendInplaceOffset = 0;
   *recvInplaceOffset = 0;
-  *paramcount = count/nranks;
 }
 
 testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -74,7 +74,7 @@ struct testColl alltoAllTest = {
 
 void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 903066a2b8..67e9af2f36 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -7,7 +7,7 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
   *recvcount = count;
   *sendInplaceOffset = 0;
@@ -64,7 +64,7 @@ struct testColl broadcastTest = {
 
 void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/src/common.cu b/src/common.cu
index e1f8a85f16..6d103d797d 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -571,7 +571,7 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
   size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
 
   count = size / wordSize(type);
-  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, wordSize(type), (size_t)nranks);
 
   args->nbytes = paramCount * wordSize(type);
   args->sendBytes = sendCount * wordSize(type);
diff --git a/src/common.h b/src/common.h
index e6762e1c97..478d7fb1c0 100644
--- a/src/common.h
+++ b/src/common.h
@@ -87,7 +87,7 @@ struct testColl {
   void (*getCollByteCount)(
       size_t *sendcount, size_t *recvcount, size_t *paramcount,
       size_t *sendInplaceOffset, size_t *recvInplaceOffset,
-      size_t count, int nranks);
+      size_t count, size_t eltSize, int nranks);
   testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
diff --git a/src/gather.cu b/src/gather.cu
index 03ef4d9e3f..a4a7a30bcd 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -7,12 +7,12 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = count/nranks;
-  *recvcount = (count/nranks)*nranks;
-  *sendInplaceOffset = count/nranks;
+void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = (count/nranks) & -(16/eltSize);
+  *recvcount = (*sendcount)*nranks;
+  *sendInplaceOffset = *sendcount;
   *recvInplaceOffset = 0;
-  *paramcount = count/nranks;
+  *paramcount = *sendcount;
 }
 
 testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -73,7 +73,7 @@ struct testColl gatherTest = {
 
 void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/src/hypercube.cu b/src/hypercube.cu
index 5c1456f8c7..b3459c91f4 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -9,8 +9,8 @@
 
 #define ALIGN 4
 
-void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  size_t base = (count/(ALIGN*nranks))*ALIGN;
+void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
   *sendcount = base;
   *recvcount = base*nranks;
   *sendInplaceOffset = base;
@@ -78,7 +78,7 @@ struct testColl hyperCubeTest = {
 
 void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/src/reduce.cu b/src/reduce.cu
index f2fa80dd95..731abfa141 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -7,7 +7,7 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
   *recvcount = count;
   *sendInplaceOffset = 0;
@@ -54,7 +54,7 @@ struct testColl reduceTest = {
 
 void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index ed372e3b9a..35cfdd4929 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -7,10 +7,8 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-#define ALIGN 4
-
-void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  size_t base = (count/(ALIGN*nranks))*ALIGN;
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
   *sendcount = base*nranks;
   *recvcount = base;
   *sendInplaceOffset = 0;
@@ -59,7 +57,7 @@ struct testColl reduceScatterTest = {
 
 void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/src/scatter.cu b/src/scatter.cu
index 49d20e1601..d1eec71282 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -7,12 +7,12 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks;
-  *recvcount = count/nranks;
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *recvcount = (count/nranks) & -(16/eltSize);
+  *sendcount = (*recvcount)*nranks;
   *sendInplaceOffset = 0;
-  *recvInplaceOffset = count/nranks;
-  *paramcount = count/nranks;
+  *recvInplaceOffset = *recvcount;
+  *paramcount = *recvcount;
 }
 
 testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -69,7 +69,7 @@ struct testColl scatterTest = {
 
 void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index c9eb5bb427..67a4898b27 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -7,7 +7,7 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
   *recvcount = count;
   *sendInplaceOffset = 0;
@@ -68,7 +68,7 @@ struct testColl sendRecvTest = {
 
 void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
   size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }
 
 testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

From 77ae744c181c02fade85b8b0536b5dba8928c461 Mon Sep 17 00:00:00 2001
From: mberenjk <146776561+mberenjk@users.noreply.github.com>
Date: Mon, 6 Jan 2025 14:05:38 -0600
Subject: [PATCH 183/233] removing FP8 product from allReduce test cases (#97)

* removing FP8 product from allReduce test cases

---------

Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>
---
 src/all_reduce.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 21e3ce3a9e..5302f86833 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -65,6 +65,8 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
   ncclRedOp_t *run_ops;
   const char **run_typenames, **run_opnames;
   int type_count, op_count;
+  if((type == ncclFp8E4M3 || type == ncclFp8E5M2) && op == ncclProd)
+    return testSuccess;
 
   if ((int)type != -1) {
     type_count = 1;
@@ -88,6 +90,8 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
 
   for (int i=0; i<type_count; i++) {
     for (int j=0; j<op_count; j++) {
+      if((i == ncclFp8E4M3 || i == ncclFp8E5M2) && j == ncclProd)
+        continue;
       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
     }
   }

From f7a5df7fc430527a362716d9ba596cb33752b990 Mon Sep 17 00:00:00 2001
From: Tim <43156029+AtlantaPepsi@users.noreply.github.com>
Date: Thu, 9 Jan 2025 12:03:52 -0500
Subject: [PATCH 184/233] hot fixing ncclMemFree for mscclpp (#100)

---
 src/common.cu | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 13d430af42..96c191d8b9 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1401,15 +1401,9 @@ testResult_t run() {
 
   // Free off CUDA allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-    if (sendbuffs[i]) NCCLCHECK(ncclMemFree((char*)sendbuffs[i]));
-    if (recvbuffs[i]) NCCLCHECK(ncclMemFree((char*)recvbuffs[i]));
-    if (datacheck) NCCLCHECK(ncclMemFree(expected[i]));
-#else
     if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
     if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
     if (datacheck) CUDACHECK(cudaFree(expected[i]));
-#endif
   }
   CUDACHECK(cudaFreeHost(delta));
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)

From df26b3268785e2e51e5bd5988f94d9b685232d61 Mon Sep 17 00:00:00 2001
From: Sam Wu <22262939+samjwu@users.noreply.github.com>
Date: Fri, 10 Jan 2025 16:41:43 -0700
Subject: [PATCH 185/233] Remove precheckin steps from staticanalysis (#101)

---
 .jenkins/staticanalysis.groovy | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
index adc4f07779..52702f9c3b 100644
--- a/.jenkins/staticanalysis.groovy
+++ b/.jenkins/staticanalysis.groovy
@@ -14,7 +14,6 @@ def runCI =
     nodeDetails, jobName->
 
     def prj  = new rocProject('rccl-tests', 'StaticAnalysis')
-    prj.paths.build_command = './install.sh'
 
     // Define test architectures, optional rocm version argument is available
     def nodes = new dockerNodes(nodeDetails, jobName, prj)
@@ -22,31 +21,20 @@ def runCI =
     boolean formatCheck = false
     boolean staticAnalysis = true
 
-    def commonGroovy
-
-    def compileCommand =
-    {
-        platform, project->
-
-        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
-        commonGroovy.runCompileCommand(platform, project, jobName)
-    }
-
-    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, null, null)
+    buildProject(prj, formatCheck, nodes.dockerArray, null, null, null, staticAnalysis)
 }
 
 ci: { 
     String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
 
-    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])],
-                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
-                        "rocm-docker":[]]
+    def propertyList = [
+        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]
+    ]
     propertyList = auxiliary.appendPropertyList(propertyList)
 
-    def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu22:['cpu']]),
-                       "rocm-docker":([ubuntu22:['cpu']])]
-
-    jobNameList['compute-rocm-dkms-no-npi-hipclang'] = [ubuntu22:['cpu']]
+    def jobNameList = [
+        "compute-rocm-dkms-no-npi-hipclang":([ubuntu22:['cpu']])
+    ]
     jobNameList = auxiliary.appendJobNameList(jobNameList)
 
     propertyList.each
@@ -68,7 +56,7 @@ ci: {
     // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
     if(!jobNameList.keySet().contains(urlJobName))
     {
-        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 0')])]))
         stage(urlJobName) {
             runCI([ubuntu22:['cpu']], urlJobName)
         }

From 5c41a915c88a4afeab628be2f58fe990aa4b6cd4 Mon Sep 17 00:00:00 2001
From: Sam Wu <22262939+samjwu@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:49:50 -0700
Subject: [PATCH 186/233] [CI] Clone rccl and build from tip of develop (#99)

- Set cron to weekly
- Remove unused properties
- Try rccl install as sudo
- Clear existing rccl repo
- Run install with sudo and env vars
- Fix path
- Add rccl to path
- Attempt to fix build and install of rccl during compile stage.
- Remove existing clone from workspace
- Fix path when install rccl
- Fix path for install rccl-tests
- Install rccl local only
- Set RCCL_DIR
- Build rccl and rccl-tests with cmake
- Add extra env vars
- Use installer instead of cmake for rccl
- Update .jenkins/common.groovy
- Get librccl.so from rccl/build/release
- Switching job command to build rccl and rccl-tests using install.sh because those work properly together.
---
 .jenkins/common.groovy     | 14 ++++++++------
 .jenkins/precheckin.groovy | 18 ++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index fe58a3e41a..b0846f62ab 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -6,17 +6,19 @@ def runCompileCommand(platform, project, jobName)
     project.paths.construct_build_prefix()
 
     String hipclangArgs = jobName.contains('hipclang') ? '--hip-clang' : ''
-    def getRCCL = auxiliary.getLibrary('rccl',platform.jenkinsLabel,'develop')
 
     def command = """#!/usr/bin/env bash
                 set -x
-                ${getRCCL}
+                cd ${project.paths.build_prefix}
+                git clone --recursive https://github.com/ROCm/rccl.git
+                cd rccl
+                ./install.sh -l
+                cd ../..
                 ${auxiliary.exitIfNotSuccess()}
+                
                 cd ${project.paths.project_build_prefix}
-                cmake \
-                    -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-                    -S . -B build
-                make -C build -j\$(nproc)
+                export RCCL_DIR=\$(pwd)/../rccl/build/release
+                ./install.sh --rccl_home \$RCCL_DIR
                 ${auxiliary.exitIfNotSuccess()}
             """
 
diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
index d316d47929..ac672a665f 100644
--- a/.jenkins/precheckin.groovy
+++ b/.jenkins/precheckin.groovy
@@ -44,18 +44,16 @@ def runCI =
 ci: { 
     String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
 
-    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], 
-                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
-                        "rocm-docker":[]]
+    def propertyList = [
+        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 * * * 6')])]
+    ]
     propertyList = auxiliary.appendPropertyList(propertyList)
 
-    def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['rccl906']]), 
-                       "rocm-docker":([ubuntu16:['rccl906']])]
-                       
-    jobNameList['compute-rocm-dkms-no-npi-hipclang'] = [ubuntu16:['rccl906']]
+    def jobNameList = [
+        "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['rccl906']])
+    ]
     jobNameList = auxiliary.appendJobNameList(jobNameList)
     
-    
     propertyList.each 
     {
         jobName, property->
@@ -75,9 +73,9 @@ ci: {
     // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
     if(!jobNameList.keySet().contains(urlJobName))
     {
-        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 * * * 6')])]))
         stage(urlJobName) {
             runCI([ubuntu16:['rccl906']], urlJobName)
         }
     }
-}
\ No newline at end of file
+}

From f2a48983ae675f64f2338517aa9746695f4df6e9 Mon Sep 17 00:00:00 2001
From: Mustafa Abduljabbar <mustafa.abduljabbar@amd.com>
Date: Sat, 11 Jan 2025 17:24:17 -0500
Subject: [PATCH 187/233] Memset to fix inflated performance when GPU is reset
 (#94)

* Memset to fix inflated performance when GPU is reset
* use hipMemset for both memsets
---
 src/common.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 96c191d8b9..c72b1a05c9 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -904,6 +904,8 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
     CUDACHECK(cudaMalloc(recvbuff, nbytes));
     if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
   }
+  CUDACHECK(hipMemset(*sendbuff, 1, nbytes));
+  if (datacheck) CUDACHECK(hipMemset(*expected, 1, recvBytes));
   return testSuccess;
 }
 

From fc9917e0dab6e032779d01624cc91c4a5a47cb30 Mon Sep 17 00:00:00 2001
From: saurabhAMD <160164138+saurabhAMD@users.noreply.github.com>
Date: Sat, 11 Jan 2025 19:23:04 -0600
Subject: [PATCH 188/233] Updating to use hipDeviceMallocUncached (#95)

Use hipDeviceMallocUncached instead of hipDeviceMallocFinegrained on newer ROCm versions.
---
 src/common.cu | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index c72b1a05c9..19db735c5f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -880,9 +880,16 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
     nbytes = nbytes + cache_bytes;
   }
   if (memorytype == ncclFine) {
-    CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
-    CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
-    if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
+    if(HIP_VERSION >= 50700000) {
+      CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached));
+      CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached));
+      if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached));
+    }
+    else {
+      CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
+      CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
+      if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
+    }
   }
   else if (memorytype == ncclHost) {
     CUDACHECK(hipHostMalloc(sendbuff, nbytes));

From 959cc19920c2f94c4f43171fbf77c5da4de77d3f Mon Sep 17 00:00:00 2001
From: David Sidler <dasidler@amd.com>
Date: Mon, 13 Jan 2025 15:28:29 -0800
Subject: [PATCH 189/233] Add option to output results to a file (#93)

* Use find_package for MPI
* Add functionality to output results to file
* fix compilation
* report num gpus
* Revert "Use find_package for MPI"
This reverts commit c8fa253724ef4d0beac0d9c72f968062fbc6908e.
* Change inplace key
* remove dependency on json library
* Print "ranks, ranksPerNode, gpusPerRank"
* Add "nodes" field
---------

Co-authored-by: nileshnegi <Nilesh.Negi@amd.com>
---
 src/common.cu | 127 ++++++++++++++++++++++++++++++++++++++++++++++----
 src/common.h  |  25 ++++++++++
 2 files changed, 144 insertions(+), 8 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 19db735c5f..83e06597d5 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -17,6 +17,8 @@
 #include <getopt.h>
 #include <libgen.h>
 #include "cuda.h"
+#include <vector>
+#include <utility>
 
 //#define DEBUG_PRINT
 
@@ -99,6 +101,8 @@ static uint32_t cumask[4];
 static int streamnull = 0;
 static int timeout = 0;
 static int cudaGraphLaunches = 0;
+std::string output_file;
+std::string output_format;
 static int report_cputime = 0;
 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
@@ -111,6 +115,82 @@ static int enable_rotating_tensor = 0;
 static int local_register = 0;
 #endif
 
+Reporter::Reporter(std::string fileName, std::string outputFormat) : _outputFormat(outputFormat) {
+  if (!fileName.empty()) {
+    if (isMainThread()) {
+      _out = std::ofstream(fileName, std::ios_base::out);
+      _outputValid = true;
+      if (_outputFormat == "csv") {
+        _out << "collective, ";
+#ifdef MPI_SUPPORT
+        _out << "ranks, rankspernode, gpusperrank, ";
+#else
+        _out << "gpus, ";
+#endif
+        _out << "size, type, redop, inplace, time, algbw, busbw, #wrong\n";
+      }
+    }
+  }
+}
+
+void Reporter::setParameters(const char* name, const char* typeName, const char* opName) {
+  if (!isMainThread() || !_outputValid)
+    return;
+
+  _collectiveName = name;
+  _typeName = typeName;
+  _opName = opName;
+}
+
+void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size_t numBytes, int inPlace, double timeUsec, double algBw, double busBw, int64_t wrongElts) {
+  if (!isMainThread() || !_outputValid)
+    return;
+
+  std::vector<std::pair<std::string, std::string>> outputValuesKeys;
+  std::string wrongEltsStr = (wrongElts == -1) ? "N/A" : std::to_string(wrongElts);
+  int nodes = totalRanks / ranksPerNode;
+
+  outputValuesKeys.push_back(makeValueKeyPair(_collectiveName, "name"));
+#ifdef MPI_SUPPORT
+  outputValuesKeys.push_back(makeValueKeyPair(nodes, "nodes"));
+  outputValuesKeys.push_back(makeValueKeyPair(totalRanks, "ranks"));
+  outputValuesKeys.push_back(makeValueKeyPair(ranksPerNode, "ranksPerNode"));
+  outputValuesKeys.push_back(makeValueKeyPair(gpusPerRank, "gpusPerRank"));
+#else
+  outputValuesKeys.push_back(makeValueKeyPair(gpusPerRank, "gpus"));
+#endif
+  outputValuesKeys.push_back(makeValueKeyPair(numBytes, "size"));
+  outputValuesKeys.push_back(makeValueKeyPair(_typeName, "type"));
+  outputValuesKeys.push_back(makeValueKeyPair(_opName, "redop"));
+  outputValuesKeys.push_back(makeValueKeyPair(inPlace, "inPlace"));
+  outputValuesKeys.push_back(makeValueKeyPair(timeUsec, "time"));
+  outputValuesKeys.push_back(makeValueKeyPair(algBw, "algBw"));
+  outputValuesKeys.push_back(makeValueKeyPair(busBw, "busBw"));
+  outputValuesKeys.push_back(makeValueKeyPair(wrongEltsStr, "wrong"));
+
+  for (auto iter = outputValuesKeys.begin(); iter != outputValuesKeys.end(); ++iter) {
+    if (_outputFormat == "csv") {
+      _out << iter->first;
+      if (std::next(iter) != outputValuesKeys.end()) {
+        _out << ", ";
+      }
+    } else { //json
+      if (iter == outputValuesKeys.begin()) {
+        _out << "{";
+      }
+      _out << "\"" << iter->second << "\":" << iter->first;
+      if (std::next(iter) != outputValuesKeys.end()) {
+        _out << ", ";
+      } else {
+        _out << "}";
+      }
+    }
+  }
+  _out << std::endl;
+}
+
+bool Reporter::isMainThread() { return is_main_thread == 1; }
+
 #define NUM_BLOCKS 32
 
 #ifndef CHECK_HIP_ERROR
@@ -675,6 +755,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
   }
 
+  if (args->reporter) {
+    if (args->reportErrors) {
+      args->reporter->addResult((args->nThreads * args->nGpus), args->nProcs, args->totalProcs, args->expectedBytes, in_place, timeUsec, algBw, busBw, wrongElts);
+    }
+    else {
+      args->reporter->addResult((args->nThreads * args->nGpus), args->nProcs, args->totalProcs, args->expectedBytes, in_place, timeUsec, algBw, busBw);
+    }
+  }
+
   args->bw[0] += busBw;
   args->bw_count[0]++;
   return testSuccess;
@@ -800,18 +889,22 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   }
 #endif
 
+  if (args->reporter) {
+    args->reporter->setParameters(args->collTest->name, typeName, opName);
+  }
+
   for (size_t iter = 0; iter < stress_cycles; iter++) {
     if (iter > 0) PRINT("# Testing %lu cycle.\n", iter+1);
     // Benchmark
     for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
         setupArgs(size, type, args);
-	char rootName[100];
-	sprintf(rootName, "%6i", root);	
-	PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
-	if (enable_out_of_place) {
-        	TESTCHECK(BenchTime(args, type, op, root, 0));
-        	usleep(delay_inout_place);
-	}
+        char rootName[100];
+        sprintf(rootName, "%6i", root);
+        PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
+        if (enable_out_of_place) {
+          TESTCHECK(BenchTime(args, type, op, root, 0));
+          usleep(delay_inout_place);
+        }
         TESTCHECK(BenchTime(args, type, op, root, 1));
         PRINT("\n");
     }
@@ -977,6 +1070,8 @@ int main(int argc, char* argv[]) {
     {"cache_flush", required_argument, 0, 'F'},
     {"rotating_tensor", required_argument, 0, 'E'},
     {"local_register", required_argument, 0, 'R'},
+    {"output_file", required_argument, 0, 'x'},
+    {"output_format", required_argument, 0, 'Z'},
     {"help", no_argument, 0, 'h'},
     {}
   };
@@ -984,7 +1079,7 @@ int main(int argc, char* argv[]) {
   while(1) {
     int c;
 
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:E:R:a:y:s:u:h:q:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:E:R:a:y:s:u:h:q:x:Z:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -1109,6 +1204,12 @@ int main(int argc, char* argv[]) {
         printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
 #endif
         break;
+      case 'x':
+        output_file = optarg;
+        break;
+      case 'Z':
+        output_format = optarg;
+        break;
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -1147,6 +1248,8 @@ int main(int argc, char* argv[]) {
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
             "[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
             "[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
+            "[-x,--output_file <output file name>] \n\t"
+            "[-Z,--output_format <output format <csv|json>] \n\t"
             "[-h,--help]\n",
           basename(argv[0]));
         return 0;
@@ -1167,6 +1270,12 @@ int main(int argc, char* argv[]) {
            (unsigned long long)maxBytes);
     return -1;
   }
+  if (!output_format.empty()) {
+    if (!(output_format == "csv" || output_format == "json")) {
+      std::cerr << "Invalid --output_format: " << output_format << "\n";
+      return -1;
+    }
+  }
 #ifdef MPI_SUPPORT
   MPI_Init(&argc, &argv);
 #endif
@@ -1342,6 +1451,7 @@ testResult_t run() {
         PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
         "(us)", "(GB/s)", "(GB/s)", "");
   }
+  Reporter reporter(output_file, output_format);
 
   struct testThread threads[nThreads];
   memset(threads, 0, sizeof(struct testThread)*nThreads);
@@ -1374,6 +1484,7 @@ testResult_t run() {
     threads[t].args.bw_count=bw_count+t;
 
     threads[t].args.reportErrors = datacheck;
+    threads[t].args.reporter = &reporter;
 
     threads[t].func = parallel_init ? threadInit : threadRunTests;
     if (t)
diff --git a/src/common.h b/src/common.h
index e5c8b0fbe0..2f2082c029 100644
--- a/src/common.h
+++ b/src/common.h
@@ -18,6 +18,9 @@
 #include <pthread.h>
 #include "nccl1_compat.h"
 #include "timer.h"
+#include <string>
+#include <fstream>
+#include <iostream>
 
 // For nccl.h < 2.13 since we define a weak fallback
 extern "C" char const* ncclGetLastError(ncclComm_t comm);
@@ -103,6 +106,26 @@ extern struct testColl broadcastTest;
 extern struct testColl reduceTest;
 extern struct testColl alltoAllTest;
 
+class Reporter {
+  public:
+    Reporter(std::string fileName, std::string outputFormat);
+    ~Reporter() { if (_outputValid) { _out.close(); } };
+    void setParameters(const char* name, const char* typeName, const char* opName);// {
+    void addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size_t numBytes, int inPlace, double timeUsec, double algBw, double busBw, int64_t wrongElts = -1);
+
+  private:
+    bool isMainThread();
+    template<typename T> std::pair<std::string, std::string> makeValueKeyPair(T v, std::string k) { return std::make_pair(std::to_string(v), k); };
+    template <> std::pair<std::string, std::string> makeValueKeyPair<std::string>(std::string v, std::string k) { return std::make_pair("\"" + v + "\"", k); };
+
+    bool _outputValid = false;
+    std::ofstream _out;
+    std::string _outputFormat;
+    std::string _collectiveName;
+    std::string _typeName;
+    std::string _opName;
+};
+
 struct testEngine {
   void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
   testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
@@ -147,6 +170,8 @@ struct threadArgs {
   int reportErrors;
 
   struct testColl* collTest;
+
+  Reporter* reporter;
 };
 
 typedef testResult_t (*threadFunc_t)(struct threadArgs* args);

From 46152785f0fc67b8aa9538a63b1e931b2a79b338 Mon Sep 17 00:00:00 2001
From: David Sidler <dasidler@amd.com>
Date: Tue, 14 Jan 2025 09:49:20 -0800
Subject: [PATCH 190/233] Use find_package for MPI (#92)

* Use find_package for MPI
* Minor fixes
---
 CMakeLists.txt | 91 ++++++++------------------------------------------
 README.md      |  4 +--
 2 files changed, 15 insertions(+), 80 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5800ef84b..a772522ca4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,27 +3,6 @@
 # ########################################################################
 #Adding pthread flag for linking
 set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
-macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
-    find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
-    if (MPI_MPICXX)
-        message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
-        find_file(MPI_H mpi.h PATHS ${mpi_inc_dir} NO_DEFAULT_PATH)
-        message ("-- mpi.h is in ${MPI_H}")
-        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_lib_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu NO_DEFAULT_PATH)
-        message ("-- libmpi is ${MPI_LIB}")
-	if (NOT MPI_H OR NOT MPI_LIB)
-	    set (MPI_MPICXX "MPI_MPICXX-NOTFOUND")
-	    set (MPI_H "MPI_H-NOTFOUND")
-	    set (MPI_LIB "MPI_LIB-NOTFOUND")
-	else()
-            add_definitions(-DMPI_SUPPORT)
-            include_directories(${mpi_inc_dir})
-            link_libraries(${MPI_LIB})
-	endif()
-    else()
-        message ("-- ${mpi_compiler} not found")
-    endif()
-endmacro()
 
 cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
 
@@ -51,8 +30,7 @@ include(ROCMCheckTargetIds)
 include(ROCMClients)
 
 # Build variables
-option(NO_MPI "Build RCCL-tests without MPI support.")
-option(MPI_PATH "Use MPI in the specified directory.")
+option(USE_MPI "Build RCCL-tests with MPI support.")
 
 # Default GPU architectures to build
 #==================================================================================================
@@ -84,64 +62,23 @@ set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architect
 message(STATUS "Compiling for ${GPU_TARGETS}")
 
 find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
+if (RCCL_FOUND)
+    message(STATUS "RCCL version : ${RCCL_VERSION}")
+    message(STATUS "RCCL include path : ${RCCL_INCLUDE_DIRS}")
+    message(STATUS "RCCL libraries : ${RCCL_LIBRARIES}")
+endif()
 
-if (NOT NO_MPI)
-    # CHECK for MPI Path first. User requested this directory explicitely
-    if (MPI_PATH)
-        set(mpi_spec_bin_dir "${MPI_PATH}/bin")
-	set(mpi_spec_inc_dir "${MPI_PATH}/include")
-        check_mpi(mpicxx libmpi.a libmpi.so ${mpi_spec_bin_dir} ${MPI_PATH} ${mpi_spec_inc_dir})
-	if (NOT MPI_MPICXX)
-            # Since the user explicitely requested this directory, abort if something went wrong.
-	    MESSAGE(FATAL_ERROR "Could not find MPI in ${MPI_PATH}")
-        endif()
-    endif()
-
-    # Check for MPICH Ubuntu installation
-    if (NOT MPI_MPICXX)
-        check_mpi(mpicxx.mpich libmpich.a libmpich.so /usr/bin /usr /usr/include/x86_64-linux-gnu/mpich)
-    endif()
-
-    # Check for Open MPI Ubuntu installation
-    if (NOT MPI_MPICXX)
-        check_mpi(mpicxx.openmpi libmpi.a libmpi.so /usr/bin  /usr/lib/x86_64-linux-gnu/openmpi /usr/lib/x86_64-linux-gnu/openmpi/include)
-    endif()
-
-    # Check for MPICH RHEL installation
-    if (NOT MPI_MPICXX)
-        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpich/bin /usr/lib64/mpich /usr/include/mpich-x86_64)
-    endif()
-
-    # Check for Open MPI RHEL installation
-    if (NOT MPI_MPICXX)
-        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
-    endif()
-
-    # Check for MPICH SLES installation
-    if (NOT MPI_MPICXX)
-        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpi/gcc/mpich/bin /usr/lib64/mpi/gcc/mpich /usr/lib64/mpi/gcc/mpich/include)
-    endif()
-
-    # Check for Open MPI v4 SLES installation
-    if (NOT MPI_MPICXX)
-        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi4/bin /usr/lib64/mpi/gcc/openmpi4 /usr/lib64/mpi/gcc/openmpi4/include)
-    endif()
-
-    # Check for Open MPI v3 SLES installation
-    if (NOT MPI_MPICXX)
-        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi3/bin /usr/lib64/mpi/gcc/openmpi3 /usr/lib64/mpi/gcc/openmpi3/include)
-    endif()
-
-    # Check for Open MPI v2 SLES installation
-    if (NOT MPI_MPICXX)
-        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi2/bin /usr/lib64/mpi/gcc/openmpi2 /usr/lib64/mpi/gcc/openmpi2/include)
-    endif()
-
-    if (NOT MPI_MPICXX)
+if (USE_MPI)
+    find_package(MPI REQUIRED)
+    if (MPI_FOUND)
+        message(STATUS "MPI include path : ${MPI_CXX_INCLUDE_PATH}")
+        message(STATUS "MPI libraries : ${MPI_CXX_LIBRARIES}")
+        add_definitions(-DMPI_SUPPORT)
+    else()
         message ("-- no MPI library found")
     endif()
 else()
-    message ("-- MPI support explicitely disabled")
+    message ("-- MPI support disabled")
 endif()
 
 set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
diff --git a/README.md b/README.md
index 9470ef8348..02a82ee71d 100644
--- a/README.md
+++ b/README.md
@@ -30,9 +30,7 @@ $ make
 When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
 for cmake target and config files that are created during the RCCL build.
 
-Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitly request
-MPI builds. A user can request to use a particular MPI library by using the MPI_PATH variable. MPI support can be explicitely disabled by adding the -DNO_MPI=1
-flag to the cmake command line.
+Using the cmake method also has the advantage that the build is automatically checking for MPI installations. The tests can be compiled with MPI support by adding the `-DUSE_MPI=ON` flag to the cmake command line. A user can request to use a particular MPI library by setting the environment variable `MPI_HOME` or add the path of the MPI library to the cmake prefix path with `-DCMAKE_PREFIX_PATH`.
 
 
 ## Usage

From 448c4c7269375f41514525bf65c3a77f65dbc476 Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Thu, 16 Jan 2025 18:05:48 -0600
Subject: [PATCH 191/233] [GIT] Add CODEOWNERS and PR Template (#102)

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
---
 .github/CODEOWNERS               |  6 ++++++
 .github/PULL_REQUEST_TEMPLATE.md | 16 ++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100755 .github/CODEOWNERS
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100755
index 0000000000..072f7d1a07
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,6 @@
+* @wenkaidu @gilbertlee-amd @akolliasAMD @edgargabriel @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @haripriya-amd
+
+# Documentation files
+doc/ @ROCm/rocm-documentation
+*.md @ROCm/rocm-documentation
+*.rst @ROCm/rocm-documentation
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000000..426bf3d7db
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,16 @@
+## Details
+___Do not mention proprietary info or link to internal work items in this PR.___
+
+**Work item:** _"Internal", or link to GitHub issue (if applicable)._
+
+**What were the changes?**  
+_One sentence describing the work done._
+
+**Why were the changes made?**  
+_Explain the motivation behind the work. Provide any publicly-available historical context._
+
+**How was the outcome achieved?**  
+_Technical details behind the work. Explain any publicly-available hardware peculiarities._
+
+**Additional Documentation:**  
+_What else should the reviewer know?_

From cb6a46fdd677783eec470e3df09aa138891bfebf Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 23 Jan 2025 12:57:51 -0800
Subject: [PATCH 192/233] Update CUDA gencodes

Add support for Blackwell sm100 and sm120 from CUDA 12.8

Add support for Hopper sm90 from CUDA 12.0
---
 src/Makefile | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 393de8e41b..5737092a86 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -16,15 +16,30 @@ CUDARTLIB ?= cudart
 
 CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
 NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
                 -gencode=arch=compute_61,code=sm_61 \
                 -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_80,code=sm_80 \
-                -gencode=arch=compute_80,code=compute_80
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_90,code=compute_90
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_80,code=compute_80
 else
 NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
                 -gencode=arch=compute_50,code=sm_50 \

From a89cf07fe879e1c0187a4f617f873ae47d69af6b Mon Sep 17 00:00:00 2001
From: Junyu Ma <junyum@nvidia.com>
Date: Thu, 23 Jan 2025 11:09:09 -0800
Subject: [PATCH 193/233] Perftests: Introduce NCCL_TESTS_SPLIT env

`NCCL_TESTS_SPLIT` serves as new way of computing the color for splitting communicators.

Will be overrided by `NCCL_TESTS_SPLIT_MASK`.

Examples:

NCCL_TESTS_SPLIT_MASK="0x7" # color = rank & 0x7. What we do today to run on a DGX with one GPU per node.
NCCL_TESTS_SPLIT="AND 0x7"  # color = rank & 0x7. New way to run on one GPU per node on a DGX, equivalent to NCCL_TESTS_SPLIT_MASK=0x7
NCCL_TESTS_SPLIT="MOD 72"   # color = rank % 72.  One GPU per NVLink domain on an NVL72 system.
NCCL_TESTS_SPLIT="DIV 72"   # color = rank / 72.  Intra NVLink domain on NVL72.

You can also use: "%" "&" "|" "/" for short.
Extra spaces in the middle will be automatically ignored.
Not case sensitive.

The followings are all equivalent:

NCCL_TESTS_SPLIT="%0x7"
NCCL_TESTS_SPLIT="%0b111"
NCCL_TESTS_SPLIT="AND 7"
NCCL_TESTS_SPLIT="and 0x7"
---
 src/common.cu | 51 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 6d103d797d..9277ea2b15 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -10,6 +10,8 @@
 #include <type_traits>
 #include <getopt.h>
 #include <libgen.h>
+#include <string.h>
+#include <ctype.h>
 #include "cuda.h"
 
 #include "../verifiable/verifiable.h"
@@ -892,6 +894,26 @@ int main(int argc, char* argv[]) {
   return 0;
 }
 
+#ifdef MPI_SUPPORT
+// parse int for base 2/10/16, will ignore first whitespaces
+static bool parseInt(char *s, int *num) {
+  char *p = NULL;
+  if (!s || !num)
+    return false;
+  while (*s && isspace(*s)) ++s;
+  if (!*s) return false;
+
+  if (strncasecmp(s, "0b", 2) == 0)
+    *num = (int)strtoul(s + 2, &p, 2);
+  else
+    *num = (int)strtoul(s, &p, 0);
+
+  if (p == s)
+    return false;
+  return true;
+}
+#endif
+
 testResult_t run() {
   int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
   int localRank = 0;
@@ -909,10 +931,33 @@ testResult_t run() {
     if (hostHashs[p] == hostHashs[proc]) localRank++;
   }
 
-  char* str = getenv("NCCL_TESTS_SPLIT_MASK");
-  uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
+  char *splitMaskEnv = NULL;
+  if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT_MASK")) {
+    color = proc & strtoul(splitMaskEnv, NULL, 16);
+  } else if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT")) {
+    if (
+      (strncasecmp(splitMaskEnv, "AND", strlen("AND")) == 0 && parseInt(splitMaskEnv + strlen("AND"), &color)) ||
+      (strncasecmp(splitMaskEnv, "&", strlen("&")) == 0 && parseInt(splitMaskEnv + strlen("&"), &color))
+    )
+        color = proc & color;
+    if (
+      (strncasecmp(splitMaskEnv, "OR", strlen("OR")) == 0 && parseInt(splitMaskEnv + strlen("OR"), &color)) ||
+      (strncasecmp(splitMaskEnv, "|", strlen("|")) == 0 && parseInt(splitMaskEnv + strlen("|"), &color))
+    )
+        color = proc | color;
+    if (
+      (strncasecmp(splitMaskEnv, "MOD", strlen("MOD")) == 0 && parseInt(splitMaskEnv + strlen("MOD"), &color)) ||
+      (strncasecmp(splitMaskEnv, "%", strlen("%")) == 0 && parseInt(splitMaskEnv + strlen("%"), &color))
+    )
+        color = proc % color;
+    if (
+      (strncasecmp(splitMaskEnv, "DIV", strlen("DIV")) == 0 && parseInt(splitMaskEnv + strlen("DIV"), &color)) ||
+      (strncasecmp(splitMaskEnv, "/", strlen("/")) == 0 && parseInt(splitMaskEnv + strlen("/"), &color))
+    )
+        color = proc / color;
+  }
+
   MPI_Comm mpi_comm;
-  color = proc & mask;
   MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
   MPI_Comm_size(mpi_comm, &ncclProcs);
   MPI_Comm_rank(mpi_comm, &ncclProc);

From 903918fc545fff518adf5411a8d5b3c99f5aceab Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Thu, 6 Feb 2025 14:10:07 +0100
Subject: [PATCH 194/233] Add NCCL_TESTS_SPLIT documentation in the README

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index 44e406a633..957f6afb90 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,23 @@ All tests support the same set of arguments :
   * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
   * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
 
+### Running multiple operations in parallel
+
+NCCL tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel. 
+To split the GPUs, NCCL will compute a "color" for each rank, based on the `NCCL_TESTS_SPLIT` environment variable, then all ranks
+with the same color will end up in the same group. The resulting group is printed next to each GPU at the beginning of the test.
+
+`NCCL_TESTS_SPLIT` takes the following syntax: `<operation><value>`. Operation can be `AND`, `OR`, `MOD` or `DIV`. The `&`, `|`, `%`, and `/` symbols are also supported. The value can be either decimal, hexadecimal (prefixed by `0x`) or binary (prefixed by `0b`).
+
+`NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.
+
+Here are a few examples:
+ - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network)
+ - `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
+ - `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
+
+Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.
+
 ## Copyright
 
 NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.

From b4300cc79d05dd35e26f13afcdb4938f29aa31a5 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Fri, 28 Feb 2025 13:23:26 -0800
Subject: [PATCH 195/233] Add PCI domain and device ID for GPU device BDF
 display

---
 src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 9277ea2b15..0d4dfc1944 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -984,8 +984,8 @@ testResult_t run() {
     int rank = proc*nThreads*nGpus+i;
     cudaDeviceProp prop;
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    len += snprintf(line+len, MAX_LINE-len, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
-                    rank, color, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    len += snprintf(line+len, MAX_LINE-len, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [%04x:%02x:%02x] %s\n",
+                    rank, color, getpid(), hostname, cudaDev, prop.pciDomainID, prop.pciBusID, prop.pciDeviceID, prop.name);
     maxMem = std::min(maxMem, prop.totalGlobalMem);
   }
 

From 284ff2ac84d38456ce5ab837edf70d01c48f926c Mon Sep 17 00:00:00 2001
From: Alex Breslow <alex.breslow@amd.com>
Date: Tue, 8 Apr 2025 11:19:45 -0500
Subject: [PATCH 196/233] Add instructions to README regarding benchmarking on
 pre ROCm 6.4.x versions with HSA_NO_SCRATCH_RECLAIM=1 (#114)

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 02a82ee71d..c89c15bb28 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,18 @@ Running with 1 MPI process per GPU ensures a 1:1 mapping for CPUs and GPUs, whic
 
 See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.
 
+### Environment variables
+On some older versions of ROCm before 6.4.0, setting `HSA_NO_SCRATCH_RECLAIM=1`
+ as part of the environment might be necessary to achieve better performance.  When running without MPI, a command similar to the following one should be sufficient:
+```shell
+HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
+```
+
+For MPI, you might need to use a command similar to the following:
+```shell
+mpirun.mpich -np 8 -env NCCL_DEBUG=VERSION -env HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8M -e 128M -i 8388608 -g 1 -d bfloat16
+```
+
 ### Arguments
 
 All tests support the same set of arguments :

From 5e838ad9df47079e0e586ed38049f4f579ea462d Mon Sep 17 00:00:00 2001
From: mberenjk <146776561+mberenjk@users.noreply.github.com>
Date: Tue, 15 Apr 2025 09:38:33 -0500
Subject: [PATCH 197/233] skipping the prod test for FP8 types in reduce and
 reduce-scatter (#111)

* skipping the prod test for FP8 types in reduce and reduce-scatter
---------

Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>
---
 src/all_reduce.cu     | 6 +++---
 src/reduce.cu         | 4 ++++
 src/reduce_scatter.cu | 4 ++++
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 5302f86833..9385c817f7 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -65,8 +65,6 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
   ncclRedOp_t *run_ops;
   const char **run_typenames, **run_opnames;
   int type_count, op_count;
-  if((type == ncclFp8E4M3 || type == ncclFp8E5M2) && op == ncclProd)
-    return testSuccess;
 
   if ((int)type != -1) {
     type_count = 1;
@@ -90,8 +88,10 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
 
   for (int i=0; i<type_count; i++) {
     for (int j=0; j<op_count; j++) {
-      if((i == ncclFp8E4M3 || i == ncclFp8E5M2) && j == ncclProd)
+#if defined(RCCL_FLOAT8)
+      if((run_types[i] == ncclFp8E4M3 || run_types[i] == ncclFp8E5M2) && run_ops[j] == ncclProd)
         continue;
+#endif
       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
     }
   }
diff --git a/src/reduce.cu b/src/reduce.cu
index dd90c25bf4..f08521e7f6 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -95,6 +95,10 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
 
   for (int i=0; i<type_count; i++) {
     for (int j=0; j<op_count; j++) {
+#if defined(RCCL_FLOAT8)
+      if((run_types[i] == ncclFp8E4M3 || run_types[i] == ncclFp8E5M2) && run_ops[j] == ncclProd)
+        continue;
+#endif
       for (int k=begin_root; k<=end_root; k++) {
         TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
       }
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index 2f6c8c56d6..76e5ef0164 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -92,6 +92,10 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
 
   for (int i=0; i<type_count; i++) {
     for (int j=0; j<op_count; j++) {
+#if defined(RCCL_FLOAT8)
+      if((run_types[i] == ncclFp8E4M3 || run_types[i] == ncclFp8E5M2) && run_ops[j] == ncclProd)
+        continue;
+#endif
       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
     }
   }

From 501a149d575fa62b7d8bea5b2bd20304dba55025 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Fri, 18 Apr 2025 19:20:59 -0700
Subject: [PATCH 198/233] Add support for FP8 datatypes

Added new datatypes: f8e4m3, f8e5m2

Only supported on H100+ architectures and NCCL versions >= 2.24.0
---
 src/common.cu            |  71 +++++-
 src/common.h             |  20 +-
 verifiable/verifiable.cu | 473 +++++++++++++++++++++++++++------------
 verifiable/verifiable.h  |   7 +-
 4 files changed, 415 insertions(+), 156 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 0d4dfc1944..f83cdf009a 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -21,15 +21,21 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 #if NCCL_MAJOR >= 2
   ncclDataType_t test_types[ncclNumTypes] = {
     ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #if HAVE_BF16
     , ncclBfloat16
   #endif
+  #if HAVE_FP8
+    , ncclFloat8e4m3, ncclFloat8e5m2
+  #endif
   };
   const char *test_typenames[ncclNumTypes] = {
     "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #if HAVE_BF16
     , "bfloat16"
   #endif
+  #if HAVE_FP8
+    , "f8e4m3", "f8e5m2"
+  #endif
   };
   int test_typenum = -1;
 
@@ -86,6 +92,7 @@ static int average = 1;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
 static int local_register = 0;
 #endif
+static int minCudaArch = 1<<30;
 
 #define NUM_BLOCKS 32
 
@@ -126,18 +133,18 @@ static double parsesize(const char *value) {
 }
 
 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
+  CUDACHECK(ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault));
   CUDACHECK(cudaDeviceSynchronize());
   return testSuccess;
 }
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
+  CUDACHECK(ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault));
   return testSuccess;
 }
 
 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
+  CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault));
   return testSuccess;
 }
 
@@ -358,9 +365,12 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       union {
         int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
         half f16; float f32; double f64;
-        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        #if HAVE_BF16
         __nv_bfloat16 bf16;
         #endif
+        #if HAVE_FP8
+        __nv_fp8_e4m3 f8e4m3; __nv_fp8_e5m2 f8e5m2;
+        #endif
       };
       switch(type) {
       case ncclInt8: i8 = ncclVerifiablePremulScalar<int8_t>(rank); break;
@@ -372,9 +382,14 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
       case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
       case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      #if HAVE_BF16
       case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
       #endif
+      #if HAVE_FP8
+      case ncclFloat8e4m3: f8e4m3 = ncclVerifiablePremulScalar<__nv_fp8_e4m3>(rank); break;
+      case ncclFloat8e5m2: f8e5m2 = ncclVerifiablePremulScalar<__nv_fp8_e5m2>(rank); break;
+      #endif
+      default: break; // Just to silence clang
       }
       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
     }
@@ -702,13 +717,20 @@ int main(int argc, char* argv[]) {
     test_typenum = 9;
     if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
       test_opnum++; // ncclAvg
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
-        test_typenum++; // bfloat16
-      #endif
     }
     if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
       test_opnum++; // PreMulSum
     }
+    #if defined(__CUDA_BF16_TYPES_EXIST__)
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_typenum++; // bfloat16
+    }
+    #endif
+    #if defined(__CUDA_FP8_TYPES_EXIST__)
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) {
+      test_typenum += 2; // fp8 e4m3,e5m2
+    }
+    #endif
   #endif
 
   // Parse args
@@ -1033,12 +1055,37 @@ testResult_t run() {
     gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
     CUDACHECK(cudaSetDevice(gpus[i]));
     TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
-    if (streamnull)
+    if (streamnull) {
       streams[i] = NULL;
-    else
+    }
+    else {
       CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+    }
+    int archMajor, archMinor;
+    CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, gpus[i]));
+    CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, gpus[i]));
+    minCudaArch = std::min(minCudaArch, 100*archMajor + 10*archMinor);
   }
 
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+#endif
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) {
+    if (minCudaArch < 900) { // Filter out fp8 on pre-Hopper hardware
+      int n = 0;
+      for (int i=0; i < test_typenum; i++) {
+        if (!(test_types[i] == ncclFloat8e4m3 || test_types[i] == ncclFloat8e5m2)) {
+          test_types[n] = test_types[i];
+          test_typenames[n] = test_typenames[i];
+          n += 1;
+        }
+      }
+      test_typenum = n;
+    }
+  }
+#endif
+
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
diff --git a/src/common.h b/src/common.h
index 478d7fb1c0..ff834f699d 100644
--- a/src/common.h
+++ b/src/common.h
@@ -213,16 +213,34 @@ static uint64_t getHostHash(const char* hostname) {
   return getHash(hostHash, strlen(hostHash));
 }
 
+#define HAVE_BF16 0
+#define HAVE_FP8 0
+
+#if NCCL_MAJOR >= 2
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    #undef HAVE_BF16
+    #define HAVE_BF16 1
+    #if defined(__CUDA_FP8_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0)
+      #undef HAVE_FP8
+      #define HAVE_FP8 1
+    #endif
+  #endif
+#endif
+
 static size_t wordSize(ncclDataType_t type) {
   switch(type) {
     case ncclChar:
 #if NCCL_MAJOR >= 2
     //case ncclInt8:
     case ncclUint8:
+#endif
+#if HAVE_FP8
+    case ncclFloat8e4m3:
+    case ncclFloat8e5m2:
 #endif
       return 1;
     case ncclHalf:
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if HAVE_BF16
     case ncclBfloat16:
 #endif
     //case ncclFloat16:
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 5f617ee188..dcd6e6c939 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -8,6 +8,15 @@
 #if CUDART_VERSION >= 11000
 #include <cuda_bf16.h>
 #endif
+#if CUDART_VERSION >= 11080
+#include <cuda_fp8.h>
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && defined(__CUDA_FP8_TYPES_EXIST__)
+  #define HAVE_ncclFloat8 1
+#else
+  #define HAVE_ncclFloat8 0
+#endif
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__)
   #define HAVE_ncclBfloat16 1
@@ -84,10 +93,16 @@ template<typename T>
 struct IsIntegral: std::is_integral<T> {};
 template<>
 struct IsIntegral<half>: std::false_type {};
-#ifdef __CUDA_BF16_TYPES_EXIST__
+#if HAVE_ncclBfloat16
 template<>
 struct IsIntegral<__nv_bfloat16>: std::false_type {};
 #endif
+#if HAVE_ncclFloat8
+template<>
+struct IsIntegral<__nv_fp8_e4m3>: std::false_type {};
+template<>
+struct IsIntegral<__nv_fp8_e5m2>: std::false_type {};
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -107,23 +122,72 @@ __host__ __device__ T inhibit(T x) {
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace {
-  template<typename Y, typename X>
-  __host__ __device__ Y castTo(X x) {
+  template<typename Y>
+  __host__ __device__ Y castTo(uint64_t x) {
     return Y(x);
   }
   template<typename Y>
   __host__ __device__ Y castTo(float x) {
     return Y(x);
   }
+  template<typename Y>
+  __host__ __device__ Y castTo(double x) {
+    return Y(x);
+  }
+
   template<>
   __host__ __device__ half castTo<half>(float x) {
     return __float2half(x);
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  template<>
+  __host__ __device__ half castTo<half>(double x) {
+    return __double2half(x);
+  }
+  template<>
+  __host__ __device__ half castTo<half>(uint64_t x) {
+    return __ull2half_rn(x);
+  }
+
+  #if HAVE_ncclBfloat16
   template<>
   __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
     return __float2bfloat16(x);
   }
+  template<>
+  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(double x) {
+    return __double2bfloat16(x);
+  }
+  template<>
+  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(uint64_t x) {
+    return __double2bfloat16((double)x);
+  }
+  #endif
+
+  #if HAVE_ncclFloat8
+  template<>
+  __host__ __device__ __nv_fp8_e4m3 castTo<__nv_fp8_e4m3>(float x) {
+    return __nv_fp8_e4m3(x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e4m3 castTo<__nv_fp8_e4m3>(double x) {
+    return __nv_fp8_e4m3(x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e4m3 castTo<__nv_fp8_e4m3>(uint64_t x) {
+    return __nv_fp8_e4m3((double)x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e5m2 castTo<__nv_fp8_e5m2>(float x) {
+    return __nv_fp8_e5m2(x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e5m2 castTo<__nv_fp8_e5m2>(double x) {
+    return __nv_fp8_e5m2(x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e5m2 castTo<__nv_fp8_e5m2>(uint64_t x) {
+    return __nv_fp8_e5m2((double)x);
+  }
   #endif
 }
 
@@ -151,7 +215,7 @@ struct ReduceSum {
       return __float2half(__half2float(a) + __half2float(b));
     #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if HAVE_ncclBfloat16
   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
     #if __CUDA_ARCH__ >= 800
       return __hadd(a, b);
@@ -160,6 +224,22 @@ struct ReduceSum {
     #endif
   }
   #endif
+  #if HAVE_ncclFloat8
+  __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e4m3(__hadd(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e4m3(float(a) + float(b));
+    #endif
+  }
+  __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e5m2(__hadd(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e5m2(float(a) + float(b));
+    #endif
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -175,7 +255,7 @@ struct ReduceProd {
       return __float2half(__half2float(a) * __half2float(b));
     #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if HAVE_ncclBfloat16
   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
     #if __CUDA_ARCH__ >= 800
       return __hmul(a, b);
@@ -184,6 +264,22 @@ struct ReduceProd {
     #endif
   }
   #endif
+  #if HAVE_ncclFloat8
+  __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e4m3(__hmul(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e4m3(float(a) * float(b));
+    #endif
+  }
+  __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e5m2(__hmul(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e5m2(float(a) * float(b));
+    #endif
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -201,7 +297,7 @@ struct ReduceMin {
       return __half2float(a) < __half2float(b) ? a : b;
     #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if HAVE_ncclBfloat16
   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
     #if __CUDA_ARCH__ >= 800
       return __hmin(a, b);
@@ -212,6 +308,22 @@ struct ReduceMin {
     #endif
   }
   #endif
+  #if HAVE_ncclFloat8
+  __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e4m3(__hmin(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e4m3(float(a) < float(b) ? a : b);
+    #endif
+  }
+  __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e5m2(__hmin(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e5m2(float(a) < float(b) ? a : b);
+    #endif
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -229,7 +341,7 @@ struct ReduceMax {
       return __half2float(a) > __half2float(b) ? a : b;
     #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if HAVE_ncclBfloat16
   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
     #if __CUDA_ARCH__ >= 800
       return __hmax(a, b);
@@ -240,6 +352,22 @@ struct ReduceMax {
     #endif
   }
   #endif
+  #if HAVE_ncclFloat8
+  __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e4m3(__hmax(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e4m3(float(a) > float(b) ? a : b);
+    #endif
+  }
+  __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e5m2(__hmax(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e5m2(float(a) > float(b) ? a : b);
+    #endif
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -297,29 +425,47 @@ struct ReduceAvg {
 
 namespace {
 template<typename T>
-struct FloatLayout;
+struct FloatLayout { static constexpr bool is_floating_point = false; };
 template<>
 struct FloatLayout<float> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 8, mantissa_bits = 23;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<double> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 11, mantissa_bits = 52;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<half> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 5, mantissa_bits = 10;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
-#ifdef __CUDA_BF16_TYPES_EXIST__
+#if HAVE_ncclBfloat16
 template<>
 struct FloatLayout<__nv_bfloat16> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 8, mantissa_bits = 7;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 #endif
+#if HAVE_ncclFloat8
+template<>
+struct FloatLayout<__nv_fp8_e4m3> {
+  static constexpr bool is_floating_point = true;
+  static constexpr int exponent_bits = 4, mantissa_bits = 3;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<__nv_fp8_e5m2> {
+  static constexpr bool is_floating_point = true;
+  static constexpr int exponent_bits = 5, mantissa_bits = 2;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#endif
 
 template<typename T>
 __host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
@@ -632,11 +778,12 @@ __host__ __device__ void genOutput(
 ////////////////////////////////////////////////////////////////////////////////
 // Nil reduction (byte copy functions). Optimized to assume rank_n=1
 
+// genInput specialization for integer ReduceNil.
 namespace {
-template<typename T, bool IsIntegral>
+template<typename T>
 __host__ __device__ void genInput(
     T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
-    std::integral_constant<bool, IsIntegral>
+    std::true_type /*integral*/
   ) {
   (void)rank_n, (void)rank_me; // silence unused warnings
   union { uint64_t bits; T tmp; };
@@ -646,6 +793,24 @@ __host__ __device__ void genInput(
   ans = tmp;
 }
 
+// genInput specialization for floating point ReduceNil.
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
+  int sign = rng & 1;
+  rng ^= rng>>1;
+  int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
+  exp += 1<<(FloatLayout<T>::exponent_bits-2);
+  rng ^= rng >> FloatLayout<T>::exponent_bits;
+  uint64_t mant = rng & mant_mask;
+  ans = makeFloat<T>(sign, exp, mant);
+}
+
 template<typename T, typename ReduceFn, bool IsIntegral>
 __host__ __device__ void genOutput(
     T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
@@ -734,20 +899,34 @@ __host__ __device__ void genOutput(
 namespace {
 template<typename T>
 __host__ __device__ void genInput(
-    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
     std::false_type /*integral*/
   ) {
-  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
+  // We can't control the nranks divisor in avareages so to control error we
+  // limit to two ranks contributing non-zero values. This way there is no ambiguity
+  // of summation.
+  int r = shuffleRank(rank_n, rank_me, rng);
+  uint64_t m = (rng*(r ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  ans = r < 2 ? castTo<T>(1+m) : castTo<T>((uint64_t)0);
 }
 
 template<typename T>
 __host__ __device__ void genOutput(
-    T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
+    T &ans, ReduceAvg, int rank_n, uint64_t rng, intptr_t index,
     std::false_type /*integral*/
   ) {
-  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
-  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
-  ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+  shuffleRank(rank_n, -1, rng);
+  uint64_t m0 = (rng*(0 ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  uint64_t m1 = (rng*(1 ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  if (rank_n == 1) {
+    ans = castTo<T>(1+m0);
+  } else {
+    // NCCL varies which datatype it does the muls with depending on __CUDA_ARCH__.
+    // We account for this by using a tolerance of 2 ulps during the verification.
+    using TMul = typename std::conditional<(sizeof(T) < sizeof(double)), float, double>::type;
+    ans = ReduceSum()((T)(TMul(1+m0)*TMul(1.0/rank_n)),
+                      (T)(TMul(1+m1)*TMul(1.0/rank_n)));
+  }
 }
 }
 
@@ -809,10 +988,9 @@ __host__ __device__ T genOutput(
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
-__global__ void prepareInput2(
+__global__ void __launch_bounds__(512, 1) prepareInput2(
     T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
     uint64_t seed, intptr_t elt_ix0
   ) {
@@ -833,40 +1011,49 @@ __global__ void prepareInput2(
 }
 
 template<typename ReduceOp>
-void prepareInput1(
+cudaError_t prepareInput1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
-  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
-  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
+  void const *fn = nullptr;
   switch(elt_ty) {
-  case ncclInt8: CASE_TY(int8_t)
-  case ncclUint8: CASE_TY(uint8_t)
-  case ncclInt32: CASE_TY(int32_t)
-  case ncclUint32: CASE_TY(uint32_t)
-  case ncclInt64: CASE_TY(int64_t)
-  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclInt8: fn = (void const*)&prepareInput2<int8_t, ReduceOp>; break;
+  case ncclUint8: fn = (void const*)&prepareInput2<uint8_t, ReduceOp>; break;
+  case ncclInt32: fn = (void const*)&prepareInput2<int32_t, ReduceOp>; break;
+  case ncclUint32: fn = (void const*)&prepareInput2<uint32_t, ReduceOp>; break;
+  case ncclInt64: fn = (void const*)&prepareInput2<int64_t, ReduceOp>; break;
+  case ncclUint64: fn = (void const*)&prepareInput2<uint64_t, ReduceOp>; break;
+  case ncclFloat16: fn = (void const*)&prepareInput2<half, ReduceOp>; break;
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: fn = (void const*)&prepareInput2<__nv_bfloat16, ReduceOp>; break;
   #endif
-  case ncclFloat32: CASE_TY(float)
-  case ncclFloat64: CASE_TY(double)
-  default: assert(0);
+  #if HAVE_ncclFloat8
+  case ncclFloat8e4m3: fn = (void const*)&prepareInput2<__nv_fp8_e4m3, ReduceOp>; break;
+  case ncclFloat8e5m2: fn = (void const*)&prepareInput2<__nv_fp8_e5m2, ReduceOp>; break;
+  #endif
+  case ncclFloat32: fn = (void const*)&prepareInput2<float, ReduceOp>; break;
+  case ncclFloat64: fn = (void const*)&prepareInput2<double, ReduceOp>; break;
+  default: assert(0); return cudaErrorInvalidValue;
   }
   #undef CASE_TY
+  dim3 grid = {1, 1, 1};
+  grid.x = (unsigned int)std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  dim3 block = {512, 1, 1};
+  void *args[7] = {&elts, &elt_n, &op, &rank_n, &rank_me, &seed, &elt_ix0};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }
 
-void ncclVerifiablePrepareInput(
+cudaError_t ncclVerifiablePrepareInput(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
-      prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
+      return prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
     else \
-      prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
+      return prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
     break;
   switch(red_op) {
   case ncclSum: CASE_OP(ReduceSum())
@@ -882,14 +1069,12 @@ void ncclVerifiablePrepareInput(
   }
   #undef CASE_OP
 }
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
-__global__ void prepareExpected2(
+__global__ void __launch_bounds__(512, 1) prepareExpected2(
     T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
     uint64_t seed, intptr_t elt_ix0
   ) {
@@ -909,40 +1094,49 @@ __global__ void prepareExpected2(
 }
 
 template<typename ReduceOp>
-void prepareExpected1(
+cudaError_t prepareExpected1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
-  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
-  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
+  void const *fn = nullptr;
   switch(elt_ty) {
-  case ncclInt8: CASE_TY(int8_t)
-  case ncclUint8: CASE_TY(uint8_t)
-  case ncclInt32: CASE_TY(int32_t)
-  case ncclUint32: CASE_TY(uint32_t)
-  case ncclInt64: CASE_TY(int64_t)
-  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclInt8: fn = (void const*)&prepareExpected2<int8_t, ReduceOp>; break;
+  case ncclUint8: fn = (void const*)&prepareExpected2<uint8_t, ReduceOp>; break;
+  case ncclInt32: fn = (void const*)&prepareExpected2<int32_t, ReduceOp>; break;
+  case ncclUint32: fn = (void const*)&prepareExpected2<uint32_t, ReduceOp>; break;
+  case ncclInt64: fn = (void const*)&prepareExpected2<int64_t, ReduceOp>; break;
+  case ncclUint64: fn = (void const*)&prepareExpected2<uint64_t, ReduceOp>; break;
+  case ncclFloat16: fn = (void const*)&prepareExpected2<half, ReduceOp>; break;
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: fn = (void const*)&prepareExpected2<__nv_bfloat16, ReduceOp>; break;
   #endif
-  case ncclFloat32: CASE_TY(float)
-  case ncclFloat64: CASE_TY(double)
-  default: assert(0);
+  #if HAVE_ncclFloat8
+  case ncclFloat8e4m3: fn = (void const*)&prepareExpected2<__nv_fp8_e4m3, ReduceOp>; break;
+  case ncclFloat8e5m2: fn = (void const*)&prepareExpected2<__nv_fp8_e5m2, ReduceOp>; break;
+  #endif
+  case ncclFloat32: fn = (void const*)&prepareExpected2<float, ReduceOp>; break;
+  case ncclFloat64: fn = (void const*)&prepareExpected2<double, ReduceOp>; break;
+  default: assert(0); return cudaErrorInvalidValue;
   }
   #undef CASE_TY
+  dim3 grid = {1, 1, 1};
+  grid.x = (unsigned int)std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  dim3 block = {512, 1, 1};
+  void *args[6] = {&elts, &elt_n, &op, &rank_n, &seed, &elt_ix0};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }
 
-void ncclVerifiablePrepareExpected(
+cudaError_t ncclVerifiablePrepareExpected(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
-      prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
+      return prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
     else \
-      prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
+      return prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
     break;
   switch(red_op) {
   case ncclSum: CASE_OP(ReduceSum())
@@ -958,52 +1152,10 @@ void ncclVerifiablePrepareExpected(
   }
   #undef CASE_OP
 }
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace {
-/* How we compare floating point values when exactness is impossible is interesting.
- * First, we take note that simply reinterpreting integer bits as floating point
- * gives us a monotonic mapping which exponentially spaces out floats. Thus
- * consecutive integers encode consecutive floats. In general, using integer
- * subraction on the bitpatterns of two floats gives us an integer which is the
- * logarithm of their relative difference. But, if the floats always have similar
- * exponents, than the integer difference is actually proportional to the
- * relative error (this is because we are counting hops in the mantissa bits only,
- * not the exponent bits). So a cheap way to compare if two floats are relatively
- * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
- * calculates such a tolerance for a summation of n floats. This formula
- * was derived by inspecting the maximum observed integer difference over many
- * random runs of summation. The parameter values were computed by the
- * companion program "inexact_regress.cu".
- */
-__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
-  float power, coef;
-  switch(elt_ty) {
-  case ncclFloat32:
-  case ncclFloat64:
-    power = .51f;
-    coef = 1.25f;
-    break;
-  case ncclFloat16:
-    power = .91f;
-    coef = .75f;
-    break;
-  #if HAVE_ncclBfloat16
-  case ncclBfloat16:
-    power = .91f;
-    coef = .66f;
-    break;
-  #endif
-  }
-  #if __CUDA_ARCH__
-    return 1 + unsigned(coef*powf(float(rank_n), power));
-  #else
-    return 1 + unsigned(coef*std::pow(float(rank_n), power));
-  #endif
-}
-
 template<typename T>
 __host__ __device__  uint64_t calcDelta(T a, T b) {
   union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
@@ -1020,10 +1172,9 @@ __host__ __device__  uint64_t calcDelta(T a, T b) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if !SELF_TEST
 namespace {
 template<typename T>
-__global__ void verifyPrepared(
+__global__ void __launch_bounds__(512, 1) verifyPrepared(
     T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
   ) {
   intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
@@ -1039,16 +1190,34 @@ __global__ void verifyPrepared(
     bad += tolerance < delta ? 1 : 0;
     #if 0
       if(tolerance < delta) {
-        printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
+        printf("verifyPrepared ix=%lld got=%g exp=%g tol=%d\n", (long long)i, (float)results[i], (float)expected[i], tolerance);
       }
     #endif
     i += blockDim.x;
   }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad) : "memory");
+}
+
+cudaError_t verifyPrepared1(int bytePerElt,
+    void const *results, void const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
+  ) {
+  void const *fn = nullptr;
+  switch(bytePerElt) {
+  case 1: fn = (void const*)&verifyPrepared<uint8_t>; break;
+  case 2: fn = (void const*)&verifyPrepared<uint16_t>; break;
+  case 4: fn = (void const*)&verifyPrepared<uint32_t>; break;
+  case 8: fn = (void const*)&verifyPrepared<uint64_t>; break;
+  default: assert(0); return cudaErrorInvalidValue;
+  }
+  dim3 grid = {(unsigned int)block_n, 1, 1};
+  dim3 block = {512, 1, 1};
+  void *args[5] = {&results, &expected, &elt_n, &tolerance, &bad_elt_n};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 
 template<typename T, typename Uint, typename ReduceFn>
-__global__ void verifyInline2(
+__global__ void __launch_bounds__(512, 1) verifyInline2(
     T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
     intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
   ) {
@@ -1077,39 +1246,52 @@ __global__ void verifyInline2(
     #endif
     i += blockDim.x;
   }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad) : "memory");
 }
 
 template<typename T, typename Uint>
-void verifyInline1(
+cudaError_t verifyInline1(
     T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
     unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
   ) {
+  void const *fn = nullptr;
+  ReduceNil opnil;
+  ReduceSum opsum;
+  ReduceMin opmin;
+  ReduceMax opmax;
+  ReduceProd opprod;
+  ReduceAvg opavg{rank_n};
+  ReducePreMulSum oppremulsum;
+  void *args[8] = {&results, &elt_n, nullptr, &rank_n, &seed, &elt_ix0, &tolerance, &bad_elt_n};
   #define CASE_OP(op) \
-    if(rank_n == 1) \
-    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
-      ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
-    else \
-    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
-      ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
-    break;
+    if(rank_n == 1) { \
+      fn = (void const*)&verifyInline2<T, Uint, ReduceNil>; \
+      args[2] = &opnil; \
+    } else { \
+      fn = (void const*)&verifyInline2<T, Uint, decltype(op)>; \
+      args[2] = &op; \
+    } break;
   switch(red_op) {
-  case ncclSum: CASE_OP(ReduceSum())
-  case ncclMin: CASE_OP(ReduceMin())
-  case ncclMax: CASE_OP(ReduceMax())
-  case ncclProd: CASE_OP(ReduceProd())
+  case ncclSum: CASE_OP(opsum)
+  case ncclMin: CASE_OP(opmin)
+  case ncclMax: CASE_OP(opmax)
+  case ncclProd: CASE_OP(opprod)
   #if HAVE_ncclAvg
-  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  case ncclAvg: CASE_OP(opavg)
   #endif
   #if HAVE_ncclPreMulSum
-  default: CASE_OP(ReducePreMulSum())
+  default: CASE_OP(oppremulsum)
   #endif
   }
   #undef CASE_OP
+  dim3 grid = {(unsigned int)block_n, 1, 1};
+  dim3 block = {512, 1, 1};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }
 
-void ncclVerifiableVerify(
+cudaError_t ncclVerifiableVerify(
     void const *results, void const *expected, intptr_t elt_n, int elt_ty,
     int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
     int64_t *bad_elt_n, cudaStream_t stream
@@ -1118,11 +1300,21 @@ void ncclVerifiableVerify(
   #if HAVE_ncclBfloat16
     floating |= elt_ty == ncclBfloat16;
   #endif
-
+  #if HAVE_ncclFloat8
+    floating |= elt_ty == ncclFloat8e4m3;
+    floating |= elt_ty == ncclFloat8e5m2;
+  #endif
+  
   unsigned tolerance = 0;
   #if HAVE_ncclAvg
-  if (floating && red_op == ncclAvg)
-    tolerance = calcSumFloatTolerance(rank_n, elt_ty);
+  if (floating && red_op == ncclAvg) {
+    // Average does it's pre-multiplies in an unspecified floating point format
+    // (could be the actual type T or float or half). That means the premultiply
+    // verify does could generate a discrepancy in the least mantissa digit. After
+    // adding those two (since avg only has two non-zero contributions) we could
+    // be off by a distance of 2 units.
+    tolerance = 2;
+  }
   #endif
 
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
@@ -1130,9 +1322,9 @@ void ncclVerifiableVerify(
   *bad_elt_n = 0;
   #define CASE_TY(T, Uint) { \
       if(expected != nullptr) { \
-        verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
+        return verifyPrepared1(sizeof(T), results, expected, elt_n, tolerance, bad_elt_n, stream, block_n); \
       } else { \
-        verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
+        return verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
       } \
     } break;
   switch(elt_ty) {
@@ -1143,29 +1335,30 @@ void ncclVerifiableVerify(
   case ncclInt64: CASE_TY(int64_t, uint64_t)
   case ncclUint64: CASE_TY(uint64_t, uint64_t)
   case ncclFloat16: CASE_TY(half, uint16_t)
+  #if HAVE_ncclFloat8
+  case ncclFloat8e4m3: CASE_TY(__nv_fp8_e4m3, uint8_t)
+  case ncclFloat8e5m2: CASE_TY(__nv_fp8_e5m2, uint8_t)
+  #endif
   #if HAVE_ncclBfloat16
   case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
   #endif
   case ncclFloat32: CASE_TY(float, uint32_t)
   case ncclFloat64: CASE_TY(double, uint64_t)
-  default: assert(0);
+  default: assert(0); return cudaErrorInvalidValue;
   }
   #undef CASE_TY
 }
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if SELF_TEST
-#include <iostream>
-
+namespace {
 template<typename T, typename Op>
 __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
   //if(!std::is_same<T,half>::value) return;
   //if(!std::is_same<Op,ReduceProd>::value) return;
   //if(rank_n!=3) return;
 
-  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
+  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? 2 : 0;
   uint64_t seed = 0xc8e2bed69766d533;
 
   for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
@@ -1202,7 +1395,7 @@ __device__ void sweep1(int ty, char const *tyname) {
   }
 }
 
-__global__ void sweep() {
+__global__ void __launch_bounds__(512, 1) sweep() {
   sweep1<int8_t>(ncclInt8, "int8");
   sweep1<uint8_t>(ncclUint8, "uint8");
   sweep1<int32_t>(ncclInt32, "int32");
@@ -1210,18 +1403,18 @@ __global__ void sweep() {
   sweep1<int64_t>(ncclInt64, "int64");
   sweep1<uint64_t>(ncclUint64, "uint64");
   sweep1<half>(ncclFloat16, "half");
+  #if HAVE_ncclFloat8
+    sweep1<__nv_fp8_e4m3>(ncclBfloat16, "float8e4m3");
+    sweep1<__nv_fp8_e5m2>(ncclBfloat16, "float8e5m2");
+  #endif
   #if HAVE_ncclBfloat16
     sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
   #endif
   sweep1<float>(ncclFloat32, "float");
   sweep1<double>(ncclFloat64, "double");
 }
-
-int main(int arg_n, char **args) {
-  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  cudaSetDevice(0);
-  sweep<<<1,512>>>();
-  cudaDeviceSynchronize();
-  return 0;
 }
-#endif
+
+void ncclVerifiableLaunchSelfTest() {
+  sweep<<<1,512>>>();
+}
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index aca0565a6b..71d5ef6649 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -34,13 +34,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 }
 
 // Enqueue kernel to generate data which is to be reduced.
-void ncclVerifiablePrepareInput(
+cudaError_t ncclVerifiablePrepareInput(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
 
 // Enqueue kernel to generate expected results of reduction.
-void ncclVerifiablePrepareExpected(
+cudaError_t ncclVerifiablePrepareExpected(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
@@ -51,9 +51,10 @@ void ncclVerifiablePrepareExpected(
 // which can be costly. Thus if you plan to run the same reduction multiple
 // times it is advantageous to precompute the expected values with
 // ncclVerifiablePrepareExpected and pass them as `expected` here.
-void ncclVerifiableVerify(
+cudaError_t ncclVerifiableVerify(
   void const *results, void const *expected, intptr_t elt_n, int elt_ty,
   int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
   int64_t *bad_elt_n, cudaStream_t stream
 );
+
 #endif

From 1021260ca94ea73dcedc8a15ffc6dbfb12504b65 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 21 Apr 2025 11:26:35 -0700
Subject: [PATCH 199/233] Make verifiable a DSO and add NAME_SUFFIX support

Build option DSO=1 generates libverifiable.so which can be
used to reduce the combined binary size.

Build option NAME_SUFFIX can be used to a add suffix to all
generated binaries. e.g. NAME_SUFFIX=_mpi

Added new make target: clean_intermediates
---
 README.md                | 32 +++++++++-----
 src/Makefile             | 93 +++++++++++-----------------------------
 src/common.mk            | 69 +++++++++++++++++++++++++++++
 verifiable/Makefile      | 17 +++++---
 verifiable/main.cu       | 14 ++++++
 verifiable/verifiable.h  |  4 ++
 verifiable/verifiable.mk | 15 +++++--
 7 files changed, 156 insertions(+), 88 deletions(-)
 create mode 100644 src/common.mk
 create mode 100644 verifiable/main.cu

diff --git a/README.md b/README.md
index 957f6afb90..bdafbe5a16 100644
--- a/README.md
+++ b/README.md
@@ -4,33 +4,43 @@ These tests check both the performance and the correctness of [NCCL](http://gith
 
 ## Build
 
-To build the tests, just type `make`.
+To build the tests, just type `make` or `make -j`
 
-If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME.
+If CUDA is not installed in `/usr/local/cuda`, you may specify `CUDA_HOME`. Similarly, if NCCL is not installed in `/usr`, you may specify `NCCL_HOME`.
 
 ```shell
 $ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
 ```
 
-NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
+NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set `MPI=1` and set `MPI_HOME` to the path where MPI is installed.
 
 ```shell
 $ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
 ```
 
+You can also add a suffix to the name of the generated binaries with `NAME_SUFFIX`. For example when compiling with the MPI versions you could use:
+
+```shell
+$ make MPI=1 NAME_SUFFIX=_mpi MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
+```
+
+This will generate test binaries with names such as `all_reduce_perf_mpi`.
+
 ## Usage
 
-NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
+NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to `(number of processes)*(number of threads)*(number of GPUs per thread)`.
 
 ### Quick examples
 
 Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+
 ```shell
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```
 
 Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
 (NB: The nccl-tests binaries must be compiled with `MPI=1` for this case)
+
 ```shell
 $ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
 ```
@@ -73,7 +83,7 @@ All tests support the same set of arguments :
 
 ### Running multiple operations in parallel
 
-NCCL tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel. 
+NCCL tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel.
 To split the GPUs, NCCL will compute a "color" for each rank, based on the `NCCL_TESTS_SPLIT` environment variable, then all ranks
 with the same color will end up in the same group. The resulting group is printed next to each GPU at the beginning of the test.
 
@@ -82,13 +92,15 @@ with the same color will end up in the same group. The resulting group is printe
 `NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.
 
 Here are a few examples:
- - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network)
- - `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
- - `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
+
+ - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8"`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating over the inter-node network)
+
+- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
+
+- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
 
 Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.
 
 ## Copyright
 
-NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
-
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
diff --git a/src/Makefile b/src/Makefile
index 5737092a86..612395f645 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,73 +1,13 @@
 #
-# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
+include common.mk
 
-CUDA_HOME ?= /usr/local/cuda
-PREFIX ?= /usr/local
-VERBOSE ?= 0
-DEBUG ?= 0
-
-CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-NVCC ?= $(CUDA_HOME)/bin/nvcc
-CUDARTLIB ?= cudart
-
-CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
-CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
-CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
-
-# Better define NVCC_GENCODE in your environment to the minimal set
-# of archs to reduce compile time.
-ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
-# Include Blackwell support if we're using CUDA12.8 or above
-NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
-		-gencode=arch=compute_90,code=sm_90 \
-		-gencode=arch=compute_100,code=sm_100 \
-		-gencode=arch=compute_120,code=sm_120 \
-		-gencode=arch=compute_120,code=compute_120
-else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
-NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-		-gencode=arch=compute_80,code=sm_80 \
-		-gencode=arch=compute_90,code=sm_90 \
-		-gencode=arch=compute_90,code=compute_90
-else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
-NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-		-gencode=arch=compute_80,code=sm_80 \
-		-gencode=arch=compute_80,code=compute_80
-else
-NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_70,code=compute_70
-endif
-
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
-CXXFLAGS   := -std=c++11
-
-LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
-NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
-
-ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3 -g
-CXXFLAGS  += -O3 -g
-else
-NVCUFLAGS += -O0 -G -g
-CXXFLAGS  += -O0 -g -ggdb3
-endif
-
-ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
-else
-.SILENT:
-endif
+MPI ?= 0        # Set to 1 to enable MPI support (multi-process/multi-node)
+NAME_SUFFIX ?=  # e.g. _mpi when using MPI=1
+DSO ?= 0        # Set to 1 to create and use libverifiable.so to reduce binary size
 
 .PHONY: build clean
 
@@ -92,7 +32,7 @@ DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
 BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
-BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
 
 build: ${BIN_FILES}
 
@@ -103,18 +43,35 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable
 TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
 include ../verifiable/verifiable.mk
 
+.PRECIOUS: ${DST_DIR}/%.o
+
 ${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
 
+${DST_DIR}/%$(NAME_SUFFIX).o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
 ${DST_DIR}/timer.o: timer.cc timer.h
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
+	$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
+ifeq ($(DSO), 1)
+${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS)
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable ${NVLDFLAGS} -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable"
+else
+${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+endif
+
+clean_intermediates:
+	rm -f ${DST_DIR}/*.o $(TEST_VERIFIABLE_OBJS)
 
diff --git a/src/common.mk b/src/common.mk
new file mode 100644
index 0000000000..2bc7e358a0
--- /dev/null
+++ b/src/common.mk
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC ?= $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_90,code=compute_90
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_80,code=compute_80
+else
+NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_70,code=compute_70
+endif
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+CXXFLAGS   := -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
diff --git a/verifiable/Makefile b/verifiable/Makefile
index b141a2a7c5..bb90001e1e 100644
--- a/verifiable/Makefile
+++ b/verifiable/Makefile
@@ -1,13 +1,18 @@
-include ../../makefiles/common.mk
+#
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+include ../src/common.mk
 
 .PHONY: all clean
 
-BUILDDIR := $(abspath ../../build)
+BUILDDIR := $(abspath ../build)
 NCCLDIR := $(BUILDDIR)
 NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
-DST_DIR := $(BUILDDIR)/test/verifiable
+DST_DIR := $(BUILDDIR)/verifiable
 
-all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
+all: $(DST_DIR)/self_test
 
 clean:
 	rm -rf $(DST_DIR)
@@ -18,7 +23,7 @@ include verifiable.mk
 
 self_test: $(DST_DIR)/self_test
 
-$(DST_DIR)/self_test: verifiable.cu verifiable.h
+$(DST_DIR)/self_test: main.cu $(TEST_VERIFIABLE_LIBS)
 	@printf "Linking  %s\n" $@
 	@mkdir -p $(DST_DIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
+	$(NVCC) -o $@ $(NVCUFLAGS) $< -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable $(NVLDFLAGS) -Xlinker "-rpath=\$$ORIGIN"
diff --git a/verifiable/main.cu b/verifiable/main.cu
new file mode 100644
index 0000000000..4e4aef6713
--- /dev/null
+++ b/verifiable/main.cu
@@ -0,0 +1,14 @@
+
+#include <cuda_runtime.h>
+#include <iostream>
+
+#define NCCL_VERIFIABLE_SELF_TEST 1
+#include "verifiable.h"
+
+int main(int arg_n, char **args) {
+  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
+  cudaSetDevice(0);
+  ncclVerifiableLaunchSelfTest();
+  cudaDeviceSynchronize();
+  return 0;
+}
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index 71d5ef6649..62f65c8bed 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -57,4 +57,8 @@ cudaError_t ncclVerifiableVerify(
   int64_t *bad_elt_n, cudaStream_t stream
 );
 
+#ifdef NCCL_VERIFIABLE_SELF_TEST
+void ncclVerifiableLaunchSelfTest();
+#endif
+
 #endif
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
index 225c32a3c3..2bff174050 100644
--- a/verifiable/verifiable.mk
+++ b/verifiable/verifiable.mk
@@ -1,11 +1,18 @@
-# We requires both of the following paths to be set upon including this makefile
+# We require both of the following paths to be set upon including this makefile
 # TEST_VERIFIABLE_SRCDIR = <points to this directory>
-# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
+# TEST_VERIFIABLE_BUILDDIR = <points to destination of .so file>
 
 TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+TEST_VERIFIABLE_LIBS = $(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so
 
-$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	$(NVCC) -Xcompiler "-fPIC" -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+
+$(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so: $(TEST_VERIFIABLE_OBJS)
+	@printf "Creating DSO %s\n" $@
+	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
+	$(CC) -shared -o $@.0 $^ -Wl,-soname,$(notdir $@).0
+	ln -sf $(notdir $@).0 $@

From f611dbd49a5d8cf17d42c4b1ef24ba4191bb7aae Mon Sep 17 00:00:00 2001
From: Grant Pinkert <gpinkert@amd.com>
Date: Fri, 25 Apr 2025 06:05:21 -1000
Subject: [PATCH 200/233] Fix message size logging (#115)

Previously, the logger was logging the number of expected bytes a node was to recieve.
This differs from the stdout logging, where the reported message size is the total size of a message.

Signed-off-by: Grant Pinkert <gpinkert@amd.com>
---
 src/common.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 5db78ea886..1ae2a788d4 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -760,12 +760,12 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
   }
 
+  auto largestMessageSize = std::max(args->sendBytes, args->expectedBytes);
   if (args->reporter) {
     if (args->reportErrors) {
-      args->reporter->addResult((args->nThreads * args->nGpus), args->nProcs, args->totalProcs, args->expectedBytes, in_place, timeUsec, algBw, busBw, wrongElts);
-    }
-    else {
-      args->reporter->addResult((args->nThreads * args->nGpus), args->nProcs, args->totalProcs, args->expectedBytes, in_place, timeUsec, algBw, busBw);
+      args->reporter->addResult((args->nThreads * args->nGpus), args->nProcs, args->totalProcs, largestMessageSize, in_place, timeUsec, algBw, busBw, wrongElts);
+    } else {
+      args->reporter->addResult((args->nThreads * args->nGpus), args->nProcs, args->totalProcs, largestMessageSize, in_place, timeUsec, algBw, busBw);
     }
   }
 

From a4fd8f4667e9ce50e6d25a1578000583deed4c04 Mon Sep 17 00:00:00 2001
From: Rahul Vaidya <ravaidya@amd.com>
Date: Mon, 28 Apr 2025 10:22:38 -0500
Subject: [PATCH 201/233] Fix build issues caused by 2.24.3 sync (#118)

---
 .github/CODEOWNERS       | 2 +-
 src/common.h             | 6 ++++++
 verifiable/verifiable.cu | 5 +++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 072f7d1a07..cd1b82fa42 100755
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,4 +1,4 @@
-* @wenkaidu @gilbertlee-amd @akolliasAMD @edgargabriel @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @haripriya-amd
+* @wenkaidu @gilbertlee-amd @akolliasAMD @edgargabriel @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @rahulvaidya20 @haripriya-amd
 
 # Documentation files
 doc/ @ROCm/rocm-documentation
diff --git a/src/common.h b/src/common.h
index 7982c09ccc..961059c578 100644
--- a/src/common.h
+++ b/src/common.h
@@ -22,6 +22,12 @@
 #include <fstream>
 #include <iostream>
 
+// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3)
+  #define ncclFp8E4M3 ncclFloat8e4m3
+  #define ncclFp8E5M2 ncclFloat8e5m2
+#endif
+
 // For nccl.h < 2.13 since we define a weak fallback
 extern "C" char const* ncclGetLastError(ncclComm_t comm);
 
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 32c13b048e..d157f8aa68 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -23,6 +23,11 @@
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_FLOAT8 == 1
   #define HAVE_ncclfp8 1
+  // Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3)
+    #define ncclFp8E4M3 ncclFloat8e4m3
+    #define ncclFp8E5M2 ncclFloat8e5m2
+  #endif
 #else
   #define HAVE_ncclfp8 0
 #endif

From c96deb13cd5fce13a5e21fe4a0170f050eb7f359 Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Tue, 29 Apr 2025 08:51:43 -0500
Subject: [PATCH 202/233] [BUILD] Fix rccl-tests version string for packaging
 (#117)

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4ae1c76ac..e39d38b82b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -203,6 +203,8 @@ set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
 # Add all of the tests
 add_subdirectory(src)
 
+rocm_setup_version(VERSION "2.14.1")
+
 # Create ROCm standard packages
 rocm_create_package(
     NAME rccl-tests

From 5b27b961b2543b3af2bb1cf5ca8ee0505226ba92 Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@gmail.com>
Date: Tue, 29 Apr 2025 23:18:51 +0200
Subject: [PATCH 203/233] Link `Threads::Threads` (#119)

`pthread.h` is included in `src/common.h` but lib is not properly
linked, resulting in the build failing with unresolved symbols when
trying to link.
---
 CMakeLists.txt     | 3 ---
 src/CMakeLists.txt | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e39d38b82b..cb08498032 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,9 +124,6 @@ endif()
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 
-##Adding pthread flag for linking
-#set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
-
 ## Check for HIP
 find_package(hip REQUIRED)
 message(STATUS "HIP compiler:     ${HIP_COMPILER}")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 05824a42d0..6fe68236fd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -126,7 +126,7 @@ add_custom_target(git_version_check
 add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
 add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
 add_dependencies(rccl_common hipify git_version_check)
-target_link_libraries(rccl_common roc::rccl hip::device)
+target_link_libraries(rccl_common roc::rccl hip::device Threads::Threads)
 if(USE_MPI)
   target_link_libraries(rccl_common MPI::MPI_CXX)
 endif()

From e041d901e6d3dabb67a22905cba77d9ba2689898 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 7 May 2025 10:30:59 -0700
Subject: [PATCH 204/233] Re-add sm_70 support for CUDA 12.8+ and 13.0 builds

---
 src/common.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common.mk b/src/common.mk
index 2bc7e358a0..5fd9418860 100644
--- a/src/common.mk
+++ b/src/common.mk
@@ -21,7 +21,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 # of archs to reduce compile time.
 ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
 # Include Blackwell support if we're using CUDA12.8 or above
-NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
+NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_90,code=sm_90 \
 		-gencode=arch=compute_100,code=sm_100 \
 		-gencode=arch=compute_120,code=sm_120 \

From 41b383a0d490b367d48a33af996787c31bd546c8 Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Wed, 7 May 2025 13:19:10 -0500
Subject: [PATCH 205/233] [BUILD] Add options to install script for compiler
 and GPU targets (#121)

* [BUILD] Add options to install script for compiler and GPU targets
* Fix GPU_TARGETS field and add option for custom ROCm path
* Check for ROCM_PATH

---------

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
---
 install.sh   | 96 +++++++++++++++++++++++++++++++++++++---------------
 src/Makefile |  2 +-
 2 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/install.sh b/install.sh
index c56a6bfdde..fead2f1bac 100755
--- a/install.sh
+++ b/install.sh
@@ -10,8 +10,12 @@ function display_help()
     echo "./install [-h|--help] "
     echo "    [-h|--help] Prints this help message."
     echo "    [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)"
-    echo "    [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)"
+    echo "    [-t|--test] Run unit-tests after building RCCL-Tests."
+    echo "    [--rocm_home] Specify custom path for ROCm installation (default: /opt/rocm)"
+    echo "    [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm)"
     echo "    [--mpi_home] Specify path to your MPI installation."
+    echo "    [--hip_compiler] Specify path to HIP compiler (default: /opt/rocm/bin/amdclang++)"
+    echo "    [--gpu_targets] Specify GPU targets (default:gfx906,gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gxf1101,gfx1102,gfx1200,gfx1201)"
 }
 
 # #################################################
@@ -20,8 +24,12 @@ function display_help()
 run_tests=false
 build_release=true
 mpi_enabled=false
-rccl_dir=/opt/rocm/rccl
+rocm_dir=${ROCM_PATH}
+rccl_dir=${rocm_dir}
 mpi_dir=""
+hip_compiler=${rocm_dir}/bin/amdclang++
+gpu_targets=""
+
 # #################################################
 # Parameter parsing
 # #################################################
@@ -29,7 +37,7 @@ mpi_dir=""
 # check if we have a modern version of getopt that can handle whitespace and long parameters
 getopt -T
 if [[ $? -eq 4 ]]; then
-    GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@")
+    GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rocm_home:,rccl_home:,mpi_home:,hip_compiler:,gpu_targets: --options hmt -- "$@")
 else
     echo "Need a new version of getopt"
     exit 1
@@ -44,28 +52,35 @@ eval set -- "${GETOPT_PARSE}"
 
 while true; do
     case "${1}" in
-	-h|--help)
-        display_help
-        exit 0
-        ;;
-	-m|--mpi)
-	    mpi_enabled=true
-	    shift ;;
-	-t|--test)
-	    run_tests=true
-	    shift ;;
-    --rccl_home)
-        rccl_dir=${2}
-        shift 2 ;;
-    --mpi_home)
-        mpi_dir=${2}
-        shift 2 ;;
-	--) shift ; break ;;
-	*)  echo "Unexpected command line parameter received; aborting";
-	    exit 1
-	    ;;
+        -h|--help)
+            display_help
+            exit 0 ;;
+        -m|--mpi)
+            mpi_enabled=true
+            shift ;;
+        -t|--test)
+            run_tests=true
+            shift ;;
+        --rocm_home)
+            rocm_dir=${2}
+            shift 2 ;;
+        --rccl_home)
+            rccl_dir=${2}
+            shift 2 ;;
+        --mpi_home)
+            mpi_dir=${2}
+            shift 2 ;;
+        --hip_compiler)
+            hip_compiler=${2}
+            shift 2 ;;
+        --gpu_targets)
+            gpu_targets=${2}
+            shift 2 ;;
+        --) shift ; break ;;
+        *)  echo "Unexpected command line parameter received; aborting";
+	    exit 1 ;;
     esac
-    done
+done
 
 # throw error code after running a command in the install script
 check_exit_code( )
@@ -85,15 +100,42 @@ build_dir=./build
 # ensure a clean build environment
 rm -rf ${build_dir}
 
+if [[ -n ${rocm_dir} ]]; then
+    echo "ROCM_PATH does not exist at ${rocm_dir}. Defaulting to /opt/rocm"
+    rocm_dir=/opt/rocm
+fi
+
+if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
+    echo "HIP Compiler does not exist at ${hip_compiler}. Please check the path."
+    echo "Defaulting to /opt/rocm/bin/amdclang++"
+    hip_compiler=${rocm_dir}/bin/amdclang++
+
+    if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
+        echo "${hip_compiler} does not exist. Please be advised."
+	echo "Defaulting to /opt/rocm/bin/hipcc"
+	hip_compiler=${rocm_dir}/bin/hipcc
+
+	if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
+            echo "${hip_compiler} does not exist!. Please check your ROCm installation."
+	    echo "Cannot proceed with building rccl-tests!"
+	    exit 1
+	fi
+    fi
+fi
+
+if [[ -n ${gpu_targets} ]]; then
+    GPU_TARGETS="GPU_TARGETS=${gpu_targets}"
+fi
+
 if ($mpi_enabled); then
     if [[ ${mpi_dir} == "" ]]; then
         echo "MPI flag enabled but path to MPI installation not specified.  See --mpi_home command line argument."
         exit 1
     else
-        make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc)
+        make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} HIPCC=${hip_compiler} ${GPU_TARGETS} -j$(nproc)
     fi
 else
-    make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc)
+    make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so HIP_COMPILER=${hip_compiler} ${GPU_TARGETS} -j$(nproc)
 fi
 check_exit_code "$?"
 
@@ -102,6 +144,6 @@ if ($run_tests); then
     if ($mpi_enabled); then
         cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
     else
-        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest
+        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest -k "not MPI"
     fi
 fi
diff --git a/src/Makefile b/src/Makefile
index f23ab65e29..7809da6978 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -13,7 +13,7 @@ DEBUG ?= 0
 NCCL_HOME ?= ""
 CUSTOM_RCCL_LIB ?= ""
 
-HIPCC = $(ROCM_PATH)/bin/amdclang++
+HIPCC ?= $(ROCM_PATH)/bin/amdclang++
 HIPCONFIG = $(ROCM_PATH)/bin/hipconfig
 CXX = $(HIPCC)
 

From cac33a8c2f757e20b3e9219abb3210cff3245364 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 3 Apr 2025 17:31:54 -0500
Subject: [PATCH 206/233] Automatically set in-place option from out-of-place
 (#123)

---
 README.md     |  3 ++-
 src/common.cu | 26 ++++++++++++++++++--------
 src/common.h  |  1 +
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 4fdb74ae0d..f0f6347c39 100644
--- a/README.md
+++ b/README.md
@@ -139,8 +139,9 @@ All tests support the same set of arguments :
   * `-G,--hipgraph <num graph launches>` Capture iterations as a HIP graph and then replay specified number of times. Default : 0.
   * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
   * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
-  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
+  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default: disabled.
   * `-F,--cache_flush <cache flush after every -F iteration>` Enable cache flush after every -F iteration. Default : 0 (No cache flush).
+  * `-O,--out_of_place <0=in-place only, 1=out-of-place only>`. Default: both.
   * `-q,--delay <delay>` Delay between out-of-place and in-place runs (in microseconds). Default: 10.
 * Parsing RCCL-Tests output
   * `-Z,--output_format <csv|json>` Parse RCCL-Tests output as a CSV or JSON. Default : disabled.
diff --git a/src/common.cu b/src/common.cu
index 1ae2a788d4..cb546db011 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -111,6 +111,7 @@ static int average = 1;
 static int numDevices = 1;
 static int delay_inout_place = 0;
 static int enable_out_of_place = 1;
+static int enable_in_place = 1;
 static int enable_cache_flush = 0;
 static int enable_rotating_tensor = 0;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
@@ -410,7 +411,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   int64_t *wrongPerGpu = nullptr;
   CUDACHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
-  
+
   for (int i=0; i<args->nGpus; i++) {
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     CUDACHECK(cudaSetDevice(args->gpus[i]));
@@ -450,14 +451,14 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (args->reportErrors && *wrongElts) args->errors[0]++;
   return testSuccess;
 }
-    
+
 testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
   cudaError_t cudaErr;
   int remaining = ngpus;
   int* done = (int*)malloc(sizeof(int)*ngpus);
   memset(done, 0, sizeof(int)*ngpus);
   timer tim;
-  
+
   while (remaining) {
    int idle = 1;
    for (int i=0; i<ngpus; i++) {
@@ -522,7 +523,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
     shift = totalnbytes * (iter % steps);
   }
-  
+
   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
   for (int i = 0; i < args->nGpus; i++) {
 #ifndef NCCL_MAJOR
@@ -912,7 +913,8 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
         TESTCHECK(BenchTime(args, type, op, root, 0));
         usleep(delay_inout_place);
       }
-      TESTCHECK(BenchTime(args, type, op, root, 1));
+        if (enable_in_place)
+        TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
     }
     --repeat;
@@ -1206,10 +1208,11 @@ int main(int argc, char* argv[]) {
 	break;
       case 'O':
         enable_out_of_place = strtol(optarg, NULL, 0);
+        enable_in_place = enable_out_of_place ? 0 : 1;
         break;
       case 'q':
         delay_inout_place = (int)strtol(optarg, NULL, 10);
-	break;
+      	break;
       case 'F':
         enable_cache_flush = strtol(optarg, NULL, 0);
         if (enable_cache_flush > 0) {
@@ -1500,14 +1503,20 @@ testResult_t run() {
 
   const char* timeStr = report_cputime ? "cputime" : "time";
   PRINT("#\n");
-  if (enable_out_of_place) {
+  if (enable_out_of_place && enable_in_place) {
   	PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
   	PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
       	timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
   	PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
       	"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+  } else if (enable_out_of_place) {
+	  PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place      \n", "", "", "", "", "");
+        PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
+        timeStr, "algbw", "busbw", "#wrong");
+        PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "");
   } else {
-	PRINT("# %10s  %12s  %8s  %6s  %6s           in-place          \n", "", "", "", "", "");
+    PRINT("# %10s  %12s  %8s  %6s  %6s           in-place          \n", "", "", "", "", "");
         PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
         timeStr, "algbw", "busbw", "#wrong");
         PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
@@ -1539,6 +1548,7 @@ testResult_t run() {
     threads[t].args.comms=comms+t*nGpus;
     threads[t].args.streams=streams.data()+t*nGpus;
     threads[t].args.enable_out_of_place=enable_out_of_place;
+    threads[t].args.enable_in_place=enable_in_place;
     threads[t].args.enable_cache_flush = enable_cache_flush;
     threads[t].args.enable_rotating_tensor = enable_rotating_tensor;
     threads[t].args.errors=errors.data()+t;
diff --git a/src/common.h b/src/common.h
index 961059c578..1b368cd28a 100644
--- a/src/common.h
+++ b/src/common.h
@@ -157,6 +157,7 @@ struct threadArgs {
   int* gpus;
   int localRank;
   int enable_out_of_place;
+  int enable_in_place;
   int enable_cache_flush;
   int enable_rotating_tensor;
   void** sendbuffs;

From 4b2b635766b483cef88a2b32986bd4f27998711f Mon Sep 17 00:00:00 2001
From: mberenjk <146776561+mberenjk@users.noreply.github.com>
Date: Wed, 14 May 2025 15:30:07 -0500
Subject: [PATCH 207/233] Switched to using the hip_fp8 header instead of
 rccl_float8, resolving compatibility issues.(#109)

* addressing hip_fp8 support compatibility issue

* skipping mulsum and avg test for fp8, using hip_fp8 for product

* syncing with nccl-tests

removing the fp8 filter for pre-hopper gpus and resolving the merge conflict

---------

Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>
---
 src/all_reduce.cu        |   5 +-
 src/common.cu            |  40 +++-
 src/common.h             |   4 +-
 src/rccl_float8.h        |  92 +++++++--
 src/reduce.cu            |   4 +-
 src/reduce_scatter.cu    |   4 +-
 verifiable/verifiable.cu | 423 +++++++++++++++++++++++++--------------
 verifiable/verifiable.h  |   6 +-
 8 files changed, 385 insertions(+), 193 deletions(-)

diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index cdf781cd7a..d64371bbb2 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -65,7 +65,6 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
   ncclRedOp_t *run_ops;
   const char **run_typenames, **run_opnames;
   int type_count, op_count;
-
   if ((int)type != -1) {
     type_count = 1;
     run_types = &type;
@@ -89,8 +88,8 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
   for (int i=0; i<type_count; i++) {
     for (int j=0; j<op_count; j++) {
 #if defined(RCCL_FLOAT8)
-      if((run_types[i] == ncclFp8E4M3 || run_types[i] == ncclFp8E5M2) && run_ops[j] == ncclProd)
-        continue;
+  if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+    continue;
 #endif
       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
     }
diff --git a/src/common.cu b/src/common.cu
index cb546db011..26a2f8000e 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -31,6 +31,13 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 int32_t gpu_block3;
 size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
 
+// RCCL_FLOAT8 support
+bool rccl_float8_useFnuz = false;
+bool IsArchMatch(char const* arch, char const* target) {
+  // helper function to reduce clutter in code elsewhere.  Returns true on match.
+  return (strncmp(arch, target, strlen(target)) == 0);
+}
+
 #if NCCL_MAJOR >= 2
   ncclDataType_t test_types[ncclNumTypes] = {
     ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
@@ -38,7 +45,7 @@ size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
     , ncclBfloat16
   #endif
   #if RCCL_FLOAT8 == 1
-    , ncclFp8E4M3, ncclFp8E5M2
+    , ncclFloat8e4m3, ncclFloat8e5m2
   #endif
   };
   const char *test_typenames[ncclNumTypes] = {
@@ -196,6 +203,7 @@ void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size
 }
 
 bool Reporter::isMainThread() { return is_main_thread == 1; }
+static int minCudaArch = 1<<30;
 
 #define NUM_BLOCKS 32
 
@@ -304,18 +312,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
 }
 
 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
+  CUDACHECK(ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault));
   CUDACHECK(cudaDeviceSynchronize());
   return testSuccess;
 }
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
+  CUDACHECK(ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault));
   return testSuccess;
 }
 
 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
+  CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault));
   return testSuccess;
 }
 
@@ -563,8 +571,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<hip_bfloat16>(rank); break;
       #endif
       #if defined(RCCL_FLOAT8)
-      case ncclFp8E4M3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
-      case ncclFp8E5M2: fp8_e5m2 = ncclVerifiablePremulScalar<rccl_bfloat8>(rank); break;
+      case ncclFloat8e4m3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
+      case ncclFloat8e5m2 : fp8_e5m2 = ncclVerifiablePremulScalar<rccl_bfloat8>(rank); break;
       #endif
       case ncclNumTypes: break;
       }
@@ -1330,6 +1338,13 @@ testResult_t run() {
   char hostname[1024];
   getHostName(hostname, 1024);
 
+  hipDeviceProp_t devProp;
+  CUDACHECK(hipGetDeviceProperties(&devProp, 0));
+  if (IsArchMatch(devProp.gcnArchName, "gfx942")) {
+    PRINT("On gfx942 architecture, using FNUZ FP8 types");
+    rccl_float8_useFnuz = true;
+  }
+
 #ifdef MPI_SUPPORT
   MPI_Comm_size(MPI_COMM_WORLD, &totalProcs);
   MPI_Comm_rank(MPI_COMM_WORLD, &proc);
@@ -1456,12 +1471,21 @@ testResult_t run() {
     gpus[i] = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices;
     CUDACHECK(cudaSetDevice(gpus[i]));
     TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes));
-    if (streamnull)
+    if (streamnull) {
       streams[i] = NULL;
-    else
+    }
+    else {
       CUDACHECK(cudaStreamCreateWithFlags(streams.data()+i, cudaStreamNonBlocking));
+    }
+    int archMajor, archMinor;
+    CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, gpus[i]));
+    CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, gpus[i]));
+    minCudaArch = std::min(minCudaArch, 100*archMajor + 10*archMinor);
   }
 
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+#endif
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
diff --git a/src/common.h b/src/common.h
index 1b368cd28a..5a3623cddb 100644
--- a/src/common.h
+++ b/src/common.h
@@ -258,8 +258,8 @@ static size_t wordSize(ncclDataType_t type) {
     //case ncclInt8:
     case ncclUint8:
 #if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
-    case ncclFp8E4M3:
-    case ncclFp8E5M2:
+    case ncclFloat8e4m3:
+    case ncclFloat8e5m2:
 #endif
 #endif
       return 1;
diff --git a/src/rccl_float8.h b/src/rccl_float8.h
index 01cab41f71..76bd4f35a1 100644
--- a/src/rccl_float8.h
+++ b/src/rccl_float8.h
@@ -24,8 +24,9 @@
 #define ROCBLAS_FLOAT8_H
 
 #include <stdint.h>
+#include <hip/hip_version.h>
 
-#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+#if __cplusplus < 201103L || (!defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__))
 /*! \brief Struct to represent a 8 bit floating-point number. */
 
 typedef struct
@@ -38,7 +39,60 @@ typedef struct
     uint8_t data;
 } rccl_bfloat8;
 
-#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+// __cplusplus < 201103L || (!defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__))
+#elif HIP_VERSION >= 60200000
+
+#include <hip/hip_fp8.h>
+
+#if   __HIP_DEVICE_COMPILE__ && (defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__) ||  (defined(__gfx1100__) || defined(__gfx1101__)))//HIP_FP8_TYPE_OCP is enabled.
+typedef __hip_fp8_e4m3 rccl_float8;
+typedef __hip_fp8_e5m2 rccl_bfloat8;
+#elif __HIP_DEVICE_COMPILE__ && (defined(__gfx942__))
+typedef __hip_fp8_e4m3_fnuz rccl_float8;
+typedef __hip_fp8_e5m2_fnuz rccl_bfloat8;
+#else
+typedef __hip_fp8_e4m3 rccl_float8;
+typedef __hip_fp8_e5m2 rccl_bfloat8;
+#endif
+
+#if   __HIP_DEVICE_COMPILE__
+inline std::ostream& operator<<(std::ostream& os, const rccl_float8& f8)
+{
+    return os << float(f8);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat8& bf8)
+{
+    return os << float(bf8);
+}
+
+#else
+inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e4m3& f8)
+{
+    return os << float(f8);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e5m2& bf8)
+{
+    return os << float(bf8);
+}
+
+//adding support for those operators on the host side
+inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e4m3_fnuz& f8)
+{
+    return os << float(f8);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e5m2_fnuz& bf8)
+{
+    return os << float(bf8);
+}
+#endif
+
+extern bool rccl_float8_useFnuz;
+// For older versions of ROCm that do not include hip_fp8.h,
+// we provide a local version of the header file as a fallback.
+#else
 
 #define HIP_HOST_DEVICE __host__ __device__
 #define HIP_HOST __host__
@@ -344,7 +398,7 @@ struct rccl_float8
     // default constructor
     HIP_HOST_DEVICE rccl_float8() = default;
 
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
     // device specific optimized F8 down-conversion code
 
     template <bool stochastic_rounding = false>
@@ -381,10 +435,10 @@ struct rccl_float8
         return i8data;
     }
 
-#endif // __gfx940__
+#endif // __gfx942__
 
     // constructor from float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
 
     // NOTE: ON-DEVICE... always optimal bias
     explicit HIP_DEVICE rccl_float8(float                        v,
@@ -402,7 +456,7 @@ struct rccl_float8
     // Host only implementation using s/w simulation
     explicit HIP_HOST
 #else
-    // both Host and DEVICE for non-gfx940 using s/w simulation
+    // both Host and DEVICE for non-gfx942 using s/w simulation
     explicit HIP_HOST_DEVICE
 #endif
         rccl_float8(float                        v,
@@ -446,7 +500,7 @@ struct rccl_float8
     }
 
     // convert to float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
     // upcast using device specific intrinsic
     explicit inline HIP_DEVICE operator float() const
     {
@@ -460,7 +514,7 @@ struct rccl_float8
     }
 
     explicit inline HIP_HOST operator float() const
-#else // non gfx940
+#else // non gfx942
     explicit inline HIP_HOST_DEVICE operator float() const
 #endif
     {
@@ -511,7 +565,7 @@ struct rccl_bfloat8
     // default constructor
     HIP_HOST_DEVICE rccl_bfloat8() = default;
 
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
     // device specific optimized F8 down-conversion code
 
     template <bool stochastic_rounding = false>
@@ -548,10 +602,10 @@ struct rccl_bfloat8
         return i8data;
     }
 
-#endif // __gfx940__
+#endif // __gfx942__
 
     // constructor from float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
 
     // NOTE: ON-DEVICE... always optimal bias
     explicit HIP_DEVICE rccl_bfloat8(float                        v,
@@ -569,7 +623,7 @@ struct rccl_bfloat8
     // Host only implementation using s/w simulation
     explicit HIP_HOST
 #else
-    // both Host and DEVICE for non-gfx940 using s/w simulation
+    // both Host and DEVICE for non-gfx942 using s/w simulation
     explicit HIP_HOST_DEVICE
 #endif
         rccl_bfloat8(float                        v,
@@ -613,7 +667,7 @@ struct rccl_bfloat8
     }
 
     // convert to float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
     // upcast using device specific intrinsic
     explicit inline HIP_DEVICE operator float() const
     {
@@ -627,7 +681,7 @@ struct rccl_bfloat8
     }
 
     explicit inline HIP_HOST operator float() const
-#else // non gfx940
+#else // non gfx942
     explicit inline HIP_HOST_DEVICE operator float() const
 #endif
     {
@@ -969,7 +1023,7 @@ inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng = 0)
     return a;
 }
 
-// Use h/w intrinsic and optimized version when __gfx940__
+// Use h/w intrinsic and optimized version when __gfx942__
 template <
     typename T,
     typename Ta,
@@ -980,7 +1034,7 @@ template <
     = 0>
 inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
     // NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch
     T val;
     if(std::is_same<T, rccl_float8>::value)
@@ -988,12 +1042,12 @@ inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
     else
         val.data = rccl_bfloat8::cast_to_bf8_from_f32<stochastic_rounding>(float(a), rng);
     return val;
-#else // non gfx940
+#else // non gfx942
     return T(float(a),
              stochastic_rounding ? T::rocblas_hip_f8_rounding_mode::stochastic
                                  : T::rocblas_hip_f8_rounding_mode::standard,
              rng);
-#endif // __gfx940__
+#endif // __gfx942__
 }
 
 // NOTE NOTE: The above code is good if we don't consider HIP-GEMM code and only consider the quantization
@@ -1016,6 +1070,6 @@ inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
 
 // =================================================================================================
 
-#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+#endif
 
 #endif // ROCBLAS_FLOAT8_H
diff --git a/src/reduce.cu b/src/reduce.cu
index c8ee2f84a6..f8c059e140 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -96,8 +96,8 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
   for (int i=0; i<type_count; i++) {
     for (int j=0; j<op_count; j++) {
 #if defined(RCCL_FLOAT8)
-      if((run_types[i] == ncclFp8E4M3 || run_types[i] == ncclFp8E5M2) && run_ops[j] == ncclProd)
-        continue;
+if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+    continue;
 #endif
       for (int k=begin_root; k<=end_root; k++) {
         TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index 3d296ea926..2e04cc7456 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -91,8 +91,8 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
   for (int i=0; i<type_count; i++) {
     for (int j=0; j<op_count; j++) {
 #if defined(RCCL_FLOAT8)
-      if((run_types[i] == ncclFp8E4M3 || run_types[i] == ncclFp8E5M2) && run_ops[j] == ncclProd)
-        continue;
+if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+    continue;
 #endif
       TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
     }
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index d157f8aa68..631e19feea 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -22,12 +22,16 @@
 #endif
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_FLOAT8 == 1
-  #define HAVE_ncclfp8 1
-  // Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3)
-    #define ncclFp8E4M3 ncclFloat8e4m3
-    #define ncclFp8E5M2 ncclFloat8e5m2
-  #endif
+#if __HIP_DEVICE_COMPILE__
+  #define HAVE_ncclfp8_DEVICE 1
+#else
+  #define HAVE_ncclfp8_HOST 1
+#endif
+// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3)
+  #define ncclFp8E4M3 ncclFloat8e4m3
+  #define ncclFp8E5M2 ncclFloat8e5m2
+#endif
 #else
   #define HAVE_ncclfp8 0
 #endif
@@ -130,23 +134,39 @@ __host__ __device__ T inhibit(T x) {
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace {
-  template<typename Y, typename X>
-  __host__ __device__ Y castTo(X x) {
+  template<typename Y>
+  __host__ __device__ Y castTo(uint64_t x) {
     return Y(x);
   }
   template<typename Y>
   __host__ __device__ Y castTo(float x) {
     return Y(x);
   }
+  template<typename Y>
+  __host__ __device__ Y castTo(double x) {
+    return Y(x);
+  }
   template<>
   __host__ __device__ __half castTo<__half>(float x) {
     return __float2half(x);
   }
+  template<>
+  __host__ __device__ half castTo<__half>(uint64_t x) {
+    return __ull2half_rn(x);
+  }
   #if RCCL_BFLOAT16 == 1
   template<>
   __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
     return hip_bfloat16(x);
   }
+  template<>
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(double x) {
+    return hip_bfloat16(x);
+  }
+  template<>
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(uint64_t x) {
+    return hip_bfloat16((double)x);
+  }
   #endif
   #if RCCL_FLOAT8 == 1
   template<>
@@ -157,6 +177,22 @@ namespace {
   __host__ __device__ rccl_bfloat8 castTo<rccl_bfloat8>(float x) {
     return static_cast<rccl_bfloat8>(x);
   }
+  template<>
+  __host__ __device__ rccl_float8 castTo<rccl_float8>(double x) {
+    return static_cast<rccl_float8>(x);
+  }
+  template<>
+  __host__ __device__ rccl_float8 castTo<rccl_float8>(uint64_t x) {
+    return static_cast<rccl_float8>((double)x);
+  }
+  template<>
+  __host__ __device__ rccl_bfloat8 castTo<rccl_bfloat8>(double x) {
+    return static_cast<rccl_bfloat8>(x);
+  }
+  template<>
+  __host__ __device__ rccl_bfloat8 castTo<rccl_bfloat8>(uint64_t x) {
+    return static_cast<rccl_bfloat8>((double)x);
+  }
   #endif
 }
 
@@ -211,16 +247,16 @@ struct ReduceProd {
   #endif
   #if RCCL_FLOAT8 == 1
   __host__ __device__ rccl_float8 operator()(rccl_float8 a, rccl_float8 b) const {
-      return static_cast<rccl_float8>(a * b);
+      return static_cast<rccl_float8>(float(a) * float(b));
   }
   __host__ __device__ rccl_float8 operator()(rccl_float8 a, float b) const {
-      return static_cast<rccl_float8>(a * b);
+      return static_cast<rccl_float8>(float(a) * float(b));
   }
   __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, rccl_bfloat8 b) const {
-      return static_cast<rccl_bfloat8>(a * b);
+      return static_cast<rccl_bfloat8>(float(a) * float(b));
   }
   __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, float b) const {
-      return static_cast<rccl_bfloat8>(a * b);
+      return static_cast<rccl_bfloat8>(float(a) * float(b));
   }
   #endif
   template<typename T>
@@ -328,40 +364,72 @@ struct ReduceAvg {
 
 namespace {
 template<typename T>
-struct FloatLayout;
+struct FloatLayout { static constexpr bool is_floating_point = false; };
 template<>
 struct FloatLayout<float> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 8, mantissa_bits = 23;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<double> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 11, mantissa_bits = 52;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<__half> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 5, mantissa_bits = 10;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 #if RCCL_BFLOAT16 == 1
 template<>
 struct FloatLayout<hip_bfloat16> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 8, mantissa_bits = 7;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 #endif
 #if RCCL_FLOAT8 == 1
+#if __HIP_DEVICE_COMPILE__
 template<>
 struct FloatLayout<rccl_float8> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 4, mantissa_bits = 3;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<rccl_bfloat8> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 5, mantissa_bits = 2;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
+#else
+template<>
+struct FloatLayout<__hip_fp8_e4m3> {
+  static constexpr bool is_floating_point = true;
+  static constexpr int exponent_bits = 4, mantissa_bits = 3;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<__hip_fp8_e5m2> {
+  static constexpr bool is_floating_point = true;
+  static constexpr int exponent_bits = 5, mantissa_bits = 2;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+
+template<>
+struct FloatLayout<__hip_fp8_e4m3_fnuz> {
+  static constexpr int exponent_bits = 4, mantissa_bits = 3;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1));
+};
+template<>
+struct FloatLayout<__hip_fp8_e5m2_fnuz> {
+  static constexpr int exponent_bits = 5, mantissa_bits = 2;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1));
+};
+#endif
 #endif
 
 template<typename T>
@@ -675,11 +743,12 @@ __host__ __device__ void genOutput(
 ////////////////////////////////////////////////////////////////////////////////
 // Nil reduction (byte copy functions). Optimized to assume rank_n=1
 
+// genInput specialization for integer ReduceNil.
 namespace {
-template<typename T, bool IsIntegral>
+template<typename T>
 __host__ __device__ void genInput(
     T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
-    std::integral_constant<bool, IsIntegral>
+    std::true_type /*integral*/
   ) {
   (void)rank_n, (void)rank_me; // silence unused warnings
   union { uint64_t bits; T tmp; };
@@ -689,6 +758,24 @@ __host__ __device__ void genInput(
   ans = tmp;
 }
 
+// genInput specialization for floating point ReduceNil.
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
+  int sign = rng & 1;
+  rng ^= rng>>1;
+  int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
+  exp += 1<<(FloatLayout<T>::exponent_bits-2);
+  rng ^= rng >> FloatLayout<T>::exponent_bits;
+  uint64_t mant = rng & mant_mask;
+  ans = makeFloat<T>(sign, exp, mant);
+}
+
 template<typename T, typename ReduceFn, bool IsIntegral>
 __host__ __device__ void genOutput(
     T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
@@ -777,22 +864,35 @@ __host__ __device__ void genOutput(
 namespace {
 template<typename T>
 __host__ __device__ void genInput(
-    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+  T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
     std::false_type /*integral*/
   ) {
-  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
+  // We can't control the nranks divisor in avareages so to control error we
+  // limit to two ranks contributing non-zero values. This way there is no ambiguity
+  // of summation.
+  int r = shuffleRank(rank_n, rank_me, rng);
+  uint64_t m = (rng*(r ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  ans = r < 2 ? castTo<T>(1+m) : castTo<T>((uint64_t)0);
 }
 
 template<typename T>
 __host__ __device__ void genOutput(
-    T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
+    T &ans, ReduceAvg, int rank_n, uint64_t rng, intptr_t index,
     std::false_type /*integral*/
   ) {
-  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
-  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
-  //ans = ReduceProd()(ans, T1(1)/T1(rank_n));
-  ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
- }
+  shuffleRank(rank_n, -1, rng);
+  uint64_t m0 = (rng*(0 ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  uint64_t m1 = (rng*(1 ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  if (rank_n == 1) {
+    ans = castTo<T>(1+m0);
+  } else {
+    // NCCL varies which datatype it does the muls with depending on __CUDA_ARCH__.
+    // We account for this by using a tolerance of 2 ulps during the verification.
+    using TMul = typename std::conditional<(sizeof(T) < sizeof(double)), float, double>::type;
+    ans = ReduceSum()((T)(TMul(1+m0)*TMul(1.0/rank_n)),
+                      (T)(TMul(1+m1)*TMul(1.0/rank_n)));
+  }
+}
 }
 
 /////////////////////////////////////////////////////////////////////////////////
@@ -856,7 +956,7 @@ __host__ __device__ T genOutput(
 #if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
-__global__ void prepareInput2(
+__global__ void __launch_bounds__(512, 1) prepareInput2(
     T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
     uint64_t seed, intptr_t elt_ix0
   ) {
@@ -877,44 +977,55 @@ __global__ void prepareInput2(
 }
 
 template<typename ReduceOp>
-void prepareInput1(
+cudaError_t prepareInput1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
-  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
-  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
+  void const *fn = nullptr;
   switch(elt_ty) {
-  case ncclInt8: CASE_TY(int8_t)
-  case ncclUint8: CASE_TY(uint8_t)
-  case ncclInt32: CASE_TY(int32_t)
-  case ncclUint32: CASE_TY(uint32_t)
-  case ncclInt64: CASE_TY(int64_t)
-  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(__half)
+  case ncclInt8: fn = (void const*)&prepareInput2<int8_t, ReduceOp>; break;
+  case ncclUint8: fn = (void const*)&prepareInput2<uint8_t, ReduceOp>; break;
+  case ncclInt32: fn = (void const*)&prepareInput2<int32_t, ReduceOp>; break;
+  case ncclUint32: fn = (void const*)&prepareInput2<uint32_t, ReduceOp>; break;
+  case ncclInt64: fn = (void const*)&prepareInput2<int64_t, ReduceOp>; break;
+  case ncclUint64: fn = (void const*)&prepareInput2<uint64_t, ReduceOp>; break;
+  case ncclFloat16: fn = (void const*)&prepareInput2<__half, ReduceOp>; break;
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: fn = (void const*)&prepareInput2<hip_bfloat16, ReduceOp>; break;
   #endif
-  #if HAVE_ncclfp8
-  case ncclFp8E4M3: CASE_TY(rccl_float8)
-  case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+  #if HAVE_ncclfp8_DEVICE
+  case ncclFloat8e4m3: fn = (void const*)&prepareInput2<rccl_float8, ReduceOp>; break;
+  case ncclFloat8e5m2: fn = (void const*)&prepareInput2<rccl_bfloat8, ReduceOp>; break;
   #endif
-  case ncclFloat32: CASE_TY(float)
-  case ncclFloat64: CASE_TY(double)
-  default: assert(0);
+  #if HAVE_ncclfp8_HOST
+  case ncclFloat8e4m3: if (rccl_float8_useFnuz) { fn = (void const*)&prepareInput2<__hip_fp8_e4m3_fnuz, ReduceOp>; break;}
+  else { fn = (void const*)&prepareInput2<__hip_fp8_e4m3, ReduceOp>; break;}
+  case ncclFloat8e5m2: if (rccl_float8_useFnuz) { fn = (void const*)&prepareInput2<__hip_fp8_e5m2_fnuz, ReduceOp>; break;}
+  else { fn = (void const*)&prepareInput2<__hip_fp8_e5m2, ReduceOp>; break;}
+  #endif
+  case ncclFloat32: fn = (void const*)&prepareInput2<float, ReduceOp>; break;
+  case ncclFloat64: fn = (void const*)&prepareInput2<double, ReduceOp>; break;
+  default: assert(0); return cudaErrorInvalidValue;
   }
   #undef CASE_TY
+  dim3 grid = {1, 1, 1};
+  grid.x = (unsigned int)std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  dim3 block = {512, 1, 1};
+  void *args[7] = {&elts, &elt_n, &op, &rank_n, &rank_me, &seed, &elt_ix0};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }
 
-void ncclVerifiablePrepareInput(
+hipError_t ncclVerifiablePrepareInput(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
-      prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
+      return prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
     else \
-      prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
+      return prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
     break;
   switch(red_op) {
   case ncclSum: CASE_OP(ReduceSum())
@@ -937,7 +1048,7 @@ void ncclVerifiablePrepareInput(
 #if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
-__global__ void prepareExpected2(
+__global__ void __launch_bounds__(512, 1) prepareExpected2(
     T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
     uint64_t seed, intptr_t elt_ix0
   ) {
@@ -957,44 +1068,55 @@ __global__ void prepareExpected2(
 }
 
 template<typename ReduceOp>
-void prepareExpected1(
+cudaError_t prepareExpected1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
-  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
-  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
+  void const *fn = nullptr;
   switch(elt_ty) {
-  case ncclInt8: CASE_TY(int8_t)
-  case ncclUint8: CASE_TY(uint8_t)
-  case ncclInt32: CASE_TY(int32_t)
-  case ncclUint32: CASE_TY(uint32_t)
-  case ncclInt64: CASE_TY(int64_t)
-  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(__half)
+  case ncclInt8: fn = (void const*)&prepareExpected2<int8_t, ReduceOp>; break;
+  case ncclUint8: fn = (void const*)&prepareExpected2<uint8_t, ReduceOp>; break;
+  case ncclInt32: fn = (void const*)&prepareExpected2<int32_t, ReduceOp>; break;
+  case ncclUint32: fn = (void const*)&prepareExpected2<uint32_t, ReduceOp>; break;
+  case ncclInt64: fn = (void const*)&prepareExpected2<int64_t, ReduceOp>; break;
+  case ncclUint64: fn = (void const*)&prepareExpected2<uint64_t, ReduceOp>; break;
+  case ncclFloat16: fn = (void const*)&prepareExpected2<__half, ReduceOp>; break;
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: fn = (void const*)&prepareExpected2<hip_bfloat16, ReduceOp>; break;
   #endif
-  #if HAVE_ncclfp8
-  case ncclFp8E4M3: CASE_TY(rccl_float8)
-  case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+  #if HAVE_ncclfp8_DEVICE
+  case ncclFloat8e4m3: fn = (void const*)&prepareExpected2<rccl_float8, ReduceOp>; break;
+  case ncclFloat8e5m2: fn = (void const*)&prepareExpected2<rccl_bfloat8, ReduceOp>; break;
   #endif
-  case ncclFloat32: CASE_TY(float)
-  case ncclFloat64: CASE_TY(double)
-  default: assert(0);
+  #if HAVE_ncclfp8_HOST
+  case ncclFloat8e4m3: if (rccl_float8_useFnuz) { fn = (void const*)&prepareExpected2<__hip_fp8_e4m3_fnuz, ReduceOp>; break; }
+  else { fn = (void const*)&prepareExpected2<__hip_fp8_e4m3, ReduceOp>; break; }
+  case ncclFloat8e5m2: if (rccl_float8_useFnuz) { fn = (void const*)&prepareExpected2<__hip_fp8_e5m2_fnuz, ReduceOp>; break; }
+  else { fn = (void const*)&prepareExpected2<__hip_fp8_e5m2, ReduceOp>; break; }
+  #endif
+  case ncclFloat32: { fn = (void const*)&prepareExpected2<float, ReduceOp>; break; }
+  case ncclFloat64: { fn = (void const*)&prepareExpected2<double, ReduceOp>; break; }
+  default: assert(0); return cudaErrorInvalidValue;
   }
-  #undef CASE_TY
+
+  dim3 grid = {1, 1, 1};
+  grid.x = (unsigned int)std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  dim3 block = {512, 1, 1};
+  void *args[6] = {&elts, &elt_n, &op, &rank_n, &seed, &elt_ix0};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }
 
-void ncclVerifiablePrepareExpected(
+hipError_t ncclVerifiablePrepareExpected(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
-      prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
+      return prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
     else \
-      prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
+      return prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
     break;
   switch(red_op) {
   case ncclSum: CASE_OP(ReduceSum())
@@ -1015,54 +1137,6 @@ void ncclVerifiablePrepareExpected(
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace {
-/* How we compare floating point values when exactness is impossible is interesting.
- * First, we take note that simply reinterpreting integer bits as floating point
- * gives us a monotonic mapping which exponentially spaces out floats. Thus
- * consecutive integers encode consecutive floats. In general, using integer
- * subraction on the bitpatterns of two floats gives us an integer which is the
- * logarithm of their relative difference. But, if the floats always have similar
- * exponents, than the integer difference is actually proportional to the
- * relative error (this is because we are counting hops in the mantissa bits only,
- * not the exponent bits). So a cheap way to compare if two floats are relatively
- * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
- * calculates such a tolerance for a summation of n floats. This formula
- * was derived by inspecting the maximum observed integer difference over many
- * random runs of summation. The parameter values were computed by the
- * companion program "inexact_regress.cu".
- */
-__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
-  float power, coef;
-  switch(elt_ty) {
-  case ncclFloat32:
-  case ncclFloat64:
-    power = .51f;
-    coef = 1.25f;
-    break;
-  case ncclFloat16:
-    power = .91f;
-    coef = .75f;
-    break;
-  #if HAVE_ncclBfloat16
-  case ncclBfloat16:
-    power = .91f;
-    coef = .66f;
-    break;
-  #endif
-  #if HAVE_ncclfp8
-  case ncclFp8E4M3:
-  case ncclFp8E5M2:
-    power = .91f;
-    coef = .66f;
-    break;
-  #endif
-  }
-  #if __CUDA_ARCH__
-    return 1 + unsigned(coef*powf(float(rank_n), power));
-  #else
-    return 1 + unsigned(coef*std::pow(float(rank_n), power));
-  #endif
-}
-
 template<typename T>
 __host__ __device__  uint64_t calcDelta(T a, T b) {
   union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
@@ -1082,7 +1156,7 @@ __host__ __device__  uint64_t calcDelta(T a, T b) {
 #if !SELF_TEST
 namespace {
 template<typename T>
-__global__ void verifyPrepared(
+__global__ void __launch_bounds__(512, 1) verifyPrepared(
     T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
   ) {
   intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
@@ -1098,17 +1172,35 @@ __global__ void verifyPrepared(
     bad += tolerance < delta ? 1 : 0;
     #if 0
       if(tolerance < delta) {
-        printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
+        printf("verifyPrepared ix=%lld got=%g exp=%g tol=%d\n", (long long)i, (float)results[i], (float)expected[i], tolerance);
       }
     #endif
     i += blockDim.x;
   }
-  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad) : "memory");
   atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
 }
 
+hipError_t verifyPrepared1(int bytePerElt,
+  void const *results, void const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
+) {
+  void const *fn = nullptr;
+  switch(bytePerElt) {
+  case 1: fn = (void const*)&verifyPrepared<uint8_t>; break;
+  case 2: fn = (void const*)&verifyPrepared<uint16_t>; break;
+  case 4: fn = (void const*)&verifyPrepared<uint32_t>; break;
+  case 8: fn = (void const*)&verifyPrepared<uint64_t>; break;
+  default: assert(0); return cudaErrorInvalidValue;
+  }
+  dim3 grid = {(unsigned int)block_n, 1, 1};
+  dim3 block = {512, 1, 1};
+  void *args[5] = {&results, &expected, &elt_n, &tolerance, &bad_elt_n};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
+}
+
 template<typename T, typename Uint, typename ReduceFn>
-__global__ void verifyInline2(
+__global__ void __launch_bounds__(512, 1) verifyInline2(
     T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
     intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
   ) {
@@ -1142,35 +1234,49 @@ __global__ void verifyInline2(
 }
 
 template<typename T, typename Uint>
-void verifyInline1(
+hipError_t verifyInline1(
     T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
     unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
   ) {
+  void const *fn = nullptr;
+  ReduceNil opnil;
+  ReduceSum opsum;
+  ReduceMin opmin;
+  ReduceMax opmax;
+  ReduceProd opprod;
+  ReduceAvg opavg{rank_n};
+  ReducePreMulSum oppremulsum;
+  void *args[8] = {&results, &elt_n, nullptr, &rank_n, &seed, &elt_ix0, &tolerance, &bad_elt_n};
+
   #define CASE_OP(op) \
-    if(rank_n == 1) \
-    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
-      ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
-    else \
-    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
-      ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
-    break;
+    if(rank_n == 1) { \
+      fn = (void const*)&verifyInline2<T, Uint, ReduceNil>; \
+      args[2] = &opnil; \
+    } else { \
+      fn = (void const*)&verifyInline2<T, Uint, decltype(op)>; \
+      args[2] = &op; \
+    } break;
   switch(red_op) {
-  case ncclSum: CASE_OP(ReduceSum())
-  case ncclMin: CASE_OP(ReduceMin())
-  case ncclMax: CASE_OP(ReduceMax())
-  case ncclProd: CASE_OP(ReduceProd())
+  case ncclSum: CASE_OP(opsum)
+  case ncclMin: CASE_OP(opmin)
+  case ncclMax: CASE_OP(opmax)
+  case ncclProd: CASE_OP(opprod)
   #if HAVE_ncclAvg
-  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  case ncclAvg: CASE_OP(opavg)
   #endif
   #if HAVE_ncclPreMulSum
-  default: CASE_OP(ReducePreMulSum())
+  default: CASE_OP(oppremulsum)
   #endif
   }
   #undef CASE_OP
+  dim3 grid = {(unsigned int)block_n, 1, 1};
+  dim3 block = {512, 1, 1};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }
 
-void ncclVerifiableVerify(
+hipError_t ncclVerifiableVerify(
     void const *results, void const *expected, intptr_t elt_n, int elt_ty,
     int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
     int64_t *bad_elt_n, cudaStream_t stream
@@ -1179,15 +1285,21 @@ void ncclVerifiableVerify(
   #if HAVE_ncclBfloat16
     floating |= elt_ty == ncclBfloat16;
   #endif
-  #if HAVE_ncclfp8
-    floating |= elt_ty == ncclFp8E4M3;
-    floating |= elt_ty == ncclFp8E5M2;
+  #if HAVE_ncclfp8_DEVICE || HAVE_ncclfp8_HOST
+    floating |= elt_ty == ncclFloat8e4m3;
+    floating |= elt_ty == ncclFloat8e5m2;
   #endif
 
   unsigned tolerance = 0;
   #if HAVE_ncclAvg
-  if (floating && red_op == ncclAvg)
-    tolerance = calcSumFloatTolerance(rank_n, elt_ty);
+  if (floating && red_op == ncclAvg) {
+    // Average does it's pre-multiplies in an unspecified floating point format
+    // (could be the actual type T or float or half). That means the premultiply
+    // verify does could generate a discrepancy in the least mantissa digit. After
+    // adding those two (since avg only has two non-zero contributions) we could
+    // be off by a distance of 2 units.
+    tolerance = 2;
+  }
   #endif
 
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
@@ -1195,9 +1307,9 @@ void ncclVerifiableVerify(
   *bad_elt_n = 0;
   #define CASE_TY(T, Uint) { \
       if(expected != nullptr) { \
-        verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
+        return verifyPrepared1(sizeof(T), results, expected, elt_n, tolerance, bad_elt_n, stream, block_n); \
       } else { \
-        verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
+        return verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
       } \
     } break;
   switch(elt_ty) {
@@ -1211,13 +1323,19 @@ void ncclVerifiableVerify(
   #if HAVE_ncclBfloat16
   case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
   #endif
-  #if HAVE_ncclfp8
-  case ncclFp8E4M3: CASE_TY(rccl_float8, uint8_t)
-  case ncclFp8E5M2: CASE_TY(rccl_bfloat8, uint8_t)
+  #if HAVE_ncclfp8_DEVICE
+  case ncclFloat8e4m3: CASE_TY(rccl_float8, uint8_t)
+  case ncclFloat8e5m2: CASE_TY(rccl_bfloat8, uint8_t)
+  #endif
+  #if HAVE_ncclfp8_HOST
+  case ncclFloat8e4m3: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e4m3_fnuz, uint8_t);}
+  else { CASE_TY(__hip_fp8_e4m3, uint8_t);}
+  case ncclFloat8e5m2: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e5m2_fnuz, uint8_t);}
+  else { CASE_TY(__hip_fp8_e5m2, uint8_t);}
   #endif
   case ncclFloat32: CASE_TY(float, uint32_t)
   case ncclFloat64: CASE_TY(double, uint64_t)
-  default: assert(0);
+  default: assert(0); return cudaErrorInvalidValue;
   }
   #undef CASE_TY
 }
@@ -1234,7 +1352,7 @@ __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, in
   //if(!std::is_same<Op,ReduceProd>::value) return;
   //if(rank_n!=3) return;
 
-  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
+  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? 2 : 0;
   uint64_t seed = 0xc8e2bed69766d533;
 
   for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
@@ -1271,7 +1389,7 @@ __device__ void sweep1(int ty, char const *tyname) {
   }
 }
 
-__global__ void sweep() {
+__global__ void __launch_bounds__(512, 1) sweep() {
   sweep1<int8_t>(ncclInt8, "int8");
   sweep1<uint8_t>(ncclUint8, "uint8");
   sweep1<int32_t>(ncclInt32, "int32");
@@ -1282,19 +1400,16 @@ __global__ void sweep() {
   #if HAVE_ncclBfloat16
     sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
   #endif
-  #if HAVE_ncclfp8
-    sweep1<rccl_float8>(ncclFp8E4M3, "fp8_e4m3");
-    sweep1<rccl_bfloat8>(ncclFp8E5M2, "fp8_e5m2");
+  #if HAVE_ncclfp8 && __HIP_DEVICE_COMPILE__
+    sweep1<rccl_float8>(ncclFloat8e4m3, "fp8_e4m3");
+    sweep1<rccl_bfloat8>(ncclFloat8e5m2, "fp8_e5m2");
   #endif
   sweep1<float>(ncclFloat32, "float");
   sweep1<double>(ncclFloat64, "double");
 }
 
-int main(int arg_n, char **args) {
-  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  cudaSetDevice(0);
+void ncclVerifiableLaunchSelfTest() {
+  sweep<<<1,512>>>();
   sweep<<<1,512>>>();
-  cudaDeviceSynchronize();
-  return 0;
 }
 #endif
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index da54778a6f..64b4e22514 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -41,13 +41,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 }
 
 // Enqueue kernel to generate data which is to be reduced.
-void ncclVerifiablePrepareInput(
+hipError_t ncclVerifiablePrepareInput(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
 
 // Enqueue kernel to generate expected results of reduction.
-void ncclVerifiablePrepareExpected(
+hipError_t ncclVerifiablePrepareExpected(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
@@ -58,7 +58,7 @@ void ncclVerifiablePrepareExpected(
 // which can be costly. Thus if you plan to run the same reduction multiple
 // times it is advantageous to precompute the expected values with
 // ncclVerifiablePrepareExpected and pass them as `expected` here.
-void ncclVerifiableVerify(
+hipError_t ncclVerifiableVerify(
   void const *results, void const *expected, intptr_t elt_n, int elt_ty,
   int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
   int64_t *bad_elt_n, cudaStream_t stream

From 0abe3c80bb080179b2ebc2d8a5d112181cee4cc4 Mon Sep 17 00:00:00 2001
From: Rahul Vaidya <ravaidya@amd.com>
Date: Thu, 15 May 2025 13:56:40 -0500
Subject: [PATCH 208/233] Ensure backward compatibility for fp8 datatypes
 (#126)

* Ensure backward compatibility for fp8 datatypes

Signed-off-by: ravaidya <ravaidya@amd.com>

* Update code comments

Signed-off-by: ravaidya <ravaidya@amd.com>

---------

Signed-off-by: ravaidya <ravaidya@amd.com>
---
 src/common.h             | 8 ++++----
 verifiable/verifiable.cu | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/common.h b/src/common.h
index 5a3623cddb..8e7cfdf027 100644
--- a/src/common.h
+++ b/src/common.h
@@ -22,10 +22,10 @@
 #include <fstream>
 #include <iostream>
 
-// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3)
-  #define ncclFp8E4M3 ncclFloat8e4m3
-  #define ncclFp8E5M2 ncclFloat8e5m2
+// Ensures backward compatibility for FP8 datatypes
+#if NCCL_VERSION_CODE < NCCL_VERSION(2,24,3)
+  #define ncclFloat8e4m3 ncclFp8E4M3
+  #define ncclFloat8e5m2 ncclFp8E5M2
 #endif
 
 // For nccl.h < 2.13 since we define a weak fallback
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 631e19feea..7611a6b491 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -27,10 +27,10 @@
 #else
   #define HAVE_ncclfp8_HOST 1
 #endif
-// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3)
-  #define ncclFp8E4M3 ncclFloat8e4m3
-  #define ncclFp8E5M2 ncclFloat8e5m2
+// Ensures backward compatibility for FP8 datatypes
+#if NCCL_VERSION_CODE < NCCL_VERSION(2,24,3)
+  #define ncclFloat8e4m3 ncclFp8E4M3
+  #define ncclFloat8e5m2 ncclFp8E5M2
 #endif
 #else
   #define HAVE_ncclfp8 0

From 90760916025b4bea1db0fd5d7f8ab2497efaefc0 Mon Sep 17 00:00:00 2001
From: mberenjk <146776561+mberenjk@users.noreply.github.com>
Date: Fri, 16 May 2025 09:14:46 -0500
Subject: [PATCH 209/233] Switching to old version of rccl_float8 for ROCm
 versions earlier than 6.3 for backward compatibility. (#128)

Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>
---
 src/rccl_float8.h        |  2 +-
 verifiable/verifiable.cu | 17 +++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/rccl_float8.h b/src/rccl_float8.h
index 76bd4f35a1..5540f1e1e3 100644
--- a/src/rccl_float8.h
+++ b/src/rccl_float8.h
@@ -40,7 +40,7 @@ typedef struct
 } rccl_bfloat8;
 
 // __cplusplus < 201103L || (!defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__))
-#elif HIP_VERSION >= 60200000
+#elif HIP_VERSION >= 60300000
 
 #include <hip/hip_fp8.h>
 
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 7611a6b491..e875c3238b 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -392,7 +392,7 @@ struct FloatLayout<hip_bfloat16> {
 };
 #endif
 #if RCCL_FLOAT8 == 1
-#if __HIP_DEVICE_COMPILE__
+#if __HIP_DEVICE_COMPILE__ || HIP_VERSION < 60300000
 template<>
 struct FloatLayout<rccl_float8> {
   static constexpr bool is_floating_point = true;
@@ -993,11 +993,10 @@ cudaError_t prepareInput1(
   #if HAVE_ncclBfloat16
   case ncclBfloat16: fn = (void const*)&prepareInput2<hip_bfloat16, ReduceOp>; break;
   #endif
-  #if HAVE_ncclfp8_DEVICE
+  #if HAVE_ncclfp8_DEVICE || HIP_VERSION < 60300000
   case ncclFloat8e4m3: fn = (void const*)&prepareInput2<rccl_float8, ReduceOp>; break;
   case ncclFloat8e5m2: fn = (void const*)&prepareInput2<rccl_bfloat8, ReduceOp>; break;
-  #endif
-  #if HAVE_ncclfp8_HOST
+  #elif HAVE_ncclfp8_HOST
   case ncclFloat8e4m3: if (rccl_float8_useFnuz) { fn = (void const*)&prepareInput2<__hip_fp8_e4m3_fnuz, ReduceOp>; break;}
   else { fn = (void const*)&prepareInput2<__hip_fp8_e4m3, ReduceOp>; break;}
   case ncclFloat8e5m2: if (rccl_float8_useFnuz) { fn = (void const*)&prepareInput2<__hip_fp8_e5m2_fnuz, ReduceOp>; break;}
@@ -1084,11 +1083,10 @@ cudaError_t prepareExpected1(
   #if HAVE_ncclBfloat16
   case ncclBfloat16: fn = (void const*)&prepareExpected2<hip_bfloat16, ReduceOp>; break;
   #endif
-  #if HAVE_ncclfp8_DEVICE
+  #if HAVE_ncclfp8_DEVICE || HIP_VERSION < 60300000 //for backward compatibility
   case ncclFloat8e4m3: fn = (void const*)&prepareExpected2<rccl_float8, ReduceOp>; break;
   case ncclFloat8e5m2: fn = (void const*)&prepareExpected2<rccl_bfloat8, ReduceOp>; break;
-  #endif
-  #if HAVE_ncclfp8_HOST
+  #elif HAVE_ncclfp8_HOST
   case ncclFloat8e4m3: if (rccl_float8_useFnuz) { fn = (void const*)&prepareExpected2<__hip_fp8_e4m3_fnuz, ReduceOp>; break; }
   else { fn = (void const*)&prepareExpected2<__hip_fp8_e4m3, ReduceOp>; break; }
   case ncclFloat8e5m2: if (rccl_float8_useFnuz) { fn = (void const*)&prepareExpected2<__hip_fp8_e5m2_fnuz, ReduceOp>; break; }
@@ -1323,11 +1321,10 @@ hipError_t ncclVerifiableVerify(
   #if HAVE_ncclBfloat16
   case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
   #endif
-  #if HAVE_ncclfp8_DEVICE
+  #if HAVE_ncclfp8_DEVICE || HIP_VERSION < 60300000
   case ncclFloat8e4m3: CASE_TY(rccl_float8, uint8_t)
   case ncclFloat8e5m2: CASE_TY(rccl_bfloat8, uint8_t)
-  #endif
-  #if HAVE_ncclfp8_HOST
+  #elif HAVE_ncclfp8_HOST
   case ncclFloat8e4m3: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e4m3_fnuz, uint8_t);}
   else { CASE_TY(__hip_fp8_e4m3, uint8_t);}
   case ncclFloat8e5m2: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e5m2_fnuz, uint8_t);}

From b0a3841b353be0bc3f471a5e7eff5a1ab20e92ef Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Thu, 22 May 2025 22:27:09 -0500
Subject: [PATCH 210/233] [BUILD] Fix logic for rocm-cmake dependency (#129)

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
---
 cmake/Dependencies.cmake | 44 +++++++++++++---------------------------
 install.sh               |  2 +-
 2 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ecf361e434..275d567860 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -28,41 +28,25 @@
 # GIT
 
 # Test dependencies
-
+include(FetchContent)
 
 # Find or download/install rocm-cmake project
-set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
-find_package(ROCmCMakeBuildTools 0.7.3 QUIET CONFIG PATHS "${ROCM_PATH}")
+find_package(ROCmCMakeBuildTools 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}")
 if(NOT ROCmCMakeBuildTools_FOUND)
-    set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
-    file(
-        DOWNLOAD https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip
-        ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
-        STATUS rocm_cmake_download_status LOG rocm_cmake_download_log
-    )
-    list(GET rocm_cmake_download_status 0 rocm_cmake_download_error_code)
-    if(rocm_cmake_download_error_code)
-        message(FATAL_ERROR "Error: downloading "
-            "https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip failed "
-            "error_code: ${rocm_cmake_download_error_code} "
-            "log: ${rocm_cmake_download_log} "
+    find_package(ROCM 0.7.3 CONFIG QUIET PATHS "${ROCM_PATH}") # deprecated fallback
+    if(NOT ROCM_FOUND)
+        message(STATUS "ROCmCMakeBuildTools not found. Fetching...")
+        set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
+        set(rocm_cmake_tag "rocm-6.4.0" CACHE STRING "rocm-cmake tag to download")
+        FetchContent_Declare(
+            rocm-cmake
+            GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git
+            GIT_TAG ${rocm_cmake_tag}
+            SOURCE_SUBDIR "DISABLE ADDING TO BUILD"
         )
+        FetchContent_MakeAvailable(rocm-cmake)
+        find_package(ROCmCMakeBuildTools CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
     endif()
-
-    execute_process(
-        COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
-        WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}
-        RESULT_VARIABLE rocm_cmake_unpack_error_code
-    )
-    execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake .
-      WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} )
-    execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install
-      WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
-
-    if(rocm_cmake_unpack_error_code)
-        message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip failed")
-    endif()
-    find_package(ROCmCMakeBuildTools 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
 endif()
 
 # Find available local ROCM targets
diff --git a/install.sh b/install.sh
index fead2f1bac..98882c2eef 100755
--- a/install.sh
+++ b/install.sh
@@ -100,7 +100,7 @@ build_dir=./build
 # ensure a clean build environment
 rm -rf ${build_dir}
 
-if [[ -n ${rocm_dir} ]]; then
+if [[ -z ${rocm_dir} ]]; then
     echo "ROCM_PATH does not exist at ${rocm_dir}. Defaulting to /opt/rocm"
     rocm_dir=/opt/rocm
 fi

From a5c539e68bb7263304997012498b0cd0667b99e8 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 19 May 2025 18:20:22 -0700
Subject: [PATCH 211/233] Add support for Symmetric Memory Registration

From NCCL 2.27.x we can now use the Symmetric Memory APIs (-R 2)
---
 README.md     |  2 +-
 src/common.cu | 58 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index bdafbe5a16..22687d4771 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ All tests support the same set of arguments :
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
   * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
-  * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
+  * `-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
   * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
 
 ### Running multiple operations in parallel
diff --git a/src/common.cu b/src/common.cu
index f83cdf009a..3987d89081 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -90,6 +90,8 @@ static int report_cputime = 0;
 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+#define LOCAL_REGISTER 1
+#define SYMMETRIC_REGISTER 2
 static int local_register = 0;
 #endif
 static int minCudaArch = 1<<30;
@@ -660,8 +662,16 @@ testResult_t threadInit(struct threadArgs* args) {
   void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
   void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
   for (int i=0; i<args->nGpus; i++) {
-    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
-    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+    if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+      NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+      NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+    } else
+#endif
+    {
+      if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
+      if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
+    }
   }
 #endif
 
@@ -669,8 +679,16 @@ testResult_t threadInit(struct threadArgs* args) {
 
   for (int i=0; i<args->nGpus; i++) {
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
-    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+    if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+      NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)sendRegHandles[i]));
+      NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)recvRegHandles[i]));
+    } else
+#endif
+    {
+      if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
+      if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
+    }
 #endif
     NCCLCHECK(ncclCommDestroy(args->comms[i]));
   }
@@ -859,8 +877,10 @@ int main(int argc, char* argv[]) {
         break;
       case 'R':
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-        if ((int)strtol(optarg, NULL, 0)) {
-          local_register = 1;
+        local_register = (int)strtol(optarg, NULL, 0);
+        if (local_register == SYMMETRIC_REGISTER && test_ncclVersion < NCCL_VERSION(2,27,0)) {
+          printf("Option -R 2 (symmetric) is not supported before NCCL 2.27. Defaulting to local registration\n");
+          local_register = LOCAL_REGISTER;
         }
 #else
         printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
@@ -897,7 +917,7 @@ int main(int argc, char* argv[]) {
             "[-G,--cudagraph <num graph launches>] \n\t"
             "[-C,--report_cputime <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
-            "[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
+            "[-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers (default: disable (0))] \n\t"
             "[-h,--help]\n",
           basename(argv[0]));
         return 0;
@@ -1107,8 +1127,16 @@ testResult_t run() {
      sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
      recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
      for (int i=0; i<nGpus*nThreads; i++) {
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+       if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+         NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+         NCCLCHECK(ncclCommWindowRegister(comms[i], recvbuffs[i], maxBytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+       } else
+#endif
+       {
+         if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
+         if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
+       }
      }
 #endif
   }
@@ -1188,8 +1216,16 @@ testResult_t run() {
   if (!parallel_init) {
     for(int i=0; i<nGpus*nThreads; ++i) {
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
-      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+      if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+        NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)sendRegHandles[i]));
+        NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)recvRegHandles[i]));
+      } else
+#endif
+      {
+        if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
+        if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
+      }
 #endif
       NCCLCHECK(ncclCommDestroy(comms[i]));
     }

From 0c60e6a8e42a95f50e20328b119ded5add2392f3 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Fri, 30 May 2025 17:43:30 -0700
Subject: [PATCH 212/233] Fix formatting errors in README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 22687d4771..ce952ef89d 100644
--- a/README.md
+++ b/README.md
@@ -77,8 +77,8 @@ All tests support the same set of arguments :
   * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
-  * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
-  * `-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
+  * `-C,--report_cputime <0/1>` Report CPU time instead of latency. Default : 0.
+  * `-R,--local_register <0/1/2>` enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
   * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
 
 ### Running multiple operations in parallel

From 8bc16f4e012ed27ba23b13b8d76e0509becaa389 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Fri, 30 May 2025 18:04:25 -0700
Subject: [PATCH 213/233] Need to drop Volta (sm_70) support from CUDA 13.0

---
 src/common.mk | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/common.mk b/src/common.mk
index 5fd9418860..d6e5c18f39 100644
--- a/src/common.mk
+++ b/src/common.mk
@@ -19,7 +19,14 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Add Blackwell but drop Volta support if we're using CUDA13.0 or above
+NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
 # Include Blackwell support if we're using CUDA12.8 or above
 NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \
 		-gencode=arch=compute_80,code=sm_80 \

From 5290298ab669aa5212687d7b71e2b47888c2a8f3 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 2 Jun 2025 09:29:52 -0700
Subject: [PATCH 214/233] Reinstate Pascal suppport for CUDA 12.8+ builds

---
 src/common.mk | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/common.mk b/src/common.mk
index d6e5c18f39..3c4a483b2b 100644
--- a/src/common.mk
+++ b/src/common.mk
@@ -20,7 +20,7 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
-# Add Blackwell but drop Volta support if we're using CUDA13.0 or above
+# Add Blackwell but drop Pascal & Volta support if we're using CUDA13.0 or above
 NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_90,code=sm_90 \
 		-gencode=arch=compute_100,code=sm_100 \
@@ -28,7 +28,9 @@ NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_120,code=compute_120
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
 # Include Blackwell support if we're using CUDA12.8 or above
-NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+		-gencode=arch=compute_61,code=sm_61 \
+		-gencode=arch=compute_70,code=sm_70 \
 		-gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_90,code=sm_90 \
 		-gencode=arch=compute_100,code=sm_100 \

From dafb70408d761454eeaf82f58b46d78a90eb3020 Mon Sep 17 00:00:00 2001
From: Martin Belanger <martin.belanger@dell.com>
Date: Tue, 3 Jun 2025 11:43:02 -0400
Subject: [PATCH 215/233] Print the name of the program being executed

One thing missing from the stdout of each performance test is
the name of the test that is actually being run.

This patch adds 2 new messages to the stdout. At the beginning
of the execution of a test (e.g. sendrecv_perf) we will now
see this message:

  Collective test starting: sendrecv_perf

And at the end, we will now see this:

  Collective test concluded: sendrecv_perf

This is needed when running several tests consecutively and we're
trying to parse the stdout to collect the results.

For example, using a Python script to parse the stdout, one could
retrieve the results for each test and plot them on a graph. This
patch makes it easier to implement such a script.

Signed-off-by: Martin Belanger <martin.belanger@dell.com>
---
 src/common.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 3987d89081..a4a92d51fe 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -13,6 +13,7 @@
 #include <string.h>
 #include <ctype.h>
 #include "cuda.h"
+#include <errno.h>     /* program_invocation_short_name */
 
 #include "../verifiable/verifiable.h"
 
@@ -1006,6 +1007,7 @@ testResult_t run() {
 #endif
   is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
 
+  PRINT("# Collective test starting: %s\n", program_invocation_short_name);
   PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
         nThreads, nGpus, minBytes, maxBytes,
         (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
@@ -1257,6 +1259,7 @@ testResult_t run() {
   PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
   PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
   PRINT("#\n");
+  PRINT("# Collective test concluded: %s\n", program_invocation_short_name);
 #ifdef MPI_SUPPORT
   MPI_Comm_free(&mpi_comm);
   MPI_Finalize();

From e7c8825b0b5344c2b3d3317986b7b9ef257df928 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 3 Jun 2025 10:36:53 -0700
Subject: [PATCH 216/233] Wrap ncclCommWindowRegister() calls within ncclGroup

---
 src/common.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 3987d89081..69b892ac2b 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -659,6 +659,7 @@ testResult_t threadInit(struct threadArgs* args) {
   }
   NCCLCHECK(ncclGroupEnd());
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+  NCCLCHECK(ncclGroupStart());
   void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
   void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
   for (int i=0; i<args->nGpus; i++) {
@@ -673,6 +674,7 @@ testResult_t threadInit(struct threadArgs* args) {
       if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
     }
   }
+  NCCLCHECK(ncclGroupEnd());
 #endif
 
   TESTCHECK(threadRunTests(args));
@@ -1124,6 +1126,7 @@ testResult_t run() {
        NCCLCHECK(ncclGroupEnd());
      }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+     NCCLCHECK(ncclGroupStart());
      sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
      recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
      for (int i=0; i<nGpus*nThreads; i++) {
@@ -1138,6 +1141,7 @@ testResult_t run() {
          if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
        }
      }
+     NCCLCHECK(ncclGroupEnd());
 #endif
   }
 

From 97ee0985165722717d340f1adb7c162bc8a7491d Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 4 Jun 2025 17:54:58 -0700
Subject: [PATCH 217/233] Add Turing (SM75) support to CUDA 13.0 builds

---
 src/common.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common.mk b/src/common.mk
index 3c4a483b2b..e1747b5232 100644
--- a/src/common.mk
+++ b/src/common.mk
@@ -21,7 +21,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 # of archs to reduce compile time.
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
 # Add Blackwell but drop Pascal & Volta support if we're using CUDA13.0 or above
-NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
+NVCC_GENCODE ?= -gencode=arch=compute_75,code=sm_75 \
+		-gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_90,code=sm_90 \
 		-gencode=arch=compute_100,code=sm_100 \
 		-gencode=arch=compute_120,code=sm_120 \

From 0039629ac529ac9951ec9df5e243ed76c4cfb060 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <53337087+satyanveshd@users.noreply.github.com>
Date: Wed, 25 Jun 2025 01:39:23 +0530
Subject: [PATCH 218/233] Add cstring header explictly as it is removed from
 HIP (#132)

---
 src/common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/common.h b/src/common.h
index 8e7cfdf027..462beba61f 100644
--- a/src/common.h
+++ b/src/common.h
@@ -11,6 +11,7 @@
 #include "rccl/rccl.h"
 #include <stdio.h>
 #include <cstdint>
+#include <cstring>
 #include <algorithm>
 #ifdef MPI_SUPPORT
 #include "mpi.h"

From aac5f2b56c1de08570152e0e457f234a5c1cc307 Mon Sep 17 00:00:00 2001
From: Sam Wu <22262939+samjwu@users.noreply.github.com>
Date: Fri, 4 Jul 2025 11:54:11 -0600
Subject: [PATCH 219/233] Remove call to junit in math ci (#124)

---
 .jenkins/common.groovy | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index b0846f62ab..c709f06566 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -38,7 +38,6 @@ def runTestCommand (platform, project)
             """
 
    platform.runCommand(this, command)
-   junit "${project.paths.project_build_prefix}/*.xml"
 }
 
 return this

From 66e513c24ff42394f5a0c1781f5868da7e094dd1 Mon Sep 17 00:00:00 2001
From: Sam Wu <22262939+samjwu@users.noreply.github.com>
Date: Fri, 11 Jul 2025 13:49:38 -0600
Subject: [PATCH 220/233] Remove precheckin script (#88)

---
 .jenkins/precheckin.groovy | 81 --------------------------------------
 1 file changed, 81 deletions(-)
 delete mode 100644 .jenkins/precheckin.groovy

diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
deleted file mode 100644
index ac672a665f..0000000000
--- a/.jenkins/precheckin.groovy
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env groovy
-// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
-@Library('rocJenkins@pong') _
-
-// This is file for internal AMD use.
-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
-
-import com.amd.project.*
-import com.amd.docker.*
-import java.nio.file.Path
-
-def runCI = 
-{
-    nodeDetails, jobName->
-
-    def prj  = new rocProject('rccl-tests', 'PreCheckin')
-    prj.paths.build_command = './install.sh'
-
-    // Define test architectures, optional rocm version argument is available
-    def nodes = new dockerNodes(nodeDetails, jobName, prj)
-
-    boolean formatCheck = false
-
-    def commonGroovy
-
-    def compileCommand =
-    {
-        platform, project->
-
-        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
-        commonGroovy.runCompileCommand(platform, project, jobName)
-    }
-    
-    def testCommand =
-    {
-        platform, project->
-
-        commonGroovy.runTestCommand(platform, project)
-    }
-
-    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null)
-}
-
-ci: { 
-    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
-
-    def propertyList = [
-        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 * * * 6')])]
-    ]
-    propertyList = auxiliary.appendPropertyList(propertyList)
-
-    def jobNameList = [
-        "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['rccl906']])
-    ]
-    jobNameList = auxiliary.appendJobNameList(jobNameList)
-    
-    propertyList.each 
-    {
-        jobName, property->
-        if (urlJobName == jobName)
-            properties(auxiliary.addCommonProperties(property))
-    }
-
-    jobNameList.each 
-    {
-        jobName, nodeDetails->
-        if (urlJobName == jobName)
-            stage(jobName) {
-                runCI(nodeDetails, jobName)
-            }
-    }
-
-    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
-    if(!jobNameList.keySet().contains(urlJobName))
-    {
-        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 * * * 6')])]))
-        stage(urlJobName) {
-            runCI([ubuntu16:['rccl906']], urlJobName)
-        }
-    }
-}

From 2c255c476335749f867b568090e31214052515ba Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Wed, 16 Jul 2025 09:38:33 -0500
Subject: [PATCH 221/233] [BUILD] Fix GPU_TARGETS in Makefile for ROCm 7.0
 (#136)

---
 src/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 7809da6978..a4378eb3a5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -30,7 +30,9 @@ HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
 # Currently, supports gfx906,gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
 ifndef GPU_TARGETS
 GPU_TARGETS = gfx906 gfx908 gfx90a
-  ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
+  ifeq ($(shell test "0$(HIP_MAJOR)" -ge 7; echo $$?),0)
+    GPU_TARGETS += gfx942 gfx950
+  else ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
     # Include gfx942 support if we're using ROCm 6.0 or above
     GPU_TARGETS += gfx942
     ifeq ($(shell test "0$(HIP_MINOR)" -ge 5; echo $$?),0)

From def2d3689c4dc2390d903ecea1cb1d9314134661 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 23 Jul 2025 16:04:30 -0700
Subject: [PATCH 222/233] Minor fix to Makefile

Move comments to separate lines
---
 src/Makefile | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 612395f645..b097765ccd 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -5,9 +5,12 @@
 #
 include common.mk
 
-MPI ?= 0        # Set to 1 to enable MPI support (multi-process/multi-node)
-NAME_SUFFIX ?=  # e.g. _mpi when using MPI=1
-DSO ?= 0        # Set to 1 to create and use libverifiable.so to reduce binary size
+# Set to 1 to enable MPI support (multi-process/multi-node)
+MPI ?= 0
+# e.g. Set to _mpi when using MPI=1
+NAME_SUFFIX ?=
+# Set to 1 to create and use libverifiable.so to reduce binary size
+DSO ?= 0
 
 .PHONY: build clean
 

From 6edafa0a9ca5964e2236afea0951a0f2d7df23cd Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 21 May 2025 09:40:26 -0700
Subject: [PATCH 223/233] Add extra reserved space during maxBytes calculation

Also, don't allow minBytes > maxBytes
---
 src/common.cu | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 69b892ac2b..b7c3e0c0a8 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -16,6 +16,9 @@
 
 #include "../verifiable/verifiable.h"
 
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 #if NCCL_MAJOR >= 2
@@ -1047,10 +1050,14 @@ testResult_t run() {
   PRINT("%s", line);
 #endif
 
+  // Reserve 1GiB of memory for each 16GiB installed, but limit to a max of 4GiB
+  const size_t GB = (1ULL << 30);
+  size_t reserveMem =  std::min(DIVUP(maxMem, 16*GB) * 1*GB, 4*GB);
   // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
-  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  size_t memMaxBytes = (maxMem - reserveMem - 1*GB) / (datacheck ? 3 : 2);
   if (maxBytes > memMaxBytes) {
     maxBytes = memMaxBytes;
+    if (minBytes > maxBytes) minBytes = maxBytes;
     if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
   }
 

From 645be0eb4586ccb54cbf751d6cb9fbd049bbf4ac Mon Sep 17 00:00:00 2001
From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com>
Date: Thu, 24 Jul 2025 11:14:49 -0400
Subject: [PATCH 224/233] [Common] Use NCCL API to allocate/free memory (#144)

---
 src/common.cu | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 0990621e88..edcb1d389c 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1040,9 +1040,15 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
 #endif
   }
   else {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    NCCLCHECK(ncclMemAlloc(sendbuff, nbytes));
+    NCCLCHECK(ncclMemAlloc(recvbuff, nbytes));
+    if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes));
+#else
     CUDACHECK(cudaMalloc(sendbuff, nbytes));
     CUDACHECK(cudaMalloc(recvbuff, nbytes));
     if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
+#endif
   }
   CUDACHECK(hipMemset(*sendbuff, 1, nbytes));
   if (datacheck) CUDACHECK(hipMemset(*expected, 1, recvBytes));
@@ -1676,9 +1682,15 @@ testResult_t run() {
 
   // Free off CUDA allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    if (sendbuffs[i]) NCCLCHECK(ncclMemFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) NCCLCHECK(ncclMemFree((char*)recvbuffs[i]));
+    if (datacheck) NCCLCHECK(ncclMemFree(expected[i]));
+#else
     if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
     if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
     if (datacheck) CUDACHECK(cudaFree(expected[i]));
+#endif
   }
   CUDACHECK(cudaFreeHost(delta));
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)

From a7809b324354223c16c9b82b4a228859fe335032 Mon Sep 17 00:00:00 2001
From: Kajsa Arnold <157725674+karnoldAMD@users.noreply.github.com>
Date: Wed, 30 Jul 2025 17:28:04 -0500
Subject: [PATCH 225/233] Standardize output formats (#140)

* remove spaces from csv
* consistently set redop to none when applicable
* write output file after test finishes
---
 src/alltoallv.cu |  2 +-
 src/common.cu    | 65 +++++++++++++++++++++++++++++++-----------------
 src/common.h     |  4 +++
 src/hypercube.cu |  2 +-
 4 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 34b6462619..bb335eda87 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -181,7 +181,7 @@ testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t
   }
 
   for (int i=0; i<type_count; i++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
   }
   return testSuccess;
 }
diff --git a/src/common.cu b/src/common.cu
index edcb1d389c..89d9a2bbbc 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -133,16 +133,6 @@ Reporter::Reporter(std::string fileName, std::string outputFormat) : _outputForm
     if (isMainThread()) {
       _out = std::ofstream(fileName, std::ios_base::out);
       _outputValid = true;
-      if (_outputFormat == "csv") {
-        _out << "numCycle, ";
-        _out << "collective, ";
-#ifdef MPI_SUPPORT
-        _out << "ranks, rankspernode, gpusperrank, ";
-#else
-        _out << "gpus, ";
-#endif
-        _out << "size, type, redop, inplace, time, algbw, busbw, #wrong\n";
-      }
     }
   }
 }
@@ -184,27 +174,54 @@ void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size
   outputValuesKeys.push_back(makeValueKeyPair(busBw, "busBw"));
   outputValuesKeys.push_back(makeValueKeyPair(wrongEltsStr, "wrong"));
 
-  for (auto iter = outputValuesKeys.begin(); iter != outputValuesKeys.end(); ++iter) {
-    if (_outputFormat == "csv") {
-      _out << iter->first;
-      if (std::next(iter) != outputValuesKeys.end()) {
-        _out << ", ";
+  _outputData.push_back(outputValuesKeys);
+}
+
+void Reporter::writeFile() {
+  if (!isMainThread() || !_outputValid)
+    return;
+
+  if (_outputFormat == "csv") {
+    _out << "numCycle,";
+    _out << "collective,";
+#ifdef MPI_SUPPORT
+    _out << "ranks,rankspernode,gpusperrank,";
+#else
+    _out << "gpus,";
+#endif
+    _out << "size,type,redop,inplace,time,algbw,busbw,#wrong\n";
+    for (auto iterEntries = _outputData.begin(); iterEntries != _outputData.end(); ++iterEntries) {
+      for (auto iterVals = (*iterEntries).begin(); iterVals != (*iterEntries).end(); ++iterVals) {
+	_out << iterVals->first;
+	if (std::next(iterVals) != (*iterEntries).end()) {
+	  _out << ",";
+	}
       }
-    } else { //json
-      if (iter == outputValuesKeys.begin()) {
-        _out << "{";
+      _out << std::endl;
+    }
+  } else { //json
+    _out << "[" << std::endl;
+    for (auto iterEntries = _outputData.begin(); iterEntries != _outputData.end(); ++iterEntries) {
+      for (auto iterVals = (*iterEntries).begin(); iterVals != (*iterEntries).end(); ++iterVals) {
+        if (iterVals == (*iterEntries).begin()) {
+          _out << "{";
+        }
+        _out << "\"" << iterVals->second << "\":" << iterVals->first;
+        if (std::next(iterVals) != (*iterEntries).end()) {
+          _out << ", ";
+	}
       }
-      _out << "\"" << iter->second << "\":" << iter->first;
-      if (std::next(iter) != outputValuesKeys.end()) {
-        _out << ", ";
+      if (std::next(iterEntries) != _outputData.end()) {
+        _out << "}," << std::endl;
       } else {
-        _out << "}";
+	_out << "}" << std::endl;
       }
     }
+    _out << "]" << std::endl;
   }
-  _out << std::endl;
 }
 
+
 bool Reporter::isMainThread() { return is_main_thread == 1; }
 
 #define NUM_BLOCKS 32
@@ -1711,6 +1728,8 @@ testResult_t run() {
   MPI_Finalize();
 #endif
 
+  reporter.writeFile();
+
   // 'cuda-memcheck --leak-check full' requires this
   PRINT("%s\n", ncclGetLastError(NULL));
   cudaDeviceReset();
diff --git a/src/common.h b/src/common.h
index 7a961c15a5..97ea1829f5 100644
--- a/src/common.h
+++ b/src/common.h
@@ -22,6 +22,8 @@
 #include <string>
 #include <fstream>
 #include <iostream>
+#include <utility>
+#include <vector>
 
 // Ensures backward compatibility for FP8 datatypes
 #if NCCL_VERSION_CODE < NCCL_VERSION(2,24,3)
@@ -119,6 +121,7 @@ class Reporter {
     ~Reporter() { if (_outputValid) { _out.close(); } };
     void setParameters(const size_t numCycle, const char* name, const char* typeName, const char* opName);
     void addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size_t numBytes, int inPlace, double timeUsec, double algBw, double busBw, int64_t wrongElts = -1);
+    void writeFile();
 
   private:
     bool isMainThread();
@@ -132,6 +135,7 @@ class Reporter {
     std::string _collectiveName;
     std::string _typeName;
     std::string _opName;
+    std::vector<std::vector<std::pair<std::string, std::string>>> _outputData;
 };
 
 struct testEngine {
diff --git a/src/hypercube.cu b/src/hypercube.cu
index ff9201045f..f5d94f026d 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -101,7 +101,7 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
   int nRanks = args->nProcs*args->nThreads*args->nGpus;
   if (nRanks && !(nRanks & (nRanks - 1))) {
     for (int i=0; i<type_count; i++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
     }
   } else {
     printf("nRanks %d is not a power of 2, skipping\n", nRanks);

From e1b8a3aefcdf6482f0f5a35224e19eed2acc685d Mon Sep 17 00:00:00 2001
From: arvindcheru <90783369+arvindcheru@users.noreply.github.com>
Date: Mon, 15 Sep 2025 14:16:09 -0400
Subject: [PATCH 226/233] Dependency removal with hipify_perl symlink (#150)

---
 src/Makefile             | 6 ++++--
 verifiable/verifiable.mk | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index dff56a445b..225bba53b7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -16,6 +16,8 @@ CUSTOM_RCCL_LIB ?= ""
 
 HIPCC ?= $(ROCM_PATH)/bin/amdclang++
 HIPCONFIG = $(ROCM_PATH)/bin/hipconfig
+HIPIFY_PL_EXE=$(ROCM_PATH)/bin/hipify-perl
+HIPIFY_PL_FLAGS = -quiet-warnings
 CXX = $(HIPCC)
 
 HIPCUFLAGS := -std=c++14
@@ -158,12 +160,12 @@ $(GIT_VERSION_FILE):
 ${HIPIFY_DIR}/%.cu.cpp: %.cu
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
 
 ${HIPIFY_DIR}/%.h: %.h
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
 
 .PRECIOUS: ${DST_DIR}/%.o
 
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
index e82817f874..15b7232cc7 100644
--- a/verifiable/verifiable.mk
+++ b/verifiable/verifiable.mk
@@ -15,17 +15,17 @@ TEST_VERIFIABLE_LIBS      = $(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so
 ${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
 
 ${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
 
 ${HIPIFY_DIR}/rccl_float8.h: $(TEST_VERIFIABLE_SRCDIR)/../src/rccl_float8.h
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
 
 $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h $(HIPIFY_DIR)/rccl_float8.h
 	@printf "Compiling %s\n" $@

From 0c94d4d2b3fda463ec30d2b469ae25375dbad2a8 Mon Sep 17 00:00:00 2001
From: Mustafa Abduljabbar <mustafa.abduljabbar@amd.com>
Date: Fri, 26 Sep 2025 18:09:01 -0400
Subject: [PATCH 227/233] Enable viewing algo/proto/channels used in rccl-tests
 output (#151)

* Enable algo/proto/channel viewing

* Use dynamic symbol loading to avoid build/runtime issues with non-compatible RCCL versions

* Reduce code duplication
---
 README.md             |   1 +
 src/all_gather.cu     |  11 ++++-
 src/all_reduce.cu     |  10 +++-
 src/alltoall.cu       |   4 +-
 src/alltoallv.cu      |   4 +-
 src/broadcast.cu      |  10 +++-
 src/common.cu         | 107 ++++++++++++++++++++++++++++++------------
 src/common.h          |  22 ++++++++-
 src/gather.cu         |   4 +-
 src/rccl_compat.h     |  30 ++++++++++++
 src/reduce.cu         |  11 ++++-
 src/reduce_scatter.cu |  11 ++++-
 src/scatter.cu        |   4 +-
 src/sendrecv.cu       |   4 +-
 14 files changed, 193 insertions(+), 40 deletions(-)
 create mode 100644 src/rccl_compat.h

diff --git a/README.md b/README.md
index a3a4336870..6ae8c81db9 100644
--- a/README.md
+++ b/README.md
@@ -148,6 +148,7 @@ All tests support the same set of arguments :
 * Parsing RCCL-Tests output
   * `-Z,--output_format <csv|json>` Parse RCCL-Tests output as a CSV or JSON. Default : disabled.
   * `-x,--output_file <output file name>` RCCL-Tests output file name. Default : disabled.
+  * `-M,--output_algo_proto_channels <0/1>` Report Algorithm/Protocol/Channels for each message size. Default : 0.
 
 ### Running multiple operations in parallel
 
diff --git a/src/all_gather.cu b/src/all_gather.cu
index 6fe7a9214d..dbbd977ec0 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   size_t base = (count/nranks) & -(16/eltSize);
@@ -36,6 +37,13 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
   return testSuccess;
 }
 
+testResult_t  AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFunc_t::ncclFuncAllGather , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
+
 void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
 
@@ -54,7 +62,8 @@ struct testColl allGatherTest = {
   AllGatherGetCollByteCount,
   AllGatherInitData,
   AllGatherGetBw,
-  AllGatherRunColl
+  AllGatherRunColl,
+  AllGatherGetAlgoProtoChannels
 };
 
 void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index d64371bbb2..038188a74e 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
@@ -33,6 +34,12 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
   return testSuccess;
 }
 
+testResult_t  AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
 void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 
@@ -51,7 +58,8 @@ struct testColl allReduceTest = {
   AllReduceGetCollByteCount,
   AllReduceInitData,
   AllReduceGetBw,
-  AllReduceRunColl
+  AllReduceRunColl,
+  AllReduceGetAlgoProtoChannels
 };
 
 void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 02b0ae77f3..eeab700902 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *paramcount = (count/nranks) & -(16/eltSize);
@@ -56,7 +57,8 @@ struct testColl alltoAllTest = {
   AlltoAllGetCollByteCount,
   AlltoAllInitData,
   AlltoAllGetBw,
-  AlltoAllRunColl
+  AlltoAllRunColl,
+  NULL
 };
 
 void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index bb335eda87..8195826f31 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 #define USE_RCCL_GATHER_SCATTER
 
@@ -156,7 +157,8 @@ struct testColl alltoAllTest = {
   AlltoAllvGetCollByteCount,
   AlltoAllvInitData,
   AlltoAllvGetBw,
-  AlltoAllvRunColl
+  AlltoAllvRunColl,
+  NULL
 };
 
 void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
diff --git a/src/broadcast.cu b/src/broadcast.cu
index bc8c5512ff..18d09b7285 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
@@ -32,6 +33,12 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
   return testSuccess;
 }
 
+testResult_t  BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
 void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
 
@@ -60,7 +67,8 @@ struct testColl broadcastTest = {
   BroadcastGetCollByteCount,
   BroadcastInitData,
   BroadcastGetBw,
-  BroadcastRunColl
+  BroadcastRunColl,
+  BroadcastGetAlgoProtoChannels
 };
 
 void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
diff --git a/src/common.cu b/src/common.cu
index 0b16556203..3934d22463 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -22,7 +22,7 @@
 #include <vector>
 #include <utility>
 #include <errno.h>     /* program_invocation_short_name */
-
+#include <dlfcn.h>
 //#define DEBUG_PRINT
 
 #include "verifiable.h"
@@ -35,6 +35,24 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 int32_t gpu_block3;
 size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
 
+rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo = NULL;
+rcclTestsGetProtocolName_t rcclTestsGetProtocolName = NULL;
+rcclTestsGetAlgoName_t rcclTestsGetAlgoName= NULL;
+static void loadRcclSyms() {
+  static void* handle = NULL;
+  const char* libname = "librccl.so";
+  if (!handle) {
+    handle = dlopen(libname, RTLD_LAZY | RTLD_LOCAL);
+      if (!handle) {
+        fprintf(stderr, "dlopen failed: %s\n", dlerror());
+        return;
+      }
+  }
+  rcclTestsGetAlgoInfo      = (rcclTestsGetAlgoInfo_t)     dlsym(handle, "rcclGetAlgoInfo");
+  rcclTestsGetAlgoName      = (rcclTestsGetAlgoName_t)     dlsym(handle,  "rcclGetAlgoName");
+  rcclTestsGetProtocolName  = (rcclTestsGetProtocolName_t) dlsym(handle,  "rcclGetProtocolName");
+}
+
 // RCCL_FLOAT8 support
 bool rccl_float8_useFnuz = false;
 bool IsArchMatch(char const* arch, char const* target) {
@@ -109,6 +127,7 @@ static int nccltype = ncclFloat;
 static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
+static int output_algo_proto_channels = 0;
 static int memorytype = 0;
 static uint32_t cumask[4];
 static int streamnull = 0;
@@ -944,8 +963,21 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
         TESTCHECK(BenchTime(args, type, op, root, 0));
         usleep(delay_inout_place);
       }
-        if (enable_in_place)
+      if (enable_in_place)
         TESTCHECK(BenchTime(args, type, op, root, 1));
+      if(output_algo_proto_channels) {
+        if(args->collTest->getAlgoProtoChannels) {
+          int algo, proto, nchannels;
+          const char* algoName = NULL;
+          const char* protoName = NULL;
+          TESTCHECK(args->collTest->getAlgoProtoChannels(args->comms[0], args->nbytes / wordSize(type), type, &algo, &proto, &nchannels));
+          NCCLCHECK(rcclTestsGetAlgoName(algo, &algoName));
+          NCCLCHECK(rcclTestsGetProtocolName(proto, &protoName));
+          PRINT("%8s  %8s  %10d", algoName, protoName, nchannels);
+        } else {
+          PRINT("%8s  %8s  %10s","N/A", "N/A", "N/A");
+        }
+      }
       PRINT("\n");
     }
     --repeat;
@@ -1108,7 +1140,7 @@ int main(int argc, char* argv[]) {
     }
     #endif
   #endif
-
+  loadRcclSyms();
   // Parse args
   double parsed;
   int longindex;
@@ -1135,14 +1167,15 @@ int main(int argc, char* argv[]) {
     {"report_cputime", required_argument, 0, 'C'},
     {"average", required_argument, 0, 'a'},
     {"local_register", required_argument, 0, 'R'},
-    {"memory_type", required_argument, 0, 'y'},       //RCCL
-    {"cumask", required_argument, 0, 'u'},            //RCCL
-    {"out_of_place", required_argument, 0, 'O'},      //RCCL
-    {"delay_inout_place", required_argument, 0, 'q'}, //RCCL
-    {"cache_flush", required_argument, 0, 'F'},       //RCCL
-    {"rotating_tensor", required_argument, 0, 'E'},   //RCCL
-    {"output_file", required_argument, 0, 'x'},       //RCCL
-    {"output_format", required_argument, 0, 'Z'},     //RCCL
+    {"memory_type", required_argument, 0, 'y'},                     //RCCL
+    {"cumask", required_argument, 0, 'u'},                          //RCCL
+    {"out_of_place", required_argument, 0, 'O'},                    //RCCL
+    {"delay_inout_place", required_argument, 0, 'q'},               //RCCL
+    {"cache_flush", required_argument, 0, 'F'},                     //RCCL
+    {"rotating_tensor", required_argument, 0, 'E'},                 //RCCL
+    {"output_file", required_argument, 0, 'x'},                     //RCCL
+    {"output_format", required_argument, 0, 'Z'},                   //RCCL
+    {"output_algo_proto_channels", required_argument, 0, 'M'},      //RCCL
     {"help", no_argument, 0, 'h'},
     {}
   };
@@ -1150,7 +1183,7 @@ int main(int argc, char* argv[]) {
   while(1) {
     int c;
 
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:h", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:M:h", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -1290,6 +1323,10 @@ int main(int argc, char* argv[]) {
       case 'Z':
         output_format = optarg;
         break;
+      case 'M':
+        output_algo_proto_channels = strtol(optarg, NULL, 0);
+        if(rcclTestsGetAlgoInfo == NULL || rcclTestsGetAlgoName == NULL || rcclTestsGetProtocolName == NULL) output_algo_proto_channels = 0;
+        break;
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -1607,27 +1644,39 @@ testResult_t run() {
   }
 
   fflush(stdout);
-
+  const char* extra_col_str[3] = {"", "", ""};
+  if (output_algo_proto_channels) {
+    extra_col_str[0] = "algo";
+    extra_col_str[1] = "proto";
+    extra_col_str[2] = "nchannels";
+  }
+  const char* header_col_str[3] = {"           out-of-place                       in-place          ",
+                                   "           out-of-place         ","           in-place          "};
+  int header_index =(enable_out_of_place && enable_in_place) ? 0 : (enable_out_of_place ? 1 : 2);
   const char* timeStr = report_cputime ? "cputime" : "time";
+
   PRINT("#\n");
+  PRINT("# %10s  %12s  %8s  %6s  %6s%s\n", "", "", "", "", "", header_col_str[header_index]);
   if (enable_out_of_place && enable_in_place) {
-  	PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
-  	PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
-      	timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
-  	PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
-      	"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-  } else if (enable_out_of_place) {
-	  PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place      \n", "", "", "", "", "");
-        PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
-        timeStr, "algbw", "busbw", "#wrong");
-        PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "");
+      PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s %8s  %8s  %10s\n",
+            "size", "count", "type", "redop", "root",
+            timeStr, "algbw", "busbw", "#wrong",
+            timeStr, "algbw", "busbw", "#wrong",
+            extra_col_str[0], extra_col_str[1],  extra_col_str[2]);
+      PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s %8s  %8s  %10s\n",
+            "(B)", "(elements)", "", "", "",
+            "(us)", "(GB/s)", "(GB/s)", "",
+            "(us)", "(GB/s)", "(GB/s)", "",
+            "", "", "");
   } else {
-    PRINT("# %10s  %12s  %8s  %6s  %6s           in-place          \n", "", "", "", "", "");
-        PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
-        timeStr, "algbw", "busbw", "#wrong");
-        PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "");
+    PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s %8s  %8s  %10s\n",
+          "size", "count", "type", "redop", "root",
+          timeStr, "algbw", "busbw", "#wrong",
+          extra_col_str[0], extra_col_str[1],  extra_col_str[2]);
+    PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %8s  %8s  %10s\n",
+          "(B)", "(elements)", "", "", "",
+          "(us)", "(GB/s)", "(GB/s)", "",
+          "", "", "");
   }
   Reporter reporter(output_file, output_format);
 
diff --git a/src/common.h b/src/common.h
index 97ea1829f5..645d3a1f0a 100644
--- a/src/common.h
+++ b/src/common.h
@@ -7,7 +7,6 @@
  ************************************************************************/
 #ifndef __COMMON_H__
 #define __COMMON_H__
-
 #include "rccl/rccl.h"
 #include <stdio.h>
 #include <cstdint>
@@ -107,6 +106,7 @@ struct testColl {
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
   testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
       ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+  testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
@@ -375,4 +375,24 @@ extern int is_main_proc;
 extern thread_local int is_main_thread;
 #define PRINT if (is_main_thread) printf
 
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncAllReduceWithBias = 5,
+  ncclFuncSendRecv = 6,
+  ncclFuncSend = 7,
+  ncclFuncRecv = 8,
+  ncclFuncAllToAllPivot = 9,
+  ncclNumFuncs = 10
+} ncclFunc_t;
+
+typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
+                                          int collNetSupport, int nvlsSupport, int numPipeOps,
+                                          int* algo, int* protocol, int* maxChannels);
+typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName);
+typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int protocol, const char** protocolName);
+
 #endif
diff --git a/src/gather.cu b/src/gather.cu
index 662e2d47e8..a0dc00de56 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = (count/nranks) & -(16/eltSize);
@@ -69,7 +70,8 @@ struct testColl gatherTest = {
   GatherGetCollByteCount,
   GatherInitData,
   GatherGetBw,
-  GatherRunColl
+  GatherRunColl,
+  NULL
 };
 
 void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
diff --git a/src/rccl_compat.h b/src/rccl_compat.h
new file mode 100644
index 0000000000..4e132774d1
--- /dev/null
+++ b/src/rccl_compat.h
@@ -0,0 +1,30 @@
+/* ************************************************************************
+ * Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef RCCL_COMPAT_H
+#define RCCL_COMPAT_H
+
+extern rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo;
+extern rcclTestsGetProtocolName_t rcclTestsGetProtocolName;
+extern rcclTestsGetAlgoName_t rcclTestsGetAlgoName;
+
+#endif
\ No newline at end of file
diff --git a/src/reduce.cu b/src/reduce.cu
index f8c059e140..c2353c3fc0 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
@@ -34,6 +35,13 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
   return testSuccess;
 }
 
+testResult_t  ReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduce , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
+
 void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize) / 1.0E9 / sec;
   *algBw = baseBw;
@@ -50,7 +58,8 @@ struct testColl reduceTest = {
   ReduceGetCollByteCount,
   ReduceInitData,
   ReduceGetBw,
-  ReduceRunColl
+  ReduceRunColl,
+  ReduceGetAlgoProtoChannels
 };
 
 void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index 2e04cc7456..fe906ce372 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   size_t base = (count/nranks) & -(16/eltSize);
@@ -35,6 +36,13 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
   return testSuccess;
 }
 
+testResult_t  ReduceScatterGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduceScatter , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
+
 void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
   double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
 
@@ -53,7 +61,8 @@ struct testColl reduceScatterTest = {
   ReduceScatterGetCollByteCount,
   ReduceScatterInitData,
   ReduceScatterGetBw,
-  ReduceScatterRunColl
+  ReduceScatterRunColl,
+  ReduceScatterGetAlgoProtoChannels
 };
 
 void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
diff --git a/src/scatter.cu b/src/scatter.cu
index d93663ced7..d0323fa36d 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *recvcount = (count/nranks) & -(16/eltSize);
@@ -65,7 +66,8 @@ struct testColl scatterTest = {
   ScatterGetCollByteCount,
   ScatterInitData,
   ScatterGetBw,
-  ScatterRunColl
+  ScatterRunColl,
+  NULL
 };
 
 void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index 3f84dcffc9..4f5f6b8a7b 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -7,6 +7,7 @@
 
 #include "cuda_runtime.h"
 #include "common.h"
+#include "rccl_compat.h"
 
 void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
   *sendcount = count;
@@ -64,7 +65,8 @@ struct testColl sendRecvTest = {
   SendRecvGetCollByteCount,
   SendRecvInitData,
   SendRecvGetBw,
-  SendRecvRunColl
+  SendRecvRunColl,
+  NULL
 };
 
 void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {

From a15d1edaa3847607efa6172c8ccd0aa13022bba4 Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Sun, 28 Sep 2025 13:33:33 -0500
Subject: [PATCH 228/233] [BUILD] Add rccl_compat.h to src/CMakeLists.txt
 (#152)

---
 src/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6fe68236fd..ec96b65412 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -72,6 +72,7 @@ set(COMMON_FILES
   common.h
   common.cu
   nccl1_compat.h
+  rccl_compat.h
   rccl_float8.h
   timer.h
   timer.cc

From a4943c512e3d9b7f15d9e12bf5529bc3d36ec77a Mon Sep 17 00:00:00 2001
From: David DeBonis <ddebonis@amd.com>
Date: Wed, 1 Oct 2025 07:07:28 -0600
Subject: [PATCH 229/233] Update CODEOWNERS (#154)

* Update CODEOWNERS

Adding me as a reviewer

* Update .github/CODEOWNERS

Co-authored-by: Nilesh M Negi <Nilesh.Negi@amd.com>

* Update CODEOWNERS

Added Alex

---------

Co-authored-by: Nilesh M Negi <Nilesh.Negi@amd.com>
---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index cd1b82fa42..23ddf51b30 100755
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,4 +1,4 @@
-* @wenkaidu @gilbertlee-amd @akolliasAMD @edgargabriel @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @rahulvaidya20 @haripriya-amd
+* @wenkaidu @gilbertlee-amd @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @BertanDogancay @rahulvaidya20 @isaki001 @PJAvinash @AbandiGa @Nikhil-Nunna @haripriya-amd @atulkulk @alex-breslow-amd @ddebonis-amd @amd-mengshwu @Kapil-Shyam-Pawar @weilewei @nawrinsu @speriaswamy-amd
 
 # Documentation files
 doc/ @ROCm/rocm-documentation

From d0a99b18474b85c99f06b981ff3838f92bca9861 Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Sun, 5 Oct 2025 04:12:05 -0500
Subject: [PATCH 230/233] [BUILD] Add link to `libdl` for RCCL-Tests builds
 (#153)

---
 src/CMakeLists.txt | 2 +-
 src/Makefile       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ec96b65412..387bccfc97 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -127,7 +127,7 @@ add_custom_target(git_version_check
 add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
 add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
 add_dependencies(rccl_common hipify git_version_check)
-target_link_libraries(rccl_common roc::rccl hip::device Threads::Threads)
+target_link_libraries(rccl_common roc::rccl hip::device Threads::Threads dl)
 if(USE_MPI)
   target_link_libraries(rccl_common MPI::MPI_CXX)
 endif()
diff --git a/src/Makefile b/src/Makefile
index 225bba53b7..fcc020f7c2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -129,7 +129,7 @@ HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/mpich/include -I
 HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/mpich/lib -lmpich
 endif
 
-LIBRARIES += rccl
+LIBRARIES += rccl dl
 HIPLDFLAGS += $(LIBRARIES:%=-l%)
 
 DST_DIR := $(BUILDDIR)

From db6ea5a5947c620f26712a9dc003fa67b3e3f0d7 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Mon, 13 Oct 2025 14:09:10 -0700
Subject: [PATCH 231/233] Add all_reduce_bias_perf to support All Reduce with
 Bias (#130)

Use dynamic symbol loading of ncclAllReduceWithBias

Co-authored-by: mberenjk <146776561+mberenjk@users.noreply.github.com>
---
 src/CMakeLists.txt       |   1 +
 src/Makefile             |   2 +-
 src/all_gather.cu        |   2 +-
 src/all_reduce.cu        |   2 +-
 src/all_reduce_bias.cu   | 123 +++++++++++++++++++++++++++++++++++++++
 src/alltoall.cu          |   2 +-
 src/alltoallv.cu         |   2 +-
 src/broadcast.cu         |   2 +-
 src/common.cu            |  24 ++++++--
 src/common.h             |   8 ++-
 src/gather.cu            |   2 +-
 src/hypercube.cu         |   2 +-
 src/reduce.cu            |   2 +-
 src/reduce_scatter.cu    |   2 +-
 src/scatter.cu           |   2 +-
 src/sendrecv.cu          |   2 +-
 verifiable/verifiable.cu |  70 ++++++++++++++++++++++
 verifiable/verifiable.h  |   6 ++
 18 files changed, 237 insertions(+), 19 deletions(-)
 create mode 100644 src/all_reduce_bias.cu

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 387bccfc97..8de04365ae 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -145,3 +145,4 @@ add_rccl_test(reduce_scatter)
 add_rccl_test(reduce)
 add_rccl_test(scatter)
 add_rccl_test(sendrecv)
+add_rccl_test(all_reduce_bias)
diff --git a/src/Makefile b/src/Makefile
index fcc020f7c2..7f6ea37279 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -135,7 +135,7 @@ HIPLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube all_reduce_bias
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
 
 GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
diff --git a/src/all_gather.cu b/src/all_gather.cu
index dbbd977ec0..54ca880d8b 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -52,7 +52,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 038188a74e..43c75032c7 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -48,7 +48,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/all_reduce_bias.cu b/src/all_reduce_bias.cu
new file mode 100644
index 0000000000..8d49c67483
--- /dev/null
+++ b/src/all_reduce_bias.cu
@@ -0,0 +1,123 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include <dlfcn.h>
+
+typedef ncclResult_t (*PFN_ncclAllReduceWithBias)(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream, const void* acc);
+#define DECLARE_RCCL_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+DECLARE_RCCL_PFN(ncclAllReduceWithBias);
+static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+
+static void initOnceFunc() {
+  void *librccl = dlopen("librccl.so", RTLD_NOLOAD);
+  pfn_ncclAllReduceWithBias = (PFN_ncclAllReduceWithBias) dlsym(librccl, "ncclAllReduceWithBias");
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+  pthread_once(&initOnceControl, initOnceFunc);
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+    TESTCHECK(InitData(args->bias[i], sendcount, 0, type, op, rep+0x12345678, nranks, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    TESTCHECK(InitDataApplyBias(args->expected[i], args->bias[i], recvcount, 0, type, op));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  if (pfn_ncclAllReduceWithBias == nullptr) {
+    fprintf(stderr, "[ERROR] This version of RCCL doesn't support ncclAllReduceWithBias\n");
+    return testNcclError;
+  }
+  NCCLCHECK((*pfn_ncclAllReduceWithBias)(sendbuff, recvbuff, count, type, op, comm, stream, bias));
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+#if defined(RCCL_FLOAT8)
+      if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+      continue;
+#endif
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
diff --git a/src/alltoall.cu b/src/alltoall.cu
index eeab700902..c81f2ff285 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -47,7 +47,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 8195826f31..1ca0fe64c4 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -84,7 +84,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int nranks;
   NCCLCHECK(ncclCommCount(comm, &nranks));
   int rank;
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 18d09b7285..6c57b0d177 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -47,7 +47,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
 #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
diff --git a/src/common.cu b/src/common.cu
index 3934d22463..f9ae4caf58 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -364,6 +364,11 @@ testResult_t InitDataReduce(void* data, const size_t count, const size_t offset,
   return testSuccess;
 }
 
+testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op) {
+  ncclVerifiableApplyBias(expected, bias, count, (int)type, (int)op, offset, cudaStreamDefault);
+  return testSuccess;
+}
+
 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
   CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault));
   return testSuccess;
@@ -469,7 +474,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
     TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
 
-#if 1 && DEBUG_PRINT
+#if 1 && defined(DEBUG_PRINT)
     if (args->reportErrors && wrongPerGpu[i] != 0) {
       printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
       char *expectedHost = (char*)malloc(args->expectedBytes);
@@ -582,6 +587,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
     char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    char* bias = ((char*)args->bias[i]) + shift;
     ncclRedOp_t op;
 
     if(opIndex < ncclNumOps) {
@@ -629,7 +635,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     TESTCHECK(args->collTest->runColl(
           (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
           (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
-        count, type, op, root, args->comms[i], args->streams[i]));
+        count, type, op, root, args->comms[i], args->streams[i], bias));
 
     #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
     if(opIndex >= ncclNumOps) {
@@ -1060,7 +1066,7 @@ testResult_t threadLaunch(struct testThread* thread) {
   return testSuccess;
 }
 
-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, void **bias) {
   if(enable_rotating_tensor) {
     recvBytes = recvBytes + cache_bytes;
     nbytes = nbytes + cache_bytes;
@@ -1069,22 +1075,26 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
     if(HIP_VERSION >= 50700000) {
       CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached));
       CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached));
+      if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocUncached));
       if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached));
     }
     else {
       CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
       CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
+      if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocFinegrained));
       if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
     }
   }
   else if (memorytype == ncclHost) {
     CUDACHECK(hipHostMalloc(sendbuff, nbytes));
     CUDACHECK(hipHostMalloc(recvbuff, nbytes));
+    if (bias) CUDACHECK(hipHostMalloc(bias, nbytes));
     if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
   }
   else if (memorytype == ncclManaged) {
     CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
     CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
+    if (bias) CUDACHECK(cudaMallocManaged(bias, nbytes));
     if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
 #if 0
     CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
@@ -1096,14 +1106,17 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
     NCCLCHECK(ncclMemAlloc(sendbuff, nbytes));
     NCCLCHECK(ncclMemAlloc(recvbuff, nbytes));
+    if (bias) CUDACHECK(cudaMalloc(bias, nbytes));
     if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes));
 #else
     CUDACHECK(cudaMalloc(sendbuff, nbytes));
     CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (bias) CUDACHECK(cudaMalloc(bias, nbytes));
     if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
 #endif
   }
   CUDACHECK(hipMemset(*sendbuff, 1, nbytes));
+  if (bias) CUDACHECK(hipMemset(*bias, 1, nbytes));
   if (datacheck) CUDACHECK(hipMemset(*expected, 1, recvBytes));
   return testSuccess;
 }
@@ -1554,6 +1567,7 @@ testResult_t run() {
   std::vector<cudaStream_t> streams(nGpus*nThreads);
   std::vector<void*> sendbuffs(nGpus*nThreads);
   std::vector<void*> recvbuffs(nGpus*nThreads);
+  std::vector<void*> bias(nGpus*nThreads);
   std::vector<void*> expected(nGpus*nThreads);
   size_t sendBytes, recvBytes;
 
@@ -1564,7 +1578,7 @@ testResult_t run() {
   for (int i=0; i<nGpus*nThreads; i++) {
     gpus[i] = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices;
     CUDACHECK(cudaSetDevice(gpus[i]));
-    TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes));
+    TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes, bias.data()+i));
     if (streamnull) {
       streams[i] = NULL;
     }
@@ -1699,6 +1713,7 @@ testResult_t run() {
     threads[t].args.gpus=gpus.data()+t*nGpus;
     threads[t].args.sendbuffs = sendbuffs.data()+t*nGpus;
     threads[t].args.recvbuffs = recvbuffs.data()+t*nGpus;
+    threads[t].args.bias = bias.data()+t*nGpus;
     threads[t].args.expected = expected.data()+t*nGpus;
     threads[t].args.ncclId = ncclId;
     threads[t].args.comms=comms+t*nGpus;
@@ -1764,6 +1779,7 @@ testResult_t run() {
 #else
     if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
     if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (bias[i]) CUDACHECK(cudaFree((char*)bias[i]));
     if (datacheck) CUDACHECK(cudaFree(expected[i]));
 #endif
   }
diff --git a/src/common.h b/src/common.h
index 645d3a1f0a..4c6af89f1e 100644
--- a/src/common.h
+++ b/src/common.h
@@ -104,9 +104,9 @@ struct testColl {
   testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
-  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias);
   testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels);
+
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
@@ -173,6 +173,7 @@ struct threadArgs {
   ncclUniqueId ncclId;
   ncclComm_t* comms;
   cudaStream_t* streams;
+  void** bias;
 
   void** expected;
   size_t expectedBytes;
@@ -199,8 +200,9 @@ struct testThread {
 extern void Barrier(struct threadArgs* args);
 extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
 extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
+extern testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op);
 extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
-extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks, void **bias);
 
 #include <unistd.h>
 
diff --git a/src/gather.cu b/src/gather.cu
index a0dc00de56..24156de7bd 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -45,7 +45,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/hypercube.cu b/src/hypercube.cu
index f5d94f026d..c35cc765cc 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   char* sbuff = (char*)sendbuff;
   char* rbuff = (char*)recvbuff;
   int nRanks;
diff --git a/src/reduce.cu b/src/reduce.cu
index c2353c3fc0..bf37a2b016 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -48,7 +48,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw;
 }
 
-testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
   return testSuccess;
 }
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index fe906ce372..fd589e2738 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -51,7 +51,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
   *busBw = baseBw * factor;
 }
 
-testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/scatter.cu b/src/scatter.cu
index d0323fa36d..ca0f6c10ad 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -41,7 +41,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index 4f5f6b8a7b..c9c4c4bc0a 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -45,7 +45,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 20df5907a4..0896c724d0 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -1044,6 +1044,76 @@ hipError_t ncclVerifiablePrepareInput(
   #undef CASE_OP
 }
 
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void applyBias2(
+    T *elts, T *bias, intptr_t elt_n, ReduceFn op, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = op(elts[i], bias[i]);
+    #if 0
+    T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
+      std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void applyBias1(
+    void *elts, void* bias, intptr_t elt_n, int elt_ty, ReduceOp op,
+    intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) applyBias2<<<block_n, 512, 0, stream>>>((T*)elts, (T*)bias, elt_n, op, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(__half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  #endif
+  #if HAVE_ncclfp8
+  case ncclFp8E4M3: CASE_TY(rccl_float8)
+  case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiableApplyBias(
+    void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
+    cudaStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    applyBias1(elts, bias, elt_n, elt_ty, op, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace {
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index f4452d6d32..b248492aee 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -64,6 +64,12 @@ hipError_t ncclVerifiableVerify(
   int64_t *bad_elt_n, cudaStream_t stream
 );
 
+// Enqueue kernel that applies bias to expected results
+void ncclVerifiableApplyBias(
+    void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
+    cudaStream_t stream
+);
+
 #ifdef NCCL_VERIFIABLE_SELF_TEST
 void ncclVerifiableLaunchSelfTest();
 #endif

From 33cc4df1e4631d98a7a9ff1b1e0221f77ec81470 Mon Sep 17 00:00:00 2001
From: mberenjk <146776561+mberenjk@users.noreply.github.com>
Date: Sat, 18 Oct 2025 12:46:31 -0700
Subject: [PATCH 232/233] Fixing the AR_Bias issue for FP8 (#155)

Authored-by: Marzieh Berenjkoub <146776561+mberenjk@users.noreply.github.com>
Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com>
Co-authored-by: Nilesh M Negi <Nilesh.Negi@amd.com>
---
 verifiable/verifiable.cu | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 0896c724d0..b845bd99e9 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -1083,9 +1083,14 @@ void applyBias1(
   #if HAVE_ncclBfloat16
   case ncclBfloat16: CASE_TY(hip_bfloat16)
   #endif
-  #if HAVE_ncclfp8
-  case ncclFp8E4M3: CASE_TY(rccl_float8)
-  case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+  #if HAVE_ncclfp8_DEVICE || HIP_VERSION < 60300000
+  case ncclFloat8e4m3: CASE_TY(rccl_float8)
+  case ncclFloat8e5m2: CASE_TY(rccl_bfloat8)
+  #elif HAVE_ncclfp8_HOST
+  case ncclFloat8e4m3: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e4m3_fnuz) }
+  else { CASE_TY(__hip_fp8_e4m3) }
+  case ncclFloat8e5m2: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e5m2_fnuz) }
+  else { CASE_TY(__hip_fp8_e5m2) }
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)

From 6405c76e6826663bbb67bd40aeee8c70aa5d3094 Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Wed, 29 Oct 2025 10:57:56 -0600
Subject: [PATCH 233/233] Fixing install script hip_compiler bug and improving
 logging on fallback (#156)

* Fixing install script hip_compiler bug and improving logging on fallback
---
 install.sh | 127 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 66 insertions(+), 61 deletions(-)

diff --git a/install.sh b/install.sh
index 98882c2eef..23a3c5df69 100755
--- a/install.sh
+++ b/install.sh
@@ -37,49 +37,49 @@ gpu_targets=""
 # check if we have a modern version of getopt that can handle whitespace and long parameters
 getopt -T
 if [[ $? -eq 4 ]]; then
-    GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rocm_home:,rccl_home:,mpi_home:,hip_compiler:,gpu_targets: --options hmt -- "$@")
+  GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rocm_home:,rccl_home:,mpi_home:,hip_compiler:,gpu_targets: --options hmt -- "$@")
 else
-    echo "Need a new version of getopt"
-    exit 1
+  echo "Need a new version of getopt"
+  exit 1
 fi
 
 if [[ $? -ne 0 ]]; then
-    echo "getopt invocation failed; could not parse the command line";
-    exit 1
+  echo "getopt invocation failed; could not parse the command line";
+  exit 1
 fi
 
 eval set -- "${GETOPT_PARSE}"
 
 while true; do
-    case "${1}" in
-        -h|--help)
-            display_help
-            exit 0 ;;
-        -m|--mpi)
-            mpi_enabled=true
-            shift ;;
-        -t|--test)
-            run_tests=true
-            shift ;;
-        --rocm_home)
-            rocm_dir=${2}
-            shift 2 ;;
-        --rccl_home)
-            rccl_dir=${2}
-            shift 2 ;;
-        --mpi_home)
-            mpi_dir=${2}
-            shift 2 ;;
-        --hip_compiler)
-            hip_compiler=${2}
-            shift 2 ;;
-        --gpu_targets)
-            gpu_targets=${2}
-            shift 2 ;;
-        --) shift ; break ;;
-        *)  echo "Unexpected command line parameter received; aborting";
-	    exit 1 ;;
-    esac
+  case "${1}" in
+    -h|--help)
+       display_help
+       exit 0 ;;
+    -m|--mpi)
+       mpi_enabled=true
+       shift ;;
+    -t|--test)
+       run_tests=true
+       shift ;;
+    --rocm_home)
+       rocm_dir=${2}
+       shift 2 ;;
+    --rccl_home)
+       rccl_dir=${2}
+       shift 2 ;;
+    --mpi_home)
+       mpi_dir=${2}
+       shift 2 ;;
+    --hip_compiler)
+       hip_compiler=${2}
+       shift 2 ;;
+    --gpu_targets)
+       gpu_targets=${2}
+       shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+    exit 1 ;;
+  esac
 done
 
 # throw error code after running a command in the install script
@@ -101,49 +101,54 @@ build_dir=./build
 rm -rf ${build_dir}
 
 if [[ -z ${rocm_dir} ]]; then
-    echo "ROCM_PATH does not exist at ${rocm_dir}. Defaulting to /opt/rocm"
-    rocm_dir=/opt/rocm
+  echo "[WARN] ROCM_PATH does not exist at ${rocm_dir}. Defaulting to /opt/rocm"
+  rocm_dir=/opt/rocm
 fi
 
 if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
-    echo "HIP Compiler does not exist at ${hip_compiler}. Please check the path."
-    echo "Defaulting to /opt/rocm/bin/amdclang++"
-    hip_compiler=${rocm_dir}/bin/amdclang++
+  echo "[WARN] HIP Compiler does not exist at ${hip_compiler}. Please check the path."
+  echo "[WARN] - Falling back to ${rocm_dir}/bin/amdclang++"
+  hip_compiler=${rocm_dir}/bin/amdclang++
+
+  if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
+    echo "[WARN] ${hip_compiler} does not exist. Please be advised."
+    echo "[WARN] - Falling back to ${rocm_dir}/bin/hipcc"
+    hip_compiler=${rocm_dir}/bin/hipcc
 
     if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
-        echo "${hip_compiler} does not exist. Please be advised."
-	echo "Defaulting to /opt/rocm/bin/hipcc"
-	hip_compiler=${rocm_dir}/bin/hipcc
-
-	if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
-            echo "${hip_compiler} does not exist!. Please check your ROCm installation."
-	    echo "Cannot proceed with building rccl-tests!"
-	    exit 1
-	fi
+      echo "[ERROR] ${hip_compiler} does not exist!. Please check your ROCm installation." >&2
+      echo "[ERROR] Cannot proceed with building rccl-tests!" >&2
+      exit 1
     fi
+  fi
 fi
+echo "[INFO] Compiling with ${hip_compiler}"
 
 if [[ -n ${gpu_targets} ]]; then
-    GPU_TARGETS="GPU_TARGETS=${gpu_targets}"
+  GPU_TARGETS="GPU_TARGETS=${gpu_targets}"
 fi
 
 if ($mpi_enabled); then
-    if [[ ${mpi_dir} == "" ]]; then
-        echo "MPI flag enabled but path to MPI installation not specified.  See --mpi_home command line argument."
-        exit 1
-    else
-        make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} HIPCC=${hip_compiler} ${GPU_TARGETS} -j$(nproc)
-    fi
+  if [[ ${mpi_dir} == "" ]]; then
+    echo "[ERROR] MPI flag enabled but path to MPI installation not specified.  See --mpi_home command line argument." >&2
+    exit 1
+  else
+    echo "[INFO] Compiling with MPI support (Using MPI from ${mpi_dir})"
+    echo
+    make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} HIPCC=${hip_compiler} ${GPU_TARGETS} -j$(nproc)
+  fi
 else
-    make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so HIP_COMPILER=${hip_compiler} ${GPU_TARGETS} -j$(nproc)
+  echo "[INFO] Compiling without MPI support (MPI support requires -m and --mpi_home)"
+  echo
+  make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so HIPCC=${hip_compiler} ${GPU_TARGETS} -j$(nproc)
 fi
 check_exit_code "$?"
 
 # Optionally, run tests if they're enabled.
 if ($run_tests); then
-    if ($mpi_enabled); then
-        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
-    else
-        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest -k "not MPI"
-    fi
+  if ($mpi_enabled); then
+    cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
+  else
+    cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest -k "not MPI"
+  fi
 fi