From 501a149d575fa62b7d8bea5b2bd20304dba55025 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Fri, 18 Apr 2025 19:20:59 -0700
Subject: [PATCH 1/9] Add support for FP8 datatypes

Added new datatypes: f8e4m3, f8e5m2

Only supported on H100+ architectures and NCCL versions >= 2.24.0
---
 src/common.cu            |  71 +++++-
 src/common.h             |  20 +-
 verifiable/verifiable.cu | 473 +++++++++++++++++++++++++++------------
 verifiable/verifiable.h  |   7 +-
 4 files changed, 415 insertions(+), 156 deletions(-)
diff --git a/src/common.cu b/src/common.cu
index 0d4dfc1944..f83cdf009a 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -21,15 +21,21 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 #if NCCL_MAJOR >= 2
   ncclDataType_t test_types[ncclNumTypes] = {
     ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #if HAVE_BF16
     , ncclBfloat16
   #endif
+  #if HAVE_FP8
+    , ncclFloat8e4m3, ncclFloat8e5m2
+  #endif
   };
   const char *test_typenames[ncclNumTypes] = {
     "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #if HAVE_BF16
     , "bfloat16"
   #endif
+  #if HAVE_FP8
+    , "f8e4m3", "f8e5m2"
+  #endif
   };
   int test_typenum = -1;
 
@@ -86,6 +92,7 @@ static int average = 1;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
 static int local_register = 0;
 #endif
+static int minCudaArch = 1<<30;
 
 #define NUM_BLOCKS 32
 
@@ -126,18 +133,18 @@ static double parsesize(const char *value) {
 }
 
 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
+  CUDACHECK(ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault));
   CUDACHECK(cudaDeviceSynchronize());
   return testSuccess;
 }
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
+  CUDACHECK(ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault));
   return testSuccess;
 }
 
 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
+  CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault));
   return testSuccess;
 }
 
@@ -358,9 +365,12 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       union {
         int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
         half f16; float f32; double f64;
-        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        #if HAVE_BF16
         __nv_bfloat16 bf16;
         #endif
+        #if HAVE_FP8
+        __nv_fp8_e4m3 f8e4m3; __nv_fp8_e5m2 f8e5m2;
+        #endif
       };
       switch(type) {
       case ncclInt8: i8 = ncclVerifiablePremulScalar<int8_t>(rank); break;
@@ -372,9 +382,14 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
       case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
       case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      #if HAVE_BF16
       case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
       #endif
+      #if HAVE_FP8
+      case ncclFloat8e4m3: f8e4m3 = ncclVerifiablePremulScalar<__nv_fp8_e4m3>(rank); break;
+      case ncclFloat8e5m2: f8e5m2 = ncclVerifiablePremulScalar<__nv_fp8_e5m2>(rank); break;
+      #endif
+      default: break; // Just to silence clang
       }
       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
     }
@@ -702,13 +717,20 @@ int main(int argc, char* argv[]) {
     test_typenum = 9;
     if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
       test_opnum++; // ncclAvg
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
-        test_typenum++; // bfloat16
-      #endif
     }
     if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
       test_opnum++; // PreMulSum
     }
+    #if defined(__CUDA_BF16_TYPES_EXIST__)
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_typenum++; // bfloat16
+    }
+    #endif
+    #if defined(__CUDA_FP8_TYPES_EXIST__)
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) {
+      test_typenum += 2; // fp8 e4m3,e5m2
+    }
+    #endif
   #endif
 
   // Parse args
@@ -1033,12 +1055,37 @@ testResult_t run() {
     gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
     CUDACHECK(cudaSetDevice(gpus[i]));
     TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
-    if (streamnull)
+    if (streamnull) {
       streams[i] = NULL;
-    else
+    }
+    else {
       CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+    }
+    int archMajor, archMinor;
+    CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, gpus[i]));
+    CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, gpus[i]));
+    minCudaArch = std::min(minCudaArch, 100*archMajor + 10*archMinor);
   }
 
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+#endif
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) {
+    if (minCudaArch < 900) { // Filter out fp8 on pre-Hopper hardware
+      int n = 0;
+      for (int i=0; i < test_typenum; i++) {
+        if (!(test_types[i] == ncclFloat8e4m3 || test_types[i] == ncclFloat8e5m2)) {
+          test_types[n] = test_types[i];
+          test_typenames[n] = test_typenames[i];
+          n += 1;
+        }
+      }
+      test_typenum = n;
+    }
+  }
+#endif
+
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
diff --git a/src/common.h b/src/common.h
index 478d7fb1c0..ff834f699d 100644
--- a/src/common.h
+++ b/src/common.h
@@ -213,16 +213,34 @@ static uint64_t getHostHash(const char* hostname) {
   return getHash(hostHash, strlen(hostHash));
 }
 
+#define HAVE_BF16 0
+#define HAVE_FP8 0
+
+#if NCCL_MAJOR >= 2
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    #undef HAVE_BF16
+    #define HAVE_BF16 1
+    #if defined(__CUDA_FP8_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0)
+      #undef HAVE_FP8
+      #define HAVE_FP8 1
+    #endif
+  #endif
+#endif
+
 static size_t wordSize(ncclDataType_t type) {
   switch(type) {
     case ncclChar:
 #if NCCL_MAJOR >= 2
     //case ncclInt8:
     case ncclUint8:
+#endif
+#if HAVE_FP8
+    case ncclFloat8e4m3:
+    case ncclFloat8e5m2:
 #endif
       return 1;
     case ncclHalf:
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if HAVE_BF16
     case ncclBfloat16:
 #endif
     //case ncclFloat16:
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 5f617ee188..dcd6e6c939 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -8,6 +8,15 @@
 #if CUDART_VERSION >= 11000
 #include <cuda_bf16.h>
 #endif
+#if CUDART_VERSION >= 11080
+#include <cuda_fp8.h>
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && defined(__CUDA_FP8_TYPES_EXIST__)
+  #define HAVE_ncclFloat8 1
+#else
+  #define HAVE_ncclFloat8 0
+#endif
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__)
   #define HAVE_ncclBfloat16 1
@@ -84,10 +93,16 @@ template<typename T>
 struct IsIntegral: std::is_integral<T> {};
 template<>
 struct IsIntegral<half>: std::false_type {};
-#ifdef __CUDA_BF16_TYPES_EXIST__
+#if HAVE_ncclBfloat16
 template<>
 struct IsIntegral<__nv_bfloat16>: std::false_type {};
 #endif
+#if HAVE_ncclFloat8
+template<>
+struct IsIntegral<__nv_fp8_e4m3>: std::false_type {};
+template<>
+struct IsIntegral<__nv_fp8_e5m2>: std::false_type {};
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -107,23 +122,72 @@ __host__ __device__ T inhibit(T x) {
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace {
-  template<typename Y, typename X>
-  __host__ __device__ Y castTo(X x) {
+  template<typename Y>
+  __host__ __device__ Y castTo(uint64_t x) {
     return Y(x);
   }
   template<typename Y>
   __host__ __device__ Y castTo(float x) {
     return Y(x);
   }
+  template<typename Y>
+  __host__ __device__ Y castTo(double x) {
+    return Y(x);
+  }
+
   template<>
   __host__ __device__ half castTo<half>(float x) {
     return __float2half(x);
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  template<>
+  __host__ __device__ half castTo<half>(double x) {
+    return __double2half(x);
+  }
+  template<>
+  __host__ __device__ half castTo<half>(uint64_t x) {
+    return __ull2half_rn(x);
+  }
+
+  #if HAVE_ncclBfloat16
   template<>
   __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
     return __float2bfloat16(x);
   }
+  template<>
+  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(double x) {
+    return __double2bfloat16(x);
+  }
+  template<>
+  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(uint64_t x) {
+    return __double2bfloat16((double)x);
+  }
+  #endif
+
+  #if HAVE_ncclFloat8
+  template<>
+  __host__ __device__ __nv_fp8_e4m3 castTo<__nv_fp8_e4m3>(float x) {
+    return __nv_fp8_e4m3(x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e4m3 castTo<__nv_fp8_e4m3>(double x) {
+    return __nv_fp8_e4m3(x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e4m3 castTo<__nv_fp8_e4m3>(uint64_t x) {
+    return __nv_fp8_e4m3((double)x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e5m2 castTo<__nv_fp8_e5m2>(float x) {
+    return __nv_fp8_e5m2(x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e5m2 castTo<__nv_fp8_e5m2>(double x) {
+    return __nv_fp8_e5m2(x);
+  }
+  template<>
+  __host__ __device__ __nv_fp8_e5m2 castTo<__nv_fp8_e5m2>(uint64_t x) {
+    return __nv_fp8_e5m2((double)x);
+  }
   #endif
 }
 
@@ -151,7 +215,7 @@ struct ReduceSum {
       return __float2half(__half2float(a) + __half2float(b));
     #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if HAVE_ncclBfloat16
   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
     #if __CUDA_ARCH__ >= 800
       return __hadd(a, b);
@@ -160,6 +224,22 @@ struct ReduceSum {
     #endif
   }
   #endif
+  #if HAVE_ncclFloat8
+  __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e4m3(__hadd(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e4m3(float(a) + float(b));
+    #endif
+  }
+  __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e5m2(__hadd(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e5m2(float(a) + float(b));
+    #endif
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -175,7 +255,7 @@ struct ReduceProd {
       return __float2half(__half2float(a) * __half2float(b));
     #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if HAVE_ncclBfloat16
   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
     #if __CUDA_ARCH__ >= 800
       return __hmul(a, b);
@@ -184,6 +264,22 @@ struct ReduceProd {
     #endif
   }
   #endif
+  #if HAVE_ncclFloat8
+  __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e4m3(__hmul(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e4m3(float(a) * float(b));
+    #endif
+  }
+  __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e5m2(__hmul(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e5m2(float(a) * float(b));
+    #endif
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -201,7 +297,7 @@ struct ReduceMin {
       return __half2float(a) < __half2float(b) ? a : b;
     #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if HAVE_ncclBfloat16
   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
     #if __CUDA_ARCH__ >= 800
       return __hmin(a, b);
@@ -212,6 +308,22 @@ struct ReduceMin {
     #endif
   }
   #endif
+  #if HAVE_ncclFloat8
+  __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e4m3(__hmin(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e4m3(float(a) < float(b) ? a : b);
+    #endif
+  }
+  __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e5m2(__hmin(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e5m2(float(a) < float(b) ? a : b);
+    #endif
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -229,7 +341,7 @@ struct ReduceMax {
       return __half2float(a) > __half2float(b) ? a : b;
     #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if HAVE_ncclBfloat16
   __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
     #if __CUDA_ARCH__ >= 800
       return __hmax(a, b);
@@ -240,6 +352,22 @@ struct ReduceMax {
     #endif
   }
   #endif
+  #if HAVE_ncclFloat8
+  __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e4m3(__hmax(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e4m3(float(a) > float(b) ? a : b);
+    #endif
+  }
+  __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __nv_fp8_e5m2(__hmax(__half(a), __half(b)));
+    #else
+      return __nv_fp8_e5m2(float(a) > float(b) ? a : b);
+    #endif
+  }
+  #endif
   template<typename T>
   __host__ __device__ T postOp(T x) const { return x; }
 };
@@ -297,29 +425,47 @@ struct ReduceAvg {
 
 namespace {
 template<typename T>
-struct FloatLayout;
+struct FloatLayout { static constexpr bool is_floating_point = false; };
 template<>
 struct FloatLayout<float> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 8, mantissa_bits = 23;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<double> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 11, mantissa_bits = 52;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<half> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 5, mantissa_bits = 10;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
-#ifdef __CUDA_BF16_TYPES_EXIST__
+#if HAVE_ncclBfloat16
 template<>
 struct FloatLayout<__nv_bfloat16> {
+  static constexpr bool is_floating_point = true;
   static constexpr int exponent_bits = 8, mantissa_bits = 7;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 #endif
+#if HAVE_ncclFloat8
+template<>
+struct FloatLayout<__nv_fp8_e4m3> {
+  static constexpr bool is_floating_point = true;
+  static constexpr int exponent_bits = 4, mantissa_bits = 3;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<__nv_fp8_e5m2> {
+  static constexpr bool is_floating_point = true;
+  static constexpr int exponent_bits = 5, mantissa_bits = 2;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#endif
 
 template<typename T>
 __host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
@@ -632,11 +778,12 @@ __host__ __device__ void genOutput(
 ////////////////////////////////////////////////////////////////////////////////
 // Nil reduction (byte copy functions). Optimized to assume rank_n=1
 
+// genInput specialization for integer ReduceNil.
 namespace {
-template<typename T, bool IsIntegral>
+template<typename T>
 __host__ __device__ void genInput(
     T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
-    std::integral_constant<bool, IsIntegral>
+    std::true_type /*integral*/
   ) {
   (void)rank_n, (void)rank_me; // silence unused warnings
   union { uint64_t bits; T tmp; };
@@ -646,6 +793,24 @@ __host__ __device__ void genInput(
   ans = tmp;
 }
 
+// genInput specialization for floating point ReduceNil.
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
+  int sign = rng & 1;
+  rng ^= rng>>1;
+  int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
+  exp += 1<<(FloatLayout<T>::exponent_bits-2);
+  rng ^= rng >> FloatLayout<T>::exponent_bits;
+  uint64_t mant = rng & mant_mask;
+  ans = makeFloat<T>(sign, exp, mant);
+}
+
 template<typename T, typename ReduceFn, bool IsIntegral>
 __host__ __device__ void genOutput(
     T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
@@ -734,20 +899,34 @@ __host__ __device__ void genOutput(
 namespace {
 template<typename T>
 __host__ __device__ void genInput(
-    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
     std::false_type /*integral*/
   ) {
-  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
+  // We can't control the nranks divisor in avareages so to control error we
+  // limit to two ranks contributing non-zero values. This way there is no ambiguity
+  // of summation.
+  int r = shuffleRank(rank_n, rank_me, rng);
+  uint64_t m = (rng*(r ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  ans = r < 2 ? castTo<T>(1+m) : castTo<T>((uint64_t)0);
 }
 
 template<typename T>
 __host__ __device__ void genOutput(
-    T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
+    T &ans, ReduceAvg, int rank_n, uint64_t rng, intptr_t index,
     std::false_type /*integral*/
   ) {
-  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
-  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
-  ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+  shuffleRank(rank_n, -1, rng);
+  uint64_t m0 = (rng*(0 ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  uint64_t m1 = (rng*(1 ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  if (rank_n == 1) {
+    ans = castTo<T>(1+m0);
+  } else {
+    // NCCL varies which datatype it does the muls with depending on __CUDA_ARCH__.
+    // We account for this by using a tolerance of 2 ulps during the verification.
+    using TMul = typename std::conditional<(sizeof(T) < sizeof(double)), float, double>::type;
+    ans = ReduceSum()((T)(TMul(1+m0)*TMul(1.0/rank_n)),
+                      (T)(TMul(1+m1)*TMul(1.0/rank_n)));
+  }
 }
 }
 
@@ -809,10 +988,9 @@ __host__ __device__ T genOutput(
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
-__global__ void prepareInput2(
+__global__ void __launch_bounds__(512, 1) prepareInput2(
     T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
     uint64_t seed, intptr_t elt_ix0
   ) {
@@ -833,40 +1011,49 @@ __global__ void prepareInput2(
 }
 
 template<typename ReduceOp>
-void prepareInput1(
+cudaError_t prepareInput1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
-  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
-  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
+  void const *fn = nullptr;
   switch(elt_ty) {
-  case ncclInt8: CASE_TY(int8_t)
-  case ncclUint8: CASE_TY(uint8_t)
-  case ncclInt32: CASE_TY(int32_t)
-  case ncclUint32: CASE_TY(uint32_t)
-  case ncclInt64: CASE_TY(int64_t)
-  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclInt8: fn = (void const*)&prepareInput2<int8_t, ReduceOp>; break;
+  case ncclUint8: fn = (void const*)&prepareInput2<uint8_t, ReduceOp>; break;
+  case ncclInt32: fn = (void const*)&prepareInput2<int32_t, ReduceOp>; break;
+  case ncclUint32: fn = (void const*)&prepareInput2<uint32_t, ReduceOp>; break;
+  case ncclInt64: fn = (void const*)&prepareInput2<int64_t, ReduceOp>; break;
+  case ncclUint64: fn = (void const*)&prepareInput2<uint64_t, ReduceOp>; break;
+  case ncclFloat16: fn = (void const*)&prepareInput2<half, ReduceOp>; break;
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: fn = (void const*)&prepareInput2<__nv_bfloat16, ReduceOp>; break;
   #endif
-  case ncclFloat32: CASE_TY(float)
-  case ncclFloat64: CASE_TY(double)
-  default: assert(0);
+  #if HAVE_ncclFloat8
+  case ncclFloat8e4m3: fn = (void const*)&prepareInput2<__nv_fp8_e4m3, ReduceOp>; break;
+  case ncclFloat8e5m2: fn = (void const*)&prepareInput2<__nv_fp8_e5m2, ReduceOp>; break;
+  #endif
+  case ncclFloat32: fn = (void const*)&prepareInput2<float, ReduceOp>; break;
+  case ncclFloat64: fn = (void const*)&prepareInput2<double, ReduceOp>; break;
+  default: assert(0); return cudaErrorInvalidValue;
   }
   #undef CASE_TY
+  dim3 grid = {1, 1, 1};
+  grid.x = (unsigned int)std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  dim3 block = {512, 1, 1};
+  void *args[7] = {&elts, &elt_n, &op, &rank_n, &rank_me, &seed, &elt_ix0};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }
 
-void ncclVerifiablePrepareInput(
+cudaError_t ncclVerifiablePrepareInput(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
-      prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
+      return prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
     else \
-      prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
+      return prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
     break;
   switch(red_op) {
   case ncclSum: CASE_OP(ReduceSum())
@@ -882,14 +1069,12 @@ void ncclVerifiablePrepareInput(
   }
   #undef CASE_OP
 }
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
-__global__ void prepareExpected2(
+__global__ void __launch_bounds__(512, 1) prepareExpected2(
     T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
     uint64_t seed, intptr_t elt_ix0
   ) {
@@ -909,40 +1094,49 @@ __global__ void prepareExpected2(
 }
 
 template<typename ReduceOp>
-void prepareExpected1(
+cudaError_t prepareExpected1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
-  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
-  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
+  void const *fn = nullptr;
   switch(elt_ty) {
-  case ncclInt8: CASE_TY(int8_t)
-  case ncclUint8: CASE_TY(uint8_t)
-  case ncclInt32: CASE_TY(int32_t)
-  case ncclUint32: CASE_TY(uint32_t)
-  case ncclInt64: CASE_TY(int64_t)
-  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclInt8: fn = (void const*)&prepareExpected2<int8_t, ReduceOp>; break;
+  case ncclUint8: fn = (void const*)&prepareExpected2<uint8_t, ReduceOp>; break;
+  case ncclInt32: fn = (void const*)&prepareExpected2<int32_t, ReduceOp>; break;
+  case ncclUint32: fn = (void const*)&prepareExpected2<uint32_t, ReduceOp>; break;
+  case ncclInt64: fn = (void const*)&prepareExpected2<int64_t, ReduceOp>; break;
+  case ncclUint64: fn = (void const*)&prepareExpected2<uint64_t, ReduceOp>; break;
+  case ncclFloat16: fn = (void const*)&prepareExpected2<half, ReduceOp>; break;
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: fn = (void const*)&prepareExpected2<__nv_bfloat16, ReduceOp>; break;
   #endif
-  case ncclFloat32: CASE_TY(float)
-  case ncclFloat64: CASE_TY(double)
-  default: assert(0);
+  #if HAVE_ncclFloat8
+  case ncclFloat8e4m3: fn = (void const*)&prepareExpected2<__nv_fp8_e4m3, ReduceOp>; break;
+  case ncclFloat8e5m2: fn = (void const*)&prepareExpected2<__nv_fp8_e5m2, ReduceOp>; break;
+  #endif
+  case ncclFloat32: fn = (void const*)&prepareExpected2<float, ReduceOp>; break;
+  case ncclFloat64: fn = (void const*)&prepareExpected2<double, ReduceOp>; break;
+  default: assert(0); return cudaErrorInvalidValue;
   }
   #undef CASE_TY
+  dim3 grid = {1, 1, 1};
+  grid.x = (unsigned int)std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  dim3 block = {512, 1, 1};
+  void *args[6] = {&elts, &elt_n, &op, &rank_n, &seed, &elt_ix0};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }
 
-void ncclVerifiablePrepareExpected(
+cudaError_t ncclVerifiablePrepareExpected(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
     uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
-      prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
+      return prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
     else \
-      prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
+      return prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
     break;
   switch(red_op) {
   case ncclSum: CASE_OP(ReduceSum())
@@ -958,52 +1152,10 @@ void ncclVerifiablePrepareExpected(
   }
   #undef CASE_OP
 }
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace {
-/* How we compare floating point values when exactness is impossible is interesting.
- * First, we take note that simply reinterpreting integer bits as floating point
- * gives us a monotonic mapping which exponentially spaces out floats. Thus
- * consecutive integers encode consecutive floats. In general, using integer
- * subraction on the bitpatterns of two floats gives us an integer which is the
- * logarithm of their relative difference. But, if the floats always have similar
- * exponents, than the integer difference is actually proportional to the
- * relative error (this is because we are counting hops in the mantissa bits only,
- * not the exponent bits). So a cheap way to compare if two floats are relatively
- * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
- * calculates such a tolerance for a summation of n floats. This formula
- * was derived by inspecting the maximum observed integer difference over many
- * random runs of summation. The parameter values were computed by the
- * companion program "inexact_regress.cu".
- */
-__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
-  float power, coef;
-  switch(elt_ty) {
-  case ncclFloat32:
-  case ncclFloat64:
-    power = .51f;
-    coef = 1.25f;
-    break;
-  case ncclFloat16:
-    power = .91f;
-    coef = .75f;
-    break;
-  #if HAVE_ncclBfloat16
-  case ncclBfloat16:
-    power = .91f;
-    coef = .66f;
-    break;
-  #endif
-  }
-  #if __CUDA_ARCH__
-    return 1 + unsigned(coef*powf(float(rank_n), power));
-  #else
-    return 1 + unsigned(coef*std::pow(float(rank_n), power));
-  #endif
-}
-
 template<typename T>
 __host__ __device__  uint64_t calcDelta(T a, T b) {
   union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
@@ -1020,10 +1172,9 @@ __host__ __device__  uint64_t calcDelta(T a, T b) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if !SELF_TEST
 namespace {
 template<typename T>
-__global__ void verifyPrepared(
+__global__ void __launch_bounds__(512, 1) verifyPrepared(
     T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
   ) {
   intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
@@ -1039,16 +1190,34 @@ __global__ void verifyPrepared(
     bad += tolerance < delta ? 1 : 0;
     #if 0
       if(tolerance < delta) {
-        printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
+        printf("verifyPrepared ix=%lld got=%g exp=%g tol=%d\n", (long long)i, (float)results[i], (float)expected[i], tolerance);
       }
     #endif
     i += blockDim.x;
   }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad) : "memory");
+}
+
+cudaError_t verifyPrepared1(int bytePerElt,
+    void const *results, void const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
+  ) {
+  void const *fn = nullptr;
+  switch(bytePerElt) {
+  case 1: fn = (void const*)&verifyPrepared<uint8_t>; break;
+  case 2: fn = (void const*)&verifyPrepared<uint16_t>; break;
+  case 4: fn = (void const*)&verifyPrepared<uint32_t>; break;
+  case 8: fn = (void const*)&verifyPrepared<uint64_t>; break;
+  default: assert(0); return cudaErrorInvalidValue;
+  }
+  dim3 grid = {(unsigned int)block_n, 1, 1};
+  dim3 block = {512, 1, 1};
+  void *args[5] = {&results, &expected, &elt_n, &tolerance, &bad_elt_n};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 
 template<typename T, typename Uint, typename ReduceFn>
-__global__ void verifyInline2(
+__global__ void __launch_bounds__(512, 1) verifyInline2(
     T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
     intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
   ) {
@@ -1077,39 +1246,52 @@ __global__ void verifyInline2(
     #endif
     i += blockDim.x;
   }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad) : "memory");
 }
 
 template<typename T, typename Uint>
-void verifyInline1(
+cudaError_t verifyInline1(
     T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
     unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
   ) {
+  void const *fn = nullptr;
+  ReduceNil opnil;
+  ReduceSum opsum;
+  ReduceMin opmin;
+  ReduceMax opmax;
+  ReduceProd opprod;
+  ReduceAvg opavg{rank_n};
+  ReducePreMulSum oppremulsum;
+  void *args[8] = {&results, &elt_n, nullptr, &rank_n, &seed, &elt_ix0, &tolerance, &bad_elt_n};
   #define CASE_OP(op) \
-    if(rank_n == 1) \
-    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
-      ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
-    else \
-    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
-      ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
-    break;
+    if(rank_n == 1) { \
+      fn = (void const*)&verifyInline2<T, Uint, ReduceNil>; \
+      args[2] = &opnil; \
+    } else { \
+      fn = (void const*)&verifyInline2<T, Uint, decltype(op)>; \
+      args[2] = &op; \
+    } break;
   switch(red_op) {
-  case ncclSum: CASE_OP(ReduceSum())
-  case ncclMin: CASE_OP(ReduceMin())
-  case ncclMax: CASE_OP(ReduceMax())
-  case ncclProd: CASE_OP(ReduceProd())
+  case ncclSum: CASE_OP(opsum)
+  case ncclMin: CASE_OP(opmin)
+  case ncclMax: CASE_OP(opmax)
+  case ncclProd: CASE_OP(opprod)
   #if HAVE_ncclAvg
-  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  case ncclAvg: CASE_OP(opavg)
   #endif
   #if HAVE_ncclPreMulSum
-  default: CASE_OP(ReducePreMulSum())
+  default: CASE_OP(oppremulsum)
   #endif
   }
   #undef CASE_OP
+  dim3 grid = {(unsigned int)block_n, 1, 1};
+  dim3 block = {512, 1, 1};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }
 
-void ncclVerifiableVerify(
+cudaError_t ncclVerifiableVerify(
     void const *results, void const *expected, intptr_t elt_n, int elt_ty,
     int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
     int64_t *bad_elt_n, cudaStream_t stream
@@ -1118,11 +1300,21 @@ void ncclVerifiableVerify(
   #if HAVE_ncclBfloat16
     floating |= elt_ty == ncclBfloat16;
   #endif
-
+  #if HAVE_ncclFloat8
+    floating |= elt_ty == ncclFloat8e4m3;
+    floating |= elt_ty == ncclFloat8e5m2;
+  #endif
+  
   unsigned tolerance = 0;
   #if HAVE_ncclAvg
-  if (floating && red_op == ncclAvg)
-    tolerance = calcSumFloatTolerance(rank_n, elt_ty);
+  if (floating && red_op == ncclAvg) {
+    // Average does it's pre-multiplies in an unspecified floating point format
+    // (could be the actual type T or float or half). That means the premultiply
+    // verify does could generate a discrepancy in the least mantissa digit. After
+    // adding those two (since avg only has two non-zero contributions) we could
+    // be off by a distance of 2 units.
+    tolerance = 2;
+  }
   #endif
 
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
@@ -1130,9 +1322,9 @@ void ncclVerifiableVerify(
   *bad_elt_n = 0;
   #define CASE_TY(T, Uint) { \
       if(expected != nullptr) { \
-        verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
+        return verifyPrepared1(sizeof(T), results, expected, elt_n, tolerance, bad_elt_n, stream, block_n); \
       } else { \
-        verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
+        return verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
       } \
     } break;
   switch(elt_ty) {
@@ -1143,29 +1335,30 @@ void ncclVerifiableVerify(
   case ncclInt64: CASE_TY(int64_t, uint64_t)
   case ncclUint64: CASE_TY(uint64_t, uint64_t)
   case ncclFloat16: CASE_TY(half, uint16_t)
+  #if HAVE_ncclFloat8
+  case ncclFloat8e4m3: CASE_TY(__nv_fp8_e4m3, uint8_t)
+  case ncclFloat8e5m2: CASE_TY(__nv_fp8_e5m2, uint8_t)
+  #endif
   #if HAVE_ncclBfloat16
   case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
   #endif
   case ncclFloat32: CASE_TY(float, uint32_t)
   case ncclFloat64: CASE_TY(double, uint64_t)
-  default: assert(0);
+  default: assert(0); return cudaErrorInvalidValue;
   }
   #undef CASE_TY
 }
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if SELF_TEST
-#include <iostream>
-
+namespace {
 template<typename T, typename Op>
 __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
   //if(!std::is_same<T,half>::value) return;
   //if(!std::is_same<Op,ReduceProd>::value) return;
   //if(rank_n!=3) return;
 
-  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
+  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? 2 : 0;
   uint64_t seed = 0xc8e2bed69766d533;
 
   for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
@@ -1202,7 +1395,7 @@ __device__ void sweep1(int ty, char const *tyname) {
   }
 }
 
-__global__ void sweep() {
+__global__ void __launch_bounds__(512, 1) sweep() {
   sweep1<int8_t>(ncclInt8, "int8");
   sweep1<uint8_t>(ncclUint8, "uint8");
   sweep1<int32_t>(ncclInt32, "int32");
@@ -1210,18 +1403,18 @@ __global__ void sweep() {
   sweep1<int64_t>(ncclInt64, "int64");
   sweep1<uint64_t>(ncclUint64, "uint64");
   sweep1<half>(ncclFloat16, "half");
+  #if HAVE_ncclFloat8
+    sweep1<__nv_fp8_e4m3>(ncclBfloat16, "float8e4m3");
+    sweep1<__nv_fp8_e5m2>(ncclBfloat16, "float8e5m2");
+  #endif
   #if HAVE_ncclBfloat16
     sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
   #endif
   sweep1<float>(ncclFloat32, "float");
   sweep1<double>(ncclFloat64, "double");
 }
-
-int main(int arg_n, char **args) {
-  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  cudaSetDevice(0);
-  sweep<<<1,512>>>();
-  cudaDeviceSynchronize();
-  return 0;
 }
-#endif
+
+void ncclVerifiableLaunchSelfTest() {
+  sweep<<<1,512>>>();
+}
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index aca0565a6b..71d5ef6649 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -34,13 +34,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 }
 
 // Enqueue kernel to generate data which is to be reduced.
-void ncclVerifiablePrepareInput(
+cudaError_t ncclVerifiablePrepareInput(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
 
 // Enqueue kernel to generate expected results of reduction.
-void ncclVerifiablePrepareExpected(
+cudaError_t ncclVerifiablePrepareExpected(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
@@ -51,9 +51,10 @@ void ncclVerifiablePrepareExpected(
 // which can be costly. Thus if you plan to run the same reduction multiple
 // times it is advantageous to precompute the expected values with
 // ncclVerifiablePrepareExpected and pass them as `expected` here.
-void ncclVerifiableVerify(
+cudaError_t ncclVerifiableVerify(
   void const *results, void const *expected, intptr_t elt_n, int elt_ty,
   int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
   int64_t *bad_elt_n, cudaStream_t stream
 );
+
 #endif

From 1021260ca94ea73dcedc8a15ffc6dbfb12504b65 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 21 Apr 2025 11:26:35 -0700
Subject: [PATCH 2/9] Make verifiable a DSO and add NAME_SUFFIX support

Build option DSO=1 generates libverifiable.so which can be
used to reduce the combined binary size.

Build option NAME_SUFFIX can be used to a add suffix to all
generated binaries. e.g. NAME_SUFFIX=_mpi

Added new make target: clean_intermediates
---
 README.md                | 32 +++++++++-----
 src/Makefile             | 93 +++++++++++-----------------------------
 src/common.mk            | 69 +++++++++++++++++++++++++++++
 verifiable/Makefile      | 17 +++++---
 verifiable/main.cu       | 14 ++++++
 verifiable/verifiable.h  |  4 ++
 verifiable/verifiable.mk | 15 +++++--
 7 files changed, 156 insertions(+), 88 deletions(-)
 create mode 100644 src/common.mk
 create mode 100644 verifiable/main.cu

diff --git a/README.md b/README.md
index 957f6afb90..bdafbe5a16 100644
--- a/README.md
+++ b/README.md
@@ -4,33 +4,43 @@ These tests check both the performance and the correctness of [NCCL](http://gith
 
 ## Build
 
-To build the tests, just type `make`.
+To build the tests, just type `make` or `make -j`
 
-If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME.
+If CUDA is not installed in `/usr/local/cuda`, you may specify `CUDA_HOME`. Similarly, if NCCL is not installed in `/usr`, you may specify `NCCL_HOME`.
 
 ```shell
 $ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
 ```
 
-NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
+NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set `MPI=1` and set `MPI_HOME` to the path where MPI is installed.
 
 ```shell
 $ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
 ```
 
+You can also add a suffix to the name of the generated binaries with `NAME_SUFFIX`. For example when compiling with the MPI versions you could use:
+
+```shell
+$ make MPI=1 NAME_SUFFIX=_mpi MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
+```
+
+This will generate test binaries with names such as `all_reduce_perf_mpi`.
+
 ## Usage
 
-NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
+NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to `(number of processes)*(number of threads)*(number of GPUs per thread)`.
 
 ### Quick examples
 
 Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+
 ```shell
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```
 
 Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
 (NB: The nccl-tests binaries must be compiled with `MPI=1` for this case)
+
 ```shell
 $ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
 ```
@@ -73,7 +83,7 @@ All tests support the same set of arguments :
 
 ### Running multiple operations in parallel
 
-NCCL tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel. 
+NCCL tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel.
 To split the GPUs, NCCL will compute a "color" for each rank, based on the `NCCL_TESTS_SPLIT` environment variable, then all ranks
 with the same color will end up in the same group. The resulting group is printed next to each GPU at the beginning of the test.
 
@@ -82,13 +92,15 @@ with the same color will end up in the same group. The resulting group is printe
 `NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.
 
 Here are a few examples:
- - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network)
- - `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
- - `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
+
+ - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8"`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating over the inter-node network)
+
+- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
+
+- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
 
 Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.
 
 ## Copyright
 
-NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
-
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
diff --git a/src/Makefile b/src/Makefile
index 5737092a86..612395f645 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,73 +1,13 @@
 #
-# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
+include common.mk
 
-CUDA_HOME ?= /usr/local/cuda
-PREFIX ?= /usr/local
-VERBOSE ?= 0
-DEBUG ?= 0
-
-CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-NVCC ?= $(CUDA_HOME)/bin/nvcc
-CUDARTLIB ?= cudart
-
-CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
-CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
-CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
-
-# Better define NVCC_GENCODE in your environment to the minimal set
-# of archs to reduce compile time.
-ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
-# Include Blackwell support if we're using CUDA12.8 or above
-NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
-		-gencode=arch=compute_90,code=sm_90 \
-		-gencode=arch=compute_100,code=sm_100 \
-		-gencode=arch=compute_120,code=sm_120 \
-		-gencode=arch=compute_120,code=compute_120
-else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
-NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-		-gencode=arch=compute_80,code=sm_80 \
-		-gencode=arch=compute_90,code=sm_90 \
-		-gencode=arch=compute_90,code=compute_90
-else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
-NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-		-gencode=arch=compute_80,code=sm_80 \
-		-gencode=arch=compute_80,code=compute_80
-else
-NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_70,code=compute_70
-endif
-
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
-CXXFLAGS   := -std=c++11
-
-LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
-NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
-
-ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3 -g
-CXXFLAGS  += -O3 -g
-else
-NVCUFLAGS += -O0 -G -g
-CXXFLAGS  += -O0 -g -ggdb3
-endif
-
-ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
-else
-.SILENT:
-endif
+MPI ?= 0        # Set to 1 to enable MPI support (multi-process/multi-node)
+NAME_SUFFIX ?=  # e.g. _mpi when using MPI=1
+DSO ?= 0        # Set to 1 to create and use libverifiable.so to reduce binary size
 
 .PHONY: build clean
 
@@ -92,7 +32,7 @@ DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
 BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
-BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
 
 build: ${BIN_FILES}
 
@@ -103,18 +43,35 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable
 TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
 include ../verifiable/verifiable.mk
 
+.PRECIOUS: ${DST_DIR}/%.o
+
 ${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
 
+${DST_DIR}/%$(NAME_SUFFIX).o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
 ${DST_DIR}/timer.o: timer.cc timer.h
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
+	$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
+ifeq ($(DSO), 1)
+${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS)
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable ${NVLDFLAGS} -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable"
+else
+${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+endif
+
+clean_intermediates:
+	rm -f ${DST_DIR}/*.o $(TEST_VERIFIABLE_OBJS)
 
diff --git a/src/common.mk b/src/common.mk
new file mode 100644
index 0000000000..2bc7e358a0
--- /dev/null
+++ b/src/common.mk
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC ?= $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_90,code=compute_90
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_80,code=compute_80
+else
+NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_70,code=compute_70
+endif
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+CXXFLAGS   := -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
diff --git a/verifiable/Makefile b/verifiable/Makefile
index b141a2a7c5..bb90001e1e 100644
--- a/verifiable/Makefile
+++ b/verifiable/Makefile
@@ -1,13 +1,18 @@
-include ../../makefiles/common.mk
+#
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+include ../src/common.mk
 
 .PHONY: all clean
 
-BUILDDIR := $(abspath ../../build)
+BUILDDIR := $(abspath ../build)
 NCCLDIR := $(BUILDDIR)
 NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
-DST_DIR := $(BUILDDIR)/test/verifiable
+DST_DIR := $(BUILDDIR)/verifiable
 
-all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
+all: $(DST_DIR)/self_test
 
 clean:
 	rm -rf $(DST_DIR)
@@ -18,7 +23,7 @@ include verifiable.mk
 
 self_test: $(DST_DIR)/self_test
 
-$(DST_DIR)/self_test: verifiable.cu verifiable.h
+$(DST_DIR)/self_test: main.cu $(TEST_VERIFIABLE_LIBS)
 	@printf "Linking  %s\n" $@
 	@mkdir -p $(DST_DIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
+	$(NVCC) -o $@ $(NVCUFLAGS) $< -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable $(NVLDFLAGS) -Xlinker "-rpath=\$$ORIGIN"
diff --git a/verifiable/main.cu b/verifiable/main.cu
new file mode 100644
index 0000000000..4e4aef6713
--- /dev/null
+++ b/verifiable/main.cu
@@ -0,0 +1,14 @@
+
+#include <cuda_runtime.h>
+#include <iostream>
+
+#define NCCL_VERIFIABLE_SELF_TEST 1
+#include "verifiable.h"
+
+int main(int arg_n, char **args) {
+  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
+  cudaSetDevice(0);
+  ncclVerifiableLaunchSelfTest();
+  cudaDeviceSynchronize();
+  return 0;
+}
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index 71d5ef6649..62f65c8bed 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -57,4 +57,8 @@ cudaError_t ncclVerifiableVerify(
   int64_t *bad_elt_n, cudaStream_t stream
 );
 
+#ifdef NCCL_VERIFIABLE_SELF_TEST
+void ncclVerifiableLaunchSelfTest();
+#endif
+
 #endif
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
index 225c32a3c3..2bff174050 100644
--- a/verifiable/verifiable.mk
+++ b/verifiable/verifiable.mk
@@ -1,11 +1,18 @@
-# We requires both of the following paths to be set upon including this makefile
+# We require both of the following paths to be set upon including this makefile
 # TEST_VERIFIABLE_SRCDIR = <points to this directory>
-# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
+# TEST_VERIFIABLE_BUILDDIR = <points to destination of .so file>
 
 TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+TEST_VERIFIABLE_LIBS = $(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so
 
-$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	$(NVCC) -Xcompiler "-fPIC" -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+
+$(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so: $(TEST_VERIFIABLE_OBJS)
+	@printf "Creating DSO %s\n" $@
+	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
+	$(CC) -shared -o $@.0 $^ -Wl,-soname,$(notdir $@).0
+	ln -sf $(notdir $@).0 $@

From e041d901e6d3dabb67a22905cba77d9ba2689898 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 7 May 2025 10:30:59 -0700
Subject: [PATCH 3/9] Re-add sm_70 support for CUDA 12.8+ and 13.0 builds

---
 src/common.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common.mk b/src/common.mk
index 2bc7e358a0..5fd9418860 100644
--- a/src/common.mk
+++ b/src/common.mk
@@ -21,7 +21,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 # of archs to reduce compile time.
 ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
 # Include Blackwell support if we're using CUDA12.8 or above
-NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
+NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_90,code=sm_90 \
 		-gencode=arch=compute_100,code=sm_100 \
 		-gencode=arch=compute_120,code=sm_120 \

From a5c539e68bb7263304997012498b0cd0667b99e8 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 19 May 2025 18:20:22 -0700
Subject: [PATCH 4/9] Add support for Symmetric Memory Registration

From NCCL 2.27.x we can now use the Symmetric Memory APIs (-R 2)
---
 README.md     |  2 +-
 src/common.cu | 58 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index bdafbe5a16..22687d4771 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ All tests support the same set of arguments :
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
   * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
-  * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
+  * `-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
   * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
 
 ### Running multiple operations in parallel
diff --git a/src/common.cu b/src/common.cu
index f83cdf009a..3987d89081 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -90,6 +90,8 @@ static int report_cputime = 0;
 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+#define LOCAL_REGISTER 1
+#define SYMMETRIC_REGISTER 2
 static int local_register = 0;
 #endif
 static int minCudaArch = 1<<30;
@@ -660,8 +662,16 @@ testResult_t threadInit(struct threadArgs* args) {
   void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
   void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
   for (int i=0; i<args->nGpus; i++) {
-    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
-    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+    if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+      NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+      NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+    } else
+#endif
+    {
+      if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
+      if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
+    }
   }
 #endif
 
@@ -669,8 +679,16 @@ testResult_t threadInit(struct threadArgs* args) {
 
   for (int i=0; i<args->nGpus; i++) {
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
-    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+    if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+      NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)sendRegHandles[i]));
+      NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)recvRegHandles[i]));
+    } else
+#endif
+    {
+      if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
+      if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
+    }
 #endif
     NCCLCHECK(ncclCommDestroy(args->comms[i]));
   }
@@ -859,8 +877,10 @@ int main(int argc, char* argv[]) {
         break;
       case 'R':
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-        if ((int)strtol(optarg, NULL, 0)) {
-          local_register = 1;
+        local_register = (int)strtol(optarg, NULL, 0);
+        if (local_register == SYMMETRIC_REGISTER && test_ncclVersion < NCCL_VERSION(2,27,0)) {
+          printf("Option -R 2 (symmetric) is not supported before NCCL 2.27. Defaulting to local registration\n");
+          local_register = LOCAL_REGISTER;
         }
 #else
         printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
@@ -897,7 +917,7 @@ int main(int argc, char* argv[]) {
             "[-G,--cudagraph <num graph launches>] \n\t"
             "[-C,--report_cputime <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
-            "[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
+            "[-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers (default: disable (0))] \n\t"
             "[-h,--help]\n",
           basename(argv[0]));
         return 0;
@@ -1107,8 +1127,16 @@ testResult_t run() {
      sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
      recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
      for (int i=0; i<nGpus*nThreads; i++) {
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+       if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+         NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+         NCCLCHECK(ncclCommWindowRegister(comms[i], recvbuffs[i], maxBytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+       } else
+#endif
+       {
+         if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
+         if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
+       }
      }
 #endif
   }
@@ -1188,8 +1216,16 @@ testResult_t run() {
   if (!parallel_init) {
     for(int i=0; i<nGpus*nThreads; ++i) {
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
-      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+      if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+        NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)sendRegHandles[i]));
+        NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)recvRegHandles[i]));
+      } else
+#endif
+      {
+        if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
+        if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
+      }
 #endif
       NCCLCHECK(ncclCommDestroy(comms[i]));
     }

From 0c60e6a8e42a95f50e20328b119ded5add2392f3 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Fri, 30 May 2025 17:43:30 -0700
Subject: [PATCH 5/9] Fix formatting errors in README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 22687d4771..ce952ef89d 100644
--- a/README.md
+++ b/README.md
@@ -77,8 +77,8 @@ All tests support the same set of arguments :
   * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
-  * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
-  * `-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
+  * `-C,--report_cputime <0/1>` Report CPU time instead of latency. Default : 0.
+  * `-R,--local_register <0/1/2>` enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
   * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
 
 ### Running multiple operations in parallel

From 8bc16f4e012ed27ba23b13b8d76e0509becaa389 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Fri, 30 May 2025 18:04:25 -0700
Subject: [PATCH 6/9] Need to drop Volta (sm_70) support from CUDA 13.0

---
 src/common.mk | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/common.mk b/src/common.mk
index 5fd9418860..d6e5c18f39 100644
--- a/src/common.mk
+++ b/src/common.mk
@@ -19,7 +19,14 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Add Blackwell but drop Volta support if we're using CUDA13.0 or above
+NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
 # Include Blackwell support if we're using CUDA12.8 or above
 NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \
 		-gencode=arch=compute_80,code=sm_80 \

From 5290298ab669aa5212687d7b71e2b47888c2a8f3 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 2 Jun 2025 09:29:52 -0700
Subject: [PATCH 7/9] Reinstate Pascal suppport for CUDA 12.8+ builds

---
 src/common.mk | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/common.mk b/src/common.mk
index d6e5c18f39..3c4a483b2b 100644
--- a/src/common.mk
+++ b/src/common.mk
@@ -20,7 +20,7 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
-# Add Blackwell but drop Volta support if we're using CUDA13.0 or above
+# Add Blackwell but drop Pascal & Volta support if we're using CUDA13.0 or above
 NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_90,code=sm_90 \
 		-gencode=arch=compute_100,code=sm_100 \
@@ -28,7 +28,9 @@ NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_120,code=compute_120
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
 # Include Blackwell support if we're using CUDA12.8 or above
-NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+		-gencode=arch=compute_61,code=sm_61 \
+		-gencode=arch=compute_70,code=sm_70 \
 		-gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_90,code=sm_90 \
 		-gencode=arch=compute_100,code=sm_100 \

From e7c8825b0b5344c2b3d3317986b7b9ef257df928 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 3 Jun 2025 10:36:53 -0700
Subject: [PATCH 8/9] Wrap ncclCommWindowRegister() calls within ncclGroup

---
 src/common.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/common.cu b/src/common.cu
index 3987d89081..69b892ac2b 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -659,6 +659,7 @@ testResult_t threadInit(struct threadArgs* args) {
   }
   NCCLCHECK(ncclGroupEnd());
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+  NCCLCHECK(ncclGroupStart());
   void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
   void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
   for (int i=0; i<args->nGpus; i++) {
@@ -673,6 +674,7 @@ testResult_t threadInit(struct threadArgs* args) {
       if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
     }
   }
+  NCCLCHECK(ncclGroupEnd());
 #endif
 
   TESTCHECK(threadRunTests(args));
@@ -1124,6 +1126,7 @@ testResult_t run() {
        NCCLCHECK(ncclGroupEnd());
      }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+     NCCLCHECK(ncclGroupStart());
      sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
      recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
      for (int i=0; i<nGpus*nThreads; i++) {
@@ -1138,6 +1141,7 @@ testResult_t run() {
          if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
        }
      }
+     NCCLCHECK(ncclGroupEnd());
 #endif
   }
 

From 97ee0985165722717d340f1adb7c162bc8a7491d Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 4 Jun 2025 17:54:58 -0700
Subject: [PATCH 9/9] Add Turing (SM75) support to CUDA 13.0 builds

---
 src/common.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common.mk b/src/common.mk
index 3c4a483b2b..e1747b5232 100644
--- a/src/common.mk
+++ b/src/common.mk
@@ -21,7 +21,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 # of archs to reduce compile time.
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
 # Add Blackwell but drop Pascal & Volta support if we're using CUDA13.0 or above
-NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
+NVCC_GENCODE ?= -gencode=arch=compute_75,code=sm_75 \
+		-gencode=arch=compute_80,code=sm_80 \
 		-gencode=arch=compute_90,code=sm_90 \
 		-gencode=arch=compute_100,code=sm_100 \
 		-gencode=arch=compute_120,code=sm_120 \