Switched to using the hip_fp8 header instead of rccl_float8, resolving compatibility issues.(#109)

* addressing hip_fp8 support compatibility issue * skipping mulsum and avg test for fp8, using hip_fp8 for product * syncing with nccl-tests removing the fp8 filter for pre-hopper gpus and resolving the merge conflict --------- Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>
2025-05-14 15:30:07 -05:00
@@ -65,7 +65,6 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
  ncclRedOp_t *run_ops;
  const char **run_typenames, **run_opnames;
  int type_count, op_count;
-
  if ((int)type != -1) {
    type_count = 1;
    run_types = &type;
@@ -89,8 +88,8 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
  for (int i=0; i<type_count; i++) {
    for (int j=0; j<op_count; j++) {
 #if defined(RCCL_FLOAT8)
-      if((run_types[i] == ncclFp8E4M3 || run_types[i] == ncclFp8E5M2) && run_ops[j] == ncclProd)
-        continue;
+  if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+    continue;
 #endif
      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
    }
@@ -31,6 +31,13 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 int32_t gpu_block3;
 size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB

+// RCCL_FLOAT8 support
+bool rccl_float8_useFnuz = false;
+bool IsArchMatch(char const* arch, char const* target) {
+  // helper function to reduce clutter in code elsewhere.  Returns true on match.
+  return (strncmp(arch, target, strlen(target)) == 0);
+}
+
 #if NCCL_MAJOR >= 2
  ncclDataType_t test_types[ncclNumTypes] = {
    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
@@ -38,7 +45,7 @@ size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
    , ncclBfloat16
  #endif
  #if RCCL_FLOAT8 == 1
-    , ncclFp8E4M3, ncclFp8E5M2
+    , ncclFloat8e4m3, ncclFloat8e5m2
  #endif
  };
  const char *test_typenames[ncclNumTypes] = {
@@ -196,6 +203,7 @@ void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size
 }

 bool Reporter::isMainThread() { return is_main_thread == 1; }
+static int minCudaArch = 1<<30;

 #define NUM_BLOCKS 32

@@ -304,18 +312,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
 }

 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
+  CUDACHECK(ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault));
  CUDACHECK(cudaDeviceSynchronize());
  return testSuccess;
 }

 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
+  CUDACHECK(ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault));
  return testSuccess;
 }

 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
+  CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault));
  return testSuccess;
 }

@@ -563,8 +571,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<hip_bfloat16>(rank); break;
      #endif
      #if defined(RCCL_FLOAT8)
-      case ncclFp8E4M3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
-      case ncclFp8E5M2: fp8_e5m2 = ncclVerifiablePremulScalar<rccl_bfloat8>(rank); break;
+      case ncclFloat8e4m3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
+      case ncclFloat8e5m2 : fp8_e5m2 = ncclVerifiablePremulScalar<rccl_bfloat8>(rank); break;
      #endif
      case ncclNumTypes: break;
      }
@@ -1330,6 +1338,13 @@ testResult_t run() {
  char hostname[1024];
  getHostName(hostname, 1024);

+  hipDeviceProp_t devProp;
+  CUDACHECK(hipGetDeviceProperties(&devProp, 0));
+  if (IsArchMatch(devProp.gcnArchName, "gfx942")) {
+    PRINT("On gfx942 architecture, using FNUZ FP8 types");
+    rccl_float8_useFnuz = true;
+  }
+
 #ifdef MPI_SUPPORT
  MPI_Comm_size(MPI_COMM_WORLD, &totalProcs);
  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
@@ -1456,12 +1471,21 @@ testResult_t run() {
    gpus[i] = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices;
    CUDACHECK(cudaSetDevice(gpus[i]));
    TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes));
-    if (streamnull)
+    if (streamnull) {
      streams[i] = NULL;
-    else
+    }
+    else {
      CUDACHECK(cudaStreamCreateWithFlags(streams.data()+i, cudaStreamNonBlocking));
+    }
+    int archMajor, archMinor;
+    CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, gpus[i]));
+    CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, gpus[i]));
+    minCudaArch = std::min(minCudaArch, 100*archMajor + 10*archMinor);
  }

+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+#endif
  //if parallel init is not selected, use main thread to initialize NCCL
  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
@@ -258,8 +258,8 @@ static size_t wordSize(ncclDataType_t type) {
    //case ncclInt8:
    case ncclUint8:
 #if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
-    case ncclFp8E4M3:
-    case ncclFp8E5M2:
+    case ncclFloat8e4m3:
+    case ncclFloat8e5m2:
 #endif
 #endif
      return 1;
@@ -24,8 +24,9 @@
 #define ROCBLAS_FLOAT8_H

 #include <stdint.h>
+#include <hip/hip_version.h>

-#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+#if __cplusplus < 201103L || (!defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__))
 /*! \brief Struct to represent a 8 bit floating-point number. */

 typedef struct
@@ -38,7 +39,60 @@ typedef struct
    uint8_t data;
 } rccl_bfloat8;

-#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+// __cplusplus < 201103L || (!defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__))
+#elif HIP_VERSION >= 60200000
+
+#include <hip/hip_fp8.h>
+
+#if   __HIP_DEVICE_COMPILE__ && (defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__) ||  (defined(__gfx1100__) || defined(__gfx1101__)))//HIP_FP8_TYPE_OCP is enabled.
+typedef __hip_fp8_e4m3 rccl_float8;
+typedef __hip_fp8_e5m2 rccl_bfloat8;
+#elif __HIP_DEVICE_COMPILE__ && (defined(__gfx942__))
+typedef __hip_fp8_e4m3_fnuz rccl_float8;
+typedef __hip_fp8_e5m2_fnuz rccl_bfloat8;
+#else
+typedef __hip_fp8_e4m3 rccl_float8;
+typedef __hip_fp8_e5m2 rccl_bfloat8;
+#endif
+
+#if   __HIP_DEVICE_COMPILE__
+inline std::ostream& operator<<(std::ostream& os, const rccl_float8& f8)
+{
+    return os << float(f8);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat8& bf8)
+{
+    return os << float(bf8);
+}
+
+#else
+inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e4m3& f8)
+{
+    return os << float(f8);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e5m2& bf8)
+{
+    return os << float(bf8);
+}
+
+//adding support for those operators on the host side
+inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e4m3_fnuz& f8)
+{
+    return os << float(f8);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e5m2_fnuz& bf8)
+{
+    return os << float(bf8);
+}
+#endif
+
+extern bool rccl_float8_useFnuz;
+// For older versions of ROCm that do not include hip_fp8.h,
+// we provide a local version of the header file as a fallback.
+#else

 #define HIP_HOST_DEVICE __host__ __device__
 #define HIP_HOST __host__
@@ -344,7 +398,7 @@ struct rccl_float8
    // default constructor
    HIP_HOST_DEVICE rccl_float8() = default;

-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
    // device specific optimized F8 down-conversion code

    template <bool stochastic_rounding = false>
@@ -381,10 +435,10 @@ struct rccl_float8
        return i8data;
    }

-#endif // __gfx940__
+#endif // __gfx942__

    // constructor from float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)

    // NOTE: ON-DEVICE... always optimal bias
    explicit HIP_DEVICE rccl_float8(float                        v,
@@ -402,7 +456,7 @@ struct rccl_float8
    // Host only implementation using s/w simulation
    explicit HIP_HOST
 #else
-    // both Host and DEVICE for non-gfx940 using s/w simulation
+    // both Host and DEVICE for non-gfx942 using s/w simulation
    explicit HIP_HOST_DEVICE
 #endif
        rccl_float8(float                        v,
@@ -446,7 +500,7 @@ struct rccl_float8
    }

    // convert to float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
    // upcast using device specific intrinsic
    explicit inline HIP_DEVICE operator float() const
    {
@@ -460,7 +514,7 @@ struct rccl_float8
    }

    explicit inline HIP_HOST operator float() const
-#else // non gfx940
+#else // non gfx942
    explicit inline HIP_HOST_DEVICE operator float() const
 #endif
    {
@@ -511,7 +565,7 @@ struct rccl_bfloat8
    // default constructor
    HIP_HOST_DEVICE rccl_bfloat8() = default;

-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
    // device specific optimized F8 down-conversion code

    template <bool stochastic_rounding = false>
@@ -548,10 +602,10 @@ struct rccl_bfloat8
        return i8data;
    }

-#endif // __gfx940__
+#endif // __gfx942__

    // constructor from float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)

    // NOTE: ON-DEVICE... always optimal bias
    explicit HIP_DEVICE rccl_bfloat8(float                        v,
@@ -569,7 +623,7 @@ struct rccl_bfloat8
    // Host only implementation using s/w simulation
    explicit HIP_HOST
 #else
-    // both Host and DEVICE for non-gfx940 using s/w simulation
+    // both Host and DEVICE for non-gfx942 using s/w simulation
    explicit HIP_HOST_DEVICE
 #endif
        rccl_bfloat8(float                        v,
@@ -613,7 +667,7 @@ struct rccl_bfloat8
    }

    // convert to float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
    // upcast using device specific intrinsic
    explicit inline HIP_DEVICE operator float() const
    {
@@ -627,7 +681,7 @@ struct rccl_bfloat8
    }

    explicit inline HIP_HOST operator float() const
-#else // non gfx940
+#else // non gfx942
    explicit inline HIP_HOST_DEVICE operator float() const
 #endif
    {
@@ -969,7 +1023,7 @@ inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng = 0)
    return a;
 }

-// Use h/w intrinsic and optimized version when __gfx940__
+// Use h/w intrinsic and optimized version when __gfx942__
 template <
    typename T,
    typename Ta,
@@ -980,7 +1034,7 @@ template <
    = 0>
 inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx942__) || defined(__gfx950__)
    // NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch
    T val;
    if(std::is_same<T, rccl_float8>::value)
@@ -988,12 +1042,12 @@ inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
    else
        val.data = rccl_bfloat8::cast_to_bf8_from_f32<stochastic_rounding>(float(a), rng);
    return val;
-#else // non gfx940
+#else // non gfx942
    return T(float(a),
             stochastic_rounding ? T::rocblas_hip_f8_rounding_mode::stochastic
                                 : T::rocblas_hip_f8_rounding_mode::standard,
             rng);
-#endif // __gfx940__
+#endif // __gfx942__
 }

 // NOTE NOTE: The above code is good if we don't consider HIP-GEMM code and only consider the quantization
@@ -1016,6 +1070,6 @@ inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)

 // =================================================================================================

-#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+#endif

 #endif // ROCBLAS_FLOAT8_H
@@ -96,8 +96,8 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
  for (int i=0; i<type_count; i++) {
    for (int j=0; j<op_count; j++) {
 #if defined(RCCL_FLOAT8)
-      if((run_types[i] == ncclFp8E4M3 || run_types[i] == ncclFp8E5M2) && run_ops[j] == ncclProd)
-        continue;
+if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+    continue;
 #endif
      for (int k=begin_root; k<=end_root; k++) {
        TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
@@ -91,8 +91,8 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
  for (int i=0; i<type_count; i++) {
    for (int j=0; j<op_count; j++) {
 #if defined(RCCL_FLOAT8)
-      if((run_types[i] == ncclFp8E4M3 || run_types[i] == ncclFp8E5M2) && run_ops[j] == ncclProd)
-        continue;
+if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+    continue;
 #endif
      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
    }
@@ -22,12 +22,16 @@
 #endif

 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_FLOAT8 == 1
-  #define HAVE_ncclfp8 1
-  // Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3)
-    #define ncclFp8E4M3 ncclFloat8e4m3
-    #define ncclFp8E5M2 ncclFloat8e5m2
-  #endif
+#if __HIP_DEVICE_COMPILE__
+  #define HAVE_ncclfp8_DEVICE 1
+#else
+  #define HAVE_ncclfp8_HOST 1
+#endif
+// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3)
+  #define ncclFp8E4M3 ncclFloat8e4m3
+  #define ncclFp8E5M2 ncclFloat8e5m2
+#endif
 #else
  #define HAVE_ncclfp8 0
 #endif
@@ -130,23 +134,39 @@ __host__ __device__ T inhibit(T x) {
 ////////////////////////////////////////////////////////////////////////////////

 namespace {
-  template<typename Y, typename X>
-  __host__ __device__ Y castTo(X x) {
+  template<typename Y>
+  __host__ __device__ Y castTo(uint64_t x) {
    return Y(x);
  }
  template<typename Y>
  __host__ __device__ Y castTo(float x) {
    return Y(x);
  }
+  template<typename Y>
+  __host__ __device__ Y castTo(double x) {
+    return Y(x);
+  }
  template<>
  __host__ __device__ __half castTo<__half>(float x) {
    return __float2half(x);
  }
+  template<>
+  __host__ __device__ half castTo<__half>(uint64_t x) {
+    return __ull2half_rn(x);
+  }
  #if RCCL_BFLOAT16 == 1
  template<>
  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
    return hip_bfloat16(x);
  }
+  template<>
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(double x) {
+    return hip_bfloat16(x);
+  }
+  template<>
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(uint64_t x) {
+    return hip_bfloat16((double)x);
+  }
  #endif
  #if RCCL_FLOAT8 == 1
  template<>
@@ -157,6 +177,22 @@ namespace {
  __host__ __device__ rccl_bfloat8 castTo<rccl_bfloat8>(float x) {
    return static_cast<rccl_bfloat8>(x);
  }
+  template<>
+  __host__ __device__ rccl_float8 castTo<rccl_float8>(double x) {
+    return static_cast<rccl_float8>(x);
+  }
+  template<>
+  __host__ __device__ rccl_float8 castTo<rccl_float8>(uint64_t x) {
+    return static_cast<rccl_float8>((double)x);
+  }
+  template<>
+  __host__ __device__ rccl_bfloat8 castTo<rccl_bfloat8>(double x) {
+    return static_cast<rccl_bfloat8>(x);
+  }
+  template<>
+  __host__ __device__ rccl_bfloat8 castTo<rccl_bfloat8>(uint64_t x) {
+    return static_cast<rccl_bfloat8>((double)x);
+  }
  #endif
 }

@@ -211,16 +247,16 @@ struct ReduceProd {
  #endif
  #if RCCL_FLOAT8 == 1
  __host__ __device__ rccl_float8 operator()(rccl_float8 a, rccl_float8 b) const {
-      return static_cast<rccl_float8>(a * b);
+      return static_cast<rccl_float8>(float(a) * float(b));
  }
  __host__ __device__ rccl_float8 operator()(rccl_float8 a, float b) const {
-      return static_cast<rccl_float8>(a * b);
+      return static_cast<rccl_float8>(float(a) * float(b));
  }
  __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, rccl_bfloat8 b) const {
-      return static_cast<rccl_bfloat8>(a * b);
+      return static_cast<rccl_bfloat8>(float(a) * float(b));
  }
  __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, float b) const {
-      return static_cast<rccl_bfloat8>(a * b);
+      return static_cast<rccl_bfloat8>(float(a) * float(b));
  }
  #endif
  template<typename T>
@@ -328,40 +364,72 @@ struct ReduceAvg {

 namespace {
 template<typename T>
-struct FloatLayout;
+struct FloatLayout { static constexpr bool is_floating_point = false; };
 template<>
 struct FloatLayout<float> {
+  static constexpr bool is_floating_point = true;
  static constexpr int exponent_bits = 8, mantissa_bits = 23;
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<double> {
+  static constexpr bool is_floating_point = true;
  static constexpr int exponent_bits = 11, mantissa_bits = 52;
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<__half> {
+  static constexpr bool is_floating_point = true;
  static constexpr int exponent_bits = 5, mantissa_bits = 10;
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 #if RCCL_BFLOAT16 == 1
 template<>
 struct FloatLayout<hip_bfloat16> {
+  static constexpr bool is_floating_point = true;
  static constexpr int exponent_bits = 8, mantissa_bits = 7;
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 #endif
 #if RCCL_FLOAT8 == 1
+#if __HIP_DEVICE_COMPILE__
 template<>
 struct FloatLayout<rccl_float8> {
+  static constexpr bool is_floating_point = true;
  static constexpr int exponent_bits = 4, mantissa_bits = 3;
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
 struct FloatLayout<rccl_bfloat8> {
+  static constexpr bool is_floating_point = true;
  static constexpr int exponent_bits = 5, mantissa_bits = 2;
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
+#else
+template<>
+struct FloatLayout<__hip_fp8_e4m3> {
+  static constexpr bool is_floating_point = true;
+  static constexpr int exponent_bits = 4, mantissa_bits = 3;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<__hip_fp8_e5m2> {
+  static constexpr bool is_floating_point = true;
+  static constexpr int exponent_bits = 5, mantissa_bits = 2;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+
+template<>
+struct FloatLayout<__hip_fp8_e4m3_fnuz> {
+  static constexpr int exponent_bits = 4, mantissa_bits = 3;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1));
+};
+template<>
+struct FloatLayout<__hip_fp8_e5m2_fnuz> {
+  static constexpr int exponent_bits = 5, mantissa_bits = 2;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1));
+};
+#endif
 #endif

 template<typename T>
@@ -675,11 +743,12 @@ __host__ __device__ void genOutput(
 ////////////////////////////////////////////////////////////////////////////////
 // Nil reduction (byte copy functions). Optimized to assume rank_n=1

+// genInput specialization for integer ReduceNil.
 namespace {
-template<typename T, bool IsIntegral>
+template<typename T>
 __host__ __device__ void genInput(
    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
-    std::integral_constant<bool, IsIntegral>
+    std::true_type /*integral*/
  ) {
  (void)rank_n, (void)rank_me; // silence unused warnings
  union { uint64_t bits; T tmp; };
@@ -689,6 +758,24 @@ __host__ __device__ void genInput(
  ans = tmp;
 }

+// genInput specialization for floating point ReduceNil.
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
+  int sign = rng & 1;
+  rng ^= rng>>1;
+  int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
+  exp += 1<<(FloatLayout<T>::exponent_bits-2);
+  rng ^= rng >> FloatLayout<T>::exponent_bits;
+  uint64_t mant = rng & mant_mask;
+  ans = makeFloat<T>(sign, exp, mant);
+}
+
 template<typename T, typename ReduceFn, bool IsIntegral>
 __host__ __device__ void genOutput(
    T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
@@ -777,22 +864,35 @@ __host__ __device__ void genOutput(
 namespace {
 template<typename T>
 __host__ __device__ void genInput(
-    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+  T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
    std::false_type /*integral*/
  ) {
-  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
+  // We can't control the nranks divisor in avareages so to control error we
+  // limit to two ranks contributing non-zero values. This way there is no ambiguity
+  // of summation.
+  int r = shuffleRank(rank_n, rank_me, rng);
+  uint64_t m = (rng*(r ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  ans = r < 2 ? castTo<T>(1+m) : castTo<T>((uint64_t)0);
 }

 template<typename T>
 __host__ __device__ void genOutput(
-    T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
+    T &ans, ReduceAvg, int rank_n, uint64_t rng, intptr_t index,
    std::false_type /*integral*/
  ) {
-  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
-  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
-  //ans = ReduceProd()(ans, T1(1)/T1(rank_n));
-  ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
- }
+  shuffleRank(rank_n, -1, rng);
+  uint64_t m0 = (rng*(0 ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  uint64_t m1 = (rng*(1 ? 0xbeef : 1)) & ((1ul<<FloatLayout<T>::mantissa_bits)-1);
+  if (rank_n == 1) {
+    ans = castTo<T>(1+m0);
+  } else {
+    // NCCL varies which datatype it does the muls with depending on __CUDA_ARCH__.
+    // We account for this by using a tolerance of 2 ulps during the verification.
+    using TMul = typename std::conditional<(sizeof(T) < sizeof(double)), float, double>::type;
+    ans = ReduceSum()((T)(TMul(1+m0)*TMul(1.0/rank_n)),
+                      (T)(TMul(1+m1)*TMul(1.0/rank_n)));
+  }
+}
 }

 /////////////////////////////////////////////////////////////////////////////////
@@ -856,7 +956,7 @@ __host__ __device__ T genOutput(
 #if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
-__global__ void prepareInput2(
+__global__ void __launch_bounds__(512, 1) prepareInput2(
    T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
    uint64_t seed, intptr_t elt_ix0
  ) {
@@ -877,44 +977,55 @@ __global__ void prepareInput2(
 }

 template<typename ReduceOp>
-void prepareInput1(
+cudaError_t prepareInput1(
    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
  ) {
-  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
-  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
+  void const *fn = nullptr;
  switch(elt_ty) {
-  case ncclInt8: CASE_TY(int8_t)
-  case ncclUint8: CASE_TY(uint8_t)
-  case ncclInt32: CASE_TY(int32_t)
-  case ncclUint32: CASE_TY(uint32_t)
-  case ncclInt64: CASE_TY(int64_t)
-  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(__half)
+  case ncclInt8: fn = (void const*)&prepareInput2<int8_t, ReduceOp>; break;
+  case ncclUint8: fn = (void const*)&prepareInput2<uint8_t, ReduceOp>; break;
+  case ncclInt32: fn = (void const*)&prepareInput2<int32_t, ReduceOp>; break;
+  case ncclUint32: fn = (void const*)&prepareInput2<uint32_t, ReduceOp>; break;
+  case ncclInt64: fn = (void const*)&prepareInput2<int64_t, ReduceOp>; break;
+  case ncclUint64: fn = (void const*)&prepareInput2<uint64_t, ReduceOp>; break;
+  case ncclFloat16: fn = (void const*)&prepareInput2<__half, ReduceOp>; break;
  #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: fn = (void const*)&prepareInput2<hip_bfloat16, ReduceOp>; break;
  #endif
-  #if HAVE_ncclfp8
-  case ncclFp8E4M3: CASE_TY(rccl_float8)
-  case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+  #if HAVE_ncclfp8_DEVICE
+  case ncclFloat8e4m3: fn = (void const*)&prepareInput2<rccl_float8, ReduceOp>; break;
+  case ncclFloat8e5m2: fn = (void const*)&prepareInput2<rccl_bfloat8, ReduceOp>; break;
  #endif
-  case ncclFloat32: CASE_TY(float)
-  case ncclFloat64: CASE_TY(double)
-  default: assert(0);
+  #if HAVE_ncclfp8_HOST
+  case ncclFloat8e4m3: if (rccl_float8_useFnuz) { fn = (void const*)&prepareInput2<__hip_fp8_e4m3_fnuz, ReduceOp>; break;}
+  else { fn = (void const*)&prepareInput2<__hip_fp8_e4m3, ReduceOp>; break;}
+  case ncclFloat8e5m2: if (rccl_float8_useFnuz) { fn = (void const*)&prepareInput2<__hip_fp8_e5m2_fnuz, ReduceOp>; break;}
+  else { fn = (void const*)&prepareInput2<__hip_fp8_e5m2, ReduceOp>; break;}
+  #endif
+  case ncclFloat32: fn = (void const*)&prepareInput2<float, ReduceOp>; break;
+  case ncclFloat64: fn = (void const*)&prepareInput2<double, ReduceOp>; break;
+  default: assert(0); return cudaErrorInvalidValue;
  }
  #undef CASE_TY
+  dim3 grid = {1, 1, 1};
+  grid.x = (unsigned int)std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  dim3 block = {512, 1, 1};
+  void *args[7] = {&elts, &elt_n, &op, &rank_n, &rank_me, &seed, &elt_ix0};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }

-void ncclVerifiablePrepareInput(
+hipError_t ncclVerifiablePrepareInput(
    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
  ) {
  #define CASE_OP(op) \
    if(rank_n == 1) \
-      prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
+      return prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
    else \
-      prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
+      return prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
    break;
  switch(red_op) {
  case ncclSum: CASE_OP(ReduceSum())
@@ -937,7 +1048,7 @@ void ncclVerifiablePrepareInput(
 #if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
-__global__ void prepareExpected2(
+__global__ void __launch_bounds__(512, 1) prepareExpected2(
    T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
    uint64_t seed, intptr_t elt_ix0
  ) {
@@ -957,44 +1068,55 @@ __global__ void prepareExpected2(
 }

 template<typename ReduceOp>
-void prepareExpected1(
+cudaError_t prepareExpected1(
    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
  ) {
-  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
-  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
+  void const *fn = nullptr;
  switch(elt_ty) {
-  case ncclInt8: CASE_TY(int8_t)
-  case ncclUint8: CASE_TY(uint8_t)
-  case ncclInt32: CASE_TY(int32_t)
-  case ncclUint32: CASE_TY(uint32_t)
-  case ncclInt64: CASE_TY(int64_t)
-  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(__half)
+  case ncclInt8: fn = (void const*)&prepareExpected2<int8_t, ReduceOp>; break;
+  case ncclUint8: fn = (void const*)&prepareExpected2<uint8_t, ReduceOp>; break;
+  case ncclInt32: fn = (void const*)&prepareExpected2<int32_t, ReduceOp>; break;
+  case ncclUint32: fn = (void const*)&prepareExpected2<uint32_t, ReduceOp>; break;
+  case ncclInt64: fn = (void const*)&prepareExpected2<int64_t, ReduceOp>; break;
+  case ncclUint64: fn = (void const*)&prepareExpected2<uint64_t, ReduceOp>; break;
+  case ncclFloat16: fn = (void const*)&prepareExpected2<__half, ReduceOp>; break;
  #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: fn = (void const*)&prepareExpected2<hip_bfloat16, ReduceOp>; break;
  #endif
-  #if HAVE_ncclfp8
-  case ncclFp8E4M3: CASE_TY(rccl_float8)
-  case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+  #if HAVE_ncclfp8_DEVICE
+  case ncclFloat8e4m3: fn = (void const*)&prepareExpected2<rccl_float8, ReduceOp>; break;
+  case ncclFloat8e5m2: fn = (void const*)&prepareExpected2<rccl_bfloat8, ReduceOp>; break;
  #endif
-  case ncclFloat32: CASE_TY(float)
-  case ncclFloat64: CASE_TY(double)
-  default: assert(0);
+  #if HAVE_ncclfp8_HOST
+  case ncclFloat8e4m3: if (rccl_float8_useFnuz) { fn = (void const*)&prepareExpected2<__hip_fp8_e4m3_fnuz, ReduceOp>; break; }
+  else { fn = (void const*)&prepareExpected2<__hip_fp8_e4m3, ReduceOp>; break; }
+  case ncclFloat8e5m2: if (rccl_float8_useFnuz) { fn = (void const*)&prepareExpected2<__hip_fp8_e5m2_fnuz, ReduceOp>; break; }
+  else { fn = (void const*)&prepareExpected2<__hip_fp8_e5m2, ReduceOp>; break; }
+  #endif
+  case ncclFloat32: { fn = (void const*)&prepareExpected2<float, ReduceOp>; break; }
+  case ncclFloat64: { fn = (void const*)&prepareExpected2<double, ReduceOp>; break; }
+  default: assert(0); return cudaErrorInvalidValue;
  }
-  #undef CASE_TY
+
+  dim3 grid = {1, 1, 1};
+  grid.x = (unsigned int)std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  dim3 block = {512, 1, 1};
+  void *args[6] = {&elts, &elt_n, &op, &rank_n, &seed, &elt_ix0};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }

-void ncclVerifiablePrepareExpected(
+hipError_t ncclVerifiablePrepareExpected(
    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
  ) {
  #define CASE_OP(op) \
    if(rank_n == 1) \
-      prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
+      return prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
    else \
-      prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
+      return prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
    break;
  switch(red_op) {
  case ncclSum: CASE_OP(ReduceSum())
@@ -1015,54 +1137,6 @@ void ncclVerifiablePrepareExpected(
 ////////////////////////////////////////////////////////////////////////////////

 namespace {
-/* How we compare floating point values when exactness is impossible is interesting.
- * First, we take note that simply reinterpreting integer bits as floating point
- * gives us a monotonic mapping which exponentially spaces out floats. Thus
- * consecutive integers encode consecutive floats. In general, using integer
- * subraction on the bitpatterns of two floats gives us an integer which is the
- * logarithm of their relative difference. But, if the floats always have similar
- * exponents, than the integer difference is actually proportional to the
- * relative error (this is because we are counting hops in the mantissa bits only,
- * not the exponent bits). So a cheap way to compare if two floats are relatively
- * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
- * calculates such a tolerance for a summation of n floats. This formula
- * was derived by inspecting the maximum observed integer difference over many
- * random runs of summation. The parameter values were computed by the
- * companion program "inexact_regress.cu".
- */
-__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
-  float power, coef;
-  switch(elt_ty) {
-  case ncclFloat32:
-  case ncclFloat64:
-    power = .51f;
-    coef = 1.25f;
-    break;
-  case ncclFloat16:
-    power = .91f;
-    coef = .75f;
-    break;
-  #if HAVE_ncclBfloat16
-  case ncclBfloat16:
-    power = .91f;
-    coef = .66f;
-    break;
-  #endif
-  #if HAVE_ncclfp8
-  case ncclFp8E4M3:
-  case ncclFp8E5M2:
-    power = .91f;
-    coef = .66f;
-    break;
-  #endif
-  }
-  #if __CUDA_ARCH__
-    return 1 + unsigned(coef*powf(float(rank_n), power));
-  #else
-    return 1 + unsigned(coef*std::pow(float(rank_n), power));
-  #endif
-}
-
 template<typename T>
 __host__ __device__  uint64_t calcDelta(T a, T b) {
  union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
@@ -1082,7 +1156,7 @@ __host__ __device__  uint64_t calcDelta(T a, T b) {
 #if !SELF_TEST
 namespace {
 template<typename T>
-__global__ void verifyPrepared(
+__global__ void __launch_bounds__(512, 1) verifyPrepared(
    T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
  ) {
  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
@@ -1098,17 +1172,35 @@ __global__ void verifyPrepared(
    bad += tolerance < delta ? 1 : 0;
    #if 0
      if(tolerance < delta) {
-        printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
+        printf("verifyPrepared ix=%lld got=%g exp=%g tol=%d\n", (long long)i, (float)results[i], (float)expected[i], tolerance);
      }
    #endif
    i += blockDim.x;
  }
-  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad) : "memory");
  atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
 }

+hipError_t verifyPrepared1(int bytePerElt,
+  void const *results, void const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
+) {
+  void const *fn = nullptr;
+  switch(bytePerElt) {
+  case 1: fn = (void const*)&verifyPrepared<uint8_t>; break;
+  case 2: fn = (void const*)&verifyPrepared<uint16_t>; break;
+  case 4: fn = (void const*)&verifyPrepared<uint32_t>; break;
+  case 8: fn = (void const*)&verifyPrepared<uint64_t>; break;
+  default: assert(0); return cudaErrorInvalidValue;
+  }
+  dim3 grid = {(unsigned int)block_n, 1, 1};
+  dim3 block = {512, 1, 1};
+  void *args[5] = {&results, &expected, &elt_n, &tolerance, &bad_elt_n};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
+}
+
 template<typename T, typename Uint, typename ReduceFn>
-__global__ void verifyInline2(
+__global__ void __launch_bounds__(512, 1) verifyInline2(
    T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
    intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
  ) {
@@ -1142,35 +1234,49 @@ __global__ void verifyInline2(
 }

 template<typename T, typename Uint>
-void verifyInline1(
+hipError_t verifyInline1(
    T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
    unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
  ) {
+  void const *fn = nullptr;
+  ReduceNil opnil;
+  ReduceSum opsum;
+  ReduceMin opmin;
+  ReduceMax opmax;
+  ReduceProd opprod;
+  ReduceAvg opavg{rank_n};
+  ReducePreMulSum oppremulsum;
+  void *args[8] = {&results, &elt_n, nullptr, &rank_n, &seed, &elt_ix0, &tolerance, &bad_elt_n};
+
  #define CASE_OP(op) \
-    if(rank_n == 1) \
-    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
-      ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
-    else \
-    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
-      ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
-    break;
+    if(rank_n == 1) { \
+      fn = (void const*)&verifyInline2<T, Uint, ReduceNil>; \
+      args[2] = &opnil; \
+    } else { \
+      fn = (void const*)&verifyInline2<T, Uint, decltype(op)>; \
+      args[2] = &op; \
+    } break;
  switch(red_op) {
-  case ncclSum: CASE_OP(ReduceSum())
-  case ncclMin: CASE_OP(ReduceMin())
-  case ncclMax: CASE_OP(ReduceMax())
-  case ncclProd: CASE_OP(ReduceProd())
+  case ncclSum: CASE_OP(opsum)
+  case ncclMin: CASE_OP(opmin)
+  case ncclMax: CASE_OP(opmax)
+  case ncclProd: CASE_OP(opprod)
  #if HAVE_ncclAvg
-  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  case ncclAvg: CASE_OP(opavg)
  #endif
  #if HAVE_ncclPreMulSum
-  default: CASE_OP(ReducePreMulSum())
+  default: CASE_OP(oppremulsum)
  #endif
  }
  #undef CASE_OP
+  dim3 grid = {(unsigned int)block_n, 1, 1};
+  dim3 block = {512, 1, 1};
+  if (grid.x == 0) return cudaSuccess;
+  return cudaLaunchKernel(fn, grid, block, args, 0, stream);
 }
 }

-void ncclVerifiableVerify(
+hipError_t ncclVerifiableVerify(
    void const *results, void const *expected, intptr_t elt_n, int elt_ty,
    int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
    int64_t *bad_elt_n, cudaStream_t stream
@@ -1179,15 +1285,21 @@ void ncclVerifiableVerify(
  #if HAVE_ncclBfloat16
    floating |= elt_ty == ncclBfloat16;
  #endif
-  #if HAVE_ncclfp8
-    floating |= elt_ty == ncclFp8E4M3;
-    floating |= elt_ty == ncclFp8E5M2;
+  #if HAVE_ncclfp8_DEVICE || HAVE_ncclfp8_HOST
+    floating |= elt_ty == ncclFloat8e4m3;
+    floating |= elt_ty == ncclFloat8e5m2;
  #endif

  unsigned tolerance = 0;
  #if HAVE_ncclAvg
-  if (floating && red_op == ncclAvg)
-    tolerance = calcSumFloatTolerance(rank_n, elt_ty);
+  if (floating && red_op == ncclAvg) {
+    // Average does it's pre-multiplies in an unspecified floating point format
+    // (could be the actual type T or float or half). That means the premultiply
+    // verify does could generate a discrepancy in the least mantissa digit. After
+    // adding those two (since avg only has two non-zero contributions) we could
+    // be off by a distance of 2 units.
+    tolerance = 2;
+  }
  #endif

  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
@@ -1195,9 +1307,9 @@ void ncclVerifiableVerify(
  *bad_elt_n = 0;
  #define CASE_TY(T, Uint) { \
      if(expected != nullptr) { \
-        verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
+        return verifyPrepared1(sizeof(T), results, expected, elt_n, tolerance, bad_elt_n, stream, block_n); \
      } else { \
-        verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
+        return verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
      } \
    } break;
  switch(elt_ty) {
@@ -1211,13 +1323,19 @@ void ncclVerifiableVerify(
  #if HAVE_ncclBfloat16
  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
  #endif
-  #if HAVE_ncclfp8
-  case ncclFp8E4M3: CASE_TY(rccl_float8, uint8_t)
-  case ncclFp8E5M2: CASE_TY(rccl_bfloat8, uint8_t)
+  #if HAVE_ncclfp8_DEVICE
+  case ncclFloat8e4m3: CASE_TY(rccl_float8, uint8_t)
+  case ncclFloat8e5m2: CASE_TY(rccl_bfloat8, uint8_t)
+  #endif
+  #if HAVE_ncclfp8_HOST
+  case ncclFloat8e4m3: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e4m3_fnuz, uint8_t);}
+  else { CASE_TY(__hip_fp8_e4m3, uint8_t);}
+  case ncclFloat8e5m2: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e5m2_fnuz, uint8_t);}
+  else { CASE_TY(__hip_fp8_e5m2, uint8_t);}
  #endif
  case ncclFloat32: CASE_TY(float, uint32_t)
  case ncclFloat64: CASE_TY(double, uint64_t)
-  default: assert(0);
+  default: assert(0); return cudaErrorInvalidValue;
  }
  #undef CASE_TY
 }
@@ -1234,7 +1352,7 @@ __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, in
  //if(!std::is_same<Op,ReduceProd>::value) return;
  //if(rank_n!=3) return;

-  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
+  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? 2 : 0;
  uint64_t seed = 0xc8e2bed69766d533;

  for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
@@ -1271,7 +1389,7 @@ __device__ void sweep1(int ty, char const *tyname) {
  }
 }

-__global__ void sweep() {
+__global__ void __launch_bounds__(512, 1) sweep() {
  sweep1<int8_t>(ncclInt8, "int8");
  sweep1<uint8_t>(ncclUint8, "uint8");
  sweep1<int32_t>(ncclInt32, "int32");
@@ -1282,19 +1400,16 @@ __global__ void sweep() {
  #if HAVE_ncclBfloat16
    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
  #endif
-  #if HAVE_ncclfp8
-    sweep1<rccl_float8>(ncclFp8E4M3, "fp8_e4m3");
-    sweep1<rccl_bfloat8>(ncclFp8E5M2, "fp8_e5m2");
+  #if HAVE_ncclfp8 && __HIP_DEVICE_COMPILE__
+    sweep1<rccl_float8>(ncclFloat8e4m3, "fp8_e4m3");
+    sweep1<rccl_bfloat8>(ncclFloat8e5m2, "fp8_e5m2");
  #endif
  sweep1<float>(ncclFloat32, "float");
  sweep1<double>(ncclFloat64, "double");
 }

-int main(int arg_n, char **args) {
-  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  cudaSetDevice(0);
+void ncclVerifiableLaunchSelfTest() {
+  sweep<<<1,512>>>();
  sweep<<<1,512>>>();
-  cudaDeviceSynchronize();
-  return 0;
 }
 #endif
@@ -41,13 +41,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 }

 // Enqueue kernel to generate data which is to be reduced.
-void ncclVerifiablePrepareInput(
+hipError_t ncclVerifiablePrepareInput(
  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );

 // Enqueue kernel to generate expected results of reduction.
-void ncclVerifiablePrepareExpected(
+hipError_t ncclVerifiablePrepareExpected(
  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
@@ -58,7 +58,7 @@ void ncclVerifiablePrepareExpected(
 // which can be costly. Thus if you plan to run the same reduction multiple
 // times it is advantageous to precompute the expected values with
 // ncclVerifiablePrepareExpected and pass them as `expected` here.
-void ncclVerifiableVerify(
+hipError_t ncclVerifiableVerify(
  void const *results, void const *expected, intptr_t elt_n, int elt_ty,
  int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
  int64_t *bad_elt_n, cudaStream_t stream