Merge remote-tracking branch 'nccl-tests/master' into develop

2025-07-23 14:22:18 -05:00
@@ -4,7 +4,7 @@ These tests check both the performance and the correctness of RCCL operations. T

 ## Build

-To build the tests, just type `make`.
+To build the tests, just type `make` or `make -j`

 If HIP is not installed in `/opt/rocm`, you may specify `HIP_HOME`. Similarly, if RCCL (`librccl.so`) is not installed in `/opt/rocm/lib/`, you may specify `NCCL_HOME` and `CUSTOM_RCCL_LIB`.

@@ -75,12 +75,14 @@ RCCL Tests can run on multiple processes, multiple threads, and multiple HIP dev
 ### Quick examples

 Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+
 ```shell
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```

 Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
 (NB: The rccl-tests binaries must be compiled with `MPI=1` for this case)
+
 ```shell
 $ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
 ```
@@ -138,8 +140,8 @@ All tests support the same set of arguments :
  * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
  * `-G,--hipgraph <num graph launches>` Capture iterations as a HIP graph and then replay specified number of times. Default : 0.
  * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
-  * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
-  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default: disabled.
+  * `-R,--local_register <0/1/2>` enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
+  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
  * `-F,--cache_flush <cache flush after every -F iteration>` Enable cache flush after every -F iteration. Default : 0 (No cache flush).
  * `-O,--out_of_place <0=in-place only, 1=out-of-place only>`. Default: both.
  * `-q,--delay <delay>` Delay between out-of-place and in-place runs (in microseconds). Default: 10.
@@ -158,9 +160,12 @@ with the same color will end up in the same group. The resulting group is printe
 `NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.

 Here are a few examples:
- - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network)
- - `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
- - `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
+
+ - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8"`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating over the inter-node network)
+
+- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
+
+- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.

 Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.

@@ -178,6 +183,6 @@ $ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3

 ## Copyright

-NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.

 All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
@@ -1,9 +1,10 @@
 #
-# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
 # Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
+include common.mk

 ROCM_PATH ?= /opt/rocm
 MPI_HOME ?= /usr/lib/x86_64-linux-gnu
@@ -21,6 +22,10 @@ HIPCUFLAGS := -std=c++14
 LDFLAGS    :=
 HIPLDFLAGS :=

+MPI ?= 0        # Set to 1 to enable MPI support (multi-process/multi-node)
+NAME_SUFFIX ?=  # e.g. _mpi when using MPI=1
+DSO ?= 0        # Set to 1 to create and use libverifiable.so to reduce binary size
+
 HIP_VERSION = $(strip $(shell which $(HIPCONFIG) >/dev/null && $(HIPCONFIG) --version))
 HIP_MAJOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 1)
 HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
@@ -126,7 +131,7 @@ DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
 BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube
-BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})

 GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
 GIT_REV          := $(shell git log --pretty=format:'%h' -n 1)
@@ -157,20 +162,39 @@ ${HIPIFY_DIR}/%.h: %.h
 	@mkdir -p ${HIPIFY_DIR}
 	hipify-perl -quiet-warnings $< > $@

+.PRECIOUS: ${DST_DIR}/%.o
+
 ${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
 	$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<

+${DST_DIR}/%$(NAME_SUFFIX).o: %.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
+	$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<
+
 ${DST_DIR}/timer.o: timer.cc timer.h
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
+	$(CXX) $(CXXFLAGS) -o $@ -c $<

-${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
+ifeq ($(DSO), 1)
+${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS) $(DST_DIR)/src/git_version.cpp
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
+	$(HIPCC) -o $@ $^ $(HIPLDFLAGS) -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable"
+else
+${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
 	$(HIPCC) -o $@ $^ $(HIPLDFLAGS)
+endif
+
+clean_intermediates:
+	rm -f ${DST_DIR}/*.o $(TEST_VERIFIABLE_OBJS)

@@ -41,19 +41,19 @@ bool IsArchMatch(char const* arch, char const* target) {
 #if NCCL_MAJOR >= 2
  ncclDataType_t test_types[ncclNumTypes] = {
    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
-  #if RCCL_BFLOAT16 == 1
+  #if HAVE_BF16
    , ncclBfloat16
  #endif
-  #if RCCL_FLOAT8 == 1
+  #if HAVE_FP8
    , ncclFloat8e4m3, ncclFloat8e5m2
  #endif
  };
  const char *test_typenames[ncclNumTypes] = {
    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
-  #if RCCL_BFLOAT16 == 1
+  #if HAVE_BF16
    , "bfloat16"
  #endif
-  #if RCCL_FLOAT8 == 1
+  #if HAVE_FP8
    , "fp8_e4m3", "fp8_e5m2"
  #endif
  };
@@ -122,8 +122,11 @@ static int enable_in_place = 1;
 static int enable_cache_flush = 0;
 static int enable_rotating_tensor = 0;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+#define LOCAL_REGISTER 1
+#define SYMMETRIC_REGISTER 2
 static int local_register = 0;
 #endif
+static int minCudaArch = 1<<30;

 Reporter::Reporter(std::string fileName, std::string outputFormat) : _outputFormat(outputFormat) {
  if (!fileName.empty()) {
@@ -203,7 +206,6 @@ void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size
 }

 bool Reporter::isMainThread() { return is_main_thread == 1; }
-static int minCudaArch = 1<<30;

 #define NUM_BLOCKS 32

@@ -550,10 +552,10 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
      union {
        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
        half f16; float f32; double f64;
-        #if defined(RCCL_BFLOAT16)
+        #if HAVE_BF16
        hip_bfloat16 bf16;
        #endif
-        #if defined(RCCL_FLOAT8)
+        #if HAVE_FP8
        rccl_float8 fp8_e4m3; rccl_bfloat8 fp8_e5m2;
        #endif
      };
@@ -567,14 +569,14 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
      case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
      case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
      case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
-      #if defined(RCCL_BFLOAT16)
+      #if HAVE_BF16
      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<hip_bfloat16>(rank); break;
      #endif
-      #if defined(RCCL_FLOAT8)
+      #if HAVE_FP8
      case ncclFloat8e4m3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
      case ncclFloat8e5m2 : fp8_e5m2 = ncclVerifiablePremulScalar<rccl_bfloat8>(rank); break;
      #endif
-      case ncclNumTypes: break;
+      default: break; // Just to silence clang
      }
      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
    }
@@ -957,20 +959,38 @@ testResult_t threadInit(struct threadArgs* args) {
  }
  NCCLCHECK(ncclGroupEnd());
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+  NCCLCHECK(ncclGroupStart());
  void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
  void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
  for (int i=0; i<args->nGpus; i++) {
-    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
-    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+    if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+      NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+      NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+    } else
+#endif
+    {
+      if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
+      if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
+    }
  }
+  NCCLCHECK(ncclGroupEnd());
 #endif

  TESTCHECK(threadRunTests(args));

  for (int i=0; i<args->nGpus; i++) {
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
-    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+    if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+      NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)sendRegHandles[i]));
+      NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)recvRegHandles[i]));
+    } else
+#endif
+    {
+      if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
+      if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
+    }
 #endif
    NCCLCHECK(ncclCommDestroy(args->comms[i]));
  }
@@ -1046,17 +1066,20 @@ int main(int argc, char* argv[]) {
    test_typenum = 9;
    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
      test_opnum++; // ncclAvg
-      #if defined(RCCL_BFLOAT16)
-        test_typenum++; // bfloat16
-      #endif
-      #if defined(RCCL_FLOAT8)
-        test_typenum++; // fp8_e4m3
-        test_typenum++; // fp8_e5m2
-      #endif
    }
    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
      test_opnum++; // PreMulSum
    }
+    #if defined(RCCL_BFLOAT16)
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_typenum++; // bfloat16
+    }
+    #endif
+    #if defined(RCCL_FLOAT8)
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_typenum += 2; // fp8 e4m3,e5m2
+    }
+    #endif
  #endif

  // Parse args
@@ -1194,8 +1217,10 @@ int main(int argc, char* argv[]) {
        break;
      case 'R':
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-        if ((int)strtol(optarg, NULL, 0)) {
-          local_register = 1;
+        local_register = (int)strtol(optarg, NULL, 0);
+        if (local_register == SYMMETRIC_REGISTER && test_ncclVersion < NCCL_VERSION(2,27,0)) {
+          printf("Option -R 2 (symmetric) is not supported before NCCL 2.27. Defaulting to local registration\n");
+          local_register = LOCAL_REGISTER;
        }
 #else
        printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
@@ -1269,7 +1294,7 @@ int main(int argc, char* argv[]) {
            "[-G,--cudagraph <num graph launches>] \n\t"
            "[-C,--report_cputime <0/1>] \n\t"
            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
-            "[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
+            "[-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers (default: disable (0))] \n\t"
            "[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
            "[-u,--cumask <d0,d1,d2,d3>] \n\t"
 	    "[-O,--out_of_place <0/1>] \n\t"
@@ -1486,6 +1511,22 @@ testResult_t run() {
 #ifdef MPI_SUPPORT
  MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
 #endif
+#if defined(RCCL_FLOAT8)
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) {
+    if (minCudaArch < 900) { // Filter out fp8 on pre-Hopper hardware
+      int n = 0;
+      for (int i=0; i < test_typenum; i++) {
+        if (!(test_types[i] == ncclFloat8e4m3 || test_types[i] == ncclFloat8e5m2)) {
+          test_types[n] = test_types[i];
+          test_typenames[n] = test_typenames[i];
+          n += 1;
+        }
+      }
+      test_typenum = n;
+    }
+  }
+#endif
+
  //if parallel init is not selected, use main thread to initialize NCCL
  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
@@ -1504,12 +1545,22 @@ testResult_t run() {
       NCCLCHECK(ncclGroupEnd());
     }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+     NCCLCHECK(ncclGroupStart());
     sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
     recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
     for (int i=0; i<nGpus*nThreads; i++) {
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &sendbuffs[i], maxBytes, &sendRegHandles[i]));
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &recvbuffs[i], maxBytes, &recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+       if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+         NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+         NCCLCHECK(ncclCommWindowRegister(comms[i], recvbuffs[i], maxBytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
+       } else
+#endif
+       {
+         if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
+         if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
+       }
     }
+     NCCLCHECK(ncclGroupEnd());
 #endif
  }

@@ -1607,8 +1658,16 @@ testResult_t run() {
  if (!parallel_init) {
    for(int i=0; i<nGpus*nThreads; ++i) {
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
-      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+      if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
+        NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)sendRegHandles[i]));
+        NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)recvRegHandles[i]));
+      } else
+#endif
+      {
+        if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
+        if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
+      }
 #endif
      NCCLCHECK(ncclCommDestroy(comms[i]));
    }
@@ -252,20 +252,42 @@ static uint64_t getHostHash(const char* hostname) {
  return getHash(hostHash, strlen(hostHash));
 }

+#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
+#define HAVE_BF16 1
+#else
+#define HAVE_BF16 0
+#endif
+#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
+#define HAVE_FP8 1
+#else
+#define HAVE_FP8 0
+#endif
+
+#if NCCL_MAJOR >= 2
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    #undef HAVE_BF16
+    #define HAVE_BF16 1
+    #if defined(__CUDA_FP8_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0)
+      #undef HAVE_FP8
+      #define HAVE_FP8 1
+    #endif
+  #endif
+#endif
+
 static size_t wordSize(ncclDataType_t type) {
  switch(type) {
    case ncclChar:
 #if NCCL_MAJOR >= 2
    //case ncclInt8:
    case ncclUint8:
-#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
+#if HAVE_FP8
    case ncclFloat8e4m3:
    case ncclFloat8e5m2:
 #endif
 #endif
      return 1;
    case ncclHalf:
-#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
+#if HAVE_BF16
    case ncclBfloat16:
 #endif
    //case ncclFloat16:
@@ -0,0 +1,80 @@
+#
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC ?= $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Add Blackwell but drop Pascal & Volta support if we're using CUDA13.0 or above
+NVCC_GENCODE ?= -gencode=arch=compute_75,code=sm_75 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+		-gencode=arch=compute_61,code=sm_61 \
+		-gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_90,code=compute_90
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_80,code=compute_80
+else
+NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_70,code=compute_70
+endif
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+CXXFLAGS   := -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
 # Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
@@ -68,7 +68,7 @@ include verifiable.mk

 self_test: $(DST_DIR)/self_test

-$(DST_DIR)/self_test: verifiable.cu verifiable.h
+$(DST_DIR)/self_test: main.cu $(TEST_VERIFIABLE_LIBS)
 	@printf "Linking  %s\n" $@
 	@mkdir -p $(DST_DIR)
-	$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 $< -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable $(HIPLDFLAGS) -Xlinker "-rpath=\$$ORIGIN"
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <cuda_runtime.h>
+#include <iostream>
+
+#define NCCL_VERIFIABLE_SELF_TEST 1
+#include "verifiable.h"
+
+int main(int arg_n, char **args) {
+  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
+  cudaSetDevice(0);
+  ncclVerifiableLaunchSelfTest();
+  cudaDeviceSynchronize();
+  return 0;
+}
@@ -105,7 +105,7 @@ template<typename T>
 struct IsIntegral: std::is_integral<T> {};
 template<>
 struct IsIntegral<__half>: std::false_type {};
-#if RCCL_BFLOAT16 == 1
+#if HAVE_ncclBfloat16
 template<>
 struct IsIntegral<hip_bfloat16>: std::false_type {};
 #endif
@@ -150,6 +150,10 @@ namespace {
  __host__ __device__ __half castTo<__half>(float x) {
    return __float2half(x);
  }
+  // template<>
+  // __host__ __device__ __half castTo<half>(double x) {
+  //   return __double2half(x);
+  // }
  template<>
  __host__ __device__ half castTo<__half>(uint64_t x) {
    return __ull2half_rn(x);
@@ -864,7 +868,7 @@ __host__ __device__ void genOutput(
 namespace {
 template<typename T>
 __host__ __device__ void genInput(
-  T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
+    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
    std::false_type /*integral*/
  ) {
  // We can't control the nranks divisor in avareages so to control error we
@@ -953,7 +957,6 @@ __host__ __device__ T genOutput(

 ////////////////////////////////////////////////////////////////////////////////

-#if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
 __global__ void __launch_bounds__(512, 1) prepareInput2(
@@ -1040,11 +1043,9 @@ hipError_t ncclVerifiablePrepareInput(
  }
  #undef CASE_OP
 }
-#endif

 ////////////////////////////////////////////////////////////////////////////////

-#if !SELF_TEST
 namespace {
 template<typename T, typename ReduceFn>
 __global__ void __launch_bounds__(512, 1) prepareExpected2(
@@ -1130,7 +1131,6 @@ hipError_t ncclVerifiablePrepareExpected(
  }
  #undef CASE_OP
 }
-#endif

 ////////////////////////////////////////////////////////////////////////////////

@@ -1151,7 +1151,6 @@ __host__ __device__  uint64_t calcDelta(T a, T b) {

 ////////////////////////////////////////////////////////////////////////////////

-#if !SELF_TEST
 namespace {
 template<typename T>
 __global__ void __launch_bounds__(512, 1) verifyPrepared(
@@ -1245,7 +1244,6 @@ hipError_t verifyInline1(
  ReduceAvg opavg{rank_n};
  ReducePreMulSum oppremulsum;
  void *args[8] = {&results, &elt_n, nullptr, &rank_n, &seed, &elt_ix0, &tolerance, &bad_elt_n};
-
  #define CASE_OP(op) \
    if(rank_n == 1) { \
      fn = (void const*)&verifyInline2<T, Uint, ReduceNil>; \
@@ -1336,13 +1334,10 @@ hipError_t ncclVerifiableVerify(
  }
  #undef CASE_TY
 }
-#endif

 ////////////////////////////////////////////////////////////////////////////////
-
 #if SELF_TEST
-#include <iostream>
-
+namespace {
 template<typename T, typename Op>
 __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
  //if(!std::is_same<T,half>::value) return;
@@ -1397,16 +1392,16 @@ __global__ void __launch_bounds__(512, 1) sweep() {
  #if HAVE_ncclBfloat16
    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
  #endif
-  #if HAVE_ncclfp8 && __HIP_DEVICE_COMPILE__
+  #if HAVE_ncclfp8_DEVICE
    sweep1<rccl_float8>(ncclFloat8e4m3, "fp8_e4m3");
    sweep1<rccl_bfloat8>(ncclFloat8e5m2, "fp8_e5m2");
  #endif
  sweep1<float>(ncclFloat32, "float");
  sweep1<double>(ncclFloat64, "double");
 }
+}

 void ncclVerifiableLaunchSelfTest() {
  sweep<<<1,512>>>();
-  sweep<<<1,512>>>();
 }
-#endif
+#endif
@@ -63,4 +63,9 @@ hipError_t ncclVerifiableVerify(
  int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
  int64_t *bad_elt_n, cudaStream_t stream
 );
+
+#ifdef NCCL_VERIFIABLE_SELF_TEST
+void ncclVerifiableLaunchSelfTest();
+#endif
+
 #endif
@@ -1,15 +1,16 @@
-# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
+# Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information


-# We requires both of the following paths to be set upon including this makefile
+# We require both of the following paths to be set upon including this makefile
 # TEST_VERIFIABLE_SRCDIR = <points to this directory>
-# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
+# TEST_VERIFIABLE_BUILDDIR = <points to destination of .so file>

 TEST_VERIFIABLE_HDRS      = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 TEST_VERIFIABLE_OBJS      = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+TEST_VERIFIABLE_LIBS      = $(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so

 ${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
 	@printf "Hipifying  %-35s > %s\n" $< $@
@@ -31,3 +32,9 @@ $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPI
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
 	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
 	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
+
+$(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so: $(TEST_VERIFIABLE_OBJS)
+	@printf "Creating DSO %s\n" $@
+	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
+	$(CC) -shared -o $@.0 $^ -Wl,-soname,$(notdir $@).0
+	ln -sf $(notdir $@).0 $@