update build rccl-tests infrastructure and add getAlgoProtoChannels support (#2212)

2025-12-11 18:29:06 +05:30
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
 # Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
@@ -15,6 +15,8 @@ CUSTOM_RCCL_LIB ?= ""

 HIPCC ?= $(ROCM_PATH)/bin/amdclang++
 HIPCONFIG = $(ROCM_PATH)/bin/hipconfig
+HIPIFY_PL_EXE = $(ROCM_PATH)/bin/hipify-perl
+HIPIFY_PL_FLAGS = -quiet-warnings
 CXX = $(HIPCC)

 HIPCUFLAGS := -std=c++14
@@ -30,7 +32,10 @@ HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
 # Currently, supports gfx906,gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
 ifndef GPU_TARGETS
 GPU_TARGETS = gfx906 gfx908 gfx90a
-  ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
+  ifeq ($(shell test "0$(HIP_MAJOR)" -ge 7; echo $$?),0)
+    # Include gfx942 and gfx950 support if we're using ROCm 7.0 or above
+    GPU_TARGETS += gfx942 gfx950
+  else ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
    # Include gfx942 support if we're using ROCm 6.0 or above
    GPU_TARGETS += gfx942
    ifeq ($(shell test "0$(HIP_MINOR)" -ge 5; echo $$?),0)
@@ -43,41 +48,6 @@ endif

 GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")

-#CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
-#CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
-#CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
-#
-## Better define NVCC_GENCODE in your environment to the minimal set
-## of archs to reduce compile time.
-#ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
-## Include Blackwell support if we're using CUDA12.8 or above
-#NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
-#		-gencode=arch=compute_90,code=sm_90 \
-#		-gencode=arch=compute_100,code=sm_100 \
-#		-gencode=arch=compute_120,code=sm_120 \
-#		-gencode=arch=compute_120,code=compute_120
-#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
-#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-#                -gencode=arch=compute_61,code=sm_61 \
-#                -gencode=arch=compute_70,code=sm_70 \
-#		-gencode=arch=compute_80,code=sm_80 \
-#		-gencode=arch=compute_90,code=sm_90 \
-#		-gencode=arch=compute_90,code=compute_90
-#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
-#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-#                -gencode=arch=compute_61,code=sm_61 \
-#                -gencode=arch=compute_70,code=sm_70 \
-#		-gencode=arch=compute_80,code=sm_80 \
-#		-gencode=arch=compute_80,code=compute_80
-#else
-#NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-#                -gencode=arch=compute_50,code=sm_50 \
-#                -gencode=arch=compute_60,code=sm_60 \
-#                -gencode=arch=compute_61,code=sm_61 \
-#                -gencode=arch=compute_70,code=sm_70 \
-#                -gencode=arch=compute_70,code=compute_70
-#endif
-
 ifneq ($(NCCL_HOME), "")
 HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
 HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
@@ -117,7 +87,7 @@ HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/mpich/include -I
 HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/mpich/lib -lmpich
 endif

-LIBRARIES += rccl
+LIBRARIES += rccl dl
 HIPLDFLAGS += $(LIBRARIES:%=-l%)

 DST_DIR := $(BUILDDIR)
@@ -148,14 +118,14 @@ $(GIT_VERSION_FILE):
 ${HIPIFY_DIR}/%.cpp: %.cpp
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@

 ${HIPIFY_DIR}/%.h: %.h
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@

-${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
+${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cpp ${HIPIFY_DIR}/common.h ${HIPIFY_DIR}/rccl_compat.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
@@ -166,9 +136,8 @@ ${DST_DIR}/timer.o: timer.cc timer.h
 	@mkdir -p ${DST_DIR}
 	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc

-${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
+${DST_DIR}/%_perf: ${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
 	$(HIPCC) -o $@ $^ $(HIPLDFLAGS)
-
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 void
 AllGatherGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -47,6 +48,16 @@ AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op,
    return testSuccess;
 }

+testResult_t
+AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
+                              int* algo, int* proto, int* nchannels)
+{
+    if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+    NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllGather, count, type, 0, 0, 1, algo,
+                                   proto, nchannels));
+    return testSuccess;
+}
+
 void
 AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
               int nranks)
@@ -60,14 +71,17 @@ AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* bu

 testResult_t
 AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-                 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+                 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+                 void* bias = nullptr)
 {
+    (void) bias;
    NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
    return testSuccess;
 }

-struct testColl allGatherTest = { "AllGather", AllGatherGetCollByteCount,
-                                  AllGatherInitData, AllGatherGetBw, AllGatherRunColl };
+struct testColl allGatherTest = { "AllGather",       AllGatherGetCollByteCount,
+                                  AllGatherInitData, AllGatherGetBw,
+                                  AllGatherRunColl,  AllGatherGetAlgoProtoChannels };

 void
 AllGatherGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 void
 AllReduceGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -41,6 +42,16 @@ AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op,
    return testSuccess;
 }

+testResult_t
+AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
+                              int* algo, int* proto, int* nchannels)
+{
+    if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+    NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce, count, type, 0, 0, 1, algo,
+                                   proto, nchannels));
+    return testSuccess;
+}
+
 void
 AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
               int nranks)
@@ -54,14 +65,17 @@ AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* bu

 testResult_t
 AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-                 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+                 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+                 void* bias = nullptr)
 {
+    (void) bias;
    NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
    return testSuccess;
 }

-struct testColl allReduceTest = { "AllReduce", AllReduceGetCollByteCount,
-                                  AllReduceInitData, AllReduceGetBw, AllReduceRunColl };
+struct testColl allReduceTest = { "AllReduce",       AllReduceGetCollByteCount,
+                                  AllReduceInitData, AllReduceGetBw,
+                                  AllReduceRunColl,  AllReduceGetAlgoProtoChannels };

 void
 AllReduceGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 void
 AlltoAllGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -62,14 +63,17 @@ AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* bus

 testResult_t
 AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-                ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+                ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+                void* bias = nullptr)
 {
+    (void) bias;
    NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
    return testSuccess;
 }

-struct testColl alltoAllTest = { "AlltoAll", AlltoAllGetCollByteCount, AlltoAllInitData,
-                                 AlltoAllGetBw, AlltoAllRunColl };
+struct testColl alltoAllTest = { "AlltoAll",       AlltoAllGetCollByteCount,
+                                 AlltoAllInitData, AlltoAllGetBw,
+                                 AlltoAllRunColl,  nullptr };

 void
 AlltoAllGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 #define USE_RCCL_GATHER_SCATTER

@@ -101,8 +102,10 @@ AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* bu

 testResult_t
 AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-                 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+                 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+                 void* bias = nullptr)
 {
+    (void) bias;
    int nranks;
    NCCLCHECK(ncclCommCount(comm, &nranks));
    int rank;
@@ -174,8 +177,9 @@ AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t ty
    return testSuccess;
 }

-struct testColl alltoAllTest = { "AlltoAllv", AlltoAllvGetCollByteCount,
-                                 AlltoAllvInitData, AlltoAllvGetBw, AlltoAllvRunColl };
+struct testColl alltoAllTest = { "AlltoAllv",       AlltoAllvGetCollByteCount,
+                                 AlltoAllvInitData, AlltoAllvGetBw,
+                                 AlltoAllvRunColl,  nullptr };

 void
 AlltoAllvGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 void
 BroadcastGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -41,6 +42,16 @@ BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op,
    return testSuccess;
 }

+testResult_t
+BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
+                              int* algo, int* proto, int* nchannels)
+{
+    if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+    NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast, count, type, 0, 0, 1, algo,
+                                   proto, nchannels));
+    return testSuccess;
+}
+
 void
 BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
               int nranks)
@@ -54,8 +65,10 @@ BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* bu

 testResult_t
 BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-                 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+                 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+                 void* bias = nullptr)
 {
+    (void) bias;
    int rank;
    NCCLCHECK(ncclCommUserRank(comm, &rank));
 #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
@@ -73,8 +86,9 @@ BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t ty
    return testSuccess;
 }

-struct testColl broadcastTest = { "Broadcast", BroadcastGetCollByteCount,
-                                  BroadcastInitData, BroadcastGetBw, BroadcastRunColl };
+struct testColl broadcastTest = { "Broadcast",       BroadcastGetCollByteCount,
+                                  BroadcastInitData, BroadcastGetBw,
+                                  BroadcastRunColl,  BroadcastGetAlgoProtoChannels };

 void
 BroadcastGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -1,18 +1,20 @@

 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

 #include "common.h"
-#include "cuda.h"
-#include "cuda_runtime.h"
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
 #include "rccl_float8.h"
 #include <cstdio>
 #include <ctype.h>
+#include <dlfcn.h>
+#include <errno.h>
 #include <getopt.h>
 #include <hip/hip_bfloat16.h>
 #include <libgen.h>
@@ -27,10 +29,46 @@
 #include "git_version.h"
 #include "verifiable.h"

+#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+
 int     test_ncclVersion = 0;  // init'd with ncclGetVersion()
 int32_t gpu_block3;
 size_t  cache_bytes = 192 * 1024 * 1024;  // Use 192MB

+rcclTestsGetAlgoInfo_t     rcclTestsGetAlgoInfo     = NULL;
+rcclTestsGetProtocolName_t rcclTestsGetProtocolName = NULL;
+rcclTestsGetAlgoName_t     rcclTestsGetAlgoName     = NULL;
+
+static void
+loadRcclSyms()
+{
+    static void* handle  = NULL;
+    const char*  libname = "librccl.so";
+    if(!handle)
+    {
+        handle = dlopen(libname, RTLD_LAZY | RTLD_LOCAL);
+        if(!handle)
+        {
+            fprintf(stderr, "dlopen failed: %s\n", dlerror());
+            return;
+        }
+    }
+    rcclTestsGetAlgoInfo = (rcclTestsGetAlgoInfo_t) dlsym(handle, "rcclGetAlgoInfo");
+    rcclTestsGetAlgoName = (rcclTestsGetAlgoName_t) dlsym(handle, "rcclGetAlgoName");
+    rcclTestsGetProtocolName =
+        (rcclTestsGetProtocolName_t) dlsym(handle, "rcclGetProtocolName");
+}
+
+// RCCL_FLOAT8 support
+bool rccl_float8_useFnuz = false;
+
+bool
+IsArchMatch(char const* arch, char const* target)
+{
+    // helper function to reduce clutter in code elsewhere. Returns true on match.
+    return (strncmp(arch, target, strlen(target)) == 0);
+}
+
 #if NCCL_MAJOR >= 2
 ncclDataType_t test_types[ncclNumTypes] = { ncclInt8,
                                            ncclUint8,
@@ -41,14 +79,14 @@ ncclDataType_t test_types[ncclNumTypes] = { ncclInt8,
                                            ncclHalf,
                                            ncclFloat,
                                            ncclDouble
-#    if RCCL_BFLOAT16 == 1
+#    if HAVE_BF16
                                            ,
                                            ncclBfloat16
 #    endif
-#    if RCCL_FLOAT8 == 1
+#    if HAVE_FP8
                                            ,
-                                            ncclFp8E4M3,
-                                            ncclFp8E5M2
+                                            ncclFloat8e4m3,
+                                            ncclFloat8e5m2
 #    endif
 };
 const char* test_typenames[ncclNumTypes] = { "int8",
@@ -60,11 +98,11 @@ const char* test_typenames[ncclNumTypes] = { "int8",
                                             "half",
                                             "float",
                                             "double"
-#    if RCCL_BFLOAT16 == 1
+#    if HAVE_BF16
                                             ,
                                             "bfloat16"
 #    endif
-#    if RCCL_FLOAT8 == 1
+#    if HAVE_FP8
                                             ,
                                             "fp8_e4m3",
                                             "fp8_e5m2"
@@ -374,6 +412,15 @@ InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType
    return testSuccess;
 }

+testResult_t
+InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset,
+                  ncclDataType_t type, ncclRedOp_t op)
+{
+    ncclVerifiableApplyBias(expected, bias, count, (int) type, (int) op, offset,
+                            cudaStreamDefault);
+    return testSuccess;
+}
+
 testResult_t
 InitData(void* data, const size_t count, size_t offset, ncclDataType_t type,
         ncclRedOp_t op, uint64_t seed, int nranks, int rank)
@@ -716,7 +763,7 @@ startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int
        TESTCHECK(args->collTest->runColl(
            (void*) (in_place ? recvBuff + args->sendInplaceOffset * rank : sendBuff),
            (void*) (in_place ? recvBuff + args->recvInplaceOffset * rank : recvBuff),
-            count, type, op, root, args->comms[i], args->streams[i]));
+            count, type, op, root, args->comms[i], args->streams[i], args->bias[i]));

 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
        if(opIndex >= ncclNumOps)
@@ -1239,7 +1286,7 @@ threadLaunch(struct testThread* thread)

 testResult_t
 AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvBytes,
-              void** expected, size_t nbytes)
+              void** expected, size_t nbytes, void** bias)
 {
    if(enable_rotating_tensor)
    {
@@ -1252,6 +1299,8 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
        {
            CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached));
            CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached));
+            if(bias)
+                CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocUncached));
            if(datacheck)
                CUDACHECK(
                    hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached));
@@ -1262,6 +1311,9 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
                hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
            CUDACHECK(
                hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
+            if(bias)
+                CUDACHECK(
+                    hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocFinegrained));
            if(datacheck)
                CUDACHECK(hipExtMallocWithFlags(expected, recvBytes,
                                                hipDeviceMallocFinegrained));
@@ -1271,12 +1323,14 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
    {
        CUDACHECK(hipHostMalloc(sendbuff, nbytes));
        CUDACHECK(hipHostMalloc(recvbuff, nbytes));
+        if(bias) CUDACHECK(hipHostMalloc(bias, nbytes));
        if(datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
    }
    else if(memorytype == ncclManaged)
    {
        CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
        CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
+        if(bias) CUDACHECK(cudaMallocManaged(bias, nbytes));
        if(datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
 #if 0
    CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
@@ -1288,6 +1342,7 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
    {
        CUDACHECK(cudaMalloc(sendbuff, nbytes));
        CUDACHECK(cudaMalloc(recvbuff, nbytes));
+        if(bias) CUDACHECK(cudaMalloc(bias, nbytes));
        if(datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
    }
    CUDACHECK(hipMemset(*sendbuff, 1, nbytes));
@@ -1333,6 +1388,7 @@ main(int argc, char* argv[])
        test_opnum++;  // PreMulSum
    }
 #endif
+    loadRcclSyms();

    // Parse args
    // Replace getopt_long with manual argument parsing
@@ -1764,6 +1820,7 @@ run()
    std::vector<cudaStream_t> streams(nGpus * nThreads);
    std::vector<void*>        sendbuffs(nGpus * nThreads);
    std::vector<void*>        recvbuffs(nGpus * nThreads);
+    std::vector<void*>        bias(nGpus * nThreads);
    std::vector<void*>        expected(nGpus * nThreads);
    size_t                    sendBytes, recvBytes;

@@ -1777,7 +1834,8 @@ run()
        gpus[i] = ((gpu0 != -1 ? gpu0 : localRank * nThreads * nGpus) + i) % numDevices;
        CUDACHECK(cudaSetDevice(gpus[i]));
        TESTCHECK(AllocateBuffs(sendbuffs.data() + i, sendBytes, recvbuffs.data() + i,
-                                recvBytes, expected.data() + i, (size_t) maxBytes));
+                                recvBytes, expected.data() + i, (size_t) maxBytes,
+                                bias.data() + i));
        if(streamnull)
            streams[i] = NULL;
        else
@@ -1895,6 +1953,7 @@ run()
        threads[t].args.gpus                   = gpus.data() + t * nGpus;
        threads[t].args.sendbuffs              = sendbuffs.data() + t * nGpus;
        threads[t].args.recvbuffs              = recvbuffs.data() + t * nGpus;
+        threads[t].args.bias                   = bias.data() + t * nGpus;
        threads[t].args.expected               = expected.data() + t * nGpus;
        threads[t].args.ncclId                 = ncclId;
        threads[t].args.comms                  = comms + t * nGpus;
@@ -1952,6 +2011,7 @@ run()
    {
        if(sendbuffs[i]) CUDACHECK(cudaFree((char*) sendbuffs[i]));
        if(recvbuffs[i]) CUDACHECK(cudaFree((char*) recvbuffs[i]));
+        if(bias[i]) CUDACHECK(cudaFree((char*) bias[i]));
        if(datacheck) CUDACHECK(cudaFree(expected[i]));
    }
    CUDACHECK(cudaFreeHost(delta));
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 *
 * See LICENSE.txt for license information
@@ -11,18 +11,26 @@
 #include "rccl/rccl.h"
 #include <algorithm>
 #include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <pthread.h>
 #include <stdio.h>
+#include <string>
+#include <utility>
+#include <vector>
 #ifdef MPI_SUPPORT
 #    include "mpi.h"
 #endif
 #include "nccl1_compat.h"
 #include "timer.h"
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <pthread.h>
-// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 3)
+
+// Ensures backward compatibility for FP8 datatypes
+#if NCCL_VERSION_CODE < NCCL_VERSION(2, 24, 3)
+#    define ncclFloat8e4m3 ncclFp8E4M3
+#    define ncclFloat8e5m2 ncclFp8E5M2
+#else
+// For newer RCCL versions, define old names in terms of new names
 #    define ncclFp8E4M3 ncclFloat8e4m3
 #    define ncclFp8E5M2 ncclFloat8e5m2
 #endif
@@ -114,7 +122,10 @@ struct testColl
                  int nranks);
    testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count,
                            ncclDataType_t type, ncclRedOp_t op, int root,
-                            ncclComm_t comm, cudaStream_t stream);
+                            ncclComm_t comm, cudaStream_t stream, void* bias);
+    testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count,
+                                         ncclDataType_t type, int* algo, int* proto,
+                                         int* nchannels);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
@@ -139,6 +150,7 @@ public:
    void addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size_t numBytes,
                   int inPlace, double timeUsec, double algBw, double busBw,
                   int64_t wrongElts = -1);
+    void writeFile();

 private:
    bool isMainThread();
@@ -154,13 +166,14 @@ private:
        return std::make_pair("\"" + v + "\"", k);
    };

-    bool          _outputValid = false;
-    std::ofstream _out;
-    std::string   _outputFormat;
-    size_t        _numCycle = 0;
-    std::string   _collectiveName;
-    std::string   _typeName;
-    std::string   _opName;
+    bool                                                          _outputValid = false;
+    std::ofstream                                                 _out;
+    std::string                                                   _outputFormat;
+    size_t                                                        _numCycle = 0;
+    std::string                                                   _collectiveName;
+    std::string                                                   _typeName;
+    std::string                                                   _opName;
+    std::vector<std::vector<std::pair<std::string, std::string>>> _outputData;
 };

 struct testEngine
@@ -200,6 +213,7 @@ struct threadArgs
    ncclUniqueId  ncclId;
    ncclComm_t*   comms;
    cudaStream_t* streams;
+    void**        bias;

    void**  expected;
    size_t  expectedBytes;
@@ -233,11 +247,14 @@ extern testResult_t
 InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type,
               ncclRedOp_t op, const uint64_t seed, const int nranks);
 extern testResult_t
+InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset,
+                  ncclDataType_t type, ncclRedOp_t op);
+extern testResult_t
 InitData(void* data, const size_t count, size_t offset, ncclDataType_t type,
         ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
-extern void
-AllocateBuffs(void** sendbuff, void** recvbuff, void** expected, void** expectedHost,
-              size_t nbytes, int nranks);
+extern testResult_t
+AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvBytes,
+              void** expected, size_t nbytes, void** bias);

 #include <unistd.h>

@@ -304,6 +321,29 @@ getHostHash(const char* hostname)
    return getHash(hostHash, strlen(hostHash));
 }

+#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
+#    define HAVE_BF16 1
+#else
+#    define HAVE_BF16 0
+#endif
+#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
+#    define HAVE_FP8 1
+#else
+#    define HAVE_FP8 0
+#endif
+
+#if NCCL_MAJOR >= 2
+#    if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+#        undef HAVE_BF16
+#        define HAVE_BF16 1
+#        if defined(__CUDA_FP8_TYPES_EXIST__) &&                                         \
+            NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 0)
+#            undef HAVE_FP8
+#            define HAVE_FP8 1
+#        endif
+#    endif
+#endif
+
 static size_t
 wordSize(ncclDataType_t type)
 {
@@ -313,14 +353,14 @@ wordSize(ncclDataType_t type)
 #if NCCL_MAJOR >= 2
        // case ncclInt8:
        case ncclUint8:
-#    if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
-        case ncclFp8E4M3:
-        case ncclFp8E5M2:
+#    if HAVE_FP8
+        case ncclFloat8e4m3:
+        case ncclFloat8e5m2:
 #    endif
 #endif
            return 1;
        case ncclHalf:
-#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
+#if HAVE_BF16
        case ncclBfloat16:
 #endif
            // case ncclFloat16:
@@ -427,4 +467,30 @@ extern thread_local int is_main_thread;
 #define PRINT                                                                            \
    if(is_main_thread) printf

+typedef enum
+{
+    ncclFuncBroadcast         = 0,
+    ncclFuncReduce            = 1,
+    ncclFuncAllGather         = 2,
+    ncclFuncReduceScatter     = 3,
+    ncclFuncAllReduce         = 4,
+    ncclFuncAllReduceWithBias = 5,
+    ncclFuncSendRecv          = 6,
+    ncclFuncSend              = 7,
+    ncclFuncRecv              = 8,
+    ncclFuncAllToAllPivot     = 9,
+    ncclNumFuncs              = 10
+} ncclFunc_t;
+
+typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll,
+                                               uint64_t count, ncclDataType_t dataType,
+                                               int collNetSupport, int nvlsSupport,
+                                               int numPipeOps, int* algo, int* protocol,
+                                               int* maxChannels);
+typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName);
+typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int          protocol,
+                                                   const char** protocolName);
+
+#include "rccl_compat.h"
+
 #endif
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 void
 GatherGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -61,8 +62,10 @@ GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw

 testResult_t
 GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-              ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+              ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+              void* bias = nullptr)
 {
+    (void) bias;
    int nRanks;
    NCCLCHECK(ncclCommCount(comm, &nRanks));
    int rank;
@@ -85,8 +88,8 @@ GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
    return testSuccess;
 }

-struct testColl gatherTest = { "Gather", GatherGetCollByteCount, GatherInitData,
-                               GatherGetBw, GatherRunColl };
+struct testColl gatherTest = { "Gather",    GatherGetCollByteCount, GatherInitData,
+                               GatherGetBw, GatherRunColl,          nullptr };

 void
 GatherGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -0,0 +1,30 @@
+/* ************************************************************************
+ * Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef RCCL_COMPAT_H
+#define RCCL_COMPAT_H
+
+extern rcclTestsGetAlgoInfo_t     rcclTestsGetAlgoInfo;
+extern rcclTestsGetProtocolName_t rcclTestsGetProtocolName;
+extern rcclTestsGetAlgoName_t     rcclTestsGetAlgoName;
+
+#endif
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 void
 ReduceGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -45,6 +46,16 @@ ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int
    return testSuccess;
 }

+testResult_t
+ReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo,
+                           int* proto, int* nchannels)
+{
+    if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+    NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduce, count, type, 0, 0, 1, algo,
+                                   proto, nchannels));
+    return testSuccess;
+}
+
 void
 ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
            int nranks)
@@ -56,14 +67,17 @@ ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw

 testResult_t
 ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-              ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+              ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+              void* bias = nullptr)
 {
+    (void) bias;
    NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
    return testSuccess;
 }

-struct testColl reduceTest = { "Reduce", ReduceGetCollByteCount, ReduceInitData,
-                               ReduceGetBw, ReduceRunColl };
+struct testColl reduceTest = { "Reduce",       ReduceGetCollByteCount,
+                               ReduceInitData, ReduceGetBw,
+                               ReduceRunColl,  ReduceGetAlgoProtoChannels };

 void
 ReduceGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 void
 ReduceScatterGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -45,6 +46,16 @@ ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
    return testSuccess;
 }

+testResult_t
+ReduceScatterGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
+                                  int* algo, int* proto, int* nchannels)
+{
+    if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+    NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduceScatter, count, type, 0, 0, 1,
+                                   algo, proto, nchannels));
+    return testSuccess;
+}
+
 void
 ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
                   int nranks)
@@ -58,15 +69,18 @@ ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double

 testResult_t
 ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-                     ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+                     ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+                     void* bias = nullptr)
 {
+    (void) bias;
    NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
    return testSuccess;
 }

-struct testColl reduceScatterTest = { "ReduceScatter", ReduceScatterGetCollByteCount,
-                                      ReduceScatterInitData, ReduceScatterGetBw,
-                                      ReduceScatterRunColl };
+struct testColl reduceScatterTest = {
+    "ReduceScatter",    ReduceScatterGetCollByteCount, ReduceScatterInitData,
+    ReduceScatterGetBw, ReduceScatterRunColl,          ReduceScatterGetAlgoProtoChannels
+};

 void
 ReduceScatterGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 void
 ScatterGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -55,8 +56,10 @@ ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busB

 testResult_t
 ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-               ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+               ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+               void* bias = nullptr)
 {
+    (void) bias;
    int nRanks;
    NCCLCHECK(ncclCommCount(comm, &nRanks));
    int rank;
@@ -79,8 +82,8 @@ ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type
    return testSuccess;
 }

-struct testColl scatterTest = { "Scatter", ScatterGetCollByteCount, ScatterInitData,
-                                ScatterGetBw, ScatterRunColl };
+struct testColl scatterTest = { "Scatter",    ScatterGetCollByteCount, ScatterInitData,
+                                ScatterGetBw, ScatterRunColl,          nullptr };

 void
 ScatterGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@

 #include "common.h"
 #include "cuda_runtime.h"
+#include "rccl_compat.h"

 void
 SendRecvGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -58,8 +59,10 @@ SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* bus

 testResult_t
 SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-                ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
+                ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
+                void* bias = nullptr)
 {
+    (void) bias;
    int nRanks;
    NCCLCHECK(ncclCommCount(comm, &nRanks));
    int rank;
@@ -74,8 +77,9 @@ SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t typ
    return testSuccess;
 }

-struct testColl sendRecvTest = { "SendRecv", SendRecvGetCollByteCount, SendRecvInitData,
-                                 SendRecvGetBw, SendRecvRunColl };
+struct testColl sendRecvTest = { "SendRecv",       SendRecvGetCollByteCount,
+                                 SendRecvInitData, SendRecvGetBw,
+                                 SendRecvRunColl,  nullptr };

 void
 SendRecvGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -1108,7 +1108,7 @@ prepareInput1(void* elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
 }
 }  // namespace

-void
+hipError_t
 ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
                           int rank_me, uint64_t seed, intptr_t elt_ix0,
                           cudaStream_t stream)
@@ -1135,6 +1135,7 @@ ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, i
 #    endif
    }
 #    undef CASE_OP
+    return hipSuccess;
 }
 #endif

@@ -1198,7 +1199,7 @@ prepareExpected1(void* elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n
 }
 }  // namespace

-void
+hipError_t
 ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op,
                              int rank_n, uint64_t seed, intptr_t elt_ix0,
                              cudaStream_t stream)
@@ -1224,6 +1225,7 @@ ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op
 #    endif
    }
 #    undef CASE_OP
+    return hipSuccess;
 }
 #endif

@@ -1411,7 +1413,7 @@ verifyInline1(T const* results, intptr_t elt_n, int red_op, int rank_n, uint64_t
 }
 }  // namespace

-void
+hipError_t
 ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n,
                     int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
                     int64_t* bad_elt_n, cudaStream_t stream)
@@ -1470,6 +1472,68 @@ ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n,
        default: assert(0);
    }
 #    undef CASE_TY
+    return hipSuccess;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply bias to expected results
+
+namespace
+{
+template <typename T>
+__global__ void
+applyBias2(T* elts, T* bias, intptr_t elt_n)
+{
+    intptr_t i0 = blockIdx.x * (elt_n / gridDim.x);
+    i0 += blockIdx.x < elt_n % gridDim.x ? blockIdx.x : elt_n % gridDim.x;
+    intptr_t i1 = (blockIdx.x + 1) * (elt_n / gridDim.x);
+    i1 += blockIdx.x + 1 < elt_n % gridDim.x ? blockIdx.x + 1 : elt_n % gridDim.x;
+    intptr_t i = i0 + threadIdx.x;
+    while(i < i1)
+    {
+        elts[i] = ReduceSum()(elts[i], bias[i]);
+        i += blockDim.x;
+    }
+}
+
+void
+applyBias1(void* elts, void* bias, intptr_t elt_n, int elt_ty, cudaStream_t stream)
+{
+    int block_n = std::min<intptr_t>(32, (elt_n + 4 * 512 - 1) / (4 * 512));
+#    define CASE_TY(T)                                                                   \
+        applyBias2<<<block_n, 512, 0, stream>>>((T*) elts, (T*) bias, elt_n);            \
+        break;
+    switch(elt_ty)
+    {
+        case ncclInt8: CASE_TY(int8_t)
+        case ncclUint8: CASE_TY(uint8_t)
+        case ncclInt32: CASE_TY(int32_t)
+        case ncclUint32: CASE_TY(uint32_t)
+        case ncclInt64: CASE_TY(int64_t)
+        case ncclUint64: CASE_TY(uint64_t)
+        case ncclFloat16: CASE_TY(__half)
+#    if HAVE_ncclBfloat16
+        case ncclBfloat16: CASE_TY(hip_bfloat16)
+#    endif
+#    if HAVE_ncclfp8
+        case ncclFp8E4M3: CASE_TY(rccl_float8)
+        case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+#    endif
+        case ncclFloat32: CASE_TY(float)
+        case ncclFloat64: CASE_TY(double)
+        default: assert(0);
+    }
+#    undef CASE_TY
+}
+}  // namespace
+
+void
+ncclVerifiableApplyBias(void* elts, void* bias, intptr_t elt_n, int elt_ty, int red_op,
+                        intptr_t elt_ix0, cudaStream_t stream)
+{
+    (void) red_op;
+    (void) elt_ix0;
+    applyBias1(elts, bias, elt_n, elt_ty, stream);
 }
 #endif

@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -43,13 +43,13 @@ ncclVerifiablePremulScalar(int rank_me)
 }

 // Enqueue kernel to generate data which is to be reduced.
-void
+hipError_t
 ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
                           int rank_me, uint64_t seed, intptr_t elt_ix0,
                           cudaStream_t stream);

 // Enqueue kernel to generate expected results of reduction.
-void
+hipError_t
 ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op,
                              int rank_n, uint64_t seed, intptr_t elt_ix0,
                              cudaStream_t stream);
@@ -60,8 +60,19 @@ ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op
 // which can be costly. Thus if you plan to run the same reduction multiple
 // times it is advantageous to precompute the expected values with
 // ncclVerifiablePrepareExpected and pass them as `expected` here.
-void
+hipError_t
 ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n,
                     int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
                     int64_t* bad_elt_n, cudaStream_t stream);
+
+// Enqueue kernel that applies bias to expected results
+void
+ncclVerifiableApplyBias(void* elts, void* bias, intptr_t elt_n, int elt_ty, int red_op,
+                        intptr_t elt_ix0, cudaStream_t stream);
+
+#ifdef NCCL_VERIFIABLE_SELF_TEST
+void
+ncclVerifiableLaunchSelfTest();
+#endif
+
 #endif
@@ -1,5 +1,5 @@
 # Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information

@@ -14,17 +14,17 @@ TEST_VERIFIABLE_OBJS      = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
 ${HIPIFY_DIR}/verifiable.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cpp
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@

 ${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@

 ${HIPIFY_DIR}/rccl_float8.h: $(TEST_VERIFIABLE_SRCDIR)/../src/rccl_float8.h
 	@printf "Hipifying  %-35s > %s\n" $< $@
 	@mkdir -p ${HIPIFY_DIR}
-	hipify-perl -quiet-warnings $< > $@
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@

 $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cpp $(HIPIFY_DIR)/verifiable.h $(HIPIFY_DIR)/rccl_float8.h
 	@printf "Compiling %s\n" $@