From db6ea5a5947c620f26712a9dc003fa67b3e3f0d7 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Mon, 13 Oct 2025 14:09:10 -0700
Subject: [PATCH] Add all_reduce_bias_perf to support All Reduce with Bias
 (#130)

Use dynamic symbol loading of ncclAllReduceWithBias

Co-authored-by: mberenjk <146776561+mberenjk@users.noreply.github.com>
---
 src/CMakeLists.txt       |   1 +
 src/Makefile             |   2 +-
 src/all_gather.cu        |   2 +-
 src/all_reduce.cu        |   2 +-
 src/all_reduce_bias.cu   | 123 +++++++++++++++++++++++++++++++++++++++
 src/alltoall.cu          |   2 +-
 src/alltoallv.cu         |   2 +-
 src/broadcast.cu         |   2 +-
 src/common.cu            |  24 ++++++--
 src/common.h             |   8 ++-
 src/gather.cu            |   2 +-
 src/hypercube.cu         |   2 +-
 src/reduce.cu            |   2 +-
 src/reduce_scatter.cu    |   2 +-
 src/scatter.cu           |   2 +-
 src/sendrecv.cu          |   2 +-
 verifiable/verifiable.cu |  70 ++++++++++++++++++++++
 verifiable/verifiable.h  |   6 ++
 18 files changed, 237 insertions(+), 19 deletions(-)
 create mode 100644 src/all_reduce_bias.cu

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 387bccfc97..8de04365ae 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -145,3 +145,4 @@ add_rccl_test(reduce_scatter)
 add_rccl_test(reduce)
 add_rccl_test(scatter)
 add_rccl_test(sendrecv)
+add_rccl_test(all_reduce_bias)
diff --git a/src/Makefile b/src/Makefile
index fcc020f7c2..7f6ea37279 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -135,7 +135,7 @@ HIPLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube all_reduce_bias
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
 
 GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
diff --git a/src/all_gather.cu b/src/all_gather.cu
index dbbd977ec0..54ca880d8b 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -52,7 +52,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 038188a74e..43c75032c7 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -48,7 +48,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/all_reduce_bias.cu b/src/all_reduce_bias.cu
new file mode 100644
index 0000000000..8d49c67483
--- /dev/null
+++ b/src/all_reduce_bias.cu
@@ -0,0 +1,123 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include <dlfcn.h>
+
+typedef ncclResult_t (*PFN_ncclAllReduceWithBias)(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream, const void* acc);
+#define DECLARE_RCCL_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+DECLARE_RCCL_PFN(ncclAllReduceWithBias);
+static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+
+static void initOnceFunc() {
+  void *librccl = dlopen("librccl.so", RTLD_NOLOAD);
+  pfn_ncclAllReduceWithBias = (PFN_ncclAllReduceWithBias) dlsym(librccl, "ncclAllReduceWithBias");
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+  pthread_once(&initOnceControl, initOnceFunc);
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+    TESTCHECK(InitData(args->bias[i], sendcount, 0, type, op, rep+0x12345678, nranks, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    TESTCHECK(InitDataApplyBias(args->expected[i], args->bias[i], recvcount, 0, type, op));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  if (pfn_ncclAllReduceWithBias == nullptr) {
+    fprintf(stderr, "[ERROR] This version of RCCL doesn't support ncclAllReduceWithBias\n");
+    return testNcclError;
+  }
+  NCCLCHECK((*pfn_ncclAllReduceWithBias)(sendbuff, recvbuff, count, type, op, comm, stream, bias));
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+#if defined(RCCL_FLOAT8)
+      if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+      continue;
+#endif
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
diff --git a/src/alltoall.cu b/src/alltoall.cu
index eeab700902..c81f2ff285 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -47,7 +47,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 8195826f31..1ca0fe64c4 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -84,7 +84,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int nranks;
   NCCLCHECK(ncclCommCount(comm, &nranks));
   int rank;
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 18d09b7285..6c57b0d177 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -47,7 +47,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
 #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
diff --git a/src/common.cu b/src/common.cu
index 3934d22463..f9ae4caf58 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -364,6 +364,11 @@ testResult_t InitDataReduce(void* data, const size_t count, const size_t offset,
   return testSuccess;
 }
 
+testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op) {
+  ncclVerifiableApplyBias(expected, bias, count, (int)type, (int)op, offset, cudaStreamDefault);
+  return testSuccess;
+}
+
 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
   CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault));
   return testSuccess;
@@ -469,7 +474,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
     TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
 
-#if 1 && DEBUG_PRINT
+#if 1 && defined(DEBUG_PRINT)
     if (args->reportErrors && wrongPerGpu[i] != 0) {
       printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
       char *expectedHost = (char*)malloc(args->expectedBytes);
@@ -582,6 +587,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
     char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    char* bias = ((char*)args->bias[i]) + shift;
     ncclRedOp_t op;
 
     if(opIndex < ncclNumOps) {
@@ -629,7 +635,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     TESTCHECK(args->collTest->runColl(
           (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
           (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
-        count, type, op, root, args->comms[i], args->streams[i]));
+        count, type, op, root, args->comms[i], args->streams[i], bias));
 
     #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
     if(opIndex >= ncclNumOps) {
@@ -1060,7 +1066,7 @@ testResult_t threadLaunch(struct testThread* thread) {
   return testSuccess;
 }
 
-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, void **bias) {
   if(enable_rotating_tensor) {
     recvBytes = recvBytes + cache_bytes;
     nbytes = nbytes + cache_bytes;
@@ -1069,22 +1075,26 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
     if(HIP_VERSION >= 50700000) {
       CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached));
       CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached));
+      if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocUncached));
       if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached));
     }
     else {
       CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
       CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
+      if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocFinegrained));
       if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
     }
   }
   else if (memorytype == ncclHost) {
     CUDACHECK(hipHostMalloc(sendbuff, nbytes));
     CUDACHECK(hipHostMalloc(recvbuff, nbytes));
+    if (bias) CUDACHECK(hipHostMalloc(bias, nbytes));
     if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
   }
   else if (memorytype == ncclManaged) {
     CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
     CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
+    if (bias) CUDACHECK(cudaMallocManaged(bias, nbytes));
     if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
 #if 0
     CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
@@ -1096,14 +1106,17 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
     NCCLCHECK(ncclMemAlloc(sendbuff, nbytes));
     NCCLCHECK(ncclMemAlloc(recvbuff, nbytes));
+    if (bias) CUDACHECK(cudaMalloc(bias, nbytes));
     if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes));
 #else
     CUDACHECK(cudaMalloc(sendbuff, nbytes));
     CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (bias) CUDACHECK(cudaMalloc(bias, nbytes));
     if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
 #endif
   }
   CUDACHECK(hipMemset(*sendbuff, 1, nbytes));
+  if (bias) CUDACHECK(hipMemset(*bias, 1, nbytes));
   if (datacheck) CUDACHECK(hipMemset(*expected, 1, recvBytes));
   return testSuccess;
 }
@@ -1554,6 +1567,7 @@ testResult_t run() {
   std::vector<cudaStream_t> streams(nGpus*nThreads);
   std::vector<void*> sendbuffs(nGpus*nThreads);
   std::vector<void*> recvbuffs(nGpus*nThreads);
+  std::vector<void*> bias(nGpus*nThreads);
   std::vector<void*> expected(nGpus*nThreads);
   size_t sendBytes, recvBytes;
 
@@ -1564,7 +1578,7 @@ testResult_t run() {
   for (int i=0; i<nGpus*nThreads; i++) {
     gpus[i] = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices;
     CUDACHECK(cudaSetDevice(gpus[i]));
-    TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes));
+    TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes, bias.data()+i));
     if (streamnull) {
       streams[i] = NULL;
     }
@@ -1699,6 +1713,7 @@ testResult_t run() {
     threads[t].args.gpus=gpus.data()+t*nGpus;
     threads[t].args.sendbuffs = sendbuffs.data()+t*nGpus;
     threads[t].args.recvbuffs = recvbuffs.data()+t*nGpus;
+    threads[t].args.bias = bias.data()+t*nGpus;
     threads[t].args.expected = expected.data()+t*nGpus;
     threads[t].args.ncclId = ncclId;
     threads[t].args.comms=comms+t*nGpus;
@@ -1764,6 +1779,7 @@ testResult_t run() {
 #else
     if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
     if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (bias[i]) CUDACHECK(cudaFree((char*)bias[i]));
     if (datacheck) CUDACHECK(cudaFree(expected[i]));
 #endif
   }
diff --git a/src/common.h b/src/common.h
index 645d3a1f0a..4c6af89f1e 100644
--- a/src/common.h
+++ b/src/common.h
@@ -104,9 +104,9 @@ struct testColl {
   testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
-  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias);
   testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels);
+
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
@@ -173,6 +173,7 @@ struct threadArgs {
   ncclUniqueId ncclId;
   ncclComm_t* comms;
   cudaStream_t* streams;
+  void** bias;
 
   void** expected;
   size_t expectedBytes;
@@ -199,8 +200,9 @@ struct testThread {
 extern void Barrier(struct threadArgs* args);
 extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
 extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
+extern testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op);
 extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
-extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks, void **bias);
 
 #include <unistd.h>
 
diff --git a/src/gather.cu b/src/gather.cu
index a0dc00de56..24156de7bd 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -45,7 +45,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/hypercube.cu b/src/hypercube.cu
index f5d94f026d..c35cc765cc 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   char* sbuff = (char*)sendbuff;
   char* rbuff = (char*)recvbuff;
   int nRanks;
diff --git a/src/reduce.cu b/src/reduce.cu
index c2353c3fc0..bf37a2b016 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -48,7 +48,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw;
 }
 
-testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
   return testSuccess;
 }
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index fe906ce372..fd589e2738 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -51,7 +51,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
   *busBw = baseBw * factor;
 }
 
-testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/scatter.cu b/src/scatter.cu
index d0323fa36d..ca0f6c10ad 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -41,7 +41,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index 4f5f6b8a7b..c9c4c4bc0a 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -45,7 +45,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index 20df5907a4..0896c724d0 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -1044,6 +1044,76 @@ hipError_t ncclVerifiablePrepareInput(
   #undef CASE_OP
 }
 
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void applyBias2(
+    T *elts, T *bias, intptr_t elt_n, ReduceFn op, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = op(elts[i], bias[i]);
+    #if 0
+    T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
+      std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void applyBias1(
+    void *elts, void* bias, intptr_t elt_n, int elt_ty, ReduceOp op,
+    intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) applyBias2<<<block_n, 512, 0, stream>>>((T*)elts, (T*)bias, elt_n, op, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(__half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  #endif
+  #if HAVE_ncclfp8
+  case ncclFp8E4M3: CASE_TY(rccl_float8)
+  case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiableApplyBias(
+    void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
+    cudaStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    applyBias1(elts, bias, elt_n, elt_ty, op, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace {
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index f4452d6d32..b248492aee 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -64,6 +64,12 @@ hipError_t ncclVerifiableVerify(
   int64_t *bad_elt_n, cudaStream_t stream
 );
 
+// Enqueue kernel that applies bias to expected results
+void ncclVerifiableApplyBias(
+    void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
+    cudaStream_t stream
+);
+
 #ifdef NCCL_VERIFIABLE_SELF_TEST
 void ncclVerifiableLaunchSelfTest();
 #endif