From db6ea5a5947c620f26712a9dc003fa67b3e3f0d7 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Mon, 13 Oct 2025 14:09:10 -0700 Subject: [PATCH] Add all_reduce_bias_perf to support All Reduce with Bias (#130) Use dynamic symbol loading of ncclAllReduceWithBias Co-authored-by: mberenjk <146776561+mberenjk@users.noreply.github.com> --- src/CMakeLists.txt | 1 + src/Makefile | 2 +- src/all_gather.cu | 2 +- src/all_reduce.cu | 2 +- src/all_reduce_bias.cu | 123 +++++++++++++++++++++++++++++++++++++++ src/alltoall.cu | 2 +- src/alltoallv.cu | 2 +- src/broadcast.cu | 2 +- src/common.cu | 24 ++++++-- src/common.h | 8 ++- src/gather.cu | 2 +- src/hypercube.cu | 2 +- src/reduce.cu | 2 +- src/reduce_scatter.cu | 2 +- src/scatter.cu | 2 +- src/sendrecv.cu | 2 +- verifiable/verifiable.cu | 70 ++++++++++++++++++++++ verifiable/verifiable.h | 6 ++ 18 files changed, 237 insertions(+), 19 deletions(-) create mode 100644 src/all_reduce_bias.cu diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 387bccfc97..8de04365ae 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -145,3 +145,4 @@ add_rccl_test(reduce_scatter) add_rccl_test(reduce) add_rccl_test(scatter) add_rccl_test(sendrecv) +add_rccl_test(all_reduce_bias) diff --git a/src/Makefile b/src/Makefile index fcc020f7c2..7f6ea37279 100644 --- a/src/Makefile +++ b/src/Makefile @@ -135,7 +135,7 @@ HIPLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube all_reduce_bias BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX}) GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp diff --git a/src/all_gather.cu b/src/all_gather.cu index dbbd977ec0..54ca880d8b 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -52,7 +52,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) { NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); return testSuccess; } diff --git a/src/all_reduce.cu b/src/all_reduce.cu index 038188a74e..43c75032c7 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -48,7 +48,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) { NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } diff --git a/src/all_reduce_bias.cu b/src/all_reduce_bias.cu new file mode 100644 index 0000000000..8d49c67483 --- /dev/null +++ b/src/all_reduce_bias.cu @@ -0,0 +1,123 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common.h" +#include + +typedef ncclResult_t (*PFN_ncclAllReduceWithBias)(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream, const void* acc); +#define DECLARE_RCCL_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr +DECLARE_RCCL_PFN(ncclAllReduceWithBias); +static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; + +static void initOnceFunc() { + void *librccl = dlopen("librccl.so", RTLD_NOLOAD); + pfn_ncclAllReduceWithBias = (PFN_ncclAllReduceWithBias) dlsym(librccl, "ncclAllReduceWithBias"); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; + pthread_once(&initOnceControl, initOnceFunc); +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + CUDACHECK(cudaSetDevice(args->gpus[i])); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); + TESTCHECK(InitData(args->bias[i], sendcount, 0, type, op, rep+0x12345678, nranks, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + TESTCHECK(InitDataApplyBias(args->expected[i], args->bias[i], recvcount, 0, type, op)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) { + if (pfn_ncclAllReduceWithBias == nullptr) { + fprintf(stderr, "[ERROR] This version of RCCL doesn't support ncclAllReduceWithBias\n"); + return testNcclError; + } + NCCLCHECK((*pfn_ncclAllReduceWithBias)(sendbuff, recvbuff, count, type, op, comm, stream, bias)); + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i= 2 && NCCL_MINOR >= 2 diff --git a/src/common.cu b/src/common.cu index 3934d22463..f9ae4caf58 100644 --- a/src/common.cu +++ b/src/common.cu @@ -364,6 +364,11 @@ testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, return testSuccess; } +testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op) { + ncclVerifiableApplyBias(expected, bias, count, (int)type, (int)op, offset, cudaStreamDefault); + return testSuccess; +} + testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) { CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault)); return testSuccess; @@ -469,7 +474,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i)); -#if 1 && DEBUG_PRINT +#if 1 && defined(DEBUG_PRINT) if (args->reportErrors && wrongPerGpu[i] != 0) { printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]); char *expectedHost = (char*)malloc(args->expectedBytes); @@ -582,6 +587,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); char* recvBuff = ((char*)args->recvbuffs[i]) + shift; char* sendBuff = ((char*)args->sendbuffs[i]) + shift; + char* bias = ((char*)args->bias[i]) + shift; ncclRedOp_t op; if(opIndex < ncclNumOps) { @@ -629,7 +635,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t TESTCHECK(args->collTest->runColl( (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), - count, type, op, root, args->comms[i], args->streams[i])); + count, type, op, root, args->comms[i], args->streams[i], bias)); #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) if(opIndex >= ncclNumOps) { @@ -1060,7 +1066,7 @@ testResult_t threadLaunch(struct testThread* thread) { return testSuccess; } -testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) { +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, void **bias) { if(enable_rotating_tensor) { recvBytes = recvBytes + cache_bytes; nbytes = nbytes + cache_bytes; @@ -1069,22 +1075,26 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s if(HIP_VERSION >= 50700000) { CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached)); CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached)); + if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocUncached)); if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached)); } else { CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained)); CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained)); + if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocFinegrained)); if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained)); } } else if (memorytype == ncclHost) { CUDACHECK(hipHostMalloc(sendbuff, nbytes)); CUDACHECK(hipHostMalloc(recvbuff, nbytes)); + if (bias) CUDACHECK(hipHostMalloc(bias, nbytes)); if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes)); } else if (memorytype == ncclManaged) { CUDACHECK(cudaMallocManaged(sendbuff, nbytes)); CUDACHECK(cudaMallocManaged(recvbuff, nbytes)); + if (bias) CUDACHECK(cudaMallocManaged(bias, nbytes)); if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes)); #if 0 CUDACHECK(cudaMemset(*sendbuff, 0, nbytes)); @@ -1096,14 +1106,17 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) NCCLCHECK(ncclMemAlloc(sendbuff, nbytes)); NCCLCHECK(ncclMemAlloc(recvbuff, nbytes)); + if (bias) CUDACHECK(cudaMalloc(bias, nbytes)); if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes)); #else CUDACHECK(cudaMalloc(sendbuff, nbytes)); CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (bias) CUDACHECK(cudaMalloc(bias, nbytes)); if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); #endif } CUDACHECK(hipMemset(*sendbuff, 1, nbytes)); + if (bias) CUDACHECK(hipMemset(*bias, 1, nbytes)); if (datacheck) CUDACHECK(hipMemset(*expected, 1, recvBytes)); return testSuccess; } @@ -1554,6 +1567,7 @@ testResult_t run() { std::vector streams(nGpus*nThreads); std::vector sendbuffs(nGpus*nThreads); std::vector recvbuffs(nGpus*nThreads); + std::vector bias(nGpus*nThreads); std::vector expected(nGpus*nThreads); size_t sendBytes, recvBytes; @@ -1564,7 +1578,7 @@ testResult_t run() { for (int i=0; i diff --git a/src/gather.cu b/src/gather.cu index a0dc00de56..24156de7bd 100644 --- a/src/gather.cu +++ b/src/gather.cu @@ -45,7 +45,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw * factor; } -testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/src/hypercube.cu b/src/hypercube.cu index f5d94f026d..c35cc765cc 100644 --- a/src/hypercube.cu +++ b/src/hypercube.cu @@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) { char* sbuff = (char*)sendbuff; char* rbuff = (char*)recvbuff; int nRanks; diff --git a/src/reduce.cu b/src/reduce.cu index c2353c3fc0..bf37a2b016 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -48,7 +48,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw; } -testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) { NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream)); return testSuccess; } diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index fe906ce372..fd589e2738 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -51,7 +51,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d *busBw = baseBw * factor; } -testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) { NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } diff --git a/src/scatter.cu b/src/scatter.cu index d0323fa36d..ca0f6c10ad 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -41,7 +41,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw * factor; } -testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/src/sendrecv.cu b/src/sendrecv.cu index 4f5f6b8a7b..c9c4c4bc0a 100644 --- a/src/sendrecv.cu +++ b/src/sendrecv.cu @@ -45,7 +45,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double *busBw = baseBw * factor; } -testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu index 20df5907a4..0896c724d0 100644 --- a/verifiable/verifiable.cu +++ b/verifiable/verifiable.cu @@ -1044,6 +1044,76 @@ hipError_t ncclVerifiablePrepareInput( #undef CASE_OP } +namespace { +template +__global__ void applyBias2( + T *elts, T *bias, intptr_t elt_n, ReduceFn op, intptr_t elt_ix0 + ) { + intptr_t i0 = blockIdx.x*(elt_n/gridDim.x); + i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x; + intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x); + i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x; + intptr_t i = i0 + threadIdx.x; + while(i < i1) { + elts[i] = op(elts[i], bias[i]); + #if 0 + T output = genOutput(op, rank_n, seed, elt_ix0+i); + printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n", + std::is_same::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts); + #endif + i += blockDim.x; + } +} + +template +void applyBias1( + void *elts, void* bias, intptr_t elt_n, int elt_ty, ReduceOp op, + intptr_t elt_ix0, cudaStream_t stream + ) { + int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); + #define CASE_TY(T) applyBias2<<>>((T*)elts, (T*)bias, elt_n, op, elt_ix0); break; + switch(elt_ty) { + case ncclInt8: CASE_TY(int8_t) + case ncclUint8: CASE_TY(uint8_t) + case ncclInt32: CASE_TY(int32_t) + case ncclUint32: CASE_TY(uint32_t) + case ncclInt64: CASE_TY(int64_t) + case ncclUint64: CASE_TY(uint64_t) + case ncclFloat16: CASE_TY(__half) + #if HAVE_ncclBfloat16 + case ncclBfloat16: CASE_TY(hip_bfloat16) + #endif + #if HAVE_ncclfp8 + case ncclFp8E4M3: CASE_TY(rccl_float8) + case ncclFp8E5M2: CASE_TY(rccl_bfloat8) + #endif + case ncclFloat32: CASE_TY(float) + case ncclFloat64: CASE_TY(double) + default: assert(0); + } + #undef CASE_TY +} +} + +void ncclVerifiableApplyBias( + void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0, + cudaStream_t stream + ) { + #define CASE_OP(op) \ + applyBias1(elts, bias, elt_n, elt_ty, op, elt_ix0, stream); \ + break; + switch(red_op) { + case ncclSum: CASE_OP(ReduceSum()) + case ncclMin: CASE_OP(ReduceMin()) + case ncclMax: CASE_OP(ReduceMax()) + case ncclProd: CASE_OP(ReduceProd()) + #if HAVE_ncclPreMulSum + default: CASE_OP(ReducePreMulSum()) + #endif + } + #undef CASE_OP +} + //////////////////////////////////////////////////////////////////////////////// namespace { diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h index f4452d6d32..b248492aee 100644 --- a/verifiable/verifiable.h +++ b/verifiable/verifiable.h @@ -64,6 +64,12 @@ hipError_t ncclVerifiableVerify( int64_t *bad_elt_n, cudaStream_t stream ); +// Enqueue kernel that applies bias to expected results +void ncclVerifiableApplyBias( + void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0, + cudaStream_t stream +); + #ifdef NCCL_VERIFIABLE_SELF_TEST void ncclVerifiableLaunchSelfTest(); #endif