From e4aef195118c27d1a624917649041b9e2f06968d Mon Sep 17 00:00:00 2001 From: Atul Kulkarni Date: Wed, 3 Dec 2025 17:37:34 -0600 Subject: [PATCH] Added new unit tests for AllReduce with Bias API (#2036) * Added new unit tests for AllReduce with Bias API * Address review comments [ROCm/rccl commit: 7c12b0b76bbf622d1a3051c3352f5171118657b7] --- projects/rccl/test/AllReduceTests.cpp | 444 +++++++++++++++++++ projects/rccl/test/common/CollectiveArgs.cpp | 20 + projects/rccl/test/common/CollectiveArgs.hpp | 16 + projects/rccl/test/common/EnvVars.cpp | 2 + projects/rccl/test/common/EnvVars.hpp | 1 + projects/rccl/test/common/PrepDataFuncs.cpp | 65 ++- projects/rccl/test/common/TestBed.cpp | 5 + projects/rccl/test/common/TestBed.hpp | 6 +- projects/rccl/test/common/TestBedChild.cpp | 45 +- 9 files changed, 586 insertions(+), 18 deletions(-) diff --git a/projects/rccl/test/AllReduceTests.cpp b/projects/rccl/test/AllReduceTests.cpp index 9e1d3d4410..39c25ae8a9 100644 --- a/projects/rccl/test/AllReduceTests.cpp +++ b/projects/rccl/test/AllReduceTests.cpp @@ -249,4 +249,448 @@ namespace RcclUnitTesting } callCollectiveForked(nranks, ncclCollAllReduce, sendBuff, recvBuff, expected, use_managed_mem); } + +#ifdef RCCL_ALLREDUCE_WITH_BIAS + // Note: All bias tests require: + // nRanks >= 2 (bias NOT supported for single rank) + + // Named constants for bias test configuration + namespace BiasTestConstants + { + // Element counts for different operations + constexpr std::initializer_list STANDARD_ELEM_COUNTS = {2048, 384}; // For Sum/Max/Min + constexpr std::initializer_list PROD_ELEM_COUNTS_MEDIUM = {32}; // For Int32/Uint32 Prod + constexpr std::initializer_list PROD_ELEM_COUNTS_LARGE = {64}; // For Int8/Uint8/Int64/Uint64/Float Prod + + // Bias and input pattern constants + constexpr int BIAS_CONSTANT_ONE = 1; // Use constant bias value of 1 (prevents overflow) + constexpr int BIAS_INCREMENTAL_PATTERN + = -1; // Use incremental pattern: bias[i] = i (more thorough testing) + constexpr int INPUT_RANK_BASED_PATTERN + = -1; // Use rank-based pattern: input[rank][i] = (rank+i)%256 + constexpr int INPUT_CONSTANT_ONE = 1; // Use constant input value of 1 (prevents overflow) + } // namespace BiasTestConstants + + /* + * @brief Helper function for running bias tests with specific datatype and redOp + * @param dataType Data type + * @param redOp Reduction operation + * @param numElements Number of elements + * @param biasConstVal Bias constant value, -1 for incremental bias + * @param inputConstVal Input constant value, -1 for rank-based input + */ + void RunBiasTest(ncclDataType_t dataType, + ncclRedOp_t redOp, + std::vector numElements, + int biasConstVal = BiasTestConstants::BIAS_INCREMENTAL_PATTERN, + int inputConstVal = BiasTestConstants::INPUT_RANK_BASED_PATTERN) + { + // Create TestBed first (doesn't create child processes yet) + TestBed testBed; + + // Check if architecture is gfx94 (covers gfx942) or gfx95 (covers gfx950) + if (!testBed.ev.isGfx94 && !testBed.ev.isGfx95) + { + INFO("SKIPPED: AllReduce with Bias is only supported on gfx942 or gfx950 architectures.\n"); + return; + } + + bool const inPlace = false; + bool const useManagedMem = false; + bool const useHipGraph = false; + + OptionalColArgs options; + options.useBias = true; + options.redOp = redOp; + options.biasConstantValue = biasConstVal; + options.inputConstantValue = inputConstVal; + + bool isCorrect = true; + + for(int totalRanks : testBed.ev.GetNumGpusList()) + { + if(totalRanks < 2) + continue; + + int const numProcesses = totalRanks; + bool const isMultiProcess = true; + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder)); + + for(auto numElem : numElements) + { + if(!isCorrect) + break; + + if(testBed.ev.showNames) + { + std::string name = testBed.GetTestCaseName(totalRanks, + isMultiProcess, + ncclCollAllReduce, + dataType, + redOp, + -1, + inPlace, + useManagedMem, + useHipGraph); + INFO(" %s (with bias, count=%d)\n", name.c_str(), numElem); + } + + options.biasNumElements = numElem; + + testBed.SetCollectiveArgs(ncclCollAllReduce, + dataType, + numElem, + numElem, + options, + -1, + 0, + -1); + testBed.AllocateMem(inPlace, useManagedMem); + testBed.PrepareData(); + testBed.ExecuteCollectives({}, useHipGraph); + testBed.ValidateResults(isCorrect); + testBed.DeallocateMem(); + } + testBed.DestroyComms(); + } + testBed.Finalize(); + } + + // Int8 Tests + TEST(AllReduce, BiasInt8_Sum) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt8, + ncclSum, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasInt8_Max) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt8, + ncclMax, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasInt8_Min) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt8, + ncclMin, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasInt8_Prod) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt8, + ncclProd, + PROD_ELEM_COUNTS_LARGE, + BIAS_CONSTANT_ONE, + INPUT_CONSTANT_ONE); + } + + // Uint8 Tests + TEST(AllReduce, BiasUint8_Sum) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint8, + ncclSum, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasUint8_Max) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint8, + ncclMax, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasUint8_Min) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint8, + ncclMin, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasUint8_Prod) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint8, + ncclProd, + PROD_ELEM_COUNTS_LARGE, + BIAS_CONSTANT_ONE, + INPUT_CONSTANT_ONE); + } + + // Int32 Tests + TEST(AllReduce, BiasInt32_Sum) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt32, + ncclSum, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasInt32_Max) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt32, + ncclMax, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasInt32_Min) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt32, + ncclMin, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasInt32_Prod) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt32, + ncclProd, + PROD_ELEM_COUNTS_MEDIUM, + BIAS_CONSTANT_ONE, + INPUT_CONSTANT_ONE); + } + + // Uint32 Tests + TEST(AllReduce, BiasUint32_Sum) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint32, + ncclSum, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasUint32_Max) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint32, + ncclMax, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasUint32_Min) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint32, + ncclMin, + STANDARD_ELEM_COUNTS, + BIAS_CONSTANT_ONE, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasUint32_Prod) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint32, + ncclProd, + PROD_ELEM_COUNTS_MEDIUM, + BIAS_CONSTANT_ONE, + INPUT_CONSTANT_ONE); + } + + // Int64 Tests + TEST(AllReduce, BiasInt64_Sum) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt64, + ncclSum, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasInt64_Max) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt64, + ncclMax, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasInt64_Min) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt64, + ncclMin, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasInt64_Prod) + { + using namespace BiasTestConstants; + RunBiasTest(ncclInt64, + ncclProd, + PROD_ELEM_COUNTS_LARGE, + BIAS_INCREMENTAL_PATTERN, + INPUT_CONSTANT_ONE); + } + + // Uint64 Tests + TEST(AllReduce, BiasUint64_Sum) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint64, + ncclSum, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasUint64_Max) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint64, + ncclMax, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasUint64_Min) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint64, + ncclMin, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasUint64_Prod) + { + using namespace BiasTestConstants; + RunBiasTest(ncclUint64, + ncclProd, + PROD_ELEM_COUNTS_LARGE, + BIAS_INCREMENTAL_PATTERN, + INPUT_CONSTANT_ONE); + } + + // Float32 Tests + TEST(AllReduce, BiasFloat32_Sum) + { + using namespace BiasTestConstants; + RunBiasTest(ncclFloat32, + ncclSum, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasFloat32_Max) + { + using namespace BiasTestConstants; + RunBiasTest(ncclFloat32, + ncclMax, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasFloat32_Min) + { + using namespace BiasTestConstants; + RunBiasTest(ncclFloat32, + ncclMin, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasFloat32_Prod) + { + using namespace BiasTestConstants; + RunBiasTest(ncclFloat32, + ncclProd, + PROD_ELEM_COUNTS_LARGE, + BIAS_INCREMENTAL_PATTERN, + INPUT_CONSTANT_ONE); + } + + // Float64 Tests + TEST(AllReduce, BiasFloat64_Sum) + { + using namespace BiasTestConstants; + RunBiasTest(ncclFloat64, + ncclSum, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasFloat64_Max) + { + using namespace BiasTestConstants; + RunBiasTest(ncclFloat64, + ncclMax, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasFloat64_Min) + { + using namespace BiasTestConstants; + RunBiasTest(ncclFloat64, + ncclMin, + STANDARD_ELEM_COUNTS, + BIAS_INCREMENTAL_PATTERN, + INPUT_RANK_BASED_PATTERN); + } + + TEST(AllReduce, BiasFloat64_Prod) + { + using namespace BiasTestConstants; + RunBiasTest(ncclFloat64, + ncclProd, + PROD_ELEM_COUNTS_LARGE, + BIAS_INCREMENTAL_PATTERN, + INPUT_CONSTANT_ONE); + } + +#else + // If RCCL_ALLREDUCE_WITH_BIAS is not defined, skip all bias tests + TEST(AllReduce, BiasNotAvailable) + { + INFO("SKIPPED: RCCL_ALLREDUCE_WITH_BIAS not defined - bias tests skipped\n"); + return; + } +#endif } diff --git a/projects/rccl/test/common/CollectiveArgs.cpp b/projects/rccl/test/common/CollectiveArgs.cpp index 28ce1b1c09..93fe0d588f 100644 --- a/projects/rccl/test/common/CollectiveArgs.cpp +++ b/projects/rccl/test/common/CollectiveArgs.cpp @@ -102,6 +102,17 @@ namespace RcclUnitTesting CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated)); } CHECK_CALL(this->outputCpu.AllocateCpuMem(this->numOutputBytesAllocated)); + + // Allocate bias buffers if bias is enabled + if (this->options.useBias) + { + this->numBiasElements = this->options.biasNumElements; + this->numBiasBytesAllocated = this->numBiasElements * DataTypeToBytes(this->dataType); + CHECK_CALL(this->biasGpu.AllocateGpuMem(this->numBiasBytesAllocated, useManagedMem, userRegistered)); + CHECK_CALL(this->biasCpu.AllocateCpuMem(this->numBiasBytesAllocated)); + this->biasRegHandle = nullptr; + } + return TEST_SUCCESS; } @@ -155,6 +166,15 @@ namespace RcclUnitTesting if (this->options.scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr)); this->localScalar.Attach(nullptr); } + + // Deallocate bias buffers if they were allocated + if (this->options.useBias && this->numBiasBytesAllocated > 0) + { + this->biasGpu.FreeGpuMem(this->userRegistered); + this->biasCpu.FreeCpuMem(); + this->biasRegHandle = nullptr; + } + return TEST_SUCCESS; } diff --git a/projects/rccl/test/common/CollectiveArgs.hpp b/projects/rccl/test/common/CollectiveArgs.hpp index aa497ebf01..0c7745aac8 100644 --- a/projects/rccl/test/common/CollectiveArgs.hpp +++ b/projects/rccl/test/common/CollectiveArgs.hpp @@ -82,6 +82,15 @@ namespace RcclUnitTesting ScalarTransport scalarTransport; // Used for custom reduction operators int scalarMode = -1; // -1 if scalar not used + // Bias support for fused AllReduce+Bias operations + bool useBias = false; // Enable bias addition + void* biasPtr = nullptr; // Pointer to bias buffer (GPU memory) + size_t biasNumElements = 0; // Number of elements in bias buffer + int biasConstantValue = -1; // If >= 0, use constant value for all bias elements (instead of incremental pattern) + + // Input data pattern control (useful for ncclProd to avoid overflow at high rank counts) + int inputConstantValue = -1; // If >= 0, use constant value for all input elements (instead of rank-based pattern) + // allToAllv args size_t sendcounts[MAX_RANKS*MAX_RANKS]; size_t sdispls[MAX_RANKS*MAX_RANKS]; @@ -122,6 +131,13 @@ namespace RcclUnitTesting size_t numInputElementsAllocated; size_t numOutputElementsAllocated; + // Bias data for fused AllReduce+Bias operations + PtrUnion biasGpu; // Bias buffer on GPU + PtrUnion biasCpu; // Bias buffer on CPU (for initialization/validation) + void* biasRegHandle; // Handle for registered bias buffer + size_t numBiasElements; // Number of elements in bias buffer + size_t numBiasBytesAllocated; // Number of bytes allocated for bias + // Set collective arguments ErrCode SetArgs(int const globalRank, int const totalRanks, diff --git a/projects/rccl/test/common/EnvVars.cpp b/projects/rccl/test/common/EnvVars.cpp index edcda6f457..de8b44ef51 100644 --- a/projects/rccl/test/common/EnvVars.cpp +++ b/projects/rccl/test/common/EnvVars.cpp @@ -200,6 +200,8 @@ namespace RcclUnitTesting numDetectedGpus = min(numDetectedGpus, 16); isGfx94 = false; getArchInfo(&isGfx94, "gfx94"); + isGfx95 = false; + getArchInfo(&isGfx95, "gfx95"); isGfx12 = false; getArchInfo(&isGfx12, "gfx12"); isGfx90 = false; diff --git a/projects/rccl/test/common/EnvVars.hpp b/projects/rccl/test/common/EnvVars.hpp index 914c4c1566..c2e211665f 100644 --- a/projects/rccl/test/common/EnvVars.hpp +++ b/projects/rccl/test/common/EnvVars.hpp @@ -33,6 +33,7 @@ namespace RcclUnitTesting bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD] bool isGfx94; // Detects if architecture is gfx94 + bool isGfx95; // Detects if architecture is gfx95 bool isGfx12; // Detects if architecture is gfx12 bool isGfx90; // Detects if architecture is gfx90 diff --git a/projects/rccl/test/common/PrepDataFuncs.cpp b/projects/rccl/test/common/PrepDataFuncs.cpp index cde4d73c34..31d99c548f 100644 --- a/projects/rccl/test/common/PrepDataFuncs.cpp +++ b/projects/rccl/test/common/PrepDataFuncs.cpp @@ -47,6 +47,22 @@ namespace RcclUnitTesting collArgs.numOutputElements, collArgs.numOutputElementsAllocated); return TEST_FAIL; } + + // Check bias allocation if bias is enabled + if (collArgs.options.useBias) + { + if (collArgs.numBiasElements == 0 || collArgs.numBiasBytesAllocated == 0) + { + ERROR("Bias is enabled but bias buffers are not allocated\n"); + return TEST_FAIL; + } + if (collArgs.numBiasElements != collArgs.numOutputElements) + { + ERROR("Number of bias elements (%lu) must match number of output elements (%lu)\n", + collArgs.numBiasElements, collArgs.numOutputElements); + return TEST_FAIL; + } + } return TEST_SUCCESS; } @@ -108,7 +124,22 @@ namespace RcclUnitTesting for (int rank = 0; rank < collArgs.totalRanks; ++rank) { // Generate temporary input for this rank - CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false)); + if (collArgs.options.inputConstantValue >= 0) + { + // Use constant value for all input elements across all ranks + // This is useful for ncclProd at high rank counts to avoid factorial overflow + for (size_t i = 0; i < collArgs.numInputElements; i++) + { + CHECK_CALL(tempInputCpu.Set(collArgs.dataType, i, + collArgs.options.inputConstantValue, + (double)collArgs.options.inputConstantValue)); + } + } + else + { + // Use rank-based pattern: value[rank][i] = (rank + i) % 256 (default behavior) + CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false)); + } // Copy the pre-scaled input into GPU memory for the correct rank if (rank == collArgs.globalRank) @@ -144,6 +175,38 @@ namespace RcclUnitTesting { CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks)); } + + // Add bias to expected output if bias is enabled + if (collArgs.options.useBias && (isAllReduce || collArgs.options.root == collArgs.globalRank)) + { + // Initialize bias data on CPU + if (collArgs.options.biasConstantValue >= 0) + { + // Use constant value for all bias elements (useful for ncclProd to avoid overflow) + for (size_t i = 0; i < collArgs.numBiasElements; i++) + { + CHECK_CALL(collArgs.biasCpu.Set(collArgs.dataType, i, + collArgs.options.biasConstantValue, + (double)collArgs.options.biasConstantValue)); + } + } + else + { + // Use incremental pattern: bias[i] = i (default behavior) + CHECK_CALL(collArgs.biasCpu.FillPattern(collArgs.dataType, collArgs.numBiasElements, 0, false)); + } + + // Copy bias data to GPU + size_t const biasBytes = collArgs.numBiasBytesAllocated; + CHECK_HIP(hipMemcpy(collArgs.biasGpu.ptr, collArgs.biasCpu.ptr, biasBytes, hipMemcpyHostToDevice)); + + // Apply bias to expected output using the SAME reduction operation as AllReduce + CHECK_CALL(result.Reduce(collArgs.dataType, collArgs.numInputElements, collArgs.biasCpu, tempOp)); + + // Update the biasPtr in options to point to the GPU buffer + collArgs.options.biasPtr = collArgs.biasGpu.ptr; + } + return TEST_SUCCESS; } diff --git a/projects/rccl/test/common/TestBed.cpp b/projects/rccl/test/common/TestBed.cpp index 0825a847d1..f870ae0735 100644 --- a/projects/rccl/test/common/TestBed.cpp +++ b/projects/rccl/test/common/TestBed.cpp @@ -718,6 +718,11 @@ namespace RcclUnitTesting &numOutputElements); optionalArgs.redOp = redOps[rdIdx]; optionalArgs.root = roots[rtIdx] % this->numActiveRanks; + // Set biasNumElements if bias is enabled + if (optionalArgs.useBias) + { + optionalArgs.biasNumElements = numOutputElements; + } this->SetCollectiveArgs(funcTypes[ftIdx], dataTypes[dtIdx], numInputElements, diff --git a/projects/rccl/test/common/TestBed.hpp b/projects/rccl/test/common/TestBed.hpp index 6192289e61..e1a3215da0 100644 --- a/projects/rccl/test/common/TestBed.hpp +++ b/projects/rccl/test/common/TestBed.hpp @@ -41,7 +41,7 @@ namespace RcclUnitTesting std::vector const& numStreamsPerGroup, int const numGroupCalls = 1, bool const useBlocking = true); - + // Prepare TestBed for use with GPUs across multiple child processes void InitComms(std::vector> const& deviceIdsPerChild, int const numCollectivesInGroup = 1, @@ -96,7 +96,7 @@ namespace RcclUnitTesting // Execute all collectives on all test children // Blocks until collective is completed void ExecuteCollectives(std::vector const ¤tRanks = {}, - int const groupId = -1, + int const groupId = -1, bool const useHipGraph = false); // Perform results validation - compare output to expected @@ -140,7 +140,7 @@ namespace RcclUnitTesting int const numGpus, int const ranksPerGpu, const std::vector& gpuPriorityOrder); - + static std::vector> GetDeviceIdsList(int const numProcesses, int const numGpus, const std::vector& gpuPriorityOrder); diff --git a/projects/rccl/test/common/TestBedChild.cpp b/projects/rccl/test/common/TestBedChild.cpp index 381d8f1dbb..c3e670b6df 100644 --- a/projects/rccl/test/common/TestBedChild.cpp +++ b/projects/rccl/test/common/TestBedChild.cpp @@ -598,15 +598,32 @@ namespace RcclUnitTesting "ncclReduceScatter"); break; case ncclCollAllReduce: - CHILD_NCCL_CALL_RANK(errCode, ncclAllReduce( - collArg.inputGpu.ptr, - collArg.outputGpu.ptr, - collArg.numInputElements, - collArg.dataType, - collArg.options.redOp, - this->comms[localRank], - this->streams[groupId][localRank][collArg.streamIdx]), - "ncclAllReduce"); + // Use ncclAllReduceWithBias if bias is enabled + if (collArg.options.useBias) + { + CHILD_NCCL_CALL_RANK(errCode, ncclAllReduceWithBias( + collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numInputElements, + collArg.dataType, + collArg.options.redOp, + this->comms[localRank], + this->streams[groupId][localRank][collArg.streamIdx], + collArg.options.biasPtr), + "ncclAllReduceWithBias"); + } + else + { + CHILD_NCCL_CALL_RANK(errCode, ncclAllReduce( + collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numInputElements, + collArg.dataType, + collArg.options.redOp, + this->comms[localRank], + this->streams[groupId][localRank][collArg.streamIdx]), + "ncclAllReduce"); + } break; case ncclCollGather: CHILD_NCCL_CALL_RANK(errCode, ncclGather( @@ -762,7 +779,7 @@ namespace RcclUnitTesting { streamsToComplete.erase(streamsToComplete.begin() + i); i--; - } + } } usElapsed = duration_cast(Clock::now() - start).count(); } @@ -773,13 +790,13 @@ namespace RcclUnitTesting if (this->verbose) INFO("Collective timed out, aborting\n"); for (int localRank : localRanksToExecute) { - ncclCommAbort(this->comms[localRank]); + ncclCommAbort(this->comms[localRank]); timedout = 1; } } // extra sync to flush GPU cache for validation later - // TODO: remove this after figuring out & fixing the exact behavior + // TODO: remove this after figuring out & fixing the exact behavior // of fencing between kernels and at hipStreamQuery for (int localRank : localRanksToExecute) { @@ -970,7 +987,7 @@ namespace RcclUnitTesting PIPE_READ(groupId); // Release graphs - for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank) + for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank) { CHECK_HIP(hipSetDevice(this->deviceIds[localRank])); for (int streamIdx = 0; streamIdx < this->numStreamsPerGroup[groupId]; ++streamIdx) @@ -989,7 +1006,7 @@ namespace RcclUnitTesting { for (int i = 0; i < this->numStreamsPerGroup[groupId]; ++i) CHECK_HIP(hipStreamSynchronize(this->streams[groupId][localRank][i])); - } + } this->graphs[groupId].clear(); this->graphExecs[groupId].clear();