diff --git a/CMakeLists.txt b/CMakeLists.txt index 842caef63c..c64c96c2d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,7 @@ option(BUILD_TESTS "Build test programs" OFF) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) option(BUILD_ALLREDUCE_ONLY "Build AllReduce + sum + float kernel only" OFF) -#Set the header wrapper ON by default. +#Set the header wrapper ON by default. option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON) # parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist @@ -292,7 +292,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY) GUARDS SYMLINK WRAPPER WRAPPER_LOCATIONS rccl ) #install the wrapper header file to package - rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h" + rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h" DESTINATION "./rccl/include/" ) endif() diff --git a/test/AllReduce_GroupCall.cpp b/test/AllReduce_GroupCall.cpp index 404935eb71..5d10a56c83 100644 --- a/test/AllReduce_GroupCall.cpp +++ b/test/AllReduce_GroupCall.cpp @@ -16,11 +16,11 @@ namespace RcclUnitTesting std::vector const& dataTypes = {ncclFloat}; std::vector const& redOps = {ncclSum}; std::vector const numElements = {1048576, 53327, 1024}; - int const root = 0; bool const inPlace = false; bool const useManagedMem = false; int const numCollPerGroup = numElements.size(); + OptionalColArgs options; // This tests runs 3 collectives in the same group call bool isCorrect = true; for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks) @@ -33,30 +33,32 @@ namespace RcclUnitTesting testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx) - for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) { - if (testBed.ev.showNames) - INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n", - isMultiProcess ? "MP" : "SP", - totalRanks, numCollPerGroup, - ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]); - - // Run all element sizes in parallel as single group - for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx) + options.redOp = redOps[redOpIdx]; + for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) { - testBed.SetCollectiveArgs(funcType, - dataTypes[dataIdx], - redOps[redOpIdx], - root, - numElements[collIdx], - numElements[collIdx], - collIdx); + if (testBed.ev.showNames) + INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n", + isMultiProcess ? "MP" : "SP", + totalRanks, numCollPerGroup, + ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]); + + // Run all element sizes in parallel as single group + for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx) + { + testBed.SetCollectiveArgs(funcType, + dataTypes[dataIdx], + numElements[collIdx], + numElements[collIdx], + options, + collIdx); + } + testBed.AllocateMem(inPlace, useManagedMem); + testBed.PrepareData(); + testBed.ExecuteCollectives(); + testBed.ValidateResults(isCorrect); + testBed.DeallocateMem(); } - testBed.AllocateMem(inPlace, useManagedMem); - testBed.PrepareData(); - testBed.ExecuteCollectives(); - testBed.ValidateResults(isCorrect); - testBed.DeallocateMem(); } testBed.DestroyComms(); } diff --git a/test/AllReduce_PreMultScalar.cpp b/test/AllReduce_PreMultScalar.cpp index 37d8a064b7..9be19000ad 100644 --- a/test/AllReduce_PreMultScalar.cpp +++ b/test/AllReduce_PreMultScalar.cpp @@ -17,10 +17,10 @@ namespace RcclUnitTesting std::vector const& dataTypes = {ncclInt32, ncclFloat32, ncclFloat64}; ncclRedOp_t const redOp = ncclSum; std::vector const numElements = {1048576, 1024}; - int const root = 0; bool const inPlace = false; bool const useManagedMem = false; + OptionalColArgs options; // Terminate the test as soon as first failure occurs bool isCorrect = true; for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks) @@ -43,6 +43,8 @@ namespace RcclUnitTesting double F = i; scalarsPerRank.Set(dataType, i, i, F); } + int const numBytes = totalRanks * DataTypeToBytes(dataType); + memcpy(options.scalarTransport.ptr, scalarsPerRank.ptr, numBytes); // Test various scalar residence modes for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode) @@ -54,9 +56,11 @@ namespace RcclUnitTesting for (int i = 0; i < numElements.size() && isCorrect; ++i) { - testBed.SetCollectiveArgs(funcType, dataType, redOp, root, + options.scalarMode = scalarMode; + options.redOp = redOp; + testBed.SetCollectiveArgs(funcType, dataType, numElements[i], numElements[i], - -1, -1, scalarsPerRank, scalarMode); + options); // For performance, only allocate and prepare data on largest size if (i == 0) { diff --git a/test/AllToAllv_OutOfPlace.cpp b/test/AllToAllv_OutOfPlace.cpp new file mode 100644 index 0000000000..083f3353cd --- /dev/null +++ b/test/AllToAllv_OutOfPlace.cpp @@ -0,0 +1,96 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + + void sendRecvPrep(size_t numInputElementsArray[], + size_t numOutputElementsArray[], + OptionalColArgs &options, + int totalRanks, int numElementsBase) + { + for (int sendRank = 0; sendRank < totalRanks; ++sendRank) + for (int recvRank = 0; recvRank < totalRanks; ++recvRank ) + { + //create send counts, and build other arrays from that + options.sendcounts[sendRank*totalRanks+recvRank] = numElementsBase * (recvRank + 1); + options.recvcounts[recvRank*totalRanks+sendRank] = options.sendcounts[sendRank*totalRanks+recvRank ]; + } + + for (int sendRank = 0; sendRank < totalRanks; ++sendRank) + { + options.sdispls[sendRank*totalRanks] = 0; + options.rdispls[sendRank*totalRanks] = 0; + for (int recvRank = 1; recvRank < totalRanks; ++recvRank ) + { + options.sdispls[sendRank*totalRanks+recvRank] = + options.sdispls[sendRank*totalRanks+recvRank-1] + options.sendcounts[sendRank*totalRanks+recvRank-1]; + options.rdispls[sendRank*totalRanks+recvRank] = + options.rdispls[sendRank*totalRanks+recvRank-1] + options.recvcounts[sendRank*totalRanks+recvRank-1]; + } + numInputElementsArray[sendRank] = options.sdispls[(sendRank+1)*totalRanks-1] + options.sendcounts[(sendRank+1)*totalRanks-1]; + numOutputElementsArray[sendRank] = options.rdispls[(sendRank+1)*totalRanks-1] + options.recvcounts[(sendRank+1)*totalRanks-1]; + } + + } + + TEST(AllToAllv, OutOfPlace) + { + TestBed testBed; + // Configuration + std::vector const& dataTypes = {ncclInt32, ncclFloat64}; + std::vector const numElementsBase = {1048576, 53327, 1024}; + bool const inPlace = false; + bool const useManagedMem = false; + + OptionalColArgs options; + size_t numInputElementsArray[MAX_RANKS], numOutputElementsArray[MAX_RANKS]; + bool isCorrect = true; + for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks) + for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess) + { + if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue; + + int const numProcesses = isMultiProcess ? totalRanks : 1; + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + + for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) + for (int numIdx = 0; numIdx < numElementsBase.size() && isCorrect; ++numIdx) + { + if (testBed.ev.showNames) + { + std::string name = testBed.GetTestCaseName(totalRanks, isMultiProcess, + ncclCollAllToAllv, dataTypes[dataIdx], + ncclSum, -1, + inPlace, useManagedMem); + INFO("%s\n", name.c_str()); + + } + sendRecvPrep(numInputElementsArray, numOutputElementsArray, options, totalRanks, numElementsBase[numIdx]); + for (int rank = 0; rank < totalRanks; ++rank) + { + testBed.SetCollectiveArgs(ncclCollAllToAllv, + dataTypes[dataIdx], + numInputElementsArray[rank], + numOutputElementsArray[rank], + options, + -1, + rank); + + } + testBed.AllocateMem(inPlace, useManagedMem); + testBed.PrepareData(); + testBed.ExecuteCollectives(); + testBed.ValidateResults(isCorrect); + testBed.DeallocateMem(); + + } + testBed.DestroyComms(); + } + testBed.Finalize(); + } +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 75c892a854..a40b610c03 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -65,6 +65,8 @@ if(BUILD_TESTS) #AllToAll AllToAll_OutOfPlace.cpp AllToAll_ManagedMem.cpp + #AllToAllv + AllToAllv_OutOfPlace.cpp #Broadcast Broadcast_InPlace.cpp Broadcast_ManagedMem.cpp diff --git a/test/SendRecv_SinglePairs.cpp b/test/SendRecv_SinglePairs.cpp index e23a64f754..1911fcf6dd 100644 --- a/test/SendRecv_SinglePairs.cpp +++ b/test/SendRecv_SinglePairs.cpp @@ -17,6 +17,7 @@ namespace RcclUnitTesting bool const inPlace = false; bool const useManagedMem = false; + OptionalColArgs options; bool isCorrect = true; int totalRanks = testBed.ev.maxGpus; for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess) @@ -32,12 +33,12 @@ namespace RcclUnitTesting { for (int recvRank = 0; recvRank < totalRanks; ++recvRank) { + options.root = recvRank; testBed.SetCollectiveArgs(ncclCollSend, dataTypes[dataIdx], - ncclSum, - recvRank, numElements[numIdx], numElements[numIdx], + options, 0, sendRank); if (recvRank == 0) @@ -55,18 +56,17 @@ namespace RcclUnitTesting recvRank, numElements[numIdx]); - + options.root = sendRank; testBed.SetCollectiveArgs(ncclCollRecv, dataTypes[dataIdx], - ncclSum, - sendRank, numElements[numIdx], numElements[numIdx], + options, 0, recvRank); testBed.AllocateMem(inPlace, useManagedMem, 0, recvRank); testBed.PrepareData(0, recvRank); - testBed.ExecuteCollectives({sendRank,recvRank }); + testBed.ExecuteCollectives({sendRank, recvRank}); testBed.ValidateResults(isCorrect, 0, recvRank); testBed.DeallocateMem(0, recvRank); } diff --git a/test/common/CollectiveArgs.cpp b/test/common/CollectiveArgs.cpp index 976ce1bca8..08beed737d 100644 --- a/test/common/CollectiveArgs.cpp +++ b/test/common/CollectiveArgs.cpp @@ -14,20 +14,17 @@ namespace RcclUnitTesting int const deviceId, ncclFunc_t const funcType, ncclDataType_t const dataType, - ncclRedOp_t const redOp, - int const root, size_t const numInputElements, size_t const numOutputElements, - ScalarTransport const scalarTransport, - int const scalarMode) + OptionalColArgs const &optionalColArgs) { // Free scalar based on previous scalarMode - if (scalarMode != -1) + if (optionalColArgs.scalarMode != -1) { if (this->localScalar.ptr != nullptr) { - if (this->scalarMode == 0) this->localScalar.FreeGpuMem(); - if (this->scalarMode == 1) hipHostFree(this->localScalar.ptr); + if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem(); + if (this->options.scalarMode == 1) hipHostFree(this->localScalar.ptr); } } @@ -36,26 +33,23 @@ namespace RcclUnitTesting this->deviceId = deviceId; this->funcType = funcType; this->dataType = dataType; - this->redOp = redOp; - this->root = root; this->numInputElements = numInputElements; this->numOutputElements = numOutputElements; - this->scalarTransport = scalarTransport; - this->scalarMode = scalarMode; + this->options = optionalColArgs; - if (scalarMode != -1) + if (this->options.scalarMode != -1) { size_t const numBytes = DataTypeToBytes(dataType); - if (scalarMode == ncclScalarDevice) + if (this->options.scalarMode == ncclScalarDevice) { CHECK_CALL(this->localScalar.AllocateGpuMem(numBytes)); - CHECK_HIP(hipMemcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), + CHECK_HIP(hipMemcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes), numBytes, hipMemcpyHostToDevice)); } - else if (scalarMode == ncclScalarHostImmediate) + else if (this->options.scalarMode == ncclScalarHostImmediate) { CHECK_HIP(hipHostMalloc(&this->localScalar.ptr, numBytes, 0)); - memcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), numBytes); + memcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes), numBytes); } } return TEST_SUCCESS; @@ -116,8 +110,8 @@ namespace RcclUnitTesting ErrCode CollectiveArgs::ValidateResults() { // Ignore non-root outputs for collectives with a root - if (CollectiveArgs::UsesRoot(this->funcType) && this->root != this->globalRank) return TEST_SUCCESS; - + if (CollectiveArgs::UsesRoot(this->funcType) && this->options.root != this->globalRank) return TEST_SUCCESS; + if (this->funcType == ncclCollSend) return TEST_SUCCESS; // on the send receive pair only recv needs to be checked size_t const numOutputBytes = (this->numOutputElements * DataTypeToBytes(this->dataType)); CHECK_HIP(hipMemcpy(this->outputCpu.ptr, this->outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost)); @@ -153,8 +147,9 @@ namespace RcclUnitTesting if (this->localScalar.ptr != nullptr) { - if (this->scalarMode == 0) this->localScalar.FreeGpuMem(); - if (this->scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr)); + if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem(); + if (this->options.scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr)); + this->localScalar.Attach(nullptr); } return TEST_SUCCESS; } @@ -174,6 +169,7 @@ namespace RcclUnitTesting case ncclCollGather: ss << "ncclGather"; break; case ncclCollScatter: ss << "ncclScatter"; break; case ncclCollAllToAll: ss << "ncclAllToAll"; break; + case ncclCollAllToAllv: ss << "ncclAllToAllv"; break; case ncclCollSend: ss << "ncclSend"; break; case ncclCollRecv: ss << "ncclRecv"; break; default: ss << "[Unknown]"; break; @@ -184,9 +180,9 @@ namespace RcclUnitTesting this->funcType == ncclCollReduceScatter || this->funcType == ncclCollAllReduce) { - if (this->redOp < ncclNumOps) + if (this->options.redOp < ncclNumOps) { - ss << ncclRedOpNames[this->redOp] << " "; + ss << ncclRedOpNames[this->options.redOp] << " "; } else { @@ -215,13 +211,13 @@ namespace RcclUnitTesting this->funcType == ncclCollGather || this->funcType == ncclCollScatter) { - ss << "Root " << this->root << " "; + ss << "Root " << this->options.root << " "; } if (this->funcType == ncclCollSend || this->funcType == ncclCollRecv) { - ss << "Peer " << this->root << " "; + ss << "Peer " << this->options.root << " "; } ss << "#In: " << this->numInputElements; @@ -277,7 +273,6 @@ namespace RcclUnitTesting return (funcType == ncclCollBroadcast || funcType == ncclCollReduce || funcType == ncclCollGather || - funcType == ncclCollScatter || - funcType == ncclCollSend); // this is incorrect but it works because in Send root is not root it is the peer + funcType == ncclCollScatter); } } diff --git a/test/common/CollectiveArgs.hpp b/test/common/CollectiveArgs.hpp index e7d294bc81..9caf2ab33f 100644 --- a/test/common/CollectiveArgs.hpp +++ b/test/common/CollectiveArgs.hpp @@ -22,6 +22,7 @@ namespace RcclUnitTesting ncclCollGather, ncclCollScatter, ncclCollAllToAll, + ncclCollAllToAllv, ncclCollSend, ncclCollRecv, ncclNumFuncs @@ -37,6 +38,7 @@ namespace RcclUnitTesting "Gather", "Scatter", "AllToAll", + "AllToAllv", "Send", "Recv" }; @@ -72,6 +74,19 @@ namespace RcclUnitTesting char ptr[MAX_RANKS * sizeof(double)]; }; + struct OptionalColArgs + { + ncclRedOp_t redOp = ncclSum; + int root = 0; // Used as "peer" for Send/Recv + ScalarTransport scalarTransport; // Used for custom reduction operators + int scalarMode = -1; // -1 if scalar not used + // allToAllv args + size_t sendcounts[MAX_RANKS*MAX_RANKS]; + size_t sdispls[MAX_RANKS*MAX_RANKS]; + size_t recvcounts[MAX_RANKS*MAX_RANKS]; + size_t rdispls[MAX_RANKS*MAX_RANKS]; + }; + // Function pointer for functions that operate on CollectiveArgs // e.g. For filling input / computing expected results typedef ErrCode (*CollFuncPtr)(CollectiveArgs &); @@ -85,13 +100,10 @@ namespace RcclUnitTesting int deviceId; ncclFunc_t funcType; ncclDataType_t dataType; - ncclRedOp_t redOp; - int root; // Used as "peer" for Send/Recv size_t numInputElements; size_t numOutputElements; - ScalarTransport scalarTransport; // Used for custom reduction operators PtrUnion localScalar; - int scalarMode; // -1 if scalar not used + OptionalColArgs options; // Data PtrUnion inputGpu; @@ -111,12 +123,9 @@ namespace RcclUnitTesting int const deviceId, ncclFunc_t const funcType, ncclDataType_t const dataType, - ncclRedOp_t const redOp, - int const root, size_t const numInputElements, size_t const numOutputElements, - ScalarTransport const scalarsPerRank, - int const scalarMode = -1); + OptionalColArgs const &optionalArgs = {}); // Allocates GPU memory for input/output and CPU memory for expected // When inPlace is true, input and output share the same memory diff --git a/test/common/PrepDataFuncs.cpp b/test/common/PrepDataFuncs.cpp index 4f837d76c7..cde4d73c34 100644 --- a/test/common/PrepDataFuncs.cpp +++ b/test/common/PrepDataFuncs.cpp @@ -23,6 +23,7 @@ namespace RcclUnitTesting case ncclCollGather: return DefaultPrepData_Gather(collArgs, false); case ncclCollScatter: return DefaultPrepData_Scatter(collArgs); case ncclCollAllToAll: return DefaultPrepData_AllToAll(collArgs); + case ncclCollAllToAllv: return DefaultPrepData_AllToAllv(collArgs); case ncclCollSend: return DefaultPrepData_Send(collArgs); case ncclCollRecv: return DefaultPrepData_Recv(collArgs); default: @@ -64,15 +65,15 @@ namespace RcclUnitTesting CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes)); // Only root needs input pattern - if (collArgs.globalRank == collArgs.root) + if (collArgs.globalRank == collArgs.options.root) CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType, collArgs.numInputElements, - collArgs.root, true)); + collArgs.options.root, true)); // Otherwise all other ranks expected output is the same as input of root return collArgs.expected.FillPattern(collArgs.dataType, collArgs.numInputElements, - collArgs.root, + collArgs.options.root, false); } @@ -96,11 +97,11 @@ namespace RcclUnitTesting CHECK_CALL(result.ClearCpuMem(numBytes)); // If average or custom reduction operator is used, perform a summation instead - ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp); + ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp); // Loop over each rank and generate their input into a temp buffer, then reduce PtrUnion scalarsPerRank; - scalarsPerRank.Attach(collArgs.scalarTransport.ptr); + scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr); PtrUnion tempInputCpu; CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu)); @@ -117,14 +118,14 @@ namespace RcclUnitTesting // Scale the temporary input by local scalar for this rank // (Used by custom reduction ops) - if (collArgs.scalarMode >= 0) + if (collArgs.options.scalarMode >= 0) { CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements, scalarsPerRank, rank)); } // Any rank that requires output reduces the scaled-inputs - if (isAllReduce || collArgs.root == collArgs.globalRank) + if (isAllReduce || collArgs.options.root == collArgs.globalRank) { if (rank == 0) { @@ -139,7 +140,7 @@ namespace RcclUnitTesting } // Perform averaging if necessary - if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank)) + if (collArgs.options.redOp == ncclAvg && (isAllReduce || collArgs.options.root == collArgs.globalRank)) { CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks)); } @@ -176,7 +177,7 @@ namespace RcclUnitTesting { CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice)); } - if (isAllGather || collArgs.root == collArgs.globalRank) + if (isAllGather || collArgs.options.root == collArgs.globalRank) { memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes); } @@ -207,11 +208,11 @@ namespace RcclUnitTesting CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes)); // If average or custom reduction operator is used, perform a summation instead - ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp); + ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp); // Loop over each rank and generate the input / scale / reduce PtrUnion scalarsPerRank; - scalarsPerRank.Attach(collArgs.scalarTransport.ptr); + scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr); for (int rank = 0; rank < collArgs.totalRanks; ++rank) { CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false)); @@ -229,7 +230,7 @@ namespace RcclUnitTesting // Scale the temporary input by local scalar for this rank // (Used by custom reduction ops) - if (collArgs.scalarMode >= 0) + if (collArgs.options.scalarMode >= 0) { CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements, scalarsPerRank, rank)); @@ -247,7 +248,7 @@ namespace RcclUnitTesting } // Perform averaging if necessary - if (collArgs.redOp == ncclAvg) + if (collArgs.options.redOp == ncclAvg) { CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks)); } @@ -279,10 +280,10 @@ namespace RcclUnitTesting // Generate input as if on root rank - each rank will receive a portion PtrUnion tempInput; tempInput.AllocateCpuMem(numInputBytes); - tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false); + tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.options.root, false); // Copy input to root rank - if (collArgs.globalRank == collArgs.root) + if (collArgs.globalRank == collArgs.options.root) { if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess) { @@ -341,6 +342,45 @@ namespace RcclUnitTesting return TEST_SUCCESS; } + ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs) + { + + CHECK_CALL(CheckAllocation(collArgs)); + size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType); + size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType); + + // calculating maxNumElements as the maximum number of input bytes out of all the ranks + size_t maxNumElements = 0; + for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank) + for (int recvRank = 0; recvRank < collArgs.totalRanks; ++recvRank) + { + size_t rankSendCount = collArgs.options.sdispls[(sendRank)*collArgs.totalRanks+recvRank] + collArgs.options.sendcounts[(sendRank)*collArgs.totalRanks+recvRank]; + maxNumElements = std::max(maxNumElements, rankSendCount); + } + + // Clear outputs on all ranks (prior to input in case of in-place) + collArgs.outputGpu.ClearGpuMem(numOutputBytes); + + // Generate input on root rank - each rank will receive a portion + PtrUnion tempInput; + tempInput.AllocateCpuMem(maxNumElements*DataTypeToBytes(collArgs.dataType)); + + for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank) + { + tempInput.FillPattern(collArgs.dataType, maxNumElements, sendRank, false); + size_t recvDspls = collArgs.options.rdispls[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType); + size_t sendDspls = collArgs.options.sdispls[sendRank*collArgs.totalRanks + collArgs.globalRank] * DataTypeToBytes(collArgs.dataType); + size_t numBytes = collArgs.options.recvcounts[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType); + memcpy(collArgs.expected.U1 + recvDspls, tempInput.U1 + sendDspls, numBytes); + } + tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.globalRank, false); + + CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice)); + + tempInput.FreeCpuMem(); + return TEST_SUCCESS; + } + ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs) { CHECK_CALL(CheckAllocation(collArgs)); @@ -354,7 +394,7 @@ namespace RcclUnitTesting CHECK_CALL(CheckAllocation(collArgs)); return collArgs.expected.FillPattern(collArgs.dataType, collArgs.numOutputElements, - collArgs.root, + collArgs.options.root, false); } } diff --git a/test/common/PrepDataFuncs.hpp b/test/common/PrepDataFuncs.hpp index 1dc7f0dd3a..d610b917cd 100644 --- a/test/common/PrepDataFuncs.hpp +++ b/test/common/PrepDataFuncs.hpp @@ -22,6 +22,7 @@ namespace RcclUnitTesting ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs); ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs); ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs); + ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs); ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs); ErrCode DefaultPrepData_Recv(CollectiveArgs &collArgs); } diff --git a/test/common/TestBed.cpp b/test/common/TestBed.cpp index edbab4faec..d3a962ac7f 100644 --- a/test/common/TestBed.cpp +++ b/test/common/TestBed.cpp @@ -155,32 +155,19 @@ namespace RcclUnitTesting InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup); } - void TestBed::SetCollectiveArgs(ncclFunc_t const funcType, - ncclDataType_t const dataType, - ncclRedOp_t const redOp, - int const root, - size_t const numInputElements, - size_t const numOutputElements, - int const collId, - int const rank, - PtrUnion const scalarsPerRank, - int const scalarMode) + void TestBed::SetCollectiveArgs(ncclFunc_t const funcType, + ncclDataType_t const dataType, + size_t const numInputElements, + size_t const numOutputElements, + OptionalColArgs const &optionalArgs, + int const collId, + int const rank) { // Build list of ranks this applies to (-1 for rank means to set for all) std::vector rankList; for (int i = 0; i < this->numActiveRanks; ++i) if (rank == -1 || rank == i) rankList.push_back(i); - ScalarTransport scalarTransport; - if (scalarMode >= 0) - { - ASSERT_TRUE(scalarsPerRank.ptr != NULL); - - // Capture scalars per rank in format to share with child processes - int const numBytes = this->numActiveRanks * DataTypeToBytes(dataType); - memcpy(scalarTransport.ptr, scalarsPerRank.ptr, numBytes); - } - // Loop over all ranks and send CollectiveArgs to appropriate child process int const cmd = TestBedChild::CHILD_SET_COLL_ARGS; for (auto currRank : rankList) @@ -191,12 +178,9 @@ namespace RcclUnitTesting PIPE_WRITE(childId, collId); PIPE_WRITE(childId, funcType); PIPE_WRITE(childId, dataType); - PIPE_WRITE(childId, redOp); - PIPE_WRITE(childId, root); PIPE_WRITE(childId, numInputElements); PIPE_WRITE(childId, numOutputElements); - PIPE_WRITE(childId, scalarMode); - PIPE_WRITE(childId, scalarTransport); + PIPE_WRITE(childId, optionalArgs); PIPE_CHECK(childId); } } @@ -412,7 +396,7 @@ namespace RcclUnitTesting // Sort numElements in descending order to cut down on # of allocations std::vector sortedN = numElements; std::sort(sortedN.rbegin(), sortedN.rend()); - + OptionalColArgs optionalArgs; // Filter out any unsupported datatypes, in case only subset has been compiled for std::vector const& supportedDataTypes = this->GetAllSupportedDataTypes(); std::vector dataTypes; @@ -479,13 +463,13 @@ namespace RcclUnitTesting totalRanks, &numInputElements, &numOutputElements); - + optionalArgs.redOp = redOps[rdIdx]; + optionalArgs.root = roots[rtIdx]; this->SetCollectiveArgs(funcTypes[ftIdx], dataTypes[dtIdx], - redOps[rdIdx], - roots[rtIdx], numInputElements, - numOutputElements); + numOutputElements, + optionalArgs); // Only allocate once for largest size if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]); diff --git a/test/common/TestBed.hpp b/test/common/TestBed.hpp index f06bb031ee..73e4070fdd 100644 --- a/test/common/TestBed.hpp +++ b/test/common/TestBed.hpp @@ -41,16 +41,13 @@ namespace RcclUnitTesting // Using collId = -1 (default) applies settings to all collectives in group // Using rank = -1 (default) applies settings to all ranks - void SetCollectiveArgs(ncclFunc_t const funcType, - ncclDataType_t const dataType, - ncclRedOp_t const redOp, - int const root, - size_t const numInputElements, - size_t const numOutputElements, - int const collId = -1, - int const rank = -1, - PtrUnion const scalarsPerRank = {nullptr}, - int const scalarMode = -1); + void SetCollectiveArgs(ncclFunc_t const funcType, + ncclDataType_t const dataType, + size_t const numInputElements, + size_t const numOutputElements, + OptionalColArgs const &optionalArgs = {}, + int const collId = -1, + int const rank = -1); // Allocate memory for specified collective / rank // - Requires SetCollectiveArgs to have been called already diff --git a/test/common/TestBedChild.cpp b/test/common/TestBedChild.cpp index a291f741c6..ceaa9522c2 100644 --- a/test/common/TestBedChild.cpp +++ b/test/common/TestBedChild.cpp @@ -188,29 +188,17 @@ namespace RcclUnitTesting int collId; ncclFunc_t funcType; ncclDataType_t dataType; - ncclRedOp_t redOp; - int root; size_t numInputElements; size_t numOutputElements; - ScalarTransport scalarTransport; - int scalarMode; + OptionalColArgs options; PIPE_READ(globalRank); PIPE_READ(collId); PIPE_READ(funcType); PIPE_READ(dataType); - PIPE_READ(redOp); - PIPE_READ(root); PIPE_READ(numInputElements); PIPE_READ(numOutputElements); - PIPE_READ(scalarMode); - PIPE_READ(scalarTransport); - - for (int i = 0; i < this->totalRanks; i++) - { - PtrUnion scalarsPerRank; - scalarsPerRank.Attach(scalarTransport.ptr); - } + PIPE_READ(options); if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank)) { @@ -227,24 +215,24 @@ namespace RcclUnitTesting CollectiveArgs& collArg = this->collArgs[localRank][collIdx]; CHECK_CALL(collArg.SetArgs(globalRank, this->totalRanks, this->deviceIds[localRank], - funcType, dataType, redOp, root, + funcType, dataType, numInputElements, numOutputElements, - scalarTransport, scalarMode)); + options)); if (this->verbose) INFO("Rank %d on child %d sets collective %d [%s]\n", globalRank, this->childId, collIdx, collArg.GetDescription().c_str()); // If pre-mult scalars are provided, then create a custom reduction operator - if (scalarMode >= 0) + if (options.scalarMode >= 0) { - CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.redOp, + CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.options.redOp, collArg.localScalar.ptr, dataType, - (ncclScalarResidence_t)scalarMode, + (ncclScalarResidence_t)options.scalarMode, this->comms[localRank]), "ncclRedOpCreatePreMulSum"); if (verbose) INFO("Child %d created custom redop %d for collective %d\n", - this->childId, collArg.redOp, collIdx); + this->childId, collArg.options.redOp, collIdx); } } } @@ -383,7 +371,7 @@ namespace RcclUnitTesting collArg.outputGpu.ptr, collArg.numInputElements, collArg.dataType, - collArg.root, + collArg.options.root, this->comms[localRank], this->streams[localRank]), "ncclBroadcast"); @@ -393,8 +381,8 @@ namespace RcclUnitTesting collArg.outputGpu.ptr, collArg.numInputElements, collArg.dataType, - collArg.redOp, - collArg.root, + collArg.options.redOp, + collArg.options.root, this->comms[localRank], this->streams[localRank]), "ncclReduce"); @@ -413,7 +401,7 @@ namespace RcclUnitTesting collArg.outputGpu.ptr, collArg.numOutputElements, collArg.dataType, - collArg.redOp, + collArg.options.redOp, this->comms[localRank], this->streams[localRank]), "ncclReduceScatter"); @@ -423,7 +411,7 @@ namespace RcclUnitTesting collArg.outputGpu.ptr, collArg.numInputElements, collArg.dataType, - collArg.redOp, + collArg.options.redOp, this->comms[localRank], this->streams[localRank]), "ncclAllReduce"); @@ -433,7 +421,7 @@ namespace RcclUnitTesting collArg.outputGpu.ptr, collArg.numInputElements, collArg.dataType, - collArg.root, + collArg.options.root, this->comms[localRank], this->streams[localRank]), "ncclGather"); @@ -443,7 +431,7 @@ namespace RcclUnitTesting collArg.outputGpu.ptr, collArg.numOutputElements, collArg.dataType, - collArg.root, + collArg.options.root, this->comms[localRank], this->streams[localRank]), "ncclScatter"); @@ -457,11 +445,23 @@ namespace RcclUnitTesting this->streams[localRank]), "ncclAllToAll"); break; + case ncclCollAllToAllv: + CHILD_NCCL_CALL(ncclAllToAllv(collArg.inputGpu.ptr, + collArg.options.sendcounts + (this->rankOffset + localRank)*this->totalRanks, + collArg.options.sdispls + (this->rankOffset + localRank)*this->totalRanks, + collArg.outputGpu.ptr, + collArg.options.recvcounts + (this->rankOffset + localRank)*this->totalRanks, + collArg.options.rdispls + (this->rankOffset + localRank)*this->totalRanks, + collArg.dataType, + this->comms[localRank], + this->streams[localRank]), + "ncclAllToAllv"); + break; case ncclCollSend: CHILD_NCCL_CALL(ncclSend(collArg.inputGpu.ptr, collArg.numInputElements, collArg.dataType, - collArg.root, + collArg.options.root, this->comms[localRank], this->streams[localRank]), "ncclSend"); @@ -470,7 +470,7 @@ namespace RcclUnitTesting CHILD_NCCL_CALL(ncclRecv(collArg.outputGpu.ptr, collArg.numOutputElements, collArg.dataType, - collArg.root, + collArg.options.root, this->comms[localRank], this->streams[localRank]), "ncclRecv"); @@ -579,12 +579,12 @@ namespace RcclUnitTesting CHECK_CALL(collArg.DeallocateMem()); } - if (collArg.scalarMode != -1) + if (collArg.options.scalarMode != -1) { - CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.redOp, this->comms[localRank]), + CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.options.redOp, this->comms[localRank]), "ncclRedOpDestroy"); if (verbose) INFO("Child %d destroys custom redop %d for collective %d\n", - this->childId, collArg.redOp, collIdx); + this->childId, collArg.options.redOp, collIdx); } } if (this->verbose) INFO("Child %d finishes DeallocateMem\n", this->childId);