Added alltoallv test and optional args variable on collective args (#514)

* Added alltoallv test and optional args variable on collective args

[ROCm/rccl commit: 65ea3d80db]
This commit is contained in:
akolliasAMD
2022-03-18 13:55:11 -04:00
gecommit door GitHub
bovenliggende a44ff0fad5
commit 3493750b6b
13 gewijzigde bestanden met toevoegingen van 284 en 154 verwijderingen
+2 -2
Bestand weergeven
@@ -48,7 +48,7 @@ option(BUILD_TESTS "Build test programs" OFF)
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
option(BUILD_ALLREDUCE_ONLY "Build AllReduce + sum + float kernel only" OFF)
#Set the header wrapper ON by default.
#Set the header wrapper ON by default.
option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON)
# parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
@@ -292,7 +292,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY)
GUARDS SYMLINK WRAPPER
WRAPPER_LOCATIONS rccl )
#install the wrapper header file to package
rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h"
rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h"
DESTINATION "./rccl/include/" )
endif()
@@ -16,11 +16,11 @@ namespace RcclUnitTesting
std::vector<ncclDataType_t> const& dataTypes = {ncclFloat};
std::vector<ncclRedOp_t> const& redOps = {ncclSum};
std::vector<int> const numElements = {1048576, 53327, 1024};
int const root = 0;
bool const inPlace = false;
bool const useManagedMem = false;
int const numCollPerGroup = numElements.size();
OptionalColArgs options;
// This tests runs 3 collectives in the same group call
bool isCorrect = true;
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
@@ -33,30 +33,32 @@ namespace RcclUnitTesting
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx)
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
{
if (testBed.ev.showNames)
INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
isMultiProcess ? "MP" : "SP",
totalRanks, numCollPerGroup,
ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
// Run all element sizes in parallel as single group
for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
options.redOp = redOps[redOpIdx];
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
{
testBed.SetCollectiveArgs(funcType,
dataTypes[dataIdx],
redOps[redOpIdx],
root,
numElements[collIdx],
numElements[collIdx],
collIdx);
if (testBed.ev.showNames)
INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
isMultiProcess ? "MP" : "SP",
totalRanks, numCollPerGroup,
ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
// Run all element sizes in parallel as single group
for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
{
testBed.SetCollectiveArgs(funcType,
dataTypes[dataIdx],
numElements[collIdx],
numElements[collIdx],
options,
collIdx);
}
testBed.AllocateMem(inPlace, useManagedMem);
testBed.PrepareData();
testBed.ExecuteCollectives();
testBed.ValidateResults(isCorrect);
testBed.DeallocateMem();
}
testBed.AllocateMem(inPlace, useManagedMem);
testBed.PrepareData();
testBed.ExecuteCollectives();
testBed.ValidateResults(isCorrect);
testBed.DeallocateMem();
}
testBed.DestroyComms();
}
@@ -17,10 +17,10 @@ namespace RcclUnitTesting
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat32, ncclFloat64};
ncclRedOp_t const redOp = ncclSum;
std::vector<int> const numElements = {1048576, 1024};
int const root = 0;
bool const inPlace = false;
bool const useManagedMem = false;
OptionalColArgs options;
// Terminate the test as soon as first failure occurs
bool isCorrect = true;
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
@@ -43,6 +43,8 @@ namespace RcclUnitTesting
double F = i;
scalarsPerRank.Set(dataType, i, i, F);
}
int const numBytes = totalRanks * DataTypeToBytes(dataType);
memcpy(options.scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
// Test various scalar residence modes
for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode)
@@ -54,9 +56,11 @@ namespace RcclUnitTesting
for (int i = 0; i < numElements.size() && isCorrect; ++i)
{
testBed.SetCollectiveArgs(funcType, dataType, redOp, root,
options.scalarMode = scalarMode;
options.redOp = redOp;
testBed.SetCollectiveArgs(funcType, dataType,
numElements[i], numElements[i],
-1, -1, scalarsPerRank, scalarMode);
options);
// For performance, only allocate and prepare data on largest size
if (i == 0)
{
@@ -0,0 +1,96 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
void sendRecvPrep(size_t numInputElementsArray[],
size_t numOutputElementsArray[],
OptionalColArgs &options,
int totalRanks, int numElementsBase)
{
for (int sendRank = 0; sendRank < totalRanks; ++sendRank)
for (int recvRank = 0; recvRank < totalRanks; ++recvRank )
{
//create send counts, and build other arrays from that
options.sendcounts[sendRank*totalRanks+recvRank] = numElementsBase * (recvRank + 1);
options.recvcounts[recvRank*totalRanks+sendRank] = options.sendcounts[sendRank*totalRanks+recvRank ];
}
for (int sendRank = 0; sendRank < totalRanks; ++sendRank)
{
options.sdispls[sendRank*totalRanks] = 0;
options.rdispls[sendRank*totalRanks] = 0;
for (int recvRank = 1; recvRank < totalRanks; ++recvRank )
{
options.sdispls[sendRank*totalRanks+recvRank] =
options.sdispls[sendRank*totalRanks+recvRank-1] + options.sendcounts[sendRank*totalRanks+recvRank-1];
options.rdispls[sendRank*totalRanks+recvRank] =
options.rdispls[sendRank*totalRanks+recvRank-1] + options.recvcounts[sendRank*totalRanks+recvRank-1];
}
numInputElementsArray[sendRank] = options.sdispls[(sendRank+1)*totalRanks-1] + options.sendcounts[(sendRank+1)*totalRanks-1];
numOutputElementsArray[sendRank] = options.rdispls[(sendRank+1)*totalRanks-1] + options.recvcounts[(sendRank+1)*totalRanks-1];
}
}
TEST(AllToAllv, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat64};
std::vector<int> const numElementsBase = {1048576, 53327, 1024};
bool const inPlace = false;
bool const useManagedMem = false;
OptionalColArgs options;
size_t numInputElementsArray[MAX_RANKS], numOutputElementsArray[MAX_RANKS];
bool isCorrect = true;
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
{
if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue;
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
for (int numIdx = 0; numIdx < numElementsBase.size() && isCorrect; ++numIdx)
{
if (testBed.ev.showNames)
{
std::string name = testBed.GetTestCaseName(totalRanks, isMultiProcess,
ncclCollAllToAllv, dataTypes[dataIdx],
ncclSum, -1,
inPlace, useManagedMem);
INFO("%s\n", name.c_str());
}
sendRecvPrep(numInputElementsArray, numOutputElementsArray, options, totalRanks, numElementsBase[numIdx]);
for (int rank = 0; rank < totalRanks; ++rank)
{
testBed.SetCollectiveArgs(ncclCollAllToAllv,
dataTypes[dataIdx],
numInputElementsArray[rank],
numOutputElementsArray[rank],
options,
-1,
rank);
}
testBed.AllocateMem(inPlace, useManagedMem);
testBed.PrepareData();
testBed.ExecuteCollectives();
testBed.ValidateResults(isCorrect);
testBed.DeallocateMem();
}
testBed.DestroyComms();
}
testBed.Finalize();
}
}
@@ -65,6 +65,8 @@ if(BUILD_TESTS)
#AllToAll
AllToAll_OutOfPlace.cpp
AllToAll_ManagedMem.cpp
#AllToAllv
AllToAllv_OutOfPlace.cpp
#Broadcast
Broadcast_InPlace.cpp
Broadcast_ManagedMem.cpp
@@ -17,6 +17,7 @@ namespace RcclUnitTesting
bool const inPlace = false;
bool const useManagedMem = false;
OptionalColArgs options;
bool isCorrect = true;
int totalRanks = testBed.ev.maxGpus;
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
@@ -32,12 +33,12 @@ namespace RcclUnitTesting
{
for (int recvRank = 0; recvRank < totalRanks; ++recvRank)
{
options.root = recvRank;
testBed.SetCollectiveArgs(ncclCollSend,
dataTypes[dataIdx],
ncclSum,
recvRank,
numElements[numIdx],
numElements[numIdx],
options,
0,
sendRank);
if (recvRank == 0)
@@ -55,18 +56,17 @@ namespace RcclUnitTesting
recvRank,
numElements[numIdx]);
options.root = sendRank;
testBed.SetCollectiveArgs(ncclCollRecv,
dataTypes[dataIdx],
ncclSum,
sendRank,
numElements[numIdx],
numElements[numIdx],
options,
0,
recvRank);
testBed.AllocateMem(inPlace, useManagedMem, 0, recvRank);
testBed.PrepareData(0, recvRank);
testBed.ExecuteCollectives({sendRank,recvRank });
testBed.ExecuteCollectives({sendRank, recvRank});
testBed.ValidateResults(isCorrect, 0, recvRank);
testBed.DeallocateMem(0, recvRank);
}
@@ -14,20 +14,17 @@ namespace RcclUnitTesting
int const deviceId,
ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
size_t const numInputElements,
size_t const numOutputElements,
ScalarTransport const scalarTransport,
int const scalarMode)
OptionalColArgs const &optionalColArgs)
{
// Free scalar based on previous scalarMode
if (scalarMode != -1)
if (optionalColArgs.scalarMode != -1)
{
if (this->localScalar.ptr != nullptr)
{
if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
if (this->scalarMode == 1) hipHostFree(this->localScalar.ptr);
if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem();
if (this->options.scalarMode == 1) hipHostFree(this->localScalar.ptr);
}
}
@@ -36,26 +33,23 @@ namespace RcclUnitTesting
this->deviceId = deviceId;
this->funcType = funcType;
this->dataType = dataType;
this->redOp = redOp;
this->root = root;
this->numInputElements = numInputElements;
this->numOutputElements = numOutputElements;
this->scalarTransport = scalarTransport;
this->scalarMode = scalarMode;
this->options = optionalColArgs;
if (scalarMode != -1)
if (this->options.scalarMode != -1)
{
size_t const numBytes = DataTypeToBytes(dataType);
if (scalarMode == ncclScalarDevice)
if (this->options.scalarMode == ncclScalarDevice)
{
CHECK_CALL(this->localScalar.AllocateGpuMem(numBytes));
CHECK_HIP(hipMemcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes),
CHECK_HIP(hipMemcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes),
numBytes, hipMemcpyHostToDevice));
}
else if (scalarMode == ncclScalarHostImmediate)
else if (this->options.scalarMode == ncclScalarHostImmediate)
{
CHECK_HIP(hipHostMalloc(&this->localScalar.ptr, numBytes, 0));
memcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), numBytes);
memcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes), numBytes);
}
}
return TEST_SUCCESS;
@@ -116,8 +110,8 @@ namespace RcclUnitTesting
ErrCode CollectiveArgs::ValidateResults()
{
// Ignore non-root outputs for collectives with a root
if (CollectiveArgs::UsesRoot(this->funcType) && this->root != this->globalRank) return TEST_SUCCESS;
if (CollectiveArgs::UsesRoot(this->funcType) && this->options.root != this->globalRank) return TEST_SUCCESS;
if (this->funcType == ncclCollSend) return TEST_SUCCESS; // on the send receive pair only recv needs to be checked
size_t const numOutputBytes = (this->numOutputElements * DataTypeToBytes(this->dataType));
CHECK_HIP(hipMemcpy(this->outputCpu.ptr, this->outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
@@ -153,8 +147,9 @@ namespace RcclUnitTesting
if (this->localScalar.ptr != nullptr)
{
if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
if (this->scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem();
if (this->options.scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
this->localScalar.Attach(nullptr);
}
return TEST_SUCCESS;
}
@@ -174,6 +169,7 @@ namespace RcclUnitTesting
case ncclCollGather: ss << "ncclGather"; break;
case ncclCollScatter: ss << "ncclScatter"; break;
case ncclCollAllToAll: ss << "ncclAllToAll"; break;
case ncclCollAllToAllv: ss << "ncclAllToAllv"; break;
case ncclCollSend: ss << "ncclSend"; break;
case ncclCollRecv: ss << "ncclRecv"; break;
default: ss << "[Unknown]"; break;
@@ -184,9 +180,9 @@ namespace RcclUnitTesting
this->funcType == ncclCollReduceScatter ||
this->funcType == ncclCollAllReduce)
{
if (this->redOp < ncclNumOps)
if (this->options.redOp < ncclNumOps)
{
ss << ncclRedOpNames[this->redOp] << " ";
ss << ncclRedOpNames[this->options.redOp] << " ";
}
else
{
@@ -215,13 +211,13 @@ namespace RcclUnitTesting
this->funcType == ncclCollGather ||
this->funcType == ncclCollScatter)
{
ss << "Root " << this->root << " ";
ss << "Root " << this->options.root << " ";
}
if (this->funcType == ncclCollSend ||
this->funcType == ncclCollRecv)
{
ss << "Peer " << this->root << " ";
ss << "Peer " << this->options.root << " ";
}
ss << "#In: " << this->numInputElements;
@@ -277,7 +273,6 @@ namespace RcclUnitTesting
return (funcType == ncclCollBroadcast ||
funcType == ncclCollReduce ||
funcType == ncclCollGather ||
funcType == ncclCollScatter ||
funcType == ncclCollSend); // this is incorrect but it works because in Send root is not root it is the peer
funcType == ncclCollScatter);
}
}
@@ -22,6 +22,7 @@ namespace RcclUnitTesting
ncclCollGather,
ncclCollScatter,
ncclCollAllToAll,
ncclCollAllToAllv,
ncclCollSend,
ncclCollRecv,
ncclNumFuncs
@@ -37,6 +38,7 @@ namespace RcclUnitTesting
"Gather",
"Scatter",
"AllToAll",
"AllToAllv",
"Send",
"Recv"
};
@@ -72,6 +74,19 @@ namespace RcclUnitTesting
char ptr[MAX_RANKS * sizeof(double)];
};
struct OptionalColArgs
{
ncclRedOp_t redOp = ncclSum;
int root = 0; // Used as "peer" for Send/Recv
ScalarTransport scalarTransport; // Used for custom reduction operators
int scalarMode = -1; // -1 if scalar not used
// allToAllv args
size_t sendcounts[MAX_RANKS*MAX_RANKS];
size_t sdispls[MAX_RANKS*MAX_RANKS];
size_t recvcounts[MAX_RANKS*MAX_RANKS];
size_t rdispls[MAX_RANKS*MAX_RANKS];
};
// Function pointer for functions that operate on CollectiveArgs
// e.g. For filling input / computing expected results
typedef ErrCode (*CollFuncPtr)(CollectiveArgs &);
@@ -85,13 +100,10 @@ namespace RcclUnitTesting
int deviceId;
ncclFunc_t funcType;
ncclDataType_t dataType;
ncclRedOp_t redOp;
int root; // Used as "peer" for Send/Recv
size_t numInputElements;
size_t numOutputElements;
ScalarTransport scalarTransport; // Used for custom reduction operators
PtrUnion localScalar;
int scalarMode; // -1 if scalar not used
OptionalColArgs options;
// Data
PtrUnion inputGpu;
@@ -111,12 +123,9 @@ namespace RcclUnitTesting
int const deviceId,
ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
size_t const numInputElements,
size_t const numOutputElements,
ScalarTransport const scalarsPerRank,
int const scalarMode = -1);
OptionalColArgs const &optionalArgs = {});
// Allocates GPU memory for input/output and CPU memory for expected
// When inPlace is true, input and output share the same memory
@@ -23,6 +23,7 @@ namespace RcclUnitTesting
case ncclCollGather: return DefaultPrepData_Gather(collArgs, false);
case ncclCollScatter: return DefaultPrepData_Scatter(collArgs);
case ncclCollAllToAll: return DefaultPrepData_AllToAll(collArgs);
case ncclCollAllToAllv: return DefaultPrepData_AllToAllv(collArgs);
case ncclCollSend: return DefaultPrepData_Send(collArgs);
case ncclCollRecv: return DefaultPrepData_Recv(collArgs);
default:
@@ -64,15 +65,15 @@ namespace RcclUnitTesting
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
// Only root needs input pattern
if (collArgs.globalRank == collArgs.root)
if (collArgs.globalRank == collArgs.options.root)
CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType,
collArgs.numInputElements,
collArgs.root, true));
collArgs.options.root, true));
// Otherwise all other ranks expected output is the same as input of root
return collArgs.expected.FillPattern(collArgs.dataType,
collArgs.numInputElements,
collArgs.root,
collArgs.options.root,
false);
}
@@ -96,11 +97,11 @@ namespace RcclUnitTesting
CHECK_CALL(result.ClearCpuMem(numBytes));
// If average or custom reduction operator is used, perform a summation instead
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);
// Loop over each rank and generate their input into a temp buffer, then reduce
PtrUnion scalarsPerRank;
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);
PtrUnion tempInputCpu;
CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu));
@@ -117,14 +118,14 @@ namespace RcclUnitTesting
// Scale the temporary input by local scalar for this rank
// (Used by custom reduction ops)
if (collArgs.scalarMode >= 0)
if (collArgs.options.scalarMode >= 0)
{
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
scalarsPerRank, rank));
}
// Any rank that requires output reduces the scaled-inputs
if (isAllReduce || collArgs.root == collArgs.globalRank)
if (isAllReduce || collArgs.options.root == collArgs.globalRank)
{
if (rank == 0)
{
@@ -139,7 +140,7 @@ namespace RcclUnitTesting
}
// Perform averaging if necessary
if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank))
if (collArgs.options.redOp == ncclAvg && (isAllReduce || collArgs.options.root == collArgs.globalRank))
{
CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
}
@@ -176,7 +177,7 @@ namespace RcclUnitTesting
{
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice));
}
if (isAllGather || collArgs.root == collArgs.globalRank)
if (isAllGather || collArgs.options.root == collArgs.globalRank)
{
memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes);
}
@@ -207,11 +208,11 @@ namespace RcclUnitTesting
CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes));
// If average or custom reduction operator is used, perform a summation instead
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);
// Loop over each rank and generate the input / scale / reduce
PtrUnion scalarsPerRank;
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
{
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
@@ -229,7 +230,7 @@ namespace RcclUnitTesting
// Scale the temporary input by local scalar for this rank
// (Used by custom reduction ops)
if (collArgs.scalarMode >= 0)
if (collArgs.options.scalarMode >= 0)
{
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
scalarsPerRank, rank));
@@ -247,7 +248,7 @@ namespace RcclUnitTesting
}
// Perform averaging if necessary
if (collArgs.redOp == ncclAvg)
if (collArgs.options.redOp == ncclAvg)
{
CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
}
@@ -279,10 +280,10 @@ namespace RcclUnitTesting
// Generate input as if on root rank - each rank will receive a portion
PtrUnion tempInput;
tempInput.AllocateCpuMem(numInputBytes);
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false);
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.options.root, false);
// Copy input to root rank
if (collArgs.globalRank == collArgs.root)
if (collArgs.globalRank == collArgs.options.root)
{
if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
{
@@ -341,6 +342,45 @@ namespace RcclUnitTesting
return TEST_SUCCESS;
}
ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs)
{
CHECK_CALL(CheckAllocation(collArgs));
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
// calculating maxNumElements as the maximum number of input bytes out of all the ranks
size_t maxNumElements = 0;
for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
for (int recvRank = 0; recvRank < collArgs.totalRanks; ++recvRank)
{
size_t rankSendCount = collArgs.options.sdispls[(sendRank)*collArgs.totalRanks+recvRank] + collArgs.options.sendcounts[(sendRank)*collArgs.totalRanks+recvRank];
maxNumElements = std::max(maxNumElements, rankSendCount);
}
// Clear outputs on all ranks (prior to input in case of in-place)
collArgs.outputGpu.ClearGpuMem(numOutputBytes);
// Generate input on root rank - each rank will receive a portion
PtrUnion tempInput;
tempInput.AllocateCpuMem(maxNumElements*DataTypeToBytes(collArgs.dataType));
for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
{
tempInput.FillPattern(collArgs.dataType, maxNumElements, sendRank, false);
size_t recvDspls = collArgs.options.rdispls[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
size_t sendDspls = collArgs.options.sdispls[sendRank*collArgs.totalRanks + collArgs.globalRank] * DataTypeToBytes(collArgs.dataType);
size_t numBytes = collArgs.options.recvcounts[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
memcpy(collArgs.expected.U1 + recvDspls, tempInput.U1 + sendDspls, numBytes);
}
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.globalRank, false);
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice));
tempInput.FreeCpuMem();
return TEST_SUCCESS;
}
ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs)
{
CHECK_CALL(CheckAllocation(collArgs));
@@ -354,7 +394,7 @@ namespace RcclUnitTesting
CHECK_CALL(CheckAllocation(collArgs));
return collArgs.expected.FillPattern(collArgs.dataType,
collArgs.numOutputElements,
collArgs.root,
collArgs.options.root,
false);
}
}
@@ -22,6 +22,7 @@ namespace RcclUnitTesting
ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_Recv(CollectiveArgs &collArgs);
}
+13 -29
Bestand weergeven
@@ -155,32 +155,19 @@ namespace RcclUnitTesting
InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup);
}
void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
size_t const numInputElements,
size_t const numOutputElements,
int const collId,
int const rank,
PtrUnion const scalarsPerRank,
int const scalarMode)
void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,
ncclDataType_t const dataType,
size_t const numInputElements,
size_t const numOutputElements,
OptionalColArgs const &optionalArgs,
int const collId,
int const rank)
{
// Build list of ranks this applies to (-1 for rank means to set for all)
std::vector<int> rankList;
for (int i = 0; i < this->numActiveRanks; ++i)
if (rank == -1 || rank == i) rankList.push_back(i);
ScalarTransport scalarTransport;
if (scalarMode >= 0)
{
ASSERT_TRUE(scalarsPerRank.ptr != NULL);
// Capture scalars per rank in format to share with child processes
int const numBytes = this->numActiveRanks * DataTypeToBytes(dataType);
memcpy(scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
}
// Loop over all ranks and send CollectiveArgs to appropriate child process
int const cmd = TestBedChild::CHILD_SET_COLL_ARGS;
for (auto currRank : rankList)
@@ -191,12 +178,9 @@ namespace RcclUnitTesting
PIPE_WRITE(childId, collId);
PIPE_WRITE(childId, funcType);
PIPE_WRITE(childId, dataType);
PIPE_WRITE(childId, redOp);
PIPE_WRITE(childId, root);
PIPE_WRITE(childId, numInputElements);
PIPE_WRITE(childId, numOutputElements);
PIPE_WRITE(childId, scalarMode);
PIPE_WRITE(childId, scalarTransport);
PIPE_WRITE(childId, optionalArgs);
PIPE_CHECK(childId);
}
}
@@ -412,7 +396,7 @@ namespace RcclUnitTesting
// Sort numElements in descending order to cut down on # of allocations
std::vector<int> sortedN = numElements;
std::sort(sortedN.rbegin(), sortedN.rend());
OptionalColArgs optionalArgs;
// Filter out any unsupported datatypes, in case only subset has been compiled for
std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
std::vector<ncclDataType_t> dataTypes;
@@ -479,13 +463,13 @@ namespace RcclUnitTesting
totalRanks,
&numInputElements,
&numOutputElements);
optionalArgs.redOp = redOps[rdIdx];
optionalArgs.root = roots[rtIdx];
this->SetCollectiveArgs(funcTypes[ftIdx],
dataTypes[dtIdx],
redOps[rdIdx],
roots[rtIdx],
numInputElements,
numOutputElements);
numOutputElements,
optionalArgs);
// Only allocate once for largest size
if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]);
+7 -10
Bestand weergeven
@@ -41,16 +41,13 @@ namespace RcclUnitTesting
// Using collId = -1 (default) applies settings to all collectives in group
// Using rank = -1 (default) applies settings to all ranks
void SetCollectiveArgs(ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
size_t const numInputElements,
size_t const numOutputElements,
int const collId = -1,
int const rank = -1,
PtrUnion const scalarsPerRank = {nullptr},
int const scalarMode = -1);
void SetCollectiveArgs(ncclFunc_t const funcType,
ncclDataType_t const dataType,
size_t const numInputElements,
size_t const numOutputElements,
OptionalColArgs const &optionalArgs = {},
int const collId = -1,
int const rank = -1);
// Allocate memory for specified collective / rank
// - Requires SetCollectiveArgs to have been called already
@@ -188,29 +188,17 @@ namespace RcclUnitTesting
int collId;
ncclFunc_t funcType;
ncclDataType_t dataType;
ncclRedOp_t redOp;
int root;
size_t numInputElements;
size_t numOutputElements;
ScalarTransport scalarTransport;
int scalarMode;
OptionalColArgs options;
PIPE_READ(globalRank);
PIPE_READ(collId);
PIPE_READ(funcType);
PIPE_READ(dataType);
PIPE_READ(redOp);
PIPE_READ(root);
PIPE_READ(numInputElements);
PIPE_READ(numOutputElements);
PIPE_READ(scalarMode);
PIPE_READ(scalarTransport);
for (int i = 0; i < this->totalRanks; i++)
{
PtrUnion scalarsPerRank;
scalarsPerRank.Attach(scalarTransport.ptr);
}
PIPE_READ(options);
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
{
@@ -227,24 +215,24 @@ namespace RcclUnitTesting
CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
CHECK_CALL(collArg.SetArgs(globalRank, this->totalRanks,
this->deviceIds[localRank],
funcType, dataType, redOp, root,
funcType, dataType,
numInputElements, numOutputElements,
scalarTransport, scalarMode));
options));
if (this->verbose) INFO("Rank %d on child %d sets collective %d [%s]\n",
globalRank, this->childId, collIdx,
collArg.GetDescription().c_str());
// If pre-mult scalars are provided, then create a custom reduction operator
if (scalarMode >= 0)
if (options.scalarMode >= 0)
{
CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.redOp,
CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.options.redOp,
collArg.localScalar.ptr,
dataType,
(ncclScalarResidence_t)scalarMode,
(ncclScalarResidence_t)options.scalarMode,
this->comms[localRank]),
"ncclRedOpCreatePreMulSum");
if (verbose) INFO("Child %d created custom redop %d for collective %d\n",
this->childId, collArg.redOp, collIdx);
this->childId, collArg.options.redOp, collIdx);
}
}
}
@@ -383,7 +371,7 @@ namespace RcclUnitTesting
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.root,
collArg.options.root,
this->comms[localRank],
this->streams[localRank]),
"ncclBroadcast");
@@ -393,8 +381,8 @@ namespace RcclUnitTesting
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.redOp,
collArg.root,
collArg.options.redOp,
collArg.options.root,
this->comms[localRank],
this->streams[localRank]),
"ncclReduce");
@@ -413,7 +401,7 @@ namespace RcclUnitTesting
collArg.outputGpu.ptr,
collArg.numOutputElements,
collArg.dataType,
collArg.redOp,
collArg.options.redOp,
this->comms[localRank],
this->streams[localRank]),
"ncclReduceScatter");
@@ -423,7 +411,7 @@ namespace RcclUnitTesting
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.redOp,
collArg.options.redOp,
this->comms[localRank],
this->streams[localRank]),
"ncclAllReduce");
@@ -433,7 +421,7 @@ namespace RcclUnitTesting
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.root,
collArg.options.root,
this->comms[localRank],
this->streams[localRank]),
"ncclGather");
@@ -443,7 +431,7 @@ namespace RcclUnitTesting
collArg.outputGpu.ptr,
collArg.numOutputElements,
collArg.dataType,
collArg.root,
collArg.options.root,
this->comms[localRank],
this->streams[localRank]),
"ncclScatter");
@@ -457,11 +445,23 @@ namespace RcclUnitTesting
this->streams[localRank]),
"ncclAllToAll");
break;
case ncclCollAllToAllv:
CHILD_NCCL_CALL(ncclAllToAllv(collArg.inputGpu.ptr,
collArg.options.sendcounts + (this->rankOffset + localRank)*this->totalRanks,
collArg.options.sdispls + (this->rankOffset + localRank)*this->totalRanks,
collArg.outputGpu.ptr,
collArg.options.recvcounts + (this->rankOffset + localRank)*this->totalRanks,
collArg.options.rdispls + (this->rankOffset + localRank)*this->totalRanks,
collArg.dataType,
this->comms[localRank],
this->streams[localRank]),
"ncclAllToAllv");
break;
case ncclCollSend:
CHILD_NCCL_CALL(ncclSend(collArg.inputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.root,
collArg.options.root,
this->comms[localRank],
this->streams[localRank]),
"ncclSend");
@@ -470,7 +470,7 @@ namespace RcclUnitTesting
CHILD_NCCL_CALL(ncclRecv(collArg.outputGpu.ptr,
collArg.numOutputElements,
collArg.dataType,
collArg.root,
collArg.options.root,
this->comms[localRank],
this->streams[localRank]),
"ncclRecv");
@@ -579,12 +579,12 @@ namespace RcclUnitTesting
CHECK_CALL(collArg.DeallocateMem());
}
if (collArg.scalarMode != -1)
if (collArg.options.scalarMode != -1)
{
CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.redOp, this->comms[localRank]),
CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.options.redOp, this->comms[localRank]),
"ncclRedOpDestroy");
if (verbose) INFO("Child %d destroys custom redop %d for collective %d\n",
this->childId, collArg.redOp, collIdx);
this->childId, collArg.options.redOp, collIdx);
}
}
if (this->verbose) INFO("Child %d finishes DeallocateMem\n", this->childId);