Added alltoallv test and optional args variable on collective args (#514)
* Added alltoallv test and optional args variable on collective args
这个提交包含在:
+56
-16
@@ -23,6 +23,7 @@ namespace RcclUnitTesting
|
||||
case ncclCollGather: return DefaultPrepData_Gather(collArgs, false);
|
||||
case ncclCollScatter: return DefaultPrepData_Scatter(collArgs);
|
||||
case ncclCollAllToAll: return DefaultPrepData_AllToAll(collArgs);
|
||||
case ncclCollAllToAllv: return DefaultPrepData_AllToAllv(collArgs);
|
||||
case ncclCollSend: return DefaultPrepData_Send(collArgs);
|
||||
case ncclCollRecv: return DefaultPrepData_Recv(collArgs);
|
||||
default:
|
||||
@@ -64,15 +65,15 @@ namespace RcclUnitTesting
|
||||
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
|
||||
|
||||
// Only root needs input pattern
|
||||
if (collArgs.globalRank == collArgs.root)
|
||||
if (collArgs.globalRank == collArgs.options.root)
|
||||
CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType,
|
||||
collArgs.numInputElements,
|
||||
collArgs.root, true));
|
||||
collArgs.options.root, true));
|
||||
|
||||
// Otherwise all other ranks expected output is the same as input of root
|
||||
return collArgs.expected.FillPattern(collArgs.dataType,
|
||||
collArgs.numInputElements,
|
||||
collArgs.root,
|
||||
collArgs.options.root,
|
||||
false);
|
||||
}
|
||||
|
||||
@@ -96,11 +97,11 @@ namespace RcclUnitTesting
|
||||
CHECK_CALL(result.ClearCpuMem(numBytes));
|
||||
|
||||
// If average or custom reduction operator is used, perform a summation instead
|
||||
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
|
||||
ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);
|
||||
|
||||
// Loop over each rank and generate their input into a temp buffer, then reduce
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
|
||||
scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);
|
||||
|
||||
PtrUnion tempInputCpu;
|
||||
CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu));
|
||||
@@ -117,14 +118,14 @@ namespace RcclUnitTesting
|
||||
|
||||
// Scale the temporary input by local scalar for this rank
|
||||
// (Used by custom reduction ops)
|
||||
if (collArgs.scalarMode >= 0)
|
||||
if (collArgs.options.scalarMode >= 0)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
|
||||
scalarsPerRank, rank));
|
||||
}
|
||||
|
||||
// Any rank that requires output reduces the scaled-inputs
|
||||
if (isAllReduce || collArgs.root == collArgs.globalRank)
|
||||
if (isAllReduce || collArgs.options.root == collArgs.globalRank)
|
||||
{
|
||||
if (rank == 0)
|
||||
{
|
||||
@@ -139,7 +140,7 @@ namespace RcclUnitTesting
|
||||
}
|
||||
|
||||
// Perform averaging if necessary
|
||||
if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank))
|
||||
if (collArgs.options.redOp == ncclAvg && (isAllReduce || collArgs.options.root == collArgs.globalRank))
|
||||
{
|
||||
CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
|
||||
}
|
||||
@@ -176,7 +177,7 @@ namespace RcclUnitTesting
|
||||
{
|
||||
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice));
|
||||
}
|
||||
if (isAllGather || collArgs.root == collArgs.globalRank)
|
||||
if (isAllGather || collArgs.options.root == collArgs.globalRank)
|
||||
{
|
||||
memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes);
|
||||
}
|
||||
@@ -207,11 +208,11 @@ namespace RcclUnitTesting
|
||||
CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes));
|
||||
|
||||
// If average or custom reduction operator is used, perform a summation instead
|
||||
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
|
||||
ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);
|
||||
|
||||
// Loop over each rank and generate the input / scale / reduce
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
|
||||
scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);
|
||||
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
|
||||
@@ -229,7 +230,7 @@ namespace RcclUnitTesting
|
||||
|
||||
// Scale the temporary input by local scalar for this rank
|
||||
// (Used by custom reduction ops)
|
||||
if (collArgs.scalarMode >= 0)
|
||||
if (collArgs.options.scalarMode >= 0)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
|
||||
scalarsPerRank, rank));
|
||||
@@ -247,7 +248,7 @@ namespace RcclUnitTesting
|
||||
}
|
||||
|
||||
// Perform averaging if necessary
|
||||
if (collArgs.redOp == ncclAvg)
|
||||
if (collArgs.options.redOp == ncclAvg)
|
||||
{
|
||||
CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
|
||||
}
|
||||
@@ -279,10 +280,10 @@ namespace RcclUnitTesting
|
||||
// Generate input as if on root rank - each rank will receive a portion
|
||||
PtrUnion tempInput;
|
||||
tempInput.AllocateCpuMem(numInputBytes);
|
||||
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false);
|
||||
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.options.root, false);
|
||||
|
||||
// Copy input to root rank
|
||||
if (collArgs.globalRank == collArgs.root)
|
||||
if (collArgs.globalRank == collArgs.options.root)
|
||||
{
|
||||
if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
|
||||
{
|
||||
@@ -341,6 +342,45 @@ namespace RcclUnitTesting
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs)
|
||||
{
|
||||
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
|
||||
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
|
||||
|
||||
// calculating maxNumElements as the maximum number of input bytes out of all the ranks
|
||||
size_t maxNumElements = 0;
|
||||
for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
|
||||
for (int recvRank = 0; recvRank < collArgs.totalRanks; ++recvRank)
|
||||
{
|
||||
size_t rankSendCount = collArgs.options.sdispls[(sendRank)*collArgs.totalRanks+recvRank] + collArgs.options.sendcounts[(sendRank)*collArgs.totalRanks+recvRank];
|
||||
maxNumElements = std::max(maxNumElements, rankSendCount);
|
||||
}
|
||||
|
||||
// Clear outputs on all ranks (prior to input in case of in-place)
|
||||
collArgs.outputGpu.ClearGpuMem(numOutputBytes);
|
||||
|
||||
// Generate input on root rank - each rank will receive a portion
|
||||
PtrUnion tempInput;
|
||||
tempInput.AllocateCpuMem(maxNumElements*DataTypeToBytes(collArgs.dataType));
|
||||
|
||||
for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
|
||||
{
|
||||
tempInput.FillPattern(collArgs.dataType, maxNumElements, sendRank, false);
|
||||
size_t recvDspls = collArgs.options.rdispls[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
|
||||
size_t sendDspls = collArgs.options.sdispls[sendRank*collArgs.totalRanks + collArgs.globalRank] * DataTypeToBytes(collArgs.dataType);
|
||||
size_t numBytes = collArgs.options.recvcounts[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
|
||||
memcpy(collArgs.expected.U1 + recvDspls, tempInput.U1 + sendDspls, numBytes);
|
||||
}
|
||||
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.globalRank, false);
|
||||
|
||||
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice));
|
||||
|
||||
tempInput.FreeCpuMem();
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs)
|
||||
{
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
@@ -354,7 +394,7 @@ namespace RcclUnitTesting
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
return collArgs.expected.FillPattern(collArgs.dataType,
|
||||
collArgs.numOutputElements,
|
||||
collArgs.root,
|
||||
collArgs.options.root,
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
||||
在新工单中引用
屏蔽一个用户