Added alltoallv test and optional args variable on collective args (#514)
* Added alltoallv test and optional args variable on collective args
This commit is contained in:
+2
-2
@@ -48,7 +48,7 @@ option(BUILD_TESTS "Build test programs" OFF)
|
||||
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
|
||||
option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
|
||||
option(BUILD_ALLREDUCE_ONLY "Build AllReduce + sum + float kernel only" OFF)
|
||||
#Set the header wrapper ON by default.
|
||||
#Set the header wrapper ON by default.
|
||||
option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON)
|
||||
|
||||
# parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
|
||||
@@ -292,7 +292,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY)
|
||||
GUARDS SYMLINK WRAPPER
|
||||
WRAPPER_LOCATIONS rccl )
|
||||
#install the wrapper header file to package
|
||||
rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h"
|
||||
rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h"
|
||||
DESTINATION "./rccl/include/" )
|
||||
endif()
|
||||
|
||||
|
||||
@@ -16,11 +16,11 @@ namespace RcclUnitTesting
|
||||
std::vector<ncclDataType_t> const& dataTypes = {ncclFloat};
|
||||
std::vector<ncclRedOp_t> const& redOps = {ncclSum};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
int const root = 0;
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
int const numCollPerGroup = numElements.size();
|
||||
|
||||
OptionalColArgs options;
|
||||
// This tests runs 3 collectives in the same group call
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
|
||||
@@ -33,30 +33,32 @@ namespace RcclUnitTesting
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
|
||||
|
||||
for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx)
|
||||
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
|
||||
{
|
||||
if (testBed.ev.showNames)
|
||||
INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
|
||||
isMultiProcess ? "MP" : "SP",
|
||||
totalRanks, numCollPerGroup,
|
||||
ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
|
||||
|
||||
// Run all element sizes in parallel as single group
|
||||
for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
|
||||
options.redOp = redOps[redOpIdx];
|
||||
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
|
||||
{
|
||||
testBed.SetCollectiveArgs(funcType,
|
||||
dataTypes[dataIdx],
|
||||
redOps[redOpIdx],
|
||||
root,
|
||||
numElements[collIdx],
|
||||
numElements[collIdx],
|
||||
collIdx);
|
||||
if (testBed.ev.showNames)
|
||||
INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
|
||||
isMultiProcess ? "MP" : "SP",
|
||||
totalRanks, numCollPerGroup,
|
||||
ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
|
||||
|
||||
// Run all element sizes in parallel as single group
|
||||
for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
|
||||
{
|
||||
testBed.SetCollectiveArgs(funcType,
|
||||
dataTypes[dataIdx],
|
||||
numElements[collIdx],
|
||||
numElements[collIdx],
|
||||
options,
|
||||
collIdx);
|
||||
}
|
||||
testBed.AllocateMem(inPlace, useManagedMem);
|
||||
testBed.PrepareData();
|
||||
testBed.ExecuteCollectives();
|
||||
testBed.ValidateResults(isCorrect);
|
||||
testBed.DeallocateMem();
|
||||
}
|
||||
testBed.AllocateMem(inPlace, useManagedMem);
|
||||
testBed.PrepareData();
|
||||
testBed.ExecuteCollectives();
|
||||
testBed.ValidateResults(isCorrect);
|
||||
testBed.DeallocateMem();
|
||||
}
|
||||
testBed.DestroyComms();
|
||||
}
|
||||
|
||||
@@ -17,10 +17,10 @@ namespace RcclUnitTesting
|
||||
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat32, ncclFloat64};
|
||||
ncclRedOp_t const redOp = ncclSum;
|
||||
std::vector<int> const numElements = {1048576, 1024};
|
||||
int const root = 0;
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
|
||||
OptionalColArgs options;
|
||||
// Terminate the test as soon as first failure occurs
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
|
||||
@@ -43,6 +43,8 @@ namespace RcclUnitTesting
|
||||
double F = i;
|
||||
scalarsPerRank.Set(dataType, i, i, F);
|
||||
}
|
||||
int const numBytes = totalRanks * DataTypeToBytes(dataType);
|
||||
memcpy(options.scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
|
||||
|
||||
// Test various scalar residence modes
|
||||
for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode)
|
||||
@@ -54,9 +56,11 @@ namespace RcclUnitTesting
|
||||
|
||||
for (int i = 0; i < numElements.size() && isCorrect; ++i)
|
||||
{
|
||||
testBed.SetCollectiveArgs(funcType, dataType, redOp, root,
|
||||
options.scalarMode = scalarMode;
|
||||
options.redOp = redOp;
|
||||
testBed.SetCollectiveArgs(funcType, dataType,
|
||||
numElements[i], numElements[i],
|
||||
-1, -1, scalarsPerRank, scalarMode);
|
||||
options);
|
||||
// For performance, only allocate and prepare data on largest size
|
||||
if (i == 0)
|
||||
{
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
|
||||
void sendRecvPrep(size_t numInputElementsArray[],
|
||||
size_t numOutputElementsArray[],
|
||||
OptionalColArgs &options,
|
||||
int totalRanks, int numElementsBase)
|
||||
{
|
||||
for (int sendRank = 0; sendRank < totalRanks; ++sendRank)
|
||||
for (int recvRank = 0; recvRank < totalRanks; ++recvRank )
|
||||
{
|
||||
//create send counts, and build other arrays from that
|
||||
options.sendcounts[sendRank*totalRanks+recvRank] = numElementsBase * (recvRank + 1);
|
||||
options.recvcounts[recvRank*totalRanks+sendRank] = options.sendcounts[sendRank*totalRanks+recvRank ];
|
||||
}
|
||||
|
||||
for (int sendRank = 0; sendRank < totalRanks; ++sendRank)
|
||||
{
|
||||
options.sdispls[sendRank*totalRanks] = 0;
|
||||
options.rdispls[sendRank*totalRanks] = 0;
|
||||
for (int recvRank = 1; recvRank < totalRanks; ++recvRank )
|
||||
{
|
||||
options.sdispls[sendRank*totalRanks+recvRank] =
|
||||
options.sdispls[sendRank*totalRanks+recvRank-1] + options.sendcounts[sendRank*totalRanks+recvRank-1];
|
||||
options.rdispls[sendRank*totalRanks+recvRank] =
|
||||
options.rdispls[sendRank*totalRanks+recvRank-1] + options.recvcounts[sendRank*totalRanks+recvRank-1];
|
||||
}
|
||||
numInputElementsArray[sendRank] = options.sdispls[(sendRank+1)*totalRanks-1] + options.sendcounts[(sendRank+1)*totalRanks-1];
|
||||
numOutputElementsArray[sendRank] = options.rdispls[(sendRank+1)*totalRanks-1] + options.recvcounts[(sendRank+1)*totalRanks-1];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST(AllToAllv, OutOfPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
// Configuration
|
||||
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat64};
|
||||
std::vector<int> const numElementsBase = {1048576, 53327, 1024};
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
|
||||
OptionalColArgs options;
|
||||
size_t numInputElementsArray[MAX_RANKS], numOutputElementsArray[MAX_RANKS];
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
|
||||
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
|
||||
{
|
||||
if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue;
|
||||
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
|
||||
|
||||
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
|
||||
for (int numIdx = 0; numIdx < numElementsBase.size() && isCorrect; ++numIdx)
|
||||
{
|
||||
if (testBed.ev.showNames)
|
||||
{
|
||||
std::string name = testBed.GetTestCaseName(totalRanks, isMultiProcess,
|
||||
ncclCollAllToAllv, dataTypes[dataIdx],
|
||||
ncclSum, -1,
|
||||
inPlace, useManagedMem);
|
||||
INFO("%s\n", name.c_str());
|
||||
|
||||
}
|
||||
sendRecvPrep(numInputElementsArray, numOutputElementsArray, options, totalRanks, numElementsBase[numIdx]);
|
||||
for (int rank = 0; rank < totalRanks; ++rank)
|
||||
{
|
||||
testBed.SetCollectiveArgs(ncclCollAllToAllv,
|
||||
dataTypes[dataIdx],
|
||||
numInputElementsArray[rank],
|
||||
numOutputElementsArray[rank],
|
||||
options,
|
||||
-1,
|
||||
rank);
|
||||
|
||||
}
|
||||
testBed.AllocateMem(inPlace, useManagedMem);
|
||||
testBed.PrepareData();
|
||||
testBed.ExecuteCollectives();
|
||||
testBed.ValidateResults(isCorrect);
|
||||
testBed.DeallocateMem();
|
||||
|
||||
}
|
||||
testBed.DestroyComms();
|
||||
}
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -65,6 +65,8 @@ if(BUILD_TESTS)
|
||||
#AllToAll
|
||||
AllToAll_OutOfPlace.cpp
|
||||
AllToAll_ManagedMem.cpp
|
||||
#AllToAllv
|
||||
AllToAllv_OutOfPlace.cpp
|
||||
#Broadcast
|
||||
Broadcast_InPlace.cpp
|
||||
Broadcast_ManagedMem.cpp
|
||||
|
||||
@@ -17,6 +17,7 @@ namespace RcclUnitTesting
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
|
||||
OptionalColArgs options;
|
||||
bool isCorrect = true;
|
||||
int totalRanks = testBed.ev.maxGpus;
|
||||
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
|
||||
@@ -32,12 +33,12 @@ namespace RcclUnitTesting
|
||||
{
|
||||
for (int recvRank = 0; recvRank < totalRanks; ++recvRank)
|
||||
{
|
||||
options.root = recvRank;
|
||||
testBed.SetCollectiveArgs(ncclCollSend,
|
||||
dataTypes[dataIdx],
|
||||
ncclSum,
|
||||
recvRank,
|
||||
numElements[numIdx],
|
||||
numElements[numIdx],
|
||||
options,
|
||||
0,
|
||||
sendRank);
|
||||
if (recvRank == 0)
|
||||
@@ -55,18 +56,17 @@ namespace RcclUnitTesting
|
||||
recvRank,
|
||||
numElements[numIdx]);
|
||||
|
||||
|
||||
options.root = sendRank;
|
||||
testBed.SetCollectiveArgs(ncclCollRecv,
|
||||
dataTypes[dataIdx],
|
||||
ncclSum,
|
||||
sendRank,
|
||||
numElements[numIdx],
|
||||
numElements[numIdx],
|
||||
options,
|
||||
0,
|
||||
recvRank);
|
||||
testBed.AllocateMem(inPlace, useManagedMem, 0, recvRank);
|
||||
testBed.PrepareData(0, recvRank);
|
||||
testBed.ExecuteCollectives({sendRank,recvRank });
|
||||
testBed.ExecuteCollectives({sendRank, recvRank});
|
||||
testBed.ValidateResults(isCorrect, 0, recvRank);
|
||||
testBed.DeallocateMem(0, recvRank);
|
||||
}
|
||||
|
||||
@@ -14,20 +14,17 @@ namespace RcclUnitTesting
|
||||
int const deviceId,
|
||||
ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
ScalarTransport const scalarTransport,
|
||||
int const scalarMode)
|
||||
OptionalColArgs const &optionalColArgs)
|
||||
{
|
||||
// Free scalar based on previous scalarMode
|
||||
if (scalarMode != -1)
|
||||
if (optionalColArgs.scalarMode != -1)
|
||||
{
|
||||
if (this->localScalar.ptr != nullptr)
|
||||
{
|
||||
if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
|
||||
if (this->scalarMode == 1) hipHostFree(this->localScalar.ptr);
|
||||
if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem();
|
||||
if (this->options.scalarMode == 1) hipHostFree(this->localScalar.ptr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,26 +33,23 @@ namespace RcclUnitTesting
|
||||
this->deviceId = deviceId;
|
||||
this->funcType = funcType;
|
||||
this->dataType = dataType;
|
||||
this->redOp = redOp;
|
||||
this->root = root;
|
||||
this->numInputElements = numInputElements;
|
||||
this->numOutputElements = numOutputElements;
|
||||
this->scalarTransport = scalarTransport;
|
||||
this->scalarMode = scalarMode;
|
||||
this->options = optionalColArgs;
|
||||
|
||||
if (scalarMode != -1)
|
||||
if (this->options.scalarMode != -1)
|
||||
{
|
||||
size_t const numBytes = DataTypeToBytes(dataType);
|
||||
if (scalarMode == ncclScalarDevice)
|
||||
if (this->options.scalarMode == ncclScalarDevice)
|
||||
{
|
||||
CHECK_CALL(this->localScalar.AllocateGpuMem(numBytes));
|
||||
CHECK_HIP(hipMemcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes),
|
||||
CHECK_HIP(hipMemcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes),
|
||||
numBytes, hipMemcpyHostToDevice));
|
||||
}
|
||||
else if (scalarMode == ncclScalarHostImmediate)
|
||||
else if (this->options.scalarMode == ncclScalarHostImmediate)
|
||||
{
|
||||
CHECK_HIP(hipHostMalloc(&this->localScalar.ptr, numBytes, 0));
|
||||
memcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), numBytes);
|
||||
memcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes), numBytes);
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
@@ -116,8 +110,8 @@ namespace RcclUnitTesting
|
||||
ErrCode CollectiveArgs::ValidateResults()
|
||||
{
|
||||
// Ignore non-root outputs for collectives with a root
|
||||
if (CollectiveArgs::UsesRoot(this->funcType) && this->root != this->globalRank) return TEST_SUCCESS;
|
||||
|
||||
if (CollectiveArgs::UsesRoot(this->funcType) && this->options.root != this->globalRank) return TEST_SUCCESS;
|
||||
if (this->funcType == ncclCollSend) return TEST_SUCCESS; // on the send receive pair only recv needs to be checked
|
||||
size_t const numOutputBytes = (this->numOutputElements * DataTypeToBytes(this->dataType));
|
||||
|
||||
CHECK_HIP(hipMemcpy(this->outputCpu.ptr, this->outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
|
||||
@@ -153,8 +147,9 @@ namespace RcclUnitTesting
|
||||
|
||||
if (this->localScalar.ptr != nullptr)
|
||||
{
|
||||
if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
|
||||
if (this->scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
|
||||
if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem();
|
||||
if (this->options.scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
|
||||
this->localScalar.Attach(nullptr);
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
@@ -174,6 +169,7 @@ namespace RcclUnitTesting
|
||||
case ncclCollGather: ss << "ncclGather"; break;
|
||||
case ncclCollScatter: ss << "ncclScatter"; break;
|
||||
case ncclCollAllToAll: ss << "ncclAllToAll"; break;
|
||||
case ncclCollAllToAllv: ss << "ncclAllToAllv"; break;
|
||||
case ncclCollSend: ss << "ncclSend"; break;
|
||||
case ncclCollRecv: ss << "ncclRecv"; break;
|
||||
default: ss << "[Unknown]"; break;
|
||||
@@ -184,9 +180,9 @@ namespace RcclUnitTesting
|
||||
this->funcType == ncclCollReduceScatter ||
|
||||
this->funcType == ncclCollAllReduce)
|
||||
{
|
||||
if (this->redOp < ncclNumOps)
|
||||
if (this->options.redOp < ncclNumOps)
|
||||
{
|
||||
ss << ncclRedOpNames[this->redOp] << " ";
|
||||
ss << ncclRedOpNames[this->options.redOp] << " ";
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -215,13 +211,13 @@ namespace RcclUnitTesting
|
||||
this->funcType == ncclCollGather ||
|
||||
this->funcType == ncclCollScatter)
|
||||
{
|
||||
ss << "Root " << this->root << " ";
|
||||
ss << "Root " << this->options.root << " ";
|
||||
}
|
||||
|
||||
if (this->funcType == ncclCollSend ||
|
||||
this->funcType == ncclCollRecv)
|
||||
{
|
||||
ss << "Peer " << this->root << " ";
|
||||
ss << "Peer " << this->options.root << " ";
|
||||
}
|
||||
|
||||
ss << "#In: " << this->numInputElements;
|
||||
@@ -277,7 +273,6 @@ namespace RcclUnitTesting
|
||||
return (funcType == ncclCollBroadcast ||
|
||||
funcType == ncclCollReduce ||
|
||||
funcType == ncclCollGather ||
|
||||
funcType == ncclCollScatter ||
|
||||
funcType == ncclCollSend); // this is incorrect but it works because in Send root is not root it is the peer
|
||||
funcType == ncclCollScatter);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ namespace RcclUnitTesting
|
||||
ncclCollGather,
|
||||
ncclCollScatter,
|
||||
ncclCollAllToAll,
|
||||
ncclCollAllToAllv,
|
||||
ncclCollSend,
|
||||
ncclCollRecv,
|
||||
ncclNumFuncs
|
||||
@@ -37,6 +38,7 @@ namespace RcclUnitTesting
|
||||
"Gather",
|
||||
"Scatter",
|
||||
"AllToAll",
|
||||
"AllToAllv",
|
||||
"Send",
|
||||
"Recv"
|
||||
};
|
||||
@@ -72,6 +74,19 @@ namespace RcclUnitTesting
|
||||
char ptr[MAX_RANKS * sizeof(double)];
|
||||
};
|
||||
|
||||
struct OptionalColArgs
|
||||
{
|
||||
ncclRedOp_t redOp = ncclSum;
|
||||
int root = 0; // Used as "peer" for Send/Recv
|
||||
ScalarTransport scalarTransport; // Used for custom reduction operators
|
||||
int scalarMode = -1; // -1 if scalar not used
|
||||
// allToAllv args
|
||||
size_t sendcounts[MAX_RANKS*MAX_RANKS];
|
||||
size_t sdispls[MAX_RANKS*MAX_RANKS];
|
||||
size_t recvcounts[MAX_RANKS*MAX_RANKS];
|
||||
size_t rdispls[MAX_RANKS*MAX_RANKS];
|
||||
};
|
||||
|
||||
// Function pointer for functions that operate on CollectiveArgs
|
||||
// e.g. For filling input / computing expected results
|
||||
typedef ErrCode (*CollFuncPtr)(CollectiveArgs &);
|
||||
@@ -85,13 +100,10 @@ namespace RcclUnitTesting
|
||||
int deviceId;
|
||||
ncclFunc_t funcType;
|
||||
ncclDataType_t dataType;
|
||||
ncclRedOp_t redOp;
|
||||
int root; // Used as "peer" for Send/Recv
|
||||
size_t numInputElements;
|
||||
size_t numOutputElements;
|
||||
ScalarTransport scalarTransport; // Used for custom reduction operators
|
||||
PtrUnion localScalar;
|
||||
int scalarMode; // -1 if scalar not used
|
||||
OptionalColArgs options;
|
||||
|
||||
// Data
|
||||
PtrUnion inputGpu;
|
||||
@@ -111,12 +123,9 @@ namespace RcclUnitTesting
|
||||
int const deviceId,
|
||||
ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
ScalarTransport const scalarsPerRank,
|
||||
int const scalarMode = -1);
|
||||
OptionalColArgs const &optionalArgs = {});
|
||||
|
||||
// Allocates GPU memory for input/output and CPU memory for expected
|
||||
// When inPlace is true, input and output share the same memory
|
||||
|
||||
@@ -23,6 +23,7 @@ namespace RcclUnitTesting
|
||||
case ncclCollGather: return DefaultPrepData_Gather(collArgs, false);
|
||||
case ncclCollScatter: return DefaultPrepData_Scatter(collArgs);
|
||||
case ncclCollAllToAll: return DefaultPrepData_AllToAll(collArgs);
|
||||
case ncclCollAllToAllv: return DefaultPrepData_AllToAllv(collArgs);
|
||||
case ncclCollSend: return DefaultPrepData_Send(collArgs);
|
||||
case ncclCollRecv: return DefaultPrepData_Recv(collArgs);
|
||||
default:
|
||||
@@ -64,15 +65,15 @@ namespace RcclUnitTesting
|
||||
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
|
||||
|
||||
// Only root needs input pattern
|
||||
if (collArgs.globalRank == collArgs.root)
|
||||
if (collArgs.globalRank == collArgs.options.root)
|
||||
CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType,
|
||||
collArgs.numInputElements,
|
||||
collArgs.root, true));
|
||||
collArgs.options.root, true));
|
||||
|
||||
// Otherwise all other ranks expected output is the same as input of root
|
||||
return collArgs.expected.FillPattern(collArgs.dataType,
|
||||
collArgs.numInputElements,
|
||||
collArgs.root,
|
||||
collArgs.options.root,
|
||||
false);
|
||||
}
|
||||
|
||||
@@ -96,11 +97,11 @@ namespace RcclUnitTesting
|
||||
CHECK_CALL(result.ClearCpuMem(numBytes));
|
||||
|
||||
// If average or custom reduction operator is used, perform a summation instead
|
||||
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
|
||||
ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);
|
||||
|
||||
// Loop over each rank and generate their input into a temp buffer, then reduce
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
|
||||
scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);
|
||||
|
||||
PtrUnion tempInputCpu;
|
||||
CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu));
|
||||
@@ -117,14 +118,14 @@ namespace RcclUnitTesting
|
||||
|
||||
// Scale the temporary input by local scalar for this rank
|
||||
// (Used by custom reduction ops)
|
||||
if (collArgs.scalarMode >= 0)
|
||||
if (collArgs.options.scalarMode >= 0)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
|
||||
scalarsPerRank, rank));
|
||||
}
|
||||
|
||||
// Any rank that requires output reduces the scaled-inputs
|
||||
if (isAllReduce || collArgs.root == collArgs.globalRank)
|
||||
if (isAllReduce || collArgs.options.root == collArgs.globalRank)
|
||||
{
|
||||
if (rank == 0)
|
||||
{
|
||||
@@ -139,7 +140,7 @@ namespace RcclUnitTesting
|
||||
}
|
||||
|
||||
// Perform averaging if necessary
|
||||
if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank))
|
||||
if (collArgs.options.redOp == ncclAvg && (isAllReduce || collArgs.options.root == collArgs.globalRank))
|
||||
{
|
||||
CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
|
||||
}
|
||||
@@ -176,7 +177,7 @@ namespace RcclUnitTesting
|
||||
{
|
||||
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice));
|
||||
}
|
||||
if (isAllGather || collArgs.root == collArgs.globalRank)
|
||||
if (isAllGather || collArgs.options.root == collArgs.globalRank)
|
||||
{
|
||||
memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes);
|
||||
}
|
||||
@@ -207,11 +208,11 @@ namespace RcclUnitTesting
|
||||
CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes));
|
||||
|
||||
// If average or custom reduction operator is used, perform a summation instead
|
||||
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
|
||||
ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);
|
||||
|
||||
// Loop over each rank and generate the input / scale / reduce
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
|
||||
scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);
|
||||
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
|
||||
@@ -229,7 +230,7 @@ namespace RcclUnitTesting
|
||||
|
||||
// Scale the temporary input by local scalar for this rank
|
||||
// (Used by custom reduction ops)
|
||||
if (collArgs.scalarMode >= 0)
|
||||
if (collArgs.options.scalarMode >= 0)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
|
||||
scalarsPerRank, rank));
|
||||
@@ -247,7 +248,7 @@ namespace RcclUnitTesting
|
||||
}
|
||||
|
||||
// Perform averaging if necessary
|
||||
if (collArgs.redOp == ncclAvg)
|
||||
if (collArgs.options.redOp == ncclAvg)
|
||||
{
|
||||
CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
|
||||
}
|
||||
@@ -279,10 +280,10 @@ namespace RcclUnitTesting
|
||||
// Generate input as if on root rank - each rank will receive a portion
|
||||
PtrUnion tempInput;
|
||||
tempInput.AllocateCpuMem(numInputBytes);
|
||||
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false);
|
||||
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.options.root, false);
|
||||
|
||||
// Copy input to root rank
|
||||
if (collArgs.globalRank == collArgs.root)
|
||||
if (collArgs.globalRank == collArgs.options.root)
|
||||
{
|
||||
if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
|
||||
{
|
||||
@@ -341,6 +342,45 @@ namespace RcclUnitTesting
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs)
|
||||
{
|
||||
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
|
||||
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
|
||||
|
||||
// calculating maxNumElements as the maximum number of input bytes out of all the ranks
|
||||
size_t maxNumElements = 0;
|
||||
for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
|
||||
for (int recvRank = 0; recvRank < collArgs.totalRanks; ++recvRank)
|
||||
{
|
||||
size_t rankSendCount = collArgs.options.sdispls[(sendRank)*collArgs.totalRanks+recvRank] + collArgs.options.sendcounts[(sendRank)*collArgs.totalRanks+recvRank];
|
||||
maxNumElements = std::max(maxNumElements, rankSendCount);
|
||||
}
|
||||
|
||||
// Clear outputs on all ranks (prior to input in case of in-place)
|
||||
collArgs.outputGpu.ClearGpuMem(numOutputBytes);
|
||||
|
||||
// Generate input on root rank - each rank will receive a portion
|
||||
PtrUnion tempInput;
|
||||
tempInput.AllocateCpuMem(maxNumElements*DataTypeToBytes(collArgs.dataType));
|
||||
|
||||
for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
|
||||
{
|
||||
tempInput.FillPattern(collArgs.dataType, maxNumElements, sendRank, false);
|
||||
size_t recvDspls = collArgs.options.rdispls[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
|
||||
size_t sendDspls = collArgs.options.sdispls[sendRank*collArgs.totalRanks + collArgs.globalRank] * DataTypeToBytes(collArgs.dataType);
|
||||
size_t numBytes = collArgs.options.recvcounts[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
|
||||
memcpy(collArgs.expected.U1 + recvDspls, tempInput.U1 + sendDspls, numBytes);
|
||||
}
|
||||
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.globalRank, false);
|
||||
|
||||
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice));
|
||||
|
||||
tempInput.FreeCpuMem();
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs)
|
||||
{
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
@@ -354,7 +394,7 @@ namespace RcclUnitTesting
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
return collArgs.expected.FillPattern(collArgs.dataType,
|
||||
collArgs.numOutputElements,
|
||||
collArgs.root,
|
||||
collArgs.options.root,
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ namespace RcclUnitTesting
|
||||
ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_Recv(CollectiveArgs &collArgs);
|
||||
}
|
||||
|
||||
+13
-29
@@ -155,32 +155,19 @@ namespace RcclUnitTesting
|
||||
InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup);
|
||||
}
|
||||
|
||||
void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
int const collId,
|
||||
int const rank,
|
||||
PtrUnion const scalarsPerRank,
|
||||
int const scalarMode)
|
||||
void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
OptionalColArgs const &optionalArgs,
|
||||
int const collId,
|
||||
int const rank)
|
||||
{
|
||||
// Build list of ranks this applies to (-1 for rank means to set for all)
|
||||
std::vector<int> rankList;
|
||||
for (int i = 0; i < this->numActiveRanks; ++i)
|
||||
if (rank == -1 || rank == i) rankList.push_back(i);
|
||||
|
||||
ScalarTransport scalarTransport;
|
||||
if (scalarMode >= 0)
|
||||
{
|
||||
ASSERT_TRUE(scalarsPerRank.ptr != NULL);
|
||||
|
||||
// Capture scalars per rank in format to share with child processes
|
||||
int const numBytes = this->numActiveRanks * DataTypeToBytes(dataType);
|
||||
memcpy(scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
|
||||
}
|
||||
|
||||
// Loop over all ranks and send CollectiveArgs to appropriate child process
|
||||
int const cmd = TestBedChild::CHILD_SET_COLL_ARGS;
|
||||
for (auto currRank : rankList)
|
||||
@@ -191,12 +178,9 @@ namespace RcclUnitTesting
|
||||
PIPE_WRITE(childId, collId);
|
||||
PIPE_WRITE(childId, funcType);
|
||||
PIPE_WRITE(childId, dataType);
|
||||
PIPE_WRITE(childId, redOp);
|
||||
PIPE_WRITE(childId, root);
|
||||
PIPE_WRITE(childId, numInputElements);
|
||||
PIPE_WRITE(childId, numOutputElements);
|
||||
PIPE_WRITE(childId, scalarMode);
|
||||
PIPE_WRITE(childId, scalarTransport);
|
||||
PIPE_WRITE(childId, optionalArgs);
|
||||
PIPE_CHECK(childId);
|
||||
}
|
||||
}
|
||||
@@ -412,7 +396,7 @@ namespace RcclUnitTesting
|
||||
// Sort numElements in descending order to cut down on # of allocations
|
||||
std::vector<int> sortedN = numElements;
|
||||
std::sort(sortedN.rbegin(), sortedN.rend());
|
||||
|
||||
OptionalColArgs optionalArgs;
|
||||
// Filter out any unsupported datatypes, in case only subset has been compiled for
|
||||
std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
@@ -479,13 +463,13 @@ namespace RcclUnitTesting
|
||||
totalRanks,
|
||||
&numInputElements,
|
||||
&numOutputElements);
|
||||
|
||||
optionalArgs.redOp = redOps[rdIdx];
|
||||
optionalArgs.root = roots[rtIdx];
|
||||
this->SetCollectiveArgs(funcTypes[ftIdx],
|
||||
dataTypes[dtIdx],
|
||||
redOps[rdIdx],
|
||||
roots[rtIdx],
|
||||
numInputElements,
|
||||
numOutputElements);
|
||||
numOutputElements,
|
||||
optionalArgs);
|
||||
|
||||
// Only allocate once for largest size
|
||||
if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]);
|
||||
|
||||
+7
-10
@@ -41,16 +41,13 @@ namespace RcclUnitTesting
|
||||
// Using collId = -1 (default) applies settings to all collectives in group
|
||||
// Using rank = -1 (default) applies settings to all ranks
|
||||
|
||||
void SetCollectiveArgs(ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
int const collId = -1,
|
||||
int const rank = -1,
|
||||
PtrUnion const scalarsPerRank = {nullptr},
|
||||
int const scalarMode = -1);
|
||||
void SetCollectiveArgs(ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
OptionalColArgs const &optionalArgs = {},
|
||||
int const collId = -1,
|
||||
int const rank = -1);
|
||||
|
||||
// Allocate memory for specified collective / rank
|
||||
// - Requires SetCollectiveArgs to have been called already
|
||||
|
||||
@@ -188,29 +188,17 @@ namespace RcclUnitTesting
|
||||
int collId;
|
||||
ncclFunc_t funcType;
|
||||
ncclDataType_t dataType;
|
||||
ncclRedOp_t redOp;
|
||||
int root;
|
||||
size_t numInputElements;
|
||||
size_t numOutputElements;
|
||||
ScalarTransport scalarTransport;
|
||||
int scalarMode;
|
||||
OptionalColArgs options;
|
||||
|
||||
PIPE_READ(globalRank);
|
||||
PIPE_READ(collId);
|
||||
PIPE_READ(funcType);
|
||||
PIPE_READ(dataType);
|
||||
PIPE_READ(redOp);
|
||||
PIPE_READ(root);
|
||||
PIPE_READ(numInputElements);
|
||||
PIPE_READ(numOutputElements);
|
||||
PIPE_READ(scalarMode);
|
||||
PIPE_READ(scalarTransport);
|
||||
|
||||
for (int i = 0; i < this->totalRanks; i++)
|
||||
{
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.Attach(scalarTransport.ptr);
|
||||
}
|
||||
PIPE_READ(options);
|
||||
|
||||
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
|
||||
{
|
||||
@@ -227,24 +215,24 @@ namespace RcclUnitTesting
|
||||
CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
|
||||
CHECK_CALL(collArg.SetArgs(globalRank, this->totalRanks,
|
||||
this->deviceIds[localRank],
|
||||
funcType, dataType, redOp, root,
|
||||
funcType, dataType,
|
||||
numInputElements, numOutputElements,
|
||||
scalarTransport, scalarMode));
|
||||
options));
|
||||
if (this->verbose) INFO("Rank %d on child %d sets collective %d [%s]\n",
|
||||
globalRank, this->childId, collIdx,
|
||||
collArg.GetDescription().c_str());
|
||||
|
||||
// If pre-mult scalars are provided, then create a custom reduction operator
|
||||
if (scalarMode >= 0)
|
||||
if (options.scalarMode >= 0)
|
||||
{
|
||||
CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.redOp,
|
||||
CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.options.redOp,
|
||||
collArg.localScalar.ptr,
|
||||
dataType,
|
||||
(ncclScalarResidence_t)scalarMode,
|
||||
(ncclScalarResidence_t)options.scalarMode,
|
||||
this->comms[localRank]),
|
||||
"ncclRedOpCreatePreMulSum");
|
||||
if (verbose) INFO("Child %d created custom redop %d for collective %d\n",
|
||||
this->childId, collArg.redOp, collIdx);
|
||||
this->childId, collArg.options.redOp, collIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -383,7 +371,7 @@ namespace RcclUnitTesting
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
collArg.options.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclBroadcast");
|
||||
@@ -393,8 +381,8 @@ namespace RcclUnitTesting
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.redOp,
|
||||
collArg.root,
|
||||
collArg.options.redOp,
|
||||
collArg.options.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclReduce");
|
||||
@@ -413,7 +401,7 @@ namespace RcclUnitTesting
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numOutputElements,
|
||||
collArg.dataType,
|
||||
collArg.redOp,
|
||||
collArg.options.redOp,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclReduceScatter");
|
||||
@@ -423,7 +411,7 @@ namespace RcclUnitTesting
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.redOp,
|
||||
collArg.options.redOp,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclAllReduce");
|
||||
@@ -433,7 +421,7 @@ namespace RcclUnitTesting
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
collArg.options.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclGather");
|
||||
@@ -443,7 +431,7 @@ namespace RcclUnitTesting
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numOutputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
collArg.options.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclScatter");
|
||||
@@ -457,11 +445,23 @@ namespace RcclUnitTesting
|
||||
this->streams[localRank]),
|
||||
"ncclAllToAll");
|
||||
break;
|
||||
case ncclCollAllToAllv:
|
||||
CHILD_NCCL_CALL(ncclAllToAllv(collArg.inputGpu.ptr,
|
||||
collArg.options.sendcounts + (this->rankOffset + localRank)*this->totalRanks,
|
||||
collArg.options.sdispls + (this->rankOffset + localRank)*this->totalRanks,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.options.recvcounts + (this->rankOffset + localRank)*this->totalRanks,
|
||||
collArg.options.rdispls + (this->rankOffset + localRank)*this->totalRanks,
|
||||
collArg.dataType,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclAllToAllv");
|
||||
break;
|
||||
case ncclCollSend:
|
||||
CHILD_NCCL_CALL(ncclSend(collArg.inputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
collArg.options.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclSend");
|
||||
@@ -470,7 +470,7 @@ namespace RcclUnitTesting
|
||||
CHILD_NCCL_CALL(ncclRecv(collArg.outputGpu.ptr,
|
||||
collArg.numOutputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
collArg.options.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclRecv");
|
||||
@@ -579,12 +579,12 @@ namespace RcclUnitTesting
|
||||
|
||||
CHECK_CALL(collArg.DeallocateMem());
|
||||
}
|
||||
if (collArg.scalarMode != -1)
|
||||
if (collArg.options.scalarMode != -1)
|
||||
{
|
||||
CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.redOp, this->comms[localRank]),
|
||||
CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.options.redOp, this->comms[localRank]),
|
||||
"ncclRedOpDestroy");
|
||||
if (verbose) INFO("Child %d destroys custom redop %d for collective %d\n",
|
||||
this->childId, collArg.redOp, collIdx);
|
||||
this->childId, collArg.options.redOp, collIdx);
|
||||
}
|
||||
}
|
||||
if (this->verbose) INFO("Child %d finishes DeallocateMem\n", this->childId);
|
||||
|
||||
Reference in New Issue
Block a user