Added new unit tests for AllReduce with Bias API (#2036)
* Added new unit tests for AllReduce with Bias API
* Address review comments
[ROCm/rccl commit: 7c12b0b76b]
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
3e650467fa
commit
e4aef19511
@@ -249,4 +249,448 @@ namespace RcclUnitTesting
|
||||
}
|
||||
callCollectiveForked(nranks, ncclCollAllReduce, sendBuff, recvBuff, expected, use_managed_mem);
|
||||
}
|
||||
|
||||
#ifdef RCCL_ALLREDUCE_WITH_BIAS
|
||||
// Note: All bias tests require:
|
||||
// nRanks >= 2 (bias NOT supported for single rank)
|
||||
|
||||
// Named constants for bias test configuration
|
||||
namespace BiasTestConstants
|
||||
{
|
||||
// Element counts for different operations
|
||||
constexpr std::initializer_list<int> STANDARD_ELEM_COUNTS = {2048, 384}; // For Sum/Max/Min
|
||||
constexpr std::initializer_list<int> PROD_ELEM_COUNTS_MEDIUM = {32}; // For Int32/Uint32 Prod
|
||||
constexpr std::initializer_list<int> PROD_ELEM_COUNTS_LARGE = {64}; // For Int8/Uint8/Int64/Uint64/Float Prod
|
||||
|
||||
// Bias and input pattern constants
|
||||
constexpr int BIAS_CONSTANT_ONE = 1; // Use constant bias value of 1 (prevents overflow)
|
||||
constexpr int BIAS_INCREMENTAL_PATTERN
|
||||
= -1; // Use incremental pattern: bias[i] = i (more thorough testing)
|
||||
constexpr int INPUT_RANK_BASED_PATTERN
|
||||
= -1; // Use rank-based pattern: input[rank][i] = (rank+i)%256
|
||||
constexpr int INPUT_CONSTANT_ONE = 1; // Use constant input value of 1 (prevents overflow)
|
||||
} // namespace BiasTestConstants
|
||||
|
||||
/*
|
||||
* @brief Helper function for running bias tests with specific datatype and redOp
|
||||
* @param dataType Data type
|
||||
* @param redOp Reduction operation
|
||||
* @param numElements Number of elements
|
||||
* @param biasConstVal Bias constant value, -1 for incremental bias
|
||||
* @param inputConstVal Input constant value, -1 for rank-based input
|
||||
*/
|
||||
void RunBiasTest(ncclDataType_t dataType,
|
||||
ncclRedOp_t redOp,
|
||||
std::vector<int> numElements,
|
||||
int biasConstVal = BiasTestConstants::BIAS_INCREMENTAL_PATTERN,
|
||||
int inputConstVal = BiasTestConstants::INPUT_RANK_BASED_PATTERN)
|
||||
{
|
||||
// Create TestBed first (doesn't create child processes yet)
|
||||
TestBed testBed;
|
||||
|
||||
// Check if architecture is gfx94 (covers gfx942) or gfx95 (covers gfx950)
|
||||
if (!testBed.ev.isGfx94 && !testBed.ev.isGfx95)
|
||||
{
|
||||
INFO("SKIPPED: AllReduce with Bias is only supported on gfx942 or gfx950 architectures.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
bool const useHipGraph = false;
|
||||
|
||||
OptionalColArgs options;
|
||||
options.useBias = true;
|
||||
options.redOp = redOp;
|
||||
options.biasConstantValue = biasConstVal;
|
||||
options.inputConstantValue = inputConstVal;
|
||||
|
||||
bool isCorrect = true;
|
||||
|
||||
for(int totalRanks : testBed.ev.GetNumGpusList())
|
||||
{
|
||||
if(totalRanks < 2)
|
||||
continue;
|
||||
|
||||
int const numProcesses = totalRanks;
|
||||
bool const isMultiProcess = true;
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder));
|
||||
|
||||
for(auto numElem : numElements)
|
||||
{
|
||||
if(!isCorrect)
|
||||
break;
|
||||
|
||||
if(testBed.ev.showNames)
|
||||
{
|
||||
std::string name = testBed.GetTestCaseName(totalRanks,
|
||||
isMultiProcess,
|
||||
ncclCollAllReduce,
|
||||
dataType,
|
||||
redOp,
|
||||
-1,
|
||||
inPlace,
|
||||
useManagedMem,
|
||||
useHipGraph);
|
||||
INFO(" %s (with bias, count=%d)\n", name.c_str(), numElem);
|
||||
}
|
||||
|
||||
options.biasNumElements = numElem;
|
||||
|
||||
testBed.SetCollectiveArgs(ncclCollAllReduce,
|
||||
dataType,
|
||||
numElem,
|
||||
numElem,
|
||||
options,
|
||||
-1,
|
||||
0,
|
||||
-1);
|
||||
testBed.AllocateMem(inPlace, useManagedMem);
|
||||
testBed.PrepareData();
|
||||
testBed.ExecuteCollectives({}, useHipGraph);
|
||||
testBed.ValidateResults(isCorrect);
|
||||
testBed.DeallocateMem();
|
||||
}
|
||||
testBed.DestroyComms();
|
||||
}
|
||||
testBed.Finalize();
|
||||
}
|
||||
|
||||
// Int8 Tests
|
||||
TEST(AllReduce, BiasInt8_Sum)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt8,
|
||||
ncclSum,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasInt8_Max)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt8,
|
||||
ncclMax,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasInt8_Min)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt8,
|
||||
ncclMin,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasInt8_Prod)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt8,
|
||||
ncclProd,
|
||||
PROD_ELEM_COUNTS_LARGE,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_CONSTANT_ONE);
|
||||
}
|
||||
|
||||
// Uint8 Tests
|
||||
TEST(AllReduce, BiasUint8_Sum)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint8,
|
||||
ncclSum,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasUint8_Max)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint8,
|
||||
ncclMax,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasUint8_Min)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint8,
|
||||
ncclMin,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasUint8_Prod)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint8,
|
||||
ncclProd,
|
||||
PROD_ELEM_COUNTS_LARGE,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_CONSTANT_ONE);
|
||||
}
|
||||
|
||||
// Int32 Tests
|
||||
TEST(AllReduce, BiasInt32_Sum)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt32,
|
||||
ncclSum,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasInt32_Max)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt32,
|
||||
ncclMax,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasInt32_Min)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt32,
|
||||
ncclMin,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasInt32_Prod)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt32,
|
||||
ncclProd,
|
||||
PROD_ELEM_COUNTS_MEDIUM,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_CONSTANT_ONE);
|
||||
}
|
||||
|
||||
// Uint32 Tests
|
||||
TEST(AllReduce, BiasUint32_Sum)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint32,
|
||||
ncclSum,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasUint32_Max)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint32,
|
||||
ncclMax,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasUint32_Min)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint32,
|
||||
ncclMin,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasUint32_Prod)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint32,
|
||||
ncclProd,
|
||||
PROD_ELEM_COUNTS_MEDIUM,
|
||||
BIAS_CONSTANT_ONE,
|
||||
INPUT_CONSTANT_ONE);
|
||||
}
|
||||
|
||||
// Int64 Tests
|
||||
TEST(AllReduce, BiasInt64_Sum)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt64,
|
||||
ncclSum,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasInt64_Max)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt64,
|
||||
ncclMax,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasInt64_Min)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt64,
|
||||
ncclMin,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasInt64_Prod)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclInt64,
|
||||
ncclProd,
|
||||
PROD_ELEM_COUNTS_LARGE,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_CONSTANT_ONE);
|
||||
}
|
||||
|
||||
// Uint64 Tests
|
||||
TEST(AllReduce, BiasUint64_Sum)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint64,
|
||||
ncclSum,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasUint64_Max)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint64,
|
||||
ncclMax,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasUint64_Min)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint64,
|
||||
ncclMin,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasUint64_Prod)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclUint64,
|
||||
ncclProd,
|
||||
PROD_ELEM_COUNTS_LARGE,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_CONSTANT_ONE);
|
||||
}
|
||||
|
||||
// Float32 Tests
|
||||
TEST(AllReduce, BiasFloat32_Sum)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclFloat32,
|
||||
ncclSum,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasFloat32_Max)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclFloat32,
|
||||
ncclMax,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasFloat32_Min)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclFloat32,
|
||||
ncclMin,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasFloat32_Prod)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclFloat32,
|
||||
ncclProd,
|
||||
PROD_ELEM_COUNTS_LARGE,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_CONSTANT_ONE);
|
||||
}
|
||||
|
||||
// Float64 Tests
|
||||
TEST(AllReduce, BiasFloat64_Sum)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclFloat64,
|
||||
ncclSum,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasFloat64_Max)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclFloat64,
|
||||
ncclMax,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasFloat64_Min)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclFloat64,
|
||||
ncclMin,
|
||||
STANDARD_ELEM_COUNTS,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_RANK_BASED_PATTERN);
|
||||
}
|
||||
|
||||
TEST(AllReduce, BiasFloat64_Prod)
|
||||
{
|
||||
using namespace BiasTestConstants;
|
||||
RunBiasTest(ncclFloat64,
|
||||
ncclProd,
|
||||
PROD_ELEM_COUNTS_LARGE,
|
||||
BIAS_INCREMENTAL_PATTERN,
|
||||
INPUT_CONSTANT_ONE);
|
||||
}
|
||||
|
||||
#else
|
||||
// If RCCL_ALLREDUCE_WITH_BIAS is not defined, skip all bias tests
|
||||
TEST(AllReduce, BiasNotAvailable)
|
||||
{
|
||||
INFO("SKIPPED: RCCL_ALLREDUCE_WITH_BIAS not defined - bias tests skipped\n");
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -102,6 +102,17 @@ namespace RcclUnitTesting
|
||||
CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated));
|
||||
}
|
||||
CHECK_CALL(this->outputCpu.AllocateCpuMem(this->numOutputBytesAllocated));
|
||||
|
||||
// Allocate bias buffers if bias is enabled
|
||||
if (this->options.useBias)
|
||||
{
|
||||
this->numBiasElements = this->options.biasNumElements;
|
||||
this->numBiasBytesAllocated = this->numBiasElements * DataTypeToBytes(this->dataType);
|
||||
CHECK_CALL(this->biasGpu.AllocateGpuMem(this->numBiasBytesAllocated, useManagedMem, userRegistered));
|
||||
CHECK_CALL(this->biasCpu.AllocateCpuMem(this->numBiasBytesAllocated));
|
||||
this->biasRegHandle = nullptr;
|
||||
}
|
||||
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -155,6 +166,15 @@ namespace RcclUnitTesting
|
||||
if (this->options.scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
|
||||
this->localScalar.Attach(nullptr);
|
||||
}
|
||||
|
||||
// Deallocate bias buffers if they were allocated
|
||||
if (this->options.useBias && this->numBiasBytesAllocated > 0)
|
||||
{
|
||||
this->biasGpu.FreeGpuMem(this->userRegistered);
|
||||
this->biasCpu.FreeCpuMem();
|
||||
this->biasRegHandle = nullptr;
|
||||
}
|
||||
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -82,6 +82,15 @@ namespace RcclUnitTesting
|
||||
ScalarTransport scalarTransport; // Used for custom reduction operators
|
||||
int scalarMode = -1; // -1 if scalar not used
|
||||
|
||||
// Bias support for fused AllReduce+Bias operations
|
||||
bool useBias = false; // Enable bias addition
|
||||
void* biasPtr = nullptr; // Pointer to bias buffer (GPU memory)
|
||||
size_t biasNumElements = 0; // Number of elements in bias buffer
|
||||
int biasConstantValue = -1; // If >= 0, use constant value for all bias elements (instead of incremental pattern)
|
||||
|
||||
// Input data pattern control (useful for ncclProd to avoid overflow at high rank counts)
|
||||
int inputConstantValue = -1; // If >= 0, use constant value for all input elements (instead of rank-based pattern)
|
||||
|
||||
// allToAllv args
|
||||
size_t sendcounts[MAX_RANKS*MAX_RANKS];
|
||||
size_t sdispls[MAX_RANKS*MAX_RANKS];
|
||||
@@ -122,6 +131,13 @@ namespace RcclUnitTesting
|
||||
size_t numInputElementsAllocated;
|
||||
size_t numOutputElementsAllocated;
|
||||
|
||||
// Bias data for fused AllReduce+Bias operations
|
||||
PtrUnion biasGpu; // Bias buffer on GPU
|
||||
PtrUnion biasCpu; // Bias buffer on CPU (for initialization/validation)
|
||||
void* biasRegHandle; // Handle for registered bias buffer
|
||||
size_t numBiasElements; // Number of elements in bias buffer
|
||||
size_t numBiasBytesAllocated; // Number of bytes allocated for bias
|
||||
|
||||
// Set collective arguments
|
||||
ErrCode SetArgs(int const globalRank,
|
||||
int const totalRanks,
|
||||
|
||||
@@ -200,6 +200,8 @@ namespace RcclUnitTesting
|
||||
numDetectedGpus = min(numDetectedGpus, 16);
|
||||
isGfx94 = false;
|
||||
getArchInfo(&isGfx94, "gfx94");
|
||||
isGfx95 = false;
|
||||
getArchInfo(&isGfx95, "gfx95");
|
||||
isGfx12 = false;
|
||||
getArchInfo(&isGfx12, "gfx12");
|
||||
isGfx90 = false;
|
||||
|
||||
@@ -33,6 +33,7 @@ namespace RcclUnitTesting
|
||||
bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD]
|
||||
|
||||
bool isGfx94; // Detects if architecture is gfx94
|
||||
bool isGfx95; // Detects if architecture is gfx95
|
||||
bool isGfx12; // Detects if architecture is gfx12
|
||||
bool isGfx90; // Detects if architecture is gfx90
|
||||
|
||||
|
||||
@@ -47,6 +47,22 @@ namespace RcclUnitTesting
|
||||
collArgs.numOutputElements, collArgs.numOutputElementsAllocated);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
// Check bias allocation if bias is enabled
|
||||
if (collArgs.options.useBias)
|
||||
{
|
||||
if (collArgs.numBiasElements == 0 || collArgs.numBiasBytesAllocated == 0)
|
||||
{
|
||||
ERROR("Bias is enabled but bias buffers are not allocated\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
if (collArgs.numBiasElements != collArgs.numOutputElements)
|
||||
{
|
||||
ERROR("Number of bias elements (%lu) must match number of output elements (%lu)\n",
|
||||
collArgs.numBiasElements, collArgs.numOutputElements);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -108,7 +124,22 @@ namespace RcclUnitTesting
|
||||
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
|
||||
{
|
||||
// Generate temporary input for this rank
|
||||
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
|
||||
if (collArgs.options.inputConstantValue >= 0)
|
||||
{
|
||||
// Use constant value for all input elements across all ranks
|
||||
// This is useful for ncclProd at high rank counts to avoid factorial overflow
|
||||
for (size_t i = 0; i < collArgs.numInputElements; i++)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.Set(collArgs.dataType, i,
|
||||
collArgs.options.inputConstantValue,
|
||||
(double)collArgs.options.inputConstantValue));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use rank-based pattern: value[rank][i] = (rank + i) % 256 (default behavior)
|
||||
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
|
||||
}
|
||||
|
||||
// Copy the pre-scaled input into GPU memory for the correct rank
|
||||
if (rank == collArgs.globalRank)
|
||||
@@ -144,6 +175,38 @@ namespace RcclUnitTesting
|
||||
{
|
||||
CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
|
||||
}
|
||||
|
||||
// Add bias to expected output if bias is enabled
|
||||
if (collArgs.options.useBias && (isAllReduce || collArgs.options.root == collArgs.globalRank))
|
||||
{
|
||||
// Initialize bias data on CPU
|
||||
if (collArgs.options.biasConstantValue >= 0)
|
||||
{
|
||||
// Use constant value for all bias elements (useful for ncclProd to avoid overflow)
|
||||
for (size_t i = 0; i < collArgs.numBiasElements; i++)
|
||||
{
|
||||
CHECK_CALL(collArgs.biasCpu.Set(collArgs.dataType, i,
|
||||
collArgs.options.biasConstantValue,
|
||||
(double)collArgs.options.biasConstantValue));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use incremental pattern: bias[i] = i (default behavior)
|
||||
CHECK_CALL(collArgs.biasCpu.FillPattern(collArgs.dataType, collArgs.numBiasElements, 0, false));
|
||||
}
|
||||
|
||||
// Copy bias data to GPU
|
||||
size_t const biasBytes = collArgs.numBiasBytesAllocated;
|
||||
CHECK_HIP(hipMemcpy(collArgs.biasGpu.ptr, collArgs.biasCpu.ptr, biasBytes, hipMemcpyHostToDevice));
|
||||
|
||||
// Apply bias to expected output using the SAME reduction operation as AllReduce
|
||||
CHECK_CALL(result.Reduce(collArgs.dataType, collArgs.numInputElements, collArgs.biasCpu, tempOp));
|
||||
|
||||
// Update the biasPtr in options to point to the GPU buffer
|
||||
collArgs.options.biasPtr = collArgs.biasGpu.ptr;
|
||||
}
|
||||
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -718,6 +718,11 @@ namespace RcclUnitTesting
|
||||
&numOutputElements);
|
||||
optionalArgs.redOp = redOps[rdIdx];
|
||||
optionalArgs.root = roots[rtIdx] % this->numActiveRanks;
|
||||
// Set biasNumElements if bias is enabled
|
||||
if (optionalArgs.useBias)
|
||||
{
|
||||
optionalArgs.biasNumElements = numOutputElements;
|
||||
}
|
||||
this->SetCollectiveArgs(funcTypes[ftIdx],
|
||||
dataTypes[dtIdx],
|
||||
numInputElements,
|
||||
|
||||
@@ -41,7 +41,7 @@ namespace RcclUnitTesting
|
||||
std::vector<int> const& numStreamsPerGroup,
|
||||
int const numGroupCalls = 1,
|
||||
bool const useBlocking = true);
|
||||
|
||||
|
||||
// Prepare TestBed for use with GPUs across multiple child processes
|
||||
void InitComms(std::vector<std::vector<int>> const& deviceIdsPerChild,
|
||||
int const numCollectivesInGroup = 1,
|
||||
@@ -96,7 +96,7 @@ namespace RcclUnitTesting
|
||||
// Execute all collectives on all test children
|
||||
// Blocks until collective is completed
|
||||
void ExecuteCollectives(std::vector<int> const ¤tRanks = {},
|
||||
int const groupId = -1,
|
||||
int const groupId = -1,
|
||||
bool const useHipGraph = false);
|
||||
|
||||
// Perform results validation - compare output to expected
|
||||
@@ -140,7 +140,7 @@ namespace RcclUnitTesting
|
||||
int const numGpus,
|
||||
int const ranksPerGpu,
|
||||
const std::vector<int>& gpuPriorityOrder);
|
||||
|
||||
|
||||
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus,
|
||||
const std::vector<int>& gpuPriorityOrder);
|
||||
|
||||
@@ -598,15 +598,32 @@ namespace RcclUnitTesting
|
||||
"ncclReduceScatter");
|
||||
break;
|
||||
case ncclCollAllReduce:
|
||||
CHILD_NCCL_CALL_RANK(errCode, ncclAllReduce(
|
||||
collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.options.redOp,
|
||||
this->comms[localRank],
|
||||
this->streams[groupId][localRank][collArg.streamIdx]),
|
||||
"ncclAllReduce");
|
||||
// Use ncclAllReduceWithBias if bias is enabled
|
||||
if (collArg.options.useBias)
|
||||
{
|
||||
CHILD_NCCL_CALL_RANK(errCode, ncclAllReduceWithBias(
|
||||
collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.options.redOp,
|
||||
this->comms[localRank],
|
||||
this->streams[groupId][localRank][collArg.streamIdx],
|
||||
collArg.options.biasPtr),
|
||||
"ncclAllReduceWithBias");
|
||||
}
|
||||
else
|
||||
{
|
||||
CHILD_NCCL_CALL_RANK(errCode, ncclAllReduce(
|
||||
collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.options.redOp,
|
||||
this->comms[localRank],
|
||||
this->streams[groupId][localRank][collArg.streamIdx]),
|
||||
"ncclAllReduce");
|
||||
}
|
||||
break;
|
||||
case ncclCollGather:
|
||||
CHILD_NCCL_CALL_RANK(errCode, ncclGather(
|
||||
@@ -762,7 +779,7 @@ namespace RcclUnitTesting
|
||||
{
|
||||
streamsToComplete.erase(streamsToComplete.begin() + i);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
usElapsed = duration_cast<microseconds>(Clock::now() - start).count();
|
||||
}
|
||||
@@ -773,13 +790,13 @@ namespace RcclUnitTesting
|
||||
if (this->verbose) INFO("Collective timed out, aborting\n");
|
||||
for (int localRank : localRanksToExecute)
|
||||
{
|
||||
ncclCommAbort(this->comms[localRank]);
|
||||
ncclCommAbort(this->comms[localRank]);
|
||||
timedout = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// extra sync to flush GPU cache for validation later
|
||||
// TODO: remove this after figuring out & fixing the exact behavior
|
||||
// TODO: remove this after figuring out & fixing the exact behavior
|
||||
// of fencing between kernels and at hipStreamQuery
|
||||
for (int localRank : localRanksToExecute)
|
||||
{
|
||||
@@ -970,7 +987,7 @@ namespace RcclUnitTesting
|
||||
PIPE_READ(groupId);
|
||||
|
||||
// Release graphs
|
||||
for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
|
||||
for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
|
||||
{
|
||||
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
|
||||
for (int streamIdx = 0; streamIdx < this->numStreamsPerGroup[groupId]; ++streamIdx)
|
||||
@@ -989,7 +1006,7 @@ namespace RcclUnitTesting
|
||||
{
|
||||
for (int i = 0; i < this->numStreamsPerGroup[groupId]; ++i)
|
||||
CHECK_HIP(hipStreamSynchronize(this->streams[groupId][localRank][i]));
|
||||
}
|
||||
}
|
||||
|
||||
this->graphs[groupId].clear();
|
||||
this->graphExecs[groupId].clear();
|
||||
|
||||
Reference in New Issue
Block a user