Added new unit tests for AllReduce with Bias API (#2036)

* Added new unit tests for AllReduce with Bias API

* Address review comments

[ROCm/rccl commit: 7c12b0b76b]
This commit is contained in:
Atul Kulkarni
2025-12-03 17:37:34 -06:00
zatwierdzone przez GitHub
rodzic 3e650467fa
commit e4aef19511
9 zmienionych plików z 586 dodań i 18 usunięć
+444
Wyświetl plik
@@ -249,4 +249,448 @@ namespace RcclUnitTesting
}
callCollectiveForked(nranks, ncclCollAllReduce, sendBuff, recvBuff, expected, use_managed_mem);
}
#ifdef RCCL_ALLREDUCE_WITH_BIAS
// Note: All bias tests require:
// nRanks >= 2 (bias NOT supported for single rank)
// Named constants for bias test configuration
namespace BiasTestConstants
{
// Element counts for different operations
constexpr std::initializer_list<int> STANDARD_ELEM_COUNTS = {2048, 384}; // For Sum/Max/Min
constexpr std::initializer_list<int> PROD_ELEM_COUNTS_MEDIUM = {32}; // For Int32/Uint32 Prod
constexpr std::initializer_list<int> PROD_ELEM_COUNTS_LARGE = {64}; // For Int8/Uint8/Int64/Uint64/Float Prod
// Bias and input pattern constants
constexpr int BIAS_CONSTANT_ONE = 1; // Use constant bias value of 1 (prevents overflow)
constexpr int BIAS_INCREMENTAL_PATTERN
= -1; // Use incremental pattern: bias[i] = i (more thorough testing)
constexpr int INPUT_RANK_BASED_PATTERN
= -1; // Use rank-based pattern: input[rank][i] = (rank+i)%256
constexpr int INPUT_CONSTANT_ONE = 1; // Use constant input value of 1 (prevents overflow)
} // namespace BiasTestConstants
/*
* @brief Helper function for running bias tests with specific datatype and redOp
* @param dataType Data type
* @param redOp Reduction operation
* @param numElements Number of elements
* @param biasConstVal Bias constant value, -1 for incremental bias
* @param inputConstVal Input constant value, -1 for rank-based input
*/
void RunBiasTest(ncclDataType_t dataType,
ncclRedOp_t redOp,
std::vector<int> numElements,
int biasConstVal = BiasTestConstants::BIAS_INCREMENTAL_PATTERN,
int inputConstVal = BiasTestConstants::INPUT_RANK_BASED_PATTERN)
{
// Create TestBed first (doesn't create child processes yet)
TestBed testBed;
// Check if architecture is gfx94 (covers gfx942) or gfx95 (covers gfx950)
if (!testBed.ev.isGfx94 && !testBed.ev.isGfx95)
{
INFO("SKIPPED: AllReduce with Bias is only supported on gfx942 or gfx950 architectures.\n");
return;
}
bool const inPlace = false;
bool const useManagedMem = false;
bool const useHipGraph = false;
OptionalColArgs options;
options.useBias = true;
options.redOp = redOp;
options.biasConstantValue = biasConstVal;
options.inputConstantValue = inputConstVal;
bool isCorrect = true;
for(int totalRanks : testBed.ev.GetNumGpusList())
{
if(totalRanks < 2)
continue;
int const numProcesses = totalRanks;
bool const isMultiProcess = true;
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder));
for(auto numElem : numElements)
{
if(!isCorrect)
break;
if(testBed.ev.showNames)
{
std::string name = testBed.GetTestCaseName(totalRanks,
isMultiProcess,
ncclCollAllReduce,
dataType,
redOp,
-1,
inPlace,
useManagedMem,
useHipGraph);
INFO(" %s (with bias, count=%d)\n", name.c_str(), numElem);
}
options.biasNumElements = numElem;
testBed.SetCollectiveArgs(ncclCollAllReduce,
dataType,
numElem,
numElem,
options,
-1,
0,
-1);
testBed.AllocateMem(inPlace, useManagedMem);
testBed.PrepareData();
testBed.ExecuteCollectives({}, useHipGraph);
testBed.ValidateResults(isCorrect);
testBed.DeallocateMem();
}
testBed.DestroyComms();
}
testBed.Finalize();
}
// Int8 Tests
TEST(AllReduce, BiasInt8_Sum)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt8,
ncclSum,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasInt8_Max)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt8,
ncclMax,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasInt8_Min)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt8,
ncclMin,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasInt8_Prod)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt8,
ncclProd,
PROD_ELEM_COUNTS_LARGE,
BIAS_CONSTANT_ONE,
INPUT_CONSTANT_ONE);
}
// Uint8 Tests
TEST(AllReduce, BiasUint8_Sum)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint8,
ncclSum,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasUint8_Max)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint8,
ncclMax,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasUint8_Min)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint8,
ncclMin,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasUint8_Prod)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint8,
ncclProd,
PROD_ELEM_COUNTS_LARGE,
BIAS_CONSTANT_ONE,
INPUT_CONSTANT_ONE);
}
// Int32 Tests
TEST(AllReduce, BiasInt32_Sum)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt32,
ncclSum,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasInt32_Max)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt32,
ncclMax,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasInt32_Min)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt32,
ncclMin,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasInt32_Prod)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt32,
ncclProd,
PROD_ELEM_COUNTS_MEDIUM,
BIAS_CONSTANT_ONE,
INPUT_CONSTANT_ONE);
}
// Uint32 Tests
TEST(AllReduce, BiasUint32_Sum)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint32,
ncclSum,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasUint32_Max)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint32,
ncclMax,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasUint32_Min)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint32,
ncclMin,
STANDARD_ELEM_COUNTS,
BIAS_CONSTANT_ONE,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasUint32_Prod)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint32,
ncclProd,
PROD_ELEM_COUNTS_MEDIUM,
BIAS_CONSTANT_ONE,
INPUT_CONSTANT_ONE);
}
// Int64 Tests
TEST(AllReduce, BiasInt64_Sum)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt64,
ncclSum,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasInt64_Max)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt64,
ncclMax,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasInt64_Min)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt64,
ncclMin,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasInt64_Prod)
{
using namespace BiasTestConstants;
RunBiasTest(ncclInt64,
ncclProd,
PROD_ELEM_COUNTS_LARGE,
BIAS_INCREMENTAL_PATTERN,
INPUT_CONSTANT_ONE);
}
// Uint64 Tests
TEST(AllReduce, BiasUint64_Sum)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint64,
ncclSum,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasUint64_Max)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint64,
ncclMax,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasUint64_Min)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint64,
ncclMin,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasUint64_Prod)
{
using namespace BiasTestConstants;
RunBiasTest(ncclUint64,
ncclProd,
PROD_ELEM_COUNTS_LARGE,
BIAS_INCREMENTAL_PATTERN,
INPUT_CONSTANT_ONE);
}
// Float32 Tests
TEST(AllReduce, BiasFloat32_Sum)
{
using namespace BiasTestConstants;
RunBiasTest(ncclFloat32,
ncclSum,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasFloat32_Max)
{
using namespace BiasTestConstants;
RunBiasTest(ncclFloat32,
ncclMax,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasFloat32_Min)
{
using namespace BiasTestConstants;
RunBiasTest(ncclFloat32,
ncclMin,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasFloat32_Prod)
{
using namespace BiasTestConstants;
RunBiasTest(ncclFloat32,
ncclProd,
PROD_ELEM_COUNTS_LARGE,
BIAS_INCREMENTAL_PATTERN,
INPUT_CONSTANT_ONE);
}
// Float64 Tests
TEST(AllReduce, BiasFloat64_Sum)
{
using namespace BiasTestConstants;
RunBiasTest(ncclFloat64,
ncclSum,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasFloat64_Max)
{
using namespace BiasTestConstants;
RunBiasTest(ncclFloat64,
ncclMax,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasFloat64_Min)
{
using namespace BiasTestConstants;
RunBiasTest(ncclFloat64,
ncclMin,
STANDARD_ELEM_COUNTS,
BIAS_INCREMENTAL_PATTERN,
INPUT_RANK_BASED_PATTERN);
}
TEST(AllReduce, BiasFloat64_Prod)
{
using namespace BiasTestConstants;
RunBiasTest(ncclFloat64,
ncclProd,
PROD_ELEM_COUNTS_LARGE,
BIAS_INCREMENTAL_PATTERN,
INPUT_CONSTANT_ONE);
}
#else
// If RCCL_ALLREDUCE_WITH_BIAS is not defined, skip all bias tests
TEST(AllReduce, BiasNotAvailable)
{
INFO("SKIPPED: RCCL_ALLREDUCE_WITH_BIAS not defined - bias tests skipped\n");
return;
}
#endif
}
@@ -102,6 +102,17 @@ namespace RcclUnitTesting
CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated));
}
CHECK_CALL(this->outputCpu.AllocateCpuMem(this->numOutputBytesAllocated));
// Allocate bias buffers if bias is enabled
if (this->options.useBias)
{
this->numBiasElements = this->options.biasNumElements;
this->numBiasBytesAllocated = this->numBiasElements * DataTypeToBytes(this->dataType);
CHECK_CALL(this->biasGpu.AllocateGpuMem(this->numBiasBytesAllocated, useManagedMem, userRegistered));
CHECK_CALL(this->biasCpu.AllocateCpuMem(this->numBiasBytesAllocated));
this->biasRegHandle = nullptr;
}
return TEST_SUCCESS;
}
@@ -155,6 +166,15 @@ namespace RcclUnitTesting
if (this->options.scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
this->localScalar.Attach(nullptr);
}
// Deallocate bias buffers if they were allocated
if (this->options.useBias && this->numBiasBytesAllocated > 0)
{
this->biasGpu.FreeGpuMem(this->userRegistered);
this->biasCpu.FreeCpuMem();
this->biasRegHandle = nullptr;
}
return TEST_SUCCESS;
}
@@ -82,6 +82,15 @@ namespace RcclUnitTesting
ScalarTransport scalarTransport; // Used for custom reduction operators
int scalarMode = -1; // -1 if scalar not used
// Bias support for fused AllReduce+Bias operations
bool useBias = false; // Enable bias addition
void* biasPtr = nullptr; // Pointer to bias buffer (GPU memory)
size_t biasNumElements = 0; // Number of elements in bias buffer
int biasConstantValue = -1; // If >= 0, use constant value for all bias elements (instead of incremental pattern)
// Input data pattern control (useful for ncclProd to avoid overflow at high rank counts)
int inputConstantValue = -1; // If >= 0, use constant value for all input elements (instead of rank-based pattern)
// allToAllv args
size_t sendcounts[MAX_RANKS*MAX_RANKS];
size_t sdispls[MAX_RANKS*MAX_RANKS];
@@ -122,6 +131,13 @@ namespace RcclUnitTesting
size_t numInputElementsAllocated;
size_t numOutputElementsAllocated;
// Bias data for fused AllReduce+Bias operations
PtrUnion biasGpu; // Bias buffer on GPU
PtrUnion biasCpu; // Bias buffer on CPU (for initialization/validation)
void* biasRegHandle; // Handle for registered bias buffer
size_t numBiasElements; // Number of elements in bias buffer
size_t numBiasBytesAllocated; // Number of bytes allocated for bias
// Set collective arguments
ErrCode SetArgs(int const globalRank,
int const totalRanks,
@@ -200,6 +200,8 @@ namespace RcclUnitTesting
numDetectedGpus = min(numDetectedGpus, 16);
isGfx94 = false;
getArchInfo(&isGfx94, "gfx94");
isGfx95 = false;
getArchInfo(&isGfx95, "gfx95");
isGfx12 = false;
getArchInfo(&isGfx12, "gfx12");
isGfx90 = false;
@@ -33,6 +33,7 @@ namespace RcclUnitTesting
bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD]
bool isGfx94; // Detects if architecture is gfx94
bool isGfx95; // Detects if architecture is gfx95
bool isGfx12; // Detects if architecture is gfx12
bool isGfx90; // Detects if architecture is gfx90
@@ -47,6 +47,22 @@ namespace RcclUnitTesting
collArgs.numOutputElements, collArgs.numOutputElementsAllocated);
return TEST_FAIL;
}
// Check bias allocation if bias is enabled
if (collArgs.options.useBias)
{
if (collArgs.numBiasElements == 0 || collArgs.numBiasBytesAllocated == 0)
{
ERROR("Bias is enabled but bias buffers are not allocated\n");
return TEST_FAIL;
}
if (collArgs.numBiasElements != collArgs.numOutputElements)
{
ERROR("Number of bias elements (%lu) must match number of output elements (%lu)\n",
collArgs.numBiasElements, collArgs.numOutputElements);
return TEST_FAIL;
}
}
return TEST_SUCCESS;
}
@@ -108,7 +124,22 @@ namespace RcclUnitTesting
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
{
// Generate temporary input for this rank
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
if (collArgs.options.inputConstantValue >= 0)
{
// Use constant value for all input elements across all ranks
// This is useful for ncclProd at high rank counts to avoid factorial overflow
for (size_t i = 0; i < collArgs.numInputElements; i++)
{
CHECK_CALL(tempInputCpu.Set(collArgs.dataType, i,
collArgs.options.inputConstantValue,
(double)collArgs.options.inputConstantValue));
}
}
else
{
// Use rank-based pattern: value[rank][i] = (rank + i) % 256 (default behavior)
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
}
// Copy the pre-scaled input into GPU memory for the correct rank
if (rank == collArgs.globalRank)
@@ -144,6 +175,38 @@ namespace RcclUnitTesting
{
CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
}
// Add bias to expected output if bias is enabled
if (collArgs.options.useBias && (isAllReduce || collArgs.options.root == collArgs.globalRank))
{
// Initialize bias data on CPU
if (collArgs.options.biasConstantValue >= 0)
{
// Use constant value for all bias elements (useful for ncclProd to avoid overflow)
for (size_t i = 0; i < collArgs.numBiasElements; i++)
{
CHECK_CALL(collArgs.biasCpu.Set(collArgs.dataType, i,
collArgs.options.biasConstantValue,
(double)collArgs.options.biasConstantValue));
}
}
else
{
// Use incremental pattern: bias[i] = i (default behavior)
CHECK_CALL(collArgs.biasCpu.FillPattern(collArgs.dataType, collArgs.numBiasElements, 0, false));
}
// Copy bias data to GPU
size_t const biasBytes = collArgs.numBiasBytesAllocated;
CHECK_HIP(hipMemcpy(collArgs.biasGpu.ptr, collArgs.biasCpu.ptr, biasBytes, hipMemcpyHostToDevice));
// Apply bias to expected output using the SAME reduction operation as AllReduce
CHECK_CALL(result.Reduce(collArgs.dataType, collArgs.numInputElements, collArgs.biasCpu, tempOp));
// Update the biasPtr in options to point to the GPU buffer
collArgs.options.biasPtr = collArgs.biasGpu.ptr;
}
return TEST_SUCCESS;
}
@@ -718,6 +718,11 @@ namespace RcclUnitTesting
&numOutputElements);
optionalArgs.redOp = redOps[rdIdx];
optionalArgs.root = roots[rtIdx] % this->numActiveRanks;
// Set biasNumElements if bias is enabled
if (optionalArgs.useBias)
{
optionalArgs.biasNumElements = numOutputElements;
}
this->SetCollectiveArgs(funcTypes[ftIdx],
dataTypes[dtIdx],
numInputElements,
+3 -3
Wyświetl plik
@@ -41,7 +41,7 @@ namespace RcclUnitTesting
std::vector<int> const& numStreamsPerGroup,
int const numGroupCalls = 1,
bool const useBlocking = true);
// Prepare TestBed for use with GPUs across multiple child processes
void InitComms(std::vector<std::vector<int>> const& deviceIdsPerChild,
int const numCollectivesInGroup = 1,
@@ -96,7 +96,7 @@ namespace RcclUnitTesting
// Execute all collectives on all test children
// Blocks until collective is completed
void ExecuteCollectives(std::vector<int> const &currentRanks = {},
int const groupId = -1,
int const groupId = -1,
bool const useHipGraph = false);
// Perform results validation - compare output to expected
@@ -140,7 +140,7 @@ namespace RcclUnitTesting
int const numGpus,
int const ranksPerGpu,
const std::vector<int>& gpuPriorityOrder);
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
int const numGpus,
const std::vector<int>& gpuPriorityOrder);
+31 -14
Wyświetl plik
@@ -598,15 +598,32 @@ namespace RcclUnitTesting
"ncclReduceScatter");
break;
case ncclCollAllReduce:
CHILD_NCCL_CALL_RANK(errCode, ncclAllReduce(
collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.options.redOp,
this->comms[localRank],
this->streams[groupId][localRank][collArg.streamIdx]),
"ncclAllReduce");
// Use ncclAllReduceWithBias if bias is enabled
if (collArg.options.useBias)
{
CHILD_NCCL_CALL_RANK(errCode, ncclAllReduceWithBias(
collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.options.redOp,
this->comms[localRank],
this->streams[groupId][localRank][collArg.streamIdx],
collArg.options.biasPtr),
"ncclAllReduceWithBias");
}
else
{
CHILD_NCCL_CALL_RANK(errCode, ncclAllReduce(
collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.options.redOp,
this->comms[localRank],
this->streams[groupId][localRank][collArg.streamIdx]),
"ncclAllReduce");
}
break;
case ncclCollGather:
CHILD_NCCL_CALL_RANK(errCode, ncclGather(
@@ -762,7 +779,7 @@ namespace RcclUnitTesting
{
streamsToComplete.erase(streamsToComplete.begin() + i);
i--;
}
}
}
usElapsed = duration_cast<microseconds>(Clock::now() - start).count();
}
@@ -773,13 +790,13 @@ namespace RcclUnitTesting
if (this->verbose) INFO("Collective timed out, aborting\n");
for (int localRank : localRanksToExecute)
{
ncclCommAbort(this->comms[localRank]);
ncclCommAbort(this->comms[localRank]);
timedout = 1;
}
}
// extra sync to flush GPU cache for validation later
// TODO: remove this after figuring out & fixing the exact behavior
// TODO: remove this after figuring out & fixing the exact behavior
// of fencing between kernels and at hipStreamQuery
for (int localRank : localRanksToExecute)
{
@@ -970,7 +987,7 @@ namespace RcclUnitTesting
PIPE_READ(groupId);
// Release graphs
for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
{
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
for (int streamIdx = 0; streamIdx < this->numStreamsPerGroup[groupId]; ++streamIdx)
@@ -989,7 +1006,7 @@ namespace RcclUnitTesting
{
for (int i = 0; i < this->numStreamsPerGroup[groupId]; ++i)
CHECK_HIP(hipStreamSynchronize(this->streams[groupId][localRank][i]));
}
}
this->graphs[groupId].clear();
this->graphExecs[groupId].clear();