Adding UT_DEBUG_PAUSE to unit tests (#1653)
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
ac8ec4c08c
Коммит
ee85a70bb4
@@ -177,7 +177,7 @@ namespace RcclUnitTesting
|
||||
close(pipefd[0]);
|
||||
close(pipefd[1]);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
else {
|
||||
int status;
|
||||
if (read(pipefd[0], gpuPriorityOrder->data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
|
||||
@@ -205,6 +205,7 @@ namespace RcclUnitTesting
|
||||
isGfx90 = false;
|
||||
getArchInfo(&isGfx90, "gfx90");
|
||||
|
||||
debugPause = GetEnvVar("UT_DEBUG_PAUSE" , 0);
|
||||
showNames = GetEnvVar("UT_SHOW_NAMES" , 1);
|
||||
minGpus = GetEnvVar("UT_MIN_GPUS" , 1);
|
||||
maxGpus = GetEnvVar("UT_MAX_GPUS" , numDetectedGpus);
|
||||
@@ -348,6 +349,7 @@ namespace RcclUnitTesting
|
||||
{
|
||||
std::vector<std::tuple<std::string, int, std::string>> supported =
|
||||
{
|
||||
std::make_tuple("UT_DEBUG_PAUSE" , debugPause , "Pause for debugger attach"),
|
||||
std::make_tuple("UT_SHOW_NAMES" , showNames , "Show test case names"),
|
||||
std::make_tuple("UT_MIN_GPUS" , minGpus , "Minimum number of GPUs to use"),
|
||||
std::make_tuple("UT_MAX_GPUS" , maxGpus , "Maximum number of GPUs to use"),
|
||||
|
||||
@@ -18,6 +18,7 @@ namespace RcclUnitTesting
|
||||
class EnvVars
|
||||
{
|
||||
public:
|
||||
bool debugPause; // Pause for debugger attach [UT_DEBUG_PAUSE]
|
||||
bool showNames; // List test case names during run [UT_SHOW_NAMES]
|
||||
int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS]
|
||||
int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS]
|
||||
@@ -30,6 +31,7 @@ namespace RcclUnitTesting
|
||||
bool useInteractive; // Run in interactive mode [UT_INTERACTIVE]
|
||||
int timeoutUs; // Set timeout for child in microseconds [UT_TIMEOUT_US]
|
||||
bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD]
|
||||
|
||||
bool isGfx94; // Detects if architecture is gfx94
|
||||
bool isGfx12; // Detects if architecture is gfx12
|
||||
bool isGfx90; // Detects if architecture is gfx90
|
||||
|
||||
@@ -120,6 +120,19 @@ namespace RcclUnitTesting
|
||||
}
|
||||
}
|
||||
|
||||
// If debugging is enabled, pause here to allow users to attach debugger
|
||||
if (ev.debugPause) {
|
||||
INFO("============================================================\n");
|
||||
INFO(" Pausing for debug attach: (e.g. sudo rocgdb -p <PID>)\n");
|
||||
INFO("============================================================\n");
|
||||
for (int childId = 0; childId < this->numActiveChildren; ++childId) {
|
||||
INFO(" Child %02d: processID: %d\n", childId, childList[childId]->pid);
|
||||
}
|
||||
INFO("============================================================\n");
|
||||
INFO("<Press enter to continue>\n");
|
||||
scanf("%*c");
|
||||
}
|
||||
|
||||
// Determine number of unique GPUs being used.
|
||||
std::set<int> unique_devices;
|
||||
for (auto a: this->rankToDeviceMap)
|
||||
@@ -252,7 +265,7 @@ namespace RcclUnitTesting
|
||||
std::vector<int> rankList;
|
||||
for (int i = 0; i < this->numActiveRanks; ++i)
|
||||
if (rank == -1 || rank == i) rankList.push_back(i);
|
||||
|
||||
|
||||
// Build list of groups this applies to (-1 for groupId means to set for all)
|
||||
std::vector<int> groupList;
|
||||
for (int i = 0; i < this->numGroupCalls; ++i)
|
||||
@@ -287,7 +300,7 @@ namespace RcclUnitTesting
|
||||
std::vector<int> rankList;
|
||||
for (int i = 0; i < this->numActiveRanks; ++i)
|
||||
if (rank == -1 || rank == i) rankList.push_back(i);
|
||||
|
||||
|
||||
// Build list of groups this applies to (-1 for groupId means to set for all)
|
||||
std::vector<int> groupList;
|
||||
for (int i = 0; i < this->numGroupCalls; ++i)
|
||||
@@ -311,7 +324,7 @@ namespace RcclUnitTesting
|
||||
InteractiveWait("Finishing PrepareData");
|
||||
}
|
||||
|
||||
void TestBed::ExecuteCollectives(std::vector<int> const ¤tRanks, int const groupId,
|
||||
void TestBed::ExecuteCollectives(std::vector<int> const ¤tRanks, int const groupId,
|
||||
bool const useHipGraph)
|
||||
{
|
||||
InteractiveWait("Starting ExecuteCollectives");
|
||||
@@ -367,7 +380,7 @@ namespace RcclUnitTesting
|
||||
std::vector<int> rankList;
|
||||
for (int i = 0; i < this->numActiveRanks; ++i)
|
||||
if (rank == -1 || rank == i) rankList.push_back(i);
|
||||
|
||||
|
||||
// Build list of groups this applies to (-1 for groupId means to set for all)
|
||||
std::vector<int> groupList;
|
||||
for (int i = 0; i < this->numGroupCalls; ++i)
|
||||
@@ -408,7 +421,7 @@ namespace RcclUnitTesting
|
||||
if (groupId == -1 || groupId == i) groupList.push_back(i);
|
||||
|
||||
int const cmd = TestBedChild::CHILD_LAUNCH_GRAPHS;
|
||||
for (auto currGroup : groupList)
|
||||
for (auto currGroup : groupList)
|
||||
{
|
||||
for (int childId = 0; childId < this->numActiveChildren; ++childId)
|
||||
{
|
||||
@@ -550,7 +563,7 @@ namespace RcclUnitTesting
|
||||
return ev.GetAllSupportedDataTypes();
|
||||
}
|
||||
|
||||
std::vector<int> const TestBed::GetNumCollsPerGroup(int numCollectivesInGroup,
|
||||
std::vector<int> const TestBed::GetNumCollsPerGroup(int numCollectivesInGroup,
|
||||
int numGroupCalls)
|
||||
{
|
||||
return std::vector<int>(numGroupCalls, numCollectivesInGroup);
|
||||
|
||||
Ссылка в новой задаче
Block a user