Adding UT_DEBUG_PAUSE to unit tests (#1653)

Этот коммит содержится в:
gilbertlee-amd
2025-04-21 21:15:07 -06:00
коммит произвёл GitHub
родитель ac8ec4c08c
Коммит ee85a70bb4
3 изменённых файлов: 24 добавлений и 7 удалений
+3 -1
Просмотреть файл
@@ -177,7 +177,7 @@ namespace RcclUnitTesting
close(pipefd[0]);
close(pipefd[1]);
exit(EXIT_SUCCESS);
}
}
else {
int status;
if (read(pipefd[0], gpuPriorityOrder->data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
@@ -205,6 +205,7 @@ namespace RcclUnitTesting
isGfx90 = false;
getArchInfo(&isGfx90, "gfx90");
debugPause = GetEnvVar("UT_DEBUG_PAUSE" , 0);
showNames = GetEnvVar("UT_SHOW_NAMES" , 1);
minGpus = GetEnvVar("UT_MIN_GPUS" , 1);
maxGpus = GetEnvVar("UT_MAX_GPUS" , numDetectedGpus);
@@ -348,6 +349,7 @@ namespace RcclUnitTesting
{
std::vector<std::tuple<std::string, int, std::string>> supported =
{
std::make_tuple("UT_DEBUG_PAUSE" , debugPause , "Pause for debugger attach"),
std::make_tuple("UT_SHOW_NAMES" , showNames , "Show test case names"),
std::make_tuple("UT_MIN_GPUS" , minGpus , "Minimum number of GPUs to use"),
std::make_tuple("UT_MAX_GPUS" , maxGpus , "Maximum number of GPUs to use"),
+2
Просмотреть файл
@@ -18,6 +18,7 @@ namespace RcclUnitTesting
class EnvVars
{
public:
bool debugPause; // Pause for debugger attach [UT_DEBUG_PAUSE]
bool showNames; // List test case names during run [UT_SHOW_NAMES]
int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS]
int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS]
@@ -30,6 +31,7 @@ namespace RcclUnitTesting
bool useInteractive; // Run in interactive mode [UT_INTERACTIVE]
int timeoutUs; // Set timeout for child in microseconds [UT_TIMEOUT_US]
bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD]
bool isGfx94; // Detects if architecture is gfx94
bool isGfx12; // Detects if architecture is gfx12
bool isGfx90; // Detects if architecture is gfx90
+19 -6
Просмотреть файл
@@ -120,6 +120,19 @@ namespace RcclUnitTesting
}
}
// If debugging is enabled, pause here to allow users to attach debugger
if (ev.debugPause) {
INFO("============================================================\n");
INFO(" Pausing for debug attach: (e.g. sudo rocgdb -p <PID>)\n");
INFO("============================================================\n");
for (int childId = 0; childId < this->numActiveChildren; ++childId) {
INFO(" Child %02d: processID: %d\n", childId, childList[childId]->pid);
}
INFO("============================================================\n");
INFO("<Press enter to continue>\n");
scanf("%*c");
}
// Determine number of unique GPUs being used.
std::set<int> unique_devices;
for (auto a: this->rankToDeviceMap)
@@ -252,7 +265,7 @@ namespace RcclUnitTesting
std::vector<int> rankList;
for (int i = 0; i < this->numActiveRanks; ++i)
if (rank == -1 || rank == i) rankList.push_back(i);
// Build list of groups this applies to (-1 for groupId means to set for all)
std::vector<int> groupList;
for (int i = 0; i < this->numGroupCalls; ++i)
@@ -287,7 +300,7 @@ namespace RcclUnitTesting
std::vector<int> rankList;
for (int i = 0; i < this->numActiveRanks; ++i)
if (rank == -1 || rank == i) rankList.push_back(i);
// Build list of groups this applies to (-1 for groupId means to set for all)
std::vector<int> groupList;
for (int i = 0; i < this->numGroupCalls; ++i)
@@ -311,7 +324,7 @@ namespace RcclUnitTesting
InteractiveWait("Finishing PrepareData");
}
void TestBed::ExecuteCollectives(std::vector<int> const &currentRanks, int const groupId,
void TestBed::ExecuteCollectives(std::vector<int> const &currentRanks, int const groupId,
bool const useHipGraph)
{
InteractiveWait("Starting ExecuteCollectives");
@@ -367,7 +380,7 @@ namespace RcclUnitTesting
std::vector<int> rankList;
for (int i = 0; i < this->numActiveRanks; ++i)
if (rank == -1 || rank == i) rankList.push_back(i);
// Build list of groups this applies to (-1 for groupId means to set for all)
std::vector<int> groupList;
for (int i = 0; i < this->numGroupCalls; ++i)
@@ -408,7 +421,7 @@ namespace RcclUnitTesting
if (groupId == -1 || groupId == i) groupList.push_back(i);
int const cmd = TestBedChild::CHILD_LAUNCH_GRAPHS;
for (auto currGroup : groupList)
for (auto currGroup : groupList)
{
for (int childId = 0; childId < this->numActiveChildren; ++childId)
{
@@ -550,7 +563,7 @@ namespace RcclUnitTesting
return ev.GetAllSupportedDataTypes();
}
std::vector<int> const TestBed::GetNumCollsPerGroup(int numCollectivesInGroup,
std::vector<int> const TestBed::GetNumCollsPerGroup(int numCollectivesInGroup,
int numGroupCalls)
{
return std::vector<int>(numGroupCalls, numCollectivesInGroup);