diff --git a/test/common/EnvVars.cpp b/test/common/EnvVars.cpp index 268ae5b62e..50d4661055 100644 --- a/test/common/EnvVars.cpp +++ b/test/common/EnvVars.cpp @@ -177,7 +177,7 @@ namespace RcclUnitTesting close(pipefd[0]); close(pipefd[1]); exit(EXIT_SUCCESS); - } + } else { int status; if (read(pipefd[0], gpuPriorityOrder->data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL; @@ -205,6 +205,7 @@ namespace RcclUnitTesting isGfx90 = false; getArchInfo(&isGfx90, "gfx90"); + debugPause = GetEnvVar("UT_DEBUG_PAUSE" , 0); showNames = GetEnvVar("UT_SHOW_NAMES" , 1); minGpus = GetEnvVar("UT_MIN_GPUS" , 1); maxGpus = GetEnvVar("UT_MAX_GPUS" , numDetectedGpus); @@ -348,6 +349,7 @@ namespace RcclUnitTesting { std::vector> supported = { + std::make_tuple("UT_DEBUG_PAUSE" , debugPause , "Pause for debugger attach"), std::make_tuple("UT_SHOW_NAMES" , showNames , "Show test case names"), std::make_tuple("UT_MIN_GPUS" , minGpus , "Minimum number of GPUs to use"), std::make_tuple("UT_MAX_GPUS" , maxGpus , "Maximum number of GPUs to use"), diff --git a/test/common/EnvVars.hpp b/test/common/EnvVars.hpp index c557b0050e..914c4c1566 100644 --- a/test/common/EnvVars.hpp +++ b/test/common/EnvVars.hpp @@ -18,6 +18,7 @@ namespace RcclUnitTesting class EnvVars { public: + bool debugPause; // Pause for debugger attach [UT_DEBUG_PAUSE] bool showNames; // List test case names during run [UT_SHOW_NAMES] int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS] int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS] @@ -30,6 +31,7 @@ namespace RcclUnitTesting bool useInteractive; // Run in interactive mode [UT_INTERACTIVE] int timeoutUs; // Set timeout for child in microseconds [UT_TIMEOUT_US] bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD] + bool isGfx94; // Detects if architecture is gfx94 bool isGfx12; // Detects if architecture is gfx12 bool isGfx90; // Detects if architecture is gfx90 diff --git a/test/common/TestBed.cpp b/test/common/TestBed.cpp index f33342ff82..5ea0efce11 100644 --- a/test/common/TestBed.cpp +++ b/test/common/TestBed.cpp @@ -120,6 +120,19 @@ namespace RcclUnitTesting } } + // If debugging is enabled, pause here to allow users to attach debugger + if (ev.debugPause) { + INFO("============================================================\n"); + INFO(" Pausing for debug attach: (e.g. sudo rocgdb -p )\n"); + INFO("============================================================\n"); + for (int childId = 0; childId < this->numActiveChildren; ++childId) { + INFO(" Child %02d: processID: %d\n", childId, childList[childId]->pid); + } + INFO("============================================================\n"); + INFO("\n"); + scanf("%*c"); + } + // Determine number of unique GPUs being used. std::set unique_devices; for (auto a: this->rankToDeviceMap) @@ -252,7 +265,7 @@ namespace RcclUnitTesting std::vector rankList; for (int i = 0; i < this->numActiveRanks; ++i) if (rank == -1 || rank == i) rankList.push_back(i); - + // Build list of groups this applies to (-1 for groupId means to set for all) std::vector groupList; for (int i = 0; i < this->numGroupCalls; ++i) @@ -287,7 +300,7 @@ namespace RcclUnitTesting std::vector rankList; for (int i = 0; i < this->numActiveRanks; ++i) if (rank == -1 || rank == i) rankList.push_back(i); - + // Build list of groups this applies to (-1 for groupId means to set for all) std::vector groupList; for (int i = 0; i < this->numGroupCalls; ++i) @@ -311,7 +324,7 @@ namespace RcclUnitTesting InteractiveWait("Finishing PrepareData"); } - void TestBed::ExecuteCollectives(std::vector const ¤tRanks, int const groupId, + void TestBed::ExecuteCollectives(std::vector const ¤tRanks, int const groupId, bool const useHipGraph) { InteractiveWait("Starting ExecuteCollectives"); @@ -367,7 +380,7 @@ namespace RcclUnitTesting std::vector rankList; for (int i = 0; i < this->numActiveRanks; ++i) if (rank == -1 || rank == i) rankList.push_back(i); - + // Build list of groups this applies to (-1 for groupId means to set for all) std::vector groupList; for (int i = 0; i < this->numGroupCalls; ++i) @@ -408,7 +421,7 @@ namespace RcclUnitTesting if (groupId == -1 || groupId == i) groupList.push_back(i); int const cmd = TestBedChild::CHILD_LAUNCH_GRAPHS; - for (auto currGroup : groupList) + for (auto currGroup : groupList) { for (int childId = 0; childId < this->numActiveChildren; ++childId) { @@ -550,7 +563,7 @@ namespace RcclUnitTesting return ev.GetAllSupportedDataTypes(); } - std::vector const TestBed::GetNumCollsPerGroup(int numCollectivesInGroup, + std::vector const TestBed::GetNumCollsPerGroup(int numCollectivesInGroup, int numGroupCalls) { return std::vector(numGroupCalls, numCollectivesInGroup);